Commit 487e2c9f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'afs-next-20171113' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs

Pull AFS updates from David Howells:
 "kAFS filesystem driver overhaul.

  The major points of the overhaul are:

   (1) Preliminary groundwork is laid for supporting network-namespacing
       of kAFS. The remainder of the namespacing work requires some way
       to pass namespace information to submounts triggered by an
       automount. This requires something like the mount overhaul that's
       in progress.

   (2) sockaddr_rxrpc is used in preference to in_addr for holding
       addresses internally and add support for talking to the YFS VL
       server. With this, kAFS can do everything over IPv6 as well as
       IPv4 if it's talking to servers that support it.

   (3) Callback handling is overhauled to be generally passive rather
       than active. 'Callbacks' are promises by the server to tell us
       about data and metadata changes. Callbacks are now checked when
       we next touch an inode rather than actively going and looking for
       it where possible.

   (4) File access permit caching is overhauled to store the caching
       information per-inode rather than per-directory, shared over
       subordinate files. Whilst older AFS servers only allow ACLs on
       directories (shared to the files in that directory), newer AFS
       servers break that restriction.

       To improve memory usage and to make it easier to do mass-key
       removal, permit combinations are cached and shared.

   (5) Cell database management is overhauled to allow lighter locks to
       be used and to make cell records autonomous state machines that
       look after getting their own DNS records and cleaning themselves
       up, in particular preventing races in acquiring and relinquishing
       the fscache token for the cell.

   (6) Volume caching is overhauled. The afs_vlocation record is got rid
       of to simplify things and the superblock is now keyed on the cell
       and the numeric volume ID only. The volume record is tied to a
       superblock and normal superblock management is used to mediate
       the lifetime of the volume fscache token.

   (7) File server record caching is overhauled to make server records
       independent of cells and volumes. A server can be in multiple
       cells (in such a case, the administrator must make sure that the
       VL services for all cells correctly reflect the volumes shared
       between those cells).

       Server records are now indexed using the UUID of the server
       rather than the address since a server can have multiple
       addresses.

   (8) File server rotation is overhauled to handle VMOVED, VBUSY (and
       similar), VOFFLINE and VNOVOL indications and to handle rotation
       both of servers and addresses of those servers. The rotation will
       also wait and retry if the server says it is busy.

   (9) Data writeback is overhauled. Each inode no longer stores a list
       of modified sections tagged with the key that authorised it in
       favour of noting the modified region of a page in page->private
       and storing a list of keys that made modifications in the inode.

       This simplifies things and allows other keys to be used to
       actually write to the server if a key that made a modification
       becomes useless.

  (10) Writable mmap() is implemented. This allows a kernel to be build
       entirely on AFS.

  Note that Pre AFS-3.4 servers are no longer supported, though this can
  be added back if necessary (AFS-3.4 was released in 1998)"

* tag 'afs-next-20171113' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs: (35 commits)
  afs: Protect call->state changes against signals
  afs: Trace page dirty/clean
  afs: Implement shared-writeable mmap
  afs: Get rid of the afs_writeback record
  afs: Introduce a file-private data record
  afs: Use a dynamic port if 7001 is in use
  afs: Fix directory read/modify race
  afs: Trace the sending of pages
  afs: Trace the initiation and completion of client calls
  afs: Fix documentation on # vs % prefix in mount source specification
  afs: Fix total-length calculation for multiple-page send
  afs: Only progress call state at end of Tx phase from rxrpc callback
  afs: Make use of the YFS service upgrade to fully support IPv6
  afs: Overhaul volume and server record caching and fileserver rotation
  afs: Move server rotation code into its own file
  afs: Add an address list concept
  afs: Overhaul cell database management
  afs: Overhaul permit caching
  afs: Overhaul the callback handling
  afs: Rename struct afs_call server member to cm_server
  ...
parents b630a23a 98bf40cd
......@@ -91,8 +91,8 @@ Filesystems can be mounted anywhere by commands similar to the following:
mount -t afs "#root.cell." /afs/cambridge
Where the initial character is either a hash or a percent symbol depending on
whether you definitely want a R/W volume (hash) or whether you'd prefer a R/O
volume, but are willing to use a R/W volume instead (percent).
whether you definitely want a R/W volume (percent) or whether you'd prefer a
R/O volume, but are willing to use a R/W volume instead (hash).
The name of the volume can be suffixes with ".backup" or ".readonly" to
specify connection to only volumes of those types.
......
......@@ -1233,18 +1233,6 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action,
return NOTIFY_OK;
}
static int wait_on_fp_mode_switch(atomic_t *p)
{
/*
* The FP mode for this task is currently being switched. That may
* involve modifications to the format of this tasks FP context which
* make it unsafe to proceed with execution for the moment. Instead,
* schedule some other task.
*/
schedule();
return 0;
}
static int enable_restore_fp_context(int msa)
{
int err, was_fpu_owner, prior_msa;
......@@ -1254,7 +1242,7 @@ static int enable_restore_fp_context(int msa)
* complete before proceeding.
*/
wait_on_atomic_t(&current->mm->context.fp_mode_switching,
wait_on_fp_mode_switch, TASK_KILLABLE);
atomic_t_wait, TASK_KILLABLE);
if (!used_math()) {
/* First time FP context user. */
......
......@@ -263,12 +263,6 @@ static struct drm_dp_aux_dev *drm_dp_aux_dev_get_by_aux(struct drm_dp_aux *aux)
return aux_dev;
}
static int auxdev_wait_atomic_t(atomic_t *p)
{
schedule();
return 0;
}
void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
{
struct drm_dp_aux_dev *aux_dev;
......@@ -283,7 +277,7 @@ void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
mutex_unlock(&aux_idr_mutex);
atomic_dec(&aux_dev->usecount);
wait_on_atomic_t(&aux_dev->usecount, auxdev_wait_atomic_t,
wait_on_atomic_t(&aux_dev->usecount, atomic_t_wait,
TASK_UNINTERRUPTIBLE);
minor = aux_dev->index;
......
......@@ -271,13 +271,7 @@ struct igt_wakeup {
u32 seqno;
};
static int wait_atomic(atomic_t *p)
{
schedule();
return 0;
}
static int wait_atomic_timeout(atomic_t *p)
static int wait_atomic_timeout(atomic_t *p, unsigned int mode)
{
return schedule_timeout(10 * HZ) ? 0 : -ETIMEDOUT;
}
......@@ -348,7 +342,7 @@ static void igt_wake_all_sync(atomic_t *ready,
atomic_set(ready, 0);
wake_up_all(wq);
wait_on_atomic_t(set, wait_atomic, TASK_UNINTERRUPTIBLE);
wait_on_atomic_t(set, atomic_t_wait, TASK_UNINTERRUPTIBLE);
atomic_set(ready, count);
atomic_set(done, count);
}
......
......@@ -88,12 +88,6 @@ int hfi_core_init(struct venus_core *core)
return ret;
}
static int core_deinit_wait_atomic_t(atomic_t *p)
{
schedule();
return 0;
}
int hfi_core_deinit(struct venus_core *core, bool blocking)
{
int ret = 0, empty;
......@@ -112,7 +106,7 @@ int hfi_core_deinit(struct venus_core *core, bool blocking)
if (!empty) {
mutex_unlock(&core->lock);
wait_on_atomic_t(&core->insts_count, core_deinit_wait_atomic_t,
wait_on_atomic_t(&core->insts_count, atomic_t_wait,
TASK_UNINTERRUPTIBLE);
mutex_lock(&core->lock);
}
......
......@@ -7,6 +7,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
kafs-objs := \
$(afs-cache-y) \
addr_list.o \
callback.o \
cell.o \
cmservice.o \
......@@ -19,14 +20,14 @@ kafs-objs := \
misc.o \
mntpt.o \
proc.o \
rotate.o \
rxrpc.o \
security.o \
server.o \
server_list.o \
super.o \
netdevices.o \
vlclient.o \
vlocation.o \
vnode.o \
volume.o \
write.o \
xattr.o
......
/* Server address list management
*
* Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*/
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/dns_resolver.h>
#include <linux/inet.h>
#include <keys/rxrpc-type.h>
#include "internal.h"
#include "afs_fs.h"
//#define AFS_MAX_ADDRESSES
// ((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
// sizeof(struct sockaddr_rxrpc)))
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
/*
* Release an address list.
*/
void afs_put_addrlist(struct afs_addr_list *alist)
{
if (alist && refcount_dec_and_test(&alist->usage))
call_rcu(&alist->rcu, (rcu_callback_t)kfree);
}
/*
* Allocate an address list.
*/
struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
unsigned short service,
unsigned short port)
{
struct afs_addr_list *alist;
unsigned int i;
_enter("%u,%u,%u", nr, service, port);
alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr,
GFP_KERNEL);
if (!alist)
return NULL;
refcount_set(&alist->usage, 1);
for (i = 0; i < nr; i++) {
struct sockaddr_rxrpc *srx = &alist->addrs[i];
srx->srx_family = AF_RXRPC;
srx->srx_service = service;
srx->transport_type = SOCK_DGRAM;
srx->transport_len = sizeof(srx->transport.sin6);
srx->transport.sin6.sin6_family = AF_INET6;
srx->transport.sin6.sin6_port = htons(port);
}
return alist;
}
/*
* Parse a text string consisting of delimited addresses.
*/
struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
char delim,
unsigned short service,
unsigned short port)
{
struct afs_addr_list *alist;
const char *p, *end = text + len;
unsigned int nr = 0;
_enter("%*.*s,%c", (int)len, (int)len, text, delim);
if (!len)
return ERR_PTR(-EDESTADDRREQ);
if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
delim = ',';
/* Count the addresses */
p = text;
do {
if (!*p)
return ERR_PTR(-EINVAL);
if (*p == delim)
continue;
nr++;
if (*p == '[') {
p++;
if (p == end)
return ERR_PTR(-EINVAL);
p = memchr(p, ']', end - p);
if (!p)
return ERR_PTR(-EINVAL);
p++;
if (p >= end)
break;
}
p = memchr(p, delim, end - p);
if (!p)
break;
p++;
} while (p < end);
_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
if (nr > AFS_MAX_ADDRESSES)
nr = AFS_MAX_ADDRESSES;
alist = afs_alloc_addrlist(nr, service, port);
if (!alist)
return ERR_PTR(-ENOMEM);
/* Extract the addresses */
p = text;
do {
struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
char tdelim = delim;
if (*p == delim) {
p++;
continue;
}
if (*p == '[') {
p++;
tdelim = ']';
}
if (in4_pton(p, end - p,
(u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
tdelim, &p)) {
srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
} else if (in6_pton(p, end - p,
srx->transport.sin6.sin6_addr.s6_addr,
tdelim, &p)) {
/* Nothing to do */
} else {
goto bad_address;
}
if (tdelim == ']') {
if (p == end || *p != ']')
goto bad_address;
p++;
}
if (p < end) {
if (*p == '+') {
/* Port number specification "+1234" */
unsigned int xport = 0;
p++;
if (p >= end || !isdigit(*p))
goto bad_address;
do {
xport *= 10;
xport += *p - '0';
if (xport > 65535)
goto bad_address;
p++;
} while (p < end && isdigit(*p));
srx->transport.sin6.sin6_port = htons(xport);
} else if (*p == delim) {
p++;
} else {
goto bad_address;
}
}
alist->nr_addrs++;
} while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
_leave(" = [nr %u]", alist->nr_addrs);
return alist;
bad_address:
kfree(alist);
return ERR_PTR(-EINVAL);
}
/*
* Compare old and new address lists to see if there's been any change.
* - How to do this in better than O(Nlog(N)) time?
* - We don't really want to sort the address list, but would rather take the
* list as we got it so as not to undo record rotation by the DNS server.
*/
#if 0
static int afs_cmp_addr_list(const struct afs_addr_list *a1,
const struct afs_addr_list *a2)
{
}
#endif
/*
* Perform a DNS query for VL servers and build a up an address list.
*/
struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
{
struct afs_addr_list *alist;
char *vllist = NULL;
int ret;
_enter("%s", cell->name);
ret = dns_query("afsdb", cell->name, cell->name_len,
"ipv4", &vllist, _expiry);
if (ret < 0)
return ERR_PTR(ret);
alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
VL_SERVICE, AFS_VL_PORT);
if (IS_ERR(alist)) {
kfree(vllist);
if (alist != ERR_PTR(-ENOMEM))
pr_err("Failed to parse DNS data\n");
return alist;
}
kfree(vllist);
return alist;
}
/*
* Merge an IPv4 entry into a fileserver address list.
*/
void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
{
struct sockaddr_in6 *a;
__be16 xport = htons(port);
int i;
for (i = 0; i < alist->nr_ipv4; i++) {
a = &alist->addrs[i].transport.sin6;
if (xdr == a->sin6_addr.s6_addr32[3] &&
xport == a->sin6_port)
return;
if (xdr == a->sin6_addr.s6_addr32[3] &&
xport < a->sin6_port)
break;
if (xdr < a->sin6_addr.s6_addr32[3])
break;
}
if (i < alist->nr_addrs)
memmove(alist->addrs + i + 1,
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
a = &alist->addrs[i].transport.sin6;
a->sin6_port = xport;
a->sin6_addr.s6_addr32[0] = 0;
a->sin6_addr.s6_addr32[1] = 0;
a->sin6_addr.s6_addr32[2] = htonl(0xffff);
a->sin6_addr.s6_addr32[3] = xdr;
alist->nr_ipv4++;
alist->nr_addrs++;
}
/*
* Merge an IPv6 entry into a fileserver address list.
*/
void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
{
struct sockaddr_in6 *a;
__be16 xport = htons(port);
int i, diff;
for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
a = &alist->addrs[i].transport.sin6;
diff = memcmp(xdr, &a->sin6_addr, 16);
if (diff == 0 &&
xport == a->sin6_port)
return;
if (diff == 0 &&
xport < a->sin6_port)
break;
if (diff < 0)
break;
}
if (i < alist->nr_addrs)
memmove(alist->addrs + i + 1,
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
a = &alist->addrs[i].transport.sin6;
a->sin6_port = xport;
a->sin6_addr.s6_addr32[0] = xdr[0];
a->sin6_addr.s6_addr32[1] = xdr[1];
a->sin6_addr.s6_addr32[2] = xdr[2];
a->sin6_addr.s6_addr32[3] = xdr[3];
alist->nr_addrs++;
}
/*
* Get an address to try.
*/
bool afs_iterate_addresses(struct afs_addr_cursor *ac)
{
_enter("%hu+%hd", ac->start, (short)ac->index);
if (!ac->alist)
return false;
if (ac->begun) {
ac->index++;
if (ac->index == ac->alist->nr_addrs)
ac->index = 0;
if (ac->index == ac->start) {
ac->error = -EDESTADDRREQ;
return false;
}
}
ac->begun = true;
ac->responded = false;
ac->addr = &ac->alist->addrs[ac->index];
return true;
}
/*
* Release an address list cursor.
*/
int afs_end_cursor(struct afs_addr_cursor *ac)
{
if (ac->responded && ac->index != ac->start)
WRITE_ONCE(ac->alist->index, ac->index);
afs_put_addrlist(ac->alist);
ac->alist = NULL;
return ac->error;
}
/*
* Set the address cursor for iterating over VL servers.
*/
int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
{
struct afs_addr_list *alist;
int ret;
if (!rcu_access_pointer(cell->vl_addrs)) {
ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
TASK_INTERRUPTIBLE);
if (ret < 0)
return ret;
if (!rcu_access_pointer(cell->vl_addrs) &&
ktime_get_real_seconds() < cell->dns_expiry)
return cell->error;
}
read_lock(&cell->vl_addrs_lock);
alist = rcu_dereference_protected(cell->vl_addrs,
lockdep_is_held(&cell->vl_addrs_lock));
if (alist->nr_addrs > 0)
afs_get_addrlist(alist);
else
alist = NULL;
read_unlock(&cell->vl_addrs_lock);
if (!alist)
return -EDESTADDRREQ;
ac->alist = alist;
ac->addr = NULL;
ac->start = READ_ONCE(alist->index);
ac->index = ac->start;
ac->error = 0;
ac->begun = false;
return 0;
}
......@@ -14,11 +14,14 @@
#include <linux/in.h>
#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */
#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */
#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */
#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */
#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */
#define AFS_MAXCELLNAME 64 /* Maximum length of a cell name */
#define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */
#define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */
#define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */
#define AFS_MAXTYPES 3 /* Maximum number of volume types */
#define AFSNAMEMAX 256 /* Maximum length of a filename plus NUL */
#define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */
#define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */
typedef unsigned afs_volid_t;
typedef unsigned afs_vnodeid_t;
......@@ -72,6 +75,15 @@ struct afs_callback {
#define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */
struct afs_uuid {
__be32 time_low; /* low part of timestamp */
__be16 time_mid; /* mid part of timestamp */
__be16 time_hi_and_version; /* high part of timestamp and version */
__s8 clock_seq_hi_and_reserved; /* clock seq hi and variant */
__s8 clock_seq_low; /* clock seq low */
__s8 node[6]; /* spatially unique node ID (MAC addr) */
};
/*
* AFS volume information
*/
......@@ -124,7 +136,6 @@ struct afs_file_status {
afs_access_t caller_access; /* access rights for authenticated caller */
afs_access_t anon_access; /* access rights for unauthenticated caller */
umode_t mode; /* UNIX mode */
struct afs_fid parent; /* parent dir ID for non-dirs only */
time_t mtime_client; /* last time client changed data */
time_t mtime_server; /* last time server changed data */
s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
......@@ -167,4 +178,16 @@ struct afs_volume_status {
#define AFS_BLOCK_SIZE 1024
/*
* XDR encoding of UUID in AFS.
*/
struct afs_uuid__xdr {
__be32 time_low;
__be32 time_mid;
__be32 time_hi_and_version;
__be32 clock_seq_hi_and_reserved;
__be32 clock_seq_low;
__be32 node[6];
};
#endif /* AFS_H */
......@@ -37,9 +37,12 @@ enum AFS_FS_Operations {
FSLOOKUP = 161, /* AFS lookup file in directory */
FSFETCHDATA64 = 65537, /* AFS Fetch file data */
FSSTOREDATA64 = 65538, /* AFS Store file data */
FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */
FSGETCAPABILITIES = 65540, /* Probe and get the capabilities of a fileserver */
};
enum AFS_FS_Errors {
VRESTARTING = -100, /* Server is restarting */
VSALVAGE = 101, /* volume needs salvaging */
VNOVNODE = 102, /* no such file/dir (vnode) */
VNOVOL = 103, /* no such volume or volume unavailable */
......@@ -51,6 +54,9 @@ enum AFS_FS_Errors {
VOVERQUOTA = 109, /* volume's maximum quota exceeded */
VBUSY = 110, /* volume is temporarily unavailable */
VMOVED = 111, /* volume moved to new server - ask this FS where */
VIO = 112, /* I/O error in volume */
VSALVAGING = 113, /* Volume is being salvaged */
VRESTRICTED = 120, /* Volume is restricted from using */
};
#endif /* AFS_FS_H */
......@@ -16,11 +16,17 @@
#define AFS_VL_PORT 7003 /* volume location service port */
#define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */
#define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */
enum AFSVL_Operations {
VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */
VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */
VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */
VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */
VLGETENTRYBYNAME = 504, /* AFS Get VLDB entry by name */
VLPROBE = 514, /* AFS probe VL service */
VLGETENTRYBYIDU = 526, /* AFS Get VLDB entry by ID (UUID-variant) */
VLGETENTRYBYNAMEU = 527, /* AFS Get VLDB entry by name (UUID-variant) */
VLGETADDRSU = 533, /* AFS Get addrs for fileserver */
YVLGETENDPOINTS = 64002, /* YFS Get endpoints for file/volume server */
VLGETCAPABILITIES = 65537, /* AFS Get server capabilities */
};
enum AFSVL_Errors {
......@@ -54,6 +60,19 @@ enum AFSVL_Errors {
AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */
};
enum {
YFS_SERVER_INDEX = 0,
YFS_SERVER_UUID = 1,
YFS_SERVER_ENDPOINT = 2,
};
enum {
YFS_ENDPOINT_IPV4 = 0,
YFS_ENDPOINT_IPV6 = 1,
};
#define YFS_MAXENDPOINTS 16
/*
* maps to "struct vldbentry" in vvl-spec.pdf
*/
......@@ -74,11 +93,57 @@ struct afs_vldbentry {
struct in_addr addr; /* server address */