return 1;
}
+enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+
/*
* Do we need to follow links? We _really_ want to be able
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
-static inline int should_follow_link(struct nameidata *nd, struct path *link,
- int follow,
- struct inode *inode, unsigned seq)
+static inline int step_into(struct nameidata *nd, struct path *path,
+ int flags, struct inode *inode, unsigned seq)
{
- if (likely(!d_is_symlink(link->dentry)))
- return 0;
- if (!follow)
+ if (!(flags & WALK_MORE) && nd->depth)
+ put_link(nd);
+ if (likely(!d_is_symlink(path->dentry)) ||
+ !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+ /* not a symlink or should not follow */
+ path_to_nameidata(path, nd);
+ nd->inode = inode;
+ nd->seq = seq;
return 0;
+ }
/* make sure that d_is_symlink above matches inode */
if (nd->flags & LOOKUP_RCU) {
- if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ if (read_seqcount_retry(&path->dentry->d_seq, seq))
return -ECHILD;
}
- return pick_link(nd, link, inode, seq);
+ return pick_link(nd, path, inode, seq);
}
-enum {WALK_GET = 1, WALK_PUT = 2};
-
static int walk_component(struct nameidata *nd, int flags)
{
struct path path;
*/
if (unlikely(nd->last_type != LAST_NORM)) {
err = handle_dots(nd, nd->last_type);
- if (flags & WALK_PUT)
+ if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return err;
}
inode = d_backing_inode(path.dentry);
}
- if (flags & WALK_PUT)
- put_link(nd);
- err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
- if (unlikely(err))
- return err;
- path_to_nameidata(&path, nd);
- nd->inode = inode;
- nd->seq = seq;
- return 0;
+ return step_into(nd, &path, flags, inode, seq);
}
/*
if (!name)
return 0;
/* last component of nested symlink */
- err = walk_component(nd, WALK_GET | WALK_PUT);
+ err = walk_component(nd, WALK_FOLLOW);
} else {
- err = walk_component(nd, WALK_GET);
+ /* not the last component */
+ err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
if (err < 0)
return err;
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
nd->flags &= ~LOOKUP_PARENT;
- return walk_component(nd,
- nd->flags & LOOKUP_FOLLOW
- ? nd->depth
- ? WALK_PUT | WALK_GET
- : WALK_GET
- : 0);
+ return walk_component(nd, 0);
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
}
EXPORT_SYMBOL(user_path_at_empty);
-/*
- * NB: most callers don't do anything directly with the reference to the
- * to struct filename, but the nd->last pointer points into the name string
- * allocated by getname. So we must hold the reference to it until all
- * path-walking is complete.
- */
-static inline struct filename *
-user_path_parent(int dfd, const char __user *path,
- struct path *parent,
- struct qstr *last,
- int *type,
- unsigned int flags)
-{
- /* only LOOKUP_REVAL is allowed in extra flags */
- return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
- parent, last, type);
-}
-
/**
* mountpoint_last - look up last component for umount
* @nd: pathwalk nameidata - currently pointing at parent directory of "last"
- * @path: pointer to container for result
*
* This is a special lookup_last function just for umount. In this case, we
* need to resolve the path without doing any revalidation.
*
* Returns:
* -error: if there was an error during lookup. This includes -ENOENT if the
- * lookup found a negative dentry. The nd->path reference will also be
- * put in this case.
+ * lookup found a negative dentry.
*
- * 0: if we successfully resolved nd->path and found it to not to be a
- * symlink that needs to be followed. "path" will also be populated.
- * The nd->path reference will also be put.
+ * 0: if we successfully resolved nd->last and found it to not to be a
+ * symlink that needs to be followed.
*
* 1: if we successfully resolved nd->last and found it to be a symlink
- * that needs to be followed. "path" will be populated with the path
- * to the link, and nd->path will *not* be put.
+ * that needs to be followed.
*/
static int
-mountpoint_last(struct nameidata *nd, struct path *path)
+mountpoint_last(struct nameidata *nd)
{
int error = 0;
- struct dentry *dentry;
struct dentry *dir = nd->path.dentry;
+ struct path path;
/* If we're in rcuwalk, drop out of it to handle last component */
if (nd->flags & LOOKUP_RCU) {
error = handle_dots(nd, nd->last_type);
if (error)
return error;
- dentry = dget(nd->path.dentry);
+ path.dentry = dget(nd->path.dentry);
} else {
- dentry = d_lookup(dir, &nd->last);
- if (!dentry) {
+ path.dentry = d_lookup(dir, &nd->last);
+ if (!path.dentry) {
/*
* No cached dentry. Mounted dentries are pinned in the
* cache, so that means that this dentry is probably
* a symlink or the path doesn't actually point
* to a mounted dentry.
*/
- dentry = lookup_slow(&nd->last, dir,
+ path.dentry = lookup_slow(&nd->last, dir,
nd->flags | LOOKUP_NO_REVAL);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ if (IS_ERR(path.dentry))
+ return PTR_ERR(path.dentry);
}
}
- if (d_is_negative(dentry)) {
- dput(dentry);
+ if (d_is_negative(path.dentry)) {
+ dput(path.dentry);
return -ENOENT;
}
- if (nd->depth)
- put_link(nd);
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
- d_backing_inode(dentry), 0);
- if (unlikely(error))
- return error;
- mntget(path->mnt);
- follow_mount(path);
- return 0;
+ path.mnt = nd->path.mnt;
+ return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
}
/**
if (IS_ERR(s))
return PTR_ERR(s);
while (!(err = link_path_walk(s, nd)) &&
- (err = mountpoint_last(nd, path)) > 0) {
+ (err = mountpoint_last(nd)) > 0) {
s = trailing_symlink(nd);
if (IS_ERR(s)) {
err = PTR_ERR(s);
break;
}
}
+ if (!err) {
+ *path = nd->path;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
+ follow_mount(path);
+ }
terminate_walk(nd);
return err;
}
seq = 0; /* out of RCU mode, so the value doesn't matter */
inode = d_backing_inode(path.dentry);
finish_lookup:
- if (nd->depth)
- put_link(nd);
- error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
- inode, seq);
+ error = step_into(nd, &path, 0, inode, seq);
if (unlikely(error))
return error;
-
- path_to_nameidata(&path, nd);
- nd->inode = inode;
- nd->seq = seq;
- /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
+ /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
error = complete_walk(nd);
if (error)
return error;
int type;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname,
- &path, &last, &type, lookup_flags);
+ name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ &path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname,
- &path, &last, &type, lookup_flags);
+ name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ &path, &last, &type);
if (IS_ERR(name))
return PTR_ERR(name);
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- /*
- * Check source == target.
- * On overlayfs need to look at underlying inodes.
- */
- if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
+ if (source == target)
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
target_flags = 0;
retry:
- from = user_path_parent(olddfd, oldname,
- &old_path, &old_last, &old_type, lookup_flags);
+ from = filename_parentat(olddfd, getname(oldname), lookup_flags,
+ &old_path, &old_last, &old_type);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
}
- to = user_path_parent(newdfd, newname,
- &new_path, &new_last, &new_type, lookup_flags);
+ to = filename_parentat(newdfd, getname(newname), lookup_flags,
+ &new_path, &new_last, &new_type);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
{
const struct dentry *dentry = data;
- if (f->f_inode == d_inode(dentry))
+ if (file_inode(f) == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
goto out_fput;
}
+ /* Try to use clone_file_range to clone up within the same fs */
+ error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
+ if (!error)
+ goto out;
+ /* Couldn't clone, so now we try to copy the data */
+ error = 0;
+
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
len -= bytes;
}
-
+ out:
if (!error)
error = vfs_fsync(new_file, 0);
fput(new_file);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
- umode_t mode = stat->mode;
int err;
const struct cred *old_creds = NULL;
struct cred *new_creds = NULL;
+ struct cattr cattr = {
+ /* Can't properly set mode on creation because of the umask */
+ .mode = stat->mode & S_IFMT,
+ .rdev = stat->rdev,
+ .link = link
+ };
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (new_creds)
old_creds = override_creds(new_creds);
- /* Can't properly set mode on creation because of the umask */
- stat->mode &= S_IFMT;
- err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
- stat->mode = mode;
+ err = ovl_create_real(wdir, newdentry, &cattr, NULL, true);
if (new_creds) {
revert_creds(old_creds);
ovl_dentry_update(dentry, newdentry);
ovl_inode_update(d_inode(dentry), d_inode(newdentry));
newdentry = NULL;
-
- /*
- * Non-directores become opaque when copied up.
- */
- if (!S_ISDIR(stat->mode))
- ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
/*
* Copy up a single dentry
*
- * Directory renames only allowed on "pure upper" (already created on
- * upper filesystem, never copied up). Directories which are on lower or
- * are merged may not be renamed. For these -EXDEV is returned and
- * userspace has to deal with it. This means, when copying up a
- * directory we can rely on it and ancestors being stable.
- *
- * Non-directory renames start with copy up of source if necessary. The
- * actual rename will only proceed once the copy up was successful. Copy
- * up uses upper parent i_mutex for exclusion. Since rename can change
- * d_parent it is possible that the copy up will lock the old parent. At
- * that point the file will have already been copied up anyway.
+ * All renames start with copy up of source if necessary. The actual
+ * rename will only proceed once the copy up was successful. Copy up uses
+ * upper parent i_mutex for exclusion. Since rename can change d_parent it
+ * is possible that the copy up will lock the old parent. At that point
+ * the file will have already been copied up anyway.
*/
- int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat)
+ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat)
{
DEFINE_DELAYED_CALL(done);
struct dentry *workdir = ovl_workdir(dentry);
struct path parentpath;
struct dentry *lowerdentry = lowerpath->dentry;
struct dentry *upperdir;
- struct dentry *upperdentry;
const char *link = NULL;
if (WARN_ON(!workdir))
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry) {
+ if (ovl_dentry_upper(dentry)) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
return err;
}
- int ovl_copy_up(struct dentry *dentry)
+ int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
+ /* maybe truncate regular file. this has no effect on dirs */
+ if (flags & O_TRUNC)
+ stat.size = 0;
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
return err;
}
+
+ int ovl_copy_up(struct dentry *dentry)
+ {
+ return ovl_copy_up_flags(dentry, 0);
+ }
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
-#include <linux/blk_types.h>
#include <linux/workqueue.h>
#include <linux/percpu-rwsem.h>
#include <linux/delayed_call.h>
struct backing_dev_info;
struct bdi_writeback;
+struct bio;
struct export_operations;
struct hd_geometry;
struct iovec;
*/
#define CHECK_IOVEC_ONLY -1
-/*
- * The below are the various read and write flags that we support. Some of
- * them include behavioral modifiers that send information down to the
- * block layer and IO scheduler. They should be used along with a req_op.
- * Terminology:
- *
- * The block layer uses device plugging to defer IO a little bit, in
- * the hope that we will see more IO very shortly. This increases
- * coalescing of adjacent IO and thus reduces the number of IOs we
- * have to send to the device. It also allows for better queuing,
- * if the IO isn't mergeable. If the caller is going to be waiting
- * for the IO, then he must ensure that the device is unplugged so
- * that the IO is dispatched to the driver.
- *
- * All IO is handled async in Linux. This is fine for background
- * writes, but for reads or writes that someone waits for completion
- * on, we want to notify the block layer and IO scheduler so that they
- * know about it. That allows them to make better scheduling
- * decisions. So when the below references 'sync' and 'async', it
- * is referencing this priority hint.
- *
- * With that in mind, the available types are:
- *
- * READ A normal read operation. Device will be plugged.
- * READ_SYNC A synchronous read. Device is not plugged, caller can
- * immediately wait on this read without caring about
- * unplugging.
- * WRITE A normal async write. Device will be plugged.
- * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down
- * the hint that someone will be waiting on this IO
- * shortly. The write equivalent of READ_SYNC.
- * WRITE_ODIRECT Special case write for O_DIRECT only.
- * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush.
- * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on
- * non-volatile media on completion.
- * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
- * by a cache flush and data is guaranteed to be on
- * non-volatile media on completion.
- *
- */
-#define RW_MASK REQ_OP_WRITE
-
-#define READ REQ_OP_READ
-#define WRITE REQ_OP_WRITE
-
-#define READ_SYNC REQ_SYNC
-#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE)
-#define WRITE_ODIRECT REQ_SYNC
-#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
-#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA)
-#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
-
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
struct file *file_out, loff_t pos_out, u64 len);
extern int vfs_dedupe_file_range(struct file *file,
struct file_dedupe_range *same);
+
+ static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 len)
+ {
+ int ret;
+
+ sb_start_write(file_inode(file_out)->i_sb);
+ ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+ sb_end_write(file_inode(file_out)->i_sb);
+
+ return ret;
+ }
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
extern bool is_bad_inode(struct inode *);
#ifdef CONFIG_BLOCK
-static inline bool op_is_write(unsigned int op)
-{
- return op == REQ_OP_READ ? false : true;
-}
-
-/*
- * return data direction, READ or WRITE
- */
-static inline int bio_data_dir(struct bio *bio)
-{
- return op_is_write(bio_op(bio)) ? WRITE : READ;
-}
-
extern void check_disk_size_change(struct gendisk *disk,
struct block_device *bdev);
extern int revalidate_disk(struct gendisk *);
extern void inode_sb_list_add(struct inode *inode);
#ifdef CONFIG_BLOCK
-extern blk_qc_t submit_bio(struct bio *);
extern int bdev_read_only(struct block_device *);
#endif
extern int set_blocksize(struct block_device *, int);
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
- if (dentry && d_real_inode(dentry) == i) {
+ if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
goto found;
}
err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
if (err)
goto fail;
- inode = d_real_inode(path.dentry);
+ inode = d_backing_inode(path.dentry);
err = inode_permission(inode, MAY_WRITE);
if (err)
goto put_fail;
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_real_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
+ hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
u->path = path;
list = &unix_socket_table[hash];
mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
- skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
- &last);
+ skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+ &err, &last);
if (skb)
break;