Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[sfrench/cifs-2.6.git] / fs / ceph / caps.c
index be5ea6af8366479b675e81e1d13e96139abed2c4..dc10c9dd36c1a2ac6264ed21d3248e5f62f1e330 100644 (file)
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
                used |= CEPH_CAP_PIN;
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
-       if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+       if (ci->i_rdcache_ref ||
+           (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+            ci->vfs_inode.i_data.nrpages))
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
@@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 
        /* remove from session list */
        spin_lock(&session->s_cap_lock);
-       /*
-        * s_cap_reconnect is protected by s_cap_lock. no one changes
-        * s_cap_gen while session is in the reconnect state.
-        */
-       if (queue_release &&
-           (!session->s_cap_reconnect ||
-            cap->cap_gen == session->s_cap_gen))
-               __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
-                                   cap->mseq, cap->issue_seq);
-
        if (session->s_cap_iterator == cap) {
                /* not yet, we are iterating over this very cap */
                dout("__ceph_remove_cap  delaying %p removal from session %p\n",
@@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
        }
        /* protect backpointer with s_cap_lock: see iterate_session_caps */
        cap->ci = NULL;
+
+       /*
+        * s_cap_reconnect is protected by s_cap_lock. no one changes
+        * s_cap_gen while session is in the reconnect state.
+        */
+       if (queue_release &&
+           (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+               cap->queue_release = 1;
+               if (removed) {
+                       list_add_tail(&cap->session_caps,
+                                     &session->s_cap_releases);
+                       session->s_num_cap_releases++;
+                       removed = 0;
+               }
+       } else {
+               cap->queue_release = 0;
+       }
+       cap->cap_ino = ci->i_vino.ino;
+
        spin_unlock(&session->s_cap_lock);
 
        /* remove from inode list */
@@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 static int send_cap_msg(struct ceph_mds_session *session,
                        u64 ino, u64 cid, int op,
                        int caps, int wanted, int dirty,
-                       u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
-                       u64 size, u64 max_size,
+                       u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+                       u32 issue_seq, u32 mseq, u64 size, u64 max_size,
                        struct timespec *mtime, struct timespec *atime,
                        u64 time_warp_seq,
                        kuid_t uid, kgid_t gid, umode_t mode,
@@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
        size_t extra_len;
 
        dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
-            " seq %u/%u mseq %u follows %lld size %llu/%llu"
+            " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
             " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
             cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
             ceph_cap_string(dirty),
-            seq, issue_seq, mseq, follows, size, max_size,
+            seq, issue_seq, flush_tid, oldest_flush_tid,
+            mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-       /* flock buffer size + inline version + inline data size */
-       extra_len = 4 + 8 + 4;
+       /* flock buffer size + inline version + inline data size +
+        * osd_epoch_barrier + oldest_flush_tid */
+       extra_len = 4 + 8 + 4 + 4 + 8;
        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
                           GFP_NOFS, false);
        if (!msg)
                return -ENOMEM;
 
+       msg->hdr.version = cpu_to_le16(6);
        msg->hdr.tid = cpu_to_le64(flush_tid);
 
        fc = msg->front.iov_base;
@@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
        ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
        /* inline data size */
        ceph_encode_32(&p, 0);
+       /* osd_epoch_barrier */
+       ceph_encode_32(&p, 0);
+       /* oldest_flush_tid */
+       ceph_encode_64(&p, oldest_flush_tid);
 
        fc->xattr_version = cpu_to_le64(xattr_version);
        if (xattrs_buf) {
@@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session,
        return 0;
 }
 
-void __queue_cap_release(struct ceph_mds_session *session,
-                        u64 ino, u64 cap_id, u32 migrate_seq,
-                        u32 issue_seq)
-{
-       struct ceph_msg *msg;
-       struct ceph_mds_cap_release *head;
-       struct ceph_mds_cap_item *item;
-
-       BUG_ON(!session->s_num_cap_releases);
-       msg = list_first_entry(&session->s_cap_releases,
-                              struct ceph_msg, list_head);
-
-       dout(" adding %llx release to mds%d msg %p (%d left)\n",
-            ino, session->s_mds, msg, session->s_num_cap_releases);
-
-       BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-       head = msg->front.iov_base;
-       le32_add_cpu(&head->num, 1);
-       item = msg->front.iov_base + msg->front.iov_len;
-       item->ino = cpu_to_le64(ino);
-       item->cap_id = cpu_to_le64(cap_id);
-       item->migrate_seq = cpu_to_le32(migrate_seq);
-       item->seq = cpu_to_le32(issue_seq);
-
-       session->s_num_cap_releases--;
-
-       msg->front.iov_len += sizeof(*item);
-       if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-               dout(" release msg %p full\n", msg);
-               list_move_tail(&msg->list_head, &session->s_cap_releases_done);
-       } else {
-               dout(" release msg %p at %d/%d (%d)\n", msg,
-                    (int)le32_to_cpu(head->num),
-                    (int)CEPH_CAPS_PER_RELEASE,
-                    (int)msg->front.iov_len);
-       }
-}
-
 /*
  * Queue cap releases when an inode is dropped from our cache.  Since
  * inode is about to be destroyed, there is no need for i_ceph_lock.
@@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode)
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                      int op, int used, int want, int retain, int flushing,
-                     unsigned *pflush_tid)
+                     u64 flush_tid, u64 oldest_flush_tid)
        __releases(cap->ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = cap->ci;
@@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        u64 xattr_version = 0;
        struct ceph_buffer *xattr_blob = NULL;
        int delayed = 0;
-       u64 flush_tid = 0;
-       int i;
        int ret;
        bool inline_data;
 
@@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        cap->implemented &= cap->issued | used;
        cap->mds_wanted = want;
 
-       if (flushing) {
-               /*
-                * assign a tid for flush operations so we can avoid
-                * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
-                * clean type races.  track latest tid for every bit
-                * so we can handle flush AxFw, flush Fw, and have the
-                * first ack clean Ax.
-                */
-               flush_tid = ++ci->i_cap_flush_last_tid;
-               if (pflush_tid)
-                       *pflush_tid = flush_tid;
-               dout(" cap_flush_tid %d\n", (int)flush_tid);
-               for (i = 0; i < CEPH_CAP_BITS; i++)
-                       if (flushing & (1 << i))
-                               ci->i_cap_flush_tid[i] = flush_tid;
-
-               follows = ci->i_head_snapc->seq;
-       } else {
-               follows = 0;
-       }
+       follows = flushing ? ci->i_head_snapc->seq : 0;
 
        keep = cap->implemented;
        seq = cap->seq;
@@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        spin_unlock(&ci->i_ceph_lock);
 
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
-               op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+               op, keep, want, flushing, seq,
+               flush_tid, oldest_flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
                uid, gid, mode, xattr_version, xattr_blob,
                follows, inline_data);
@@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
  * the MDS (i.e., during this session).
  *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        struct ceph_mds_session **psession,
-                       int again)
+                       int kick)
                __releases(ci->i_ceph_lock)
                __acquires(ci->i_ceph_lock)
 {
@@ -1297,11 +1257,8 @@ retry:
                if (capsnap->dirty_pages || capsnap->writing)
                        break;
 
-               /*
-                * if cap writeback already occurred, we should have dropped
-                * the capsnap in ceph_put_wrbuffer_cap_refs.
-                */
-               BUG_ON(capsnap->dirty == 0);
+               /* should be removed by ceph_try_drop_cap_snap() */
+               BUG_ON(!capsnap->need_flush);
 
                /* pick mds, take s_mutex */
                if (ci->i_auth_cap == NULL) {
@@ -1310,7 +1267,7 @@ retry:
                }
 
                /* only flush each capsnap once */
-               if (!again && !list_empty(&capsnap->flushing_item)) {
+               if (!kick && !list_empty(&capsnap->flushing_item)) {
                        dout("already flushed %p, skipping\n", capsnap);
                        continue;
                }
@@ -1320,6 +1277,9 @@ retry:
 
                if (session && session->s_mds != mds) {
                        dout("oops, wrong session %p mutex\n", session);
+                       if (kick)
+                               goto out;
+
                        mutex_unlock(&session->s_mutex);
                        ceph_put_mds_session(session);
                        session = NULL;
@@ -1343,20 +1303,22 @@ retry:
                        goto retry;
                }
 
-               capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+               spin_lock(&mdsc->cap_dirty_lock);
+               capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
+               spin_unlock(&mdsc->cap_dirty_lock);
+
                atomic_inc(&capsnap->nref);
-               if (!list_empty(&capsnap->flushing_item))
-                       list_del_init(&capsnap->flushing_item);
-               list_add_tail(&capsnap->flushing_item,
-                             &session->s_cap_snaps_flushing);
+               if (list_empty(&capsnap->flushing_item))
+                       list_add_tail(&capsnap->flushing_item,
+                                     &session->s_cap_snaps_flushing);
                spin_unlock(&ci->i_ceph_lock);
 
                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-                            capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
-                            capsnap->size, 0,
+                            capsnap->dirty, 0, capsnap->flush_tid, 0,
+                            0, mseq, capsnap->size, 0,
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  * Caller is then responsible for calling __mark_inode_dirty with the
  * returned flags value.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+                          struct ceph_cap_flush **pcf)
 {
        struct ceph_mds_client *mdsc =
                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
             ceph_cap_string(was | mask));
        ci->i_dirty_caps |= mask;
        if (was == 0) {
-               if (!ci->i_head_snapc)
+               WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+               swap(ci->i_prealloc_cap_flush, *pcf);
+
+               if (!ci->i_head_snapc) {
+                       WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
                        ci->i_head_snapc = ceph_get_snap_context(
                                ci->i_snap_realm->cached_context);
+               }
                dout(" inode %p now dirty snapc %p auth cap %p\n",
                     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
                BUG_ON(!list_empty(&ci->i_dirty_item));
@@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
                        ihold(inode);
                        dirty |= I_DIRTY_SYNC;
                }
+       } else {
+               WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
        }
        BUG_ON(list_empty(&ci->i_dirty_item));
        if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
        return dirty;
 }
 
+static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
+                                       struct ceph_cap_flush *cf)
+{
+       struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_cap_flush *other = NULL;
+
+       while (*p) {
+               parent = *p;
+               other = rb_entry(parent, struct ceph_cap_flush, i_node);
+
+               if (cf->tid < other->tid)
+                       p = &(*p)->rb_left;
+               else if (cf->tid > other->tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&cf->i_node, parent, p);
+       rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
+}
+
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+                                      struct ceph_cap_flush *cf)
+{
+       struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_cap_flush *other = NULL;
+
+       while (*p) {
+               parent = *p;
+               other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+               if (cf->tid < other->tid)
+                       p = &(*p)->rb_left;
+               else if (cf->tid > other->tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&cf->g_node, parent, p);
+       rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+       return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+       if (cf)
+               kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+       struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+       if (n) {
+               struct ceph_cap_flush *cf =
+                       rb_entry(n, struct ceph_cap_flush, g_node);
+               return cf->tid;
+       }
+       return 0;
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
-                                struct ceph_mds_session *session)
+                               struct ceph_mds_session *session,
+                               u64 *flush_tid, u64 *oldest_flush_tid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_cap_flush *cf = NULL;
        int flushing;
 
        BUG_ON(ci->i_dirty_caps == 0);
        BUG_ON(list_empty(&ci->i_dirty_item));
+       BUG_ON(!ci->i_prealloc_cap_flush);
 
        flushing = ci->i_dirty_caps;
        dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1463,22 +1504,31 @@ static int __mark_caps_flushing(struct inode *inode,
        ci->i_dirty_caps = 0;
        dout(" inode %p now !dirty\n", inode);
 
+       swap(cf, ci->i_prealloc_cap_flush);
+       cf->caps = flushing;
+       cf->kick = false;
+
        spin_lock(&mdsc->cap_dirty_lock);
        list_del_init(&ci->i_dirty_item);
 
+       cf->tid = ++mdsc->last_cap_flush_tid;
+       __add_cap_flushing_to_mdsc(mdsc, cf);
+       *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
        if (list_empty(&ci->i_flushing_item)) {
-               ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
                mdsc->num_cap_flushing++;
-               dout(" inode %p now flushing seq %lld\n", inode,
-                    ci->i_cap_flush_seq);
+               dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
        } else {
                list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
-               dout(" inode %p now flushing (more) seq %lld\n", inode,
-                    ci->i_cap_flush_seq);
+               dout(" inode %p now flushing (more) tid %llu\n",
+                    inode, cf->tid);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
 
+       __add_cap_flushing_to_inode(ci, cf);
+
+       *flush_tid = cf->tid;
        return flushing;
 }
 
@@ -1524,6 +1574,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
+       u64 flush_tid, oldest_flush_tid;
        int file_wanted, used, cap_used;
        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
        int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1553,13 +1604,13 @@ retry:
 retry_locked:
        file_wanted = __ceph_caps_file_wanted(ci);
        used = __ceph_caps_used(ci);
-       want = file_wanted | used;
        issued = __ceph_caps_issued(ci, &implemented);
        revoking = implemented & ~issued;
 
-       retain = want | CEPH_CAP_PIN;
+       want = file_wanted;
+       retain = file_wanted | used | CEPH_CAP_PIN;
        if (!mdsc->stopping && inode->i_nlink > 0) {
-               if (want) {
+               if (file_wanted) {
                        retain |= CEPH_CAP_ANY;       /* be greedy */
                } else if (S_ISDIR(inode->i_mode) &&
                           (issued & CEPH_CAP_FILE_SHARED) &&
@@ -1602,9 +1653,10 @@ retry_locked:
         * If we fail, it's because pages are locked.... try again later.
         */
        if ((!is_delayed || mdsc->stopping) &&
-           ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-           inode->i_data.nrpages &&                 /* have cached pages */
-           (file_wanted == 0 ||                     /* no open files */
+           !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
+           ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+           inode->i_data.nrpages &&            /* have cached pages */
+           (file_wanted == 0 ||                /* no open files */
             (revoking & (CEPH_CAP_FILE_CACHE|
                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
            !tried_invalidate) {
@@ -1742,17 +1794,25 @@ ack:
                        took_snap_rwsem = 1;
                }
 
-               if (cap == ci->i_auth_cap && ci->i_dirty_caps)
-                       flushing = __mark_caps_flushing(inode, session);
-               else
+               if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+                       flushing = __mark_caps_flushing(inode, session,
+                                                       &flush_tid,
+                                                       &oldest_flush_tid);
+               } else {
                        flushing = 0;
+                       flush_tid = 0;
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+                       spin_unlock(&mdsc->cap_dirty_lock);
+               }
 
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
 
                /* __send_cap drops i_ceph_lock */
                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
-                                     want, retain, flushing, NULL);
+                                     want, retain, flushing,
+                                     flush_tid, oldest_flush_tid);
                goto retry; /* retake i_ceph_lock and restart our cap scan. */
        }
 
@@ -1781,12 +1841,13 @@ ack:
 /*
  * Try to flush dirty caps back to the auth mds.
  */
-static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       int flushing = 0;
        struct ceph_mds_session *session = NULL;
+       int flushing = 0;
+       u64 flush_tid = 0, oldest_flush_tid = 0;
 
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1811,42 +1872,54 @@ retry:
                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
                        goto out;
 
-               flushing = __mark_caps_flushing(inode, session);
+               flushing = __mark_caps_flushing(inode, session, &flush_tid,
+                                               &oldest_flush_tid);
 
                /* __send_cap drops i_ceph_lock */
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-                                    cap->issued | cap->implemented, flushing,
-                                    flush_tid);
-               if (!delayed)
-                       goto out_unlocked;
+                                    (cap->issued | cap->implemented),
+                                    flushing, flush_tid, oldest_flush_tid);
 
-               spin_lock(&ci->i_ceph_lock);
-               __cap_delay_requeue(mdsc, ci);
+               if (delayed) {
+                       spin_lock(&ci->i_ceph_lock);
+                       __cap_delay_requeue(mdsc, ci);
+                       spin_unlock(&ci->i_ceph_lock);
+               }
+       } else {
+               struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
+               if (n) {
+                       struct ceph_cap_flush *cf =
+                               rb_entry(n, struct ceph_cap_flush, i_node);
+                       flush_tid = cf->tid;
+               }
+               flushing = ci->i_flushing_caps;
+               spin_unlock(&ci->i_ceph_lock);
        }
 out:
-       spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
        if (session)
                mutex_unlock(&session->s_mutex);
+
+       *ptid = flush_tid;
        return flushing;
 }
 
 /*
  * Return true if we've flushed caps through the given flush_tid.
  */
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       int i, ret = 1;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       int ret = 1;
 
        spin_lock(&ci->i_ceph_lock);
-       for (i = 0; i < CEPH_CAP_BITS; i++)
-               if ((ci->i_flushing_caps & (1 << i)) &&
-                   ci->i_cap_flush_tid[i] <= tid) {
-                       /* still flushing this bit */
+       n = rb_first(&ci->i_cap_flush_tree);
+       if (n) {
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               if (cf->tid <= flush_tid)
                        ret = 0;
-                       break;
-               }
+       }
        spin_unlock(&ci->i_ceph_lock);
        return ret;
 }
@@ -1864,13 +1937,16 @@ static void sync_write_wait(struct inode *inode)
        struct ceph_osd_request *req;
        u64 last_tid;
 
+       if (!S_ISREG(inode->i_mode))
+               return;
+
        spin_lock(&ci->i_unsafe_lock);
        if (list_empty(head))
                goto out;
 
        /* set upper bound as _last_ entry in chain */
-       req = list_entry(head->prev, struct ceph_osd_request,
-                        r_unsafe_item);
+       req = list_last_entry(head, struct ceph_osd_request,
+                             r_unsafe_item);
        last_tid = req->r_tid;
 
        do {
@@ -1888,18 +1964,64 @@ static void sync_write_wait(struct inode *inode)
                 */
                if (list_empty(head))
                        break;
-               req = list_entry(head->next, struct ceph_osd_request,
-                                r_unsafe_item);
+               req = list_first_entry(head, struct ceph_osd_request,
+                                      r_unsafe_item);
        } while (req->r_tid < last_tid);
 out:
        spin_unlock(&ci->i_unsafe_lock);
 }
 
+/*
+ * wait for any uncommitted directory operations to commit.
+ */
+static int unsafe_dirop_wait(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct list_head *head = &ci->i_unsafe_dirops;
+       struct ceph_mds_request *req;
+       u64 last_tid;
+       int ret = 0;
+
+       if (!S_ISDIR(inode->i_mode))
+               return 0;
+
+       spin_lock(&ci->i_unsafe_lock);
+       if (list_empty(head))
+               goto out;
+
+       req = list_last_entry(head, struct ceph_mds_request,
+                             r_unsafe_dir_item);
+       last_tid = req->r_tid;
+
+       do {
+               ceph_mdsc_get_request(req);
+               spin_unlock(&ci->i_unsafe_lock);
+
+               dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
+                    inode, req->r_tid, last_tid);
+               ret = !wait_for_completion_timeout(&req->r_safe_completion,
+                                       ceph_timeout_jiffies(req->r_timeout));
+               if (ret)
+                       ret = -EIO;  /* timed out */
+
+               ceph_mdsc_put_request(req);
+
+               spin_lock(&ci->i_unsafe_lock);
+               if (ret || list_empty(head))
+                       break;
+               req = list_first_entry(head, struct ceph_mds_request,
+                                      r_unsafe_dir_item);
+       } while (req->r_tid < last_tid);
+out:
+       spin_unlock(&ci->i_unsafe_lock);
+       return ret;
+}
+
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       unsigned flush_tid;
+       u64 flush_tid;
        int ret;
        int dirty;
 
@@ -1908,25 +2030,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
-               return ret;
+               goto out;
+
+       if (datasync)
+               goto out;
+
        mutex_lock(&inode->i_mutex);
 
        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
+       ret = unsafe_dirop_wait(inode);
+
        /*
         * only wait on non-file metadata writeback (the mds
         * can recover size and mtime, so we don't need to
         * wait for that)
         */
-       if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
-               dout("fsync waiting for flush_tid %u\n", flush_tid);
+       if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
                ret = wait_event_interruptible(ci->i_cap_wq,
-                                      caps_are_flushed(inode, flush_tid));
+                                       caps_are_flushed(inode, flush_tid));
        }
-
-       dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
        mutex_unlock(&inode->i_mutex);
+out:
+       dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
        return ret;
 }
 
@@ -1939,7 +2066,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       unsigned flush_tid;
+       u64 flush_tid;
        int err = 0;
        int dirty;
        int wait = wbc->sync_mode == WB_SYNC_ALL;
@@ -1994,6 +2121,104 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
        }
 }
 
+static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session,
+                               struct ceph_inode_info *ci,
+                               bool kick_all)
+{
+       struct inode *inode = &ci->vfs_inode;
+       struct ceph_cap *cap;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       int delayed = 0;
+       u64 first_tid = 0;
+       u64 oldest_flush_tid;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+       spin_unlock(&mdsc->cap_dirty_lock);
+
+       while (true) {
+               spin_lock(&ci->i_ceph_lock);
+               cap = ci->i_auth_cap;
+               if (!(cap && cap->session == session)) {
+                       pr_err("%p auth cap %p not mds%d ???\n", inode,
+                                       cap, session->s_mds);
+                       spin_unlock(&ci->i_ceph_lock);
+                       break;
+               }
+
+               for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       if (cf->tid < first_tid)
+                               continue;
+                       if (kick_all || cf->kick)
+                               break;
+               }
+               if (!n) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       break;
+               }
+
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               cf->kick = false;
+
+               first_tid = cf->tid + 1;
+
+               dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
+                    cap, cf->tid, ceph_cap_string(cf->caps));
+               delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     cf->caps, cf->tid, oldest_flush_tid);
+       }
+       return delayed;
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_session *session)
+{
+       struct ceph_inode_info *ci;
+       struct ceph_cap *cap;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+
+       dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+       list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+               spin_lock(&ci->i_ceph_lock);
+               cap = ci->i_auth_cap;
+               if (!(cap && cap->session == session)) {
+                       pr_err("%p auth cap %p not mds%d ???\n",
+                               &ci->vfs_inode, cap, session->s_mds);
+                       spin_unlock(&ci->i_ceph_lock);
+                       continue;
+               }
+
+
+               /*
+                * if flushing caps were revoked, we re-send the cap flush
+                * in client reconnect stage. This guarantees MDS * processes
+                * the cap flush message before issuing the flushing caps to
+                * other client.
+                */
+               if ((cap->issued & ci->i_flushing_caps) !=
+                   ci->i_flushing_caps) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       if (!__kick_flushing_caps(mdsc, session, ci, true))
+                               continue;
+                       spin_lock(&ci->i_ceph_lock);
+               }
+
+               for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       cf->kick = true;
+               }
+
+               spin_unlock(&ci->i_ceph_lock);
+       }
+}
+
 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session)
 {
@@ -2003,28 +2228,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 
        dout("kick_flushing_caps mds%d\n", session->s_mds);
        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
-               struct inode *inode = &ci->vfs_inode;
-               struct ceph_cap *cap;
-               int delayed = 0;
-
-               spin_lock(&ci->i_ceph_lock);
-               cap = ci->i_auth_cap;
-               if (cap && cap->session == session) {
-                       dout("kick_flushing_caps %p cap %p %s\n", inode,
-                            cap, ceph_cap_string(ci->i_flushing_caps));
-                       delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                            __ceph_caps_used(ci),
-                                            __ceph_caps_wanted(ci),
-                                            cap->issued | cap->implemented,
-                                            ci->i_flushing_caps, NULL);
-                       if (delayed) {
-                               spin_lock(&ci->i_ceph_lock);
-                               __cap_delay_requeue(mdsc, ci);
-                               spin_unlock(&ci->i_ceph_lock);
-                       }
-               } else {
-                       pr_err("%p auth cap %p not mds%d ???\n", inode,
-                              cap, session->s_mds);
+               int delayed = __kick_flushing_caps(mdsc, session, ci, false);
+               if (delayed) {
+                       spin_lock(&ci->i_ceph_lock);
+                       __cap_delay_requeue(mdsc, ci);
                        spin_unlock(&ci->i_ceph_lock);
                }
        }
@@ -2036,26 +2243,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *cap;
-       int delayed = 0;
 
        spin_lock(&ci->i_ceph_lock);
        cap = ci->i_auth_cap;
-       dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
-            ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+       dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+            ceph_cap_string(ci->i_flushing_caps));
 
        __ceph_flush_snaps(ci, &session, 1);
 
        if (ci->i_flushing_caps) {
+               int delayed;
+
                spin_lock(&mdsc->cap_dirty_lock);
                list_move_tail(&ci->i_flushing_item,
                               &cap->session->s_cap_flushing);
                spin_unlock(&mdsc->cap_dirty_lock);
 
-               delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                    __ceph_caps_used(ci),
-                                    __ceph_caps_wanted(ci),
-                                    cap->issued | cap->implemented,
-                                    ci->i_flushing_caps, NULL);
+               spin_unlock(&ci->i_ceph_lock);
+
+               delayed = __kick_flushing_caps(mdsc, session, ci, true);
                if (delayed) {
                        spin_lock(&ci->i_ceph_lock);
                        __cap_delay_requeue(mdsc, ci);
@@ -2073,7 +2279,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
  *
  * Protected by i_ceph_lock.
  */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+                           bool snap_rwsem_locked)
 {
        if (got & CEPH_CAP_PIN)
                ci->i_pin_ref++;
@@ -2081,8 +2288,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
                ci->i_rd_ref++;
        if (got & CEPH_CAP_FILE_CACHE)
                ci->i_rdcache_ref++;
-       if (got & CEPH_CAP_FILE_WR)
+       if (got & CEPH_CAP_FILE_WR) {
+               if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+                       BUG_ON(!snap_rwsem_locked);
+                       ci->i_head_snapc = ceph_get_snap_context(
+                                       ci->i_snap_realm->cached_context);
+               }
                ci->i_wr_ref++;
+       }
        if (got & CEPH_CAP_FILE_BUFFER) {
                if (ci->i_wb_ref == 0)
                        ihold(&ci->vfs_inode);
@@ -2100,16 +2313,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
  * requested from the MDS.
  */
 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-                           loff_t endoff, int *got, int *check_max, int *err)
+                           loff_t endoff, bool nonblock, int *got, int *err)
 {
        struct inode *inode = &ci->vfs_inode;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        int ret = 0;
        int have, implemented;
        int file_wanted;
+       bool snap_rwsem_locked = false;
 
        dout("get_cap_refs %p need %s want %s\n", inode,
             ceph_cap_string(need), ceph_cap_string(want));
 
+again:
        spin_lock(&ci->i_ceph_lock);
 
        /* make sure file is actually open */
@@ -2125,6 +2341,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        /* finish pending truncate */
        while (ci->i_truncate_pending) {
                spin_unlock(&ci->i_ceph_lock);
+               if (snap_rwsem_locked) {
+                       up_read(&mdsc->snap_rwsem);
+                       snap_rwsem_locked = false;
+               }
                __ceph_do_pending_vmtruncate(inode);
                spin_lock(&ci->i_ceph_lock);
        }
@@ -2136,7 +2356,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
                             inode, endoff, ci->i_max_size);
                        if (endoff > ci->i_requested_max_size) {
-                               *check_max = 1;
+                               *err = -EAGAIN;
                                ret = 1;
                        }
                        goto out_unlock;
@@ -2164,8 +2384,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
                     inode, ceph_cap_string(have), ceph_cap_string(not),
                     ceph_cap_string(revoking));
                if ((revoking & not) == 0) {
+                       if (!snap_rwsem_locked &&
+                           !ci->i_head_snapc &&
+                           (need & CEPH_CAP_FILE_WR)) {
+                               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                                       /*
+                                        * we can not call down_read() when
+                                        * task isn't in TASK_RUNNING state
+                                        */
+                                       if (nonblock) {
+                                               *err = -EAGAIN;
+                                               ret = 1;
+                                               goto out_unlock;
+                                       }
+
+                                       spin_unlock(&ci->i_ceph_lock);
+                                       down_read(&mdsc->snap_rwsem);
+                                       snap_rwsem_locked = true;
+                                       goto again;
+                               }
+                               snap_rwsem_locked = true;
+                       }
                        *got = need | (have & want);
-                       __take_cap_refs(ci, *got);
+                       __take_cap_refs(ci, *got, true);
                        ret = 1;
                }
        } else {
@@ -2189,6 +2430,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        }
 out_unlock:
        spin_unlock(&ci->i_ceph_lock);
+       if (snap_rwsem_locked)
+               up_read(&mdsc->snap_rwsem);
 
        dout("get_cap_refs %p ret %d got %s\n", inode,
             ret, ceph_cap_string(*got));
@@ -2231,50 +2474,70 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                  loff_t endoff, int *got, struct page **pinned_page)
 {
-       int _got, check_max, ret, err = 0;
+       int _got, ret, err = 0;
 
-retry:
-       if (endoff > 0)
-               check_max_size(&ci->vfs_inode, endoff);
-       _got = 0;
-       check_max = 0;
-       ret = wait_event_interruptible(ci->i_cap_wq,
-                               try_get_cap_refs(ci, need, want, endoff,
-                                                &_got, &check_max, &err));
-       if (err)
-               ret = err;
+       ret = ceph_pool_perm_check(ci, need);
        if (ret < 0)
                return ret;
 
-       if (check_max)
-               goto retry;
+       while (true) {
+               if (endoff > 0)
+                       check_max_size(&ci->vfs_inode, endoff);
 
-       if (ci->i_inline_version != CEPH_INLINE_NONE &&
-           (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-           i_size_read(&ci->vfs_inode) > 0) {
-               struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
-               if (page) {
-                       if (PageUptodate(page)) {
-                               *pinned_page = page;
-                               goto out;
-                       }
-                       page_cache_release(page);
-               }
-               /*
-                * drop cap refs first because getattr while holding
-                * caps refs can cause deadlock.
-                */
-               ceph_put_cap_refs(ci, _got);
+               err = 0;
                _got = 0;
+               ret = try_get_cap_refs(ci, need, want, endoff,
+                                      false, &_got, &err);
+               if (ret) {
+                       if (err == -EAGAIN)
+                               continue;
+                       if (err < 0)
+                               return err;
+               } else {
+                       ret = wait_event_interruptible(ci->i_cap_wq,
+                                       try_get_cap_refs(ci, need, want, endoff,
+                                                        true, &_got, &err));
+                       if (err == -EAGAIN)
+                               continue;
+                       if (err < 0)
+                               ret = err;
+                       if (ret < 0)
+                               return ret;
+               }
 
-               /* getattr request will bring inline data into page cache */
-               ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
-                                       CEPH_STAT_CAP_INLINE_DATA, true);
-               if (ret < 0)
-                       return ret;
-               goto retry;
+               if (ci->i_inline_version != CEPH_INLINE_NONE &&
+                   (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+                   i_size_read(&ci->vfs_inode) > 0) {
+                       struct page *page =
+                               find_get_page(ci->vfs_inode.i_mapping, 0);
+                       if (page) {
+                               if (PageUptodate(page)) {
+                                       *pinned_page = page;
+                                       break;
+                               }
+                               page_cache_release(page);
+                       }
+                       /*
+                        * drop cap refs first because getattr while
+                        * holding * caps refs can cause deadlock.
+                        */
+                       ceph_put_cap_refs(ci, _got);
+                       _got = 0;
+
+                       /*
+                        * getattr request will bring inline data into
+                        * page cache
+                        */
+                       ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+                                               CEPH_STAT_CAP_INLINE_DATA,
+                                               true);
+                       if (ret < 0)
+                               return ret;
+                       continue;
+               }
+               break;
        }
-out:
+
        *got = _got;
        return 0;
 }
@@ -2286,10 +2549,31 @@ out:
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
        spin_lock(&ci->i_ceph_lock);
-       __take_cap_refs(ci, caps);
+       __take_cap_refs(ci, caps, false);
        spin_unlock(&ci->i_ceph_lock);
 }
 
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+{
+       if (!capsnap->need_flush &&
+           !capsnap->writing && !capsnap->dirty_pages) {
+
+               dout("dropping cap_snap %p follows %llu\n",
+                    capsnap, capsnap->follows);
+               ceph_put_snap_context(capsnap->context);
+               list_del(&capsnap->ci_item);
+               list_del(&capsnap->flushing_item);
+               ceph_put_cap_snap(capsnap);
+               return 1;
+       }
+       return 0;
+}
+
 /*
  * Release cap refs.
  *
@@ -2303,7 +2587,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 {
        struct inode *inode = &ci->vfs_inode;
        int last = 0, put = 0, flushsnaps = 0, wake = 0;
-       struct ceph_cap_snap *capsnap;
 
        spin_lock(&ci->i_ceph_lock);
        if (had & CEPH_CAP_PIN)
@@ -2325,17 +2608,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        if (had & CEPH_CAP_FILE_WR)
                if (--ci->i_wr_ref == 0) {
                        last++;
-                       if (!list_empty(&ci->i_cap_snaps)) {
-                               capsnap = list_first_entry(&ci->i_cap_snaps,
-                                                    struct ceph_cap_snap,
-                                                    ci_item);
-                               if (capsnap->writing) {
-                                       capsnap->writing = 0;
-                                       flushsnaps =
-                                               __ceph_finish_cap_snap(ci,
-                                                                      capsnap);
-                                       wake = 1;
-                               }
+                       if (__ceph_have_pending_cap_snap(ci)) {
+                               struct ceph_cap_snap *capsnap =
+                                       list_last_entry(&ci->i_cap_snaps,
+                                                       struct ceph_cap_snap,
+                                                       ci_item);
+                               capsnap->writing = 0;
+                               if (ceph_try_drop_cap_snap(capsnap))
+                                       put++;
+                               else if (__ceph_finish_cap_snap(ci, capsnap))
+                                       flushsnaps = 1;
+                               wake = 1;
+                       }
+                       if (ci->i_wrbuffer_ref_head == 0 &&
+                           ci->i_dirty_caps == 0 &&
+                           ci->i_flushing_caps == 0) {
+                               BUG_ON(!ci->i_head_snapc);
+                               ceph_put_snap_context(ci->i_head_snapc);
+                               ci->i_head_snapc = NULL;
                        }
                        /* see comment in __ceph_remove_cap() */
                        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
@@ -2352,7 +2642,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                ceph_flush_snaps(ci);
        if (wake)
                wake_up_all(&ci->i_cap_wq);
-       if (put)
+       while (put-- > 0)
                iput(inode);
 }
 
@@ -2380,7 +2670,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
                if (ci->i_wrbuffer_ref_head == 0 &&
-                   ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+                   ci->i_wr_ref == 0 &&
+                   ci->i_dirty_caps == 0 &&
+                   ci->i_flushing_caps == 0) {
                        BUG_ON(!ci->i_head_snapc);
                        ceph_put_snap_context(ci->i_head_snapc);
                        ci->i_head_snapc = NULL;
@@ -2401,25 +2693,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                capsnap->dirty_pages -= nr;
                if (capsnap->dirty_pages == 0) {
                        complete_capsnap = 1;
-                       if (capsnap->dirty == 0)
-                               /* cap writeback completed before we created
-                                * the cap_snap; no FLUSHSNAP is needed */
-                               drop_capsnap = 1;
+                       drop_capsnap = ceph_try_drop_cap_snap(capsnap);
                }
                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-                    " snap %lld %d/%d -> %d/%d %s%s%s\n",
+                    " snap %lld %d/%d -> %d/%d %s%s\n",
                     inode, capsnap, capsnap->context->seq,
                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
                     last ? " (wrbuffer last)" : "",
-                    complete_capsnap ? " (complete capsnap)" : "",
-                    drop_capsnap ? " (drop capsnap)" : "");
-               if (drop_capsnap) {
-                       ceph_put_snap_context(capsnap->context);
-                       list_del(&capsnap->ci_item);
-                       list_del(&capsnap->flushing_item);
-                       ceph_put_cap_snap(capsnap);
-               }
+                    complete_capsnap ? " (complete capsnap)" : "");
        }
 
        spin_unlock(&ci->i_ceph_lock);
@@ -2526,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
         * try to invalidate (once).  (If there are dirty buffers, we
         * will invalidate _after_ writeback.)
         */
-       if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+       if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+           ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
            !ci->i_wrbuffer_ref) {
                if (try_nonblocking_invalidate(inode)) {
@@ -2732,16 +3015,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       LIST_HEAD(to_remove);
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
        int drop = 0;
-       int i;
 
-       for (i = 0; i < CEPH_CAP_BITS; i++)
-               if ((dirty & (1 << i)) &&
-                   (u16)flush_tid == ci->i_cap_flush_tid[i])
-                       cleaned |= 1 << i;
+       n = rb_first(&ci->i_cap_flush_tree);
+       while (n) {
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               n = rb_next(&cf->i_node);
+               if (cf->tid == flush_tid)
+                       cleaned = cf->caps;
+               if (cf->tid <= flush_tid) {
+                       rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+                       list_add_tail(&cf->list, &to_remove);
+               } else {
+                       cleaned &= ~cf->caps;
+                       if (!cleaned)
+                               break;
+               }
+       }
 
        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
             " flushing %s -> %s\n",
@@ -2749,12 +3045,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
             ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
             ceph_cap_string(ci->i_flushing_caps & ~cleaned));
 
-       if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+       if (list_empty(&to_remove) && !cleaned)
                goto out;
 
        ci->i_flushing_caps &= ~cleaned;
 
        spin_lock(&mdsc->cap_dirty_lock);
+
+       if (!list_empty(&to_remove)) {
+               list_for_each_entry(cf, &to_remove, list)
+                       rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+               n = rb_first(&mdsc->cap_flush_tree);
+               cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+               if (!cf || cf->tid > flush_tid)
+                       wake_up_all(&mdsc->cap_flushing_wq);
+       }
+
        if (ci->i_flushing_caps == 0) {
                list_del_init(&ci->i_flushing_item);
                if (!list_empty(&session->s_cap_flushing))
@@ -2764,14 +3071,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-               wake_up_all(&mdsc->cap_flushing_wq);
                dout(" inode %p now !flushing\n", inode);
 
                if (ci->i_dirty_caps == 0) {
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
                        drop = 1;
-                       if (ci->i_wrbuffer_ref_head == 0) {
+                       if (ci->i_wr_ref == 0 &&
+                           ci->i_wrbuffer_ref_head == 0) {
                                BUG_ON(!ci->i_head_snapc);
                                ceph_put_snap_context(ci->i_head_snapc);
                                ci->i_head_snapc = NULL;
@@ -2785,6 +3092,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 
 out:
        spin_unlock(&ci->i_ceph_lock);
+
+       while (!list_empty(&to_remove)) {
+               cf = list_first_entry(&to_remove,
+                                     struct ceph_cap_flush, list);
+               list_del(&cf->list);
+               ceph_free_cap_flush(cf);
+       }
        if (drop)
                iput(inode);
 }
@@ -2800,6 +3114,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                                     struct ceph_mds_session *session)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        u64 follows = le64_to_cpu(m->snap_follows);
        struct ceph_cap_snap *capsnap;
        int drop = 0;
@@ -2823,6 +3138,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                        list_del(&capsnap->ci_item);
                        list_del(&capsnap->flushing_item);
                        ceph_put_cap_snap(capsnap);
+                       wake_up_all(&mdsc->cap_flushing_wq);
                        drop = 1;
                        break;
                } else {
@@ -2971,7 +3287,6 @@ retry:
                        mutex_lock_nested(&session->s_mutex,
                                          SINGLE_DEPTH_NESTING);
                }
-               ceph_add_cap_releases(mdsc, tsession);
                new_cap = ceph_get_cap(mdsc, NULL);
        } else {
                WARN_ON(1);
@@ -3167,16 +3482,20 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
             (unsigned)seq);
 
-       if (op == CEPH_CAP_OP_IMPORT)
-               ceph_add_cap_releases(mdsc, session);
-
        if (!inode) {
                dout(" i don't have ino %llx\n", vino.ino);
 
                if (op == CEPH_CAP_OP_IMPORT) {
+                       cap = ceph_get_cap(mdsc, NULL);
+                       cap->cap_ino = vino.ino;
+                       cap->queue_release = 1;
+                       cap->cap_id = cap_id;
+                       cap->mseq = mseq;
+                       cap->seq = seq;
                        spin_lock(&session->s_cap_lock);
-                       __queue_cap_release(session, vino.ino, cap_id,
-                                           mseq, seq);
+                       list_add_tail(&cap->session_caps,
+                                       &session->s_cap_releases);
+                       session->s_num_cap_releases++;
                        spin_unlock(&session->s_cap_lock);
                }
                goto flush_cap_releases;
@@ -3252,11 +3571,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 flush_cap_releases:
        /*
-        * send any full release message to try to move things
+        * send any cap release message to try to move things
         * along for the mds (who clearly thinks we still have this
         * cap).
         */
-       ceph_add_cap_releases(mdsc, session);
        ceph_send_cap_releases(mdsc, session);
 
 done: