Merge tag 'for-4.21/block-20190102' of git://git.kernel.dk/linux-block
[sfrench/cifs-2.6.git] / drivers / block / drbd / drbd_receiver.c
index ccfcf00f2798d3c7cbad526517fcdd86c2f02314..c7ad88d91a09e7dd427be0792a48ef0f9989178b 100644 (file)
@@ -50,7 +50,7 @@
 #include "drbd_req.h"
 #include "drbd_vli.h"
 
-#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
+#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
 
 struct packet_info {
        enum drbd_packet cmd;
@@ -1490,14 +1490,129 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
                drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
-static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
+/*
+ * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
+ * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
+ * will directly go to fallback mode, submitting normal writes, and
+ * never even try to UNMAP.
+ *
+ * And dm-thin does not do this (yet), mostly because in general it has
+ * to assume that "skip_block_zeroing" is set.  See also:
+ * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
+ * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
+ *
+ * We *may* ignore the discard-zeroes-data setting, if so configured.
+ *
+ * Assumption is that this "discard_zeroes_data=0" is only because the backend
+ * may ignore partial unaligned discards.
+ *
+ * LVM/DM thin as of at least
+ *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
+ *   Library version: 1.02.93-RHEL7 (2015-01-28)
+ *   Driver version:  4.29.0
+ * still behaves this way.
+ *
+ * For unaligned (wrt. alignment and granularity) or too small discards,
+ * we zero-out the initial (and/or) trailing unaligned partial chunks,
+ * but discard all the aligned full chunks.
+ *
+ * At least for LVM/DM thin, with skip_block_zeroing=false,
+ * the result is effectively "discard_zeroes_data=1".
+ */
+/* flags: EE_TRIM|EE_ZEROOUT */
+int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
 {
        struct block_device *bdev = device->ldev->backing_bdev;
+       struct request_queue *q = bdev_get_queue(bdev);
+       sector_t tmp, nr;
+       unsigned int max_discard_sectors, granularity;
+       int alignment;
+       int err = 0;
 
-       if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
-                       GFP_NOIO, 0))
-               peer_req->flags |= EE_WAS_ERROR;
+       if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
+               goto zero_out;
+
+       /* Zero-sector (unknown) and one-sector granularities are the same.  */
+       granularity = max(q->limits.discard_granularity >> 9, 1U);
+       alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+       max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
+       max_discard_sectors -= max_discard_sectors % granularity;
+       if (unlikely(!max_discard_sectors))
+               goto zero_out;
+
+       if (nr_sectors < granularity)
+               goto zero_out;
+
+       tmp = start;
+       if (sector_div(tmp, granularity) != alignment) {
+               if (nr_sectors < 2*granularity)
+                       goto zero_out;
+               /* start + gran - (start + gran - align) % gran */
+               tmp = start + granularity - alignment;
+               tmp = start + granularity - sector_div(tmp, granularity);
+
+               nr = tmp - start;
+               /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
+                * layers are below us, some may have smaller granularity */
+               err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+               nr_sectors -= nr;
+               start = tmp;
+       }
+       while (nr_sectors >= max_discard_sectors) {
+               err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0);
+               nr_sectors -= max_discard_sectors;
+               start += max_discard_sectors;
+       }
+       if (nr_sectors) {
+               /* max_discard_sectors is unsigned int (and a multiple of
+                * granularity, we made sure of that above already);
+                * nr is < max_discard_sectors;
+                * I don't need sector_div here, even though nr is sector_t */
+               nr = nr_sectors;
+               nr -= (unsigned int)nr % granularity;
+               if (nr) {
+                       err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
+                       nr_sectors -= nr;
+                       start += nr;
+               }
+       }
+ zero_out:
+       if (nr_sectors) {
+               err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
+                               (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
+       }
+       return err != 0;
+}
 
+static bool can_do_reliable_discards(struct drbd_device *device)
+{
+       struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+       struct disk_conf *dc;
+       bool can_do;
+
+       if (!blk_queue_discard(q))
+               return false;
+
+       rcu_read_lock();
+       dc = rcu_dereference(device->ldev->disk_conf);
+       can_do = dc->discard_zeroes_if_aligned;
+       rcu_read_unlock();
+       return can_do;
+}
+
+static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+       /* If the backend cannot discard, or does not guarantee
+        * read-back zeroes in discarded ranges, we fall back to
+        * zero-out.  Unless configuration specifically requested
+        * otherwise. */
+       if (!can_do_reliable_discards(device))
+               peer_req->flags |= EE_ZEROOUT;
+
+       if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
+           peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
+               peer_req->flags |= EE_WAS_ERROR;
        drbd_endio_write_sec_final(peer_req);
 }
 
@@ -1550,7 +1665,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
         * Correctness first, performance later.  Next step is to code an
         * asynchronous variant of the same.
         */
-       if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
+       if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) {
                /* wait for all pending IO completions, before we start
                 * zeroing things out. */
                conn_wait_active_ee_empty(peer_req->peer_device->connection);
@@ -1567,8 +1682,8 @@ int drbd_submit_peer_request(struct drbd_device *device,
                        spin_unlock_irq(&device->resource->req_lock);
                }
 
-               if (peer_req->flags & EE_IS_TRIM)
-                       drbd_issue_peer_discard(device, peer_req);
+               if (peer_req->flags & (EE_TRIM|EE_ZEROOUT))
+                       drbd_issue_peer_discard_or_zero_out(device, peer_req);
                else /* EE_WRITE_SAME */
                        drbd_issue_peer_wsame(device, peer_req);
                return 0;
@@ -1765,6 +1880,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
        void *dig_vv = peer_device->connection->int_dig_vv;
        unsigned long *data;
        struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+       struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
        struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
 
        digest_size = 0;
@@ -1786,6 +1902,10 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
                if (!expect(data_size == 0))
                        return NULL;
                ds = be32_to_cpu(trim->size);
+       } else if (zeroes) {
+               if (!expect(data_size == 0))
+                       return NULL;
+               ds = be32_to_cpu(zeroes->size);
        } else if (wsame) {
                if (data_size != queue_logical_block_size(device->rq_queue)) {
                        drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
@@ -1802,7 +1922,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 
        if (!expect(IS_ALIGNED(ds, 512)))
                return NULL;
-       if (trim || wsame) {
+       if (trim || wsame || zeroes) {
                if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
                        return NULL;
        } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
@@ -1827,7 +1947,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 
        peer_req->flags |= EE_WRITE;
        if (trim) {
-               peer_req->flags |= EE_IS_TRIM;
+               peer_req->flags |= EE_TRIM;
+               return peer_req;
+       }
+       if (zeroes) {
+               peer_req->flags |= EE_ZEROOUT;
                return peer_req;
        }
        if (wsame)
@@ -2326,8 +2450,12 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
 
 static unsigned long wire_flags_to_bio_op(u32 dpf)
 {
-       if (dpf & DP_DISCARD)
+       if (dpf & DP_ZEROES)
                return REQ_OP_WRITE_ZEROES;
+       if (dpf & DP_DISCARD)
+               return REQ_OP_DISCARD;
+       if (dpf & DP_WSAME)
+               return REQ_OP_WRITE_SAME;
        else
                return REQ_OP_WRITE;
 }
@@ -2517,9 +2645,20 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
        op = wire_flags_to_bio_op(dp_flags);
        op_flags = wire_flags_to_bio_flags(dp_flags);
        if (pi->cmd == P_TRIM) {
+               D_ASSERT(peer_device, peer_req->i.size > 0);
+               D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+               D_ASSERT(peer_device, peer_req->pages == NULL);
+               /* need to play safe: an older DRBD sender
+                * may mean zero-out while sending P_TRIM. */
+               if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
+                       peer_req->flags |= EE_ZEROOUT;
+       } else if (pi->cmd == P_ZEROES) {
                D_ASSERT(peer_device, peer_req->i.size > 0);
                D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
                D_ASSERT(peer_device, peer_req->pages == NULL);
+               /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
+               if (dp_flags & DP_DISCARD)
+                       peer_req->flags |= EE_TRIM;
        } else if (peer_req->pages == NULL) {
                D_ASSERT(device, peer_req->i.size == 0);
                D_ASSERT(device, dp_flags & DP_FLUSH);
@@ -2587,7 +2726,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
         * we wait for all pending requests, respectively wait for
         * active_ee to become empty in drbd_submit_peer_request();
         * better not add ourselves here. */
-       if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
+       if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0)
                list_add_tail(&peer_req->w.list, &device->active_ee);
        spin_unlock_irq(&device->resource->req_lock);
 
@@ -3364,7 +3503,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
        enum drbd_conns rv = C_MASK;
        enum drbd_disk_state mydisk;
        struct net_conf *nc;
-       int hg, rule_nr, rr_conflict, tentative;
+       int hg, rule_nr, rr_conflict, tentative, always_asbp;
 
        mydisk = device->state.disk;
        if (mydisk == D_NEGOTIATING)
@@ -3415,8 +3554,12 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
 
        rcu_read_lock();
        nc = rcu_dereference(peer_device->connection->net_conf);
+       always_asbp = nc->always_asbp;
+       rr_conflict = nc->rr_conflict;
+       tentative = nc->tentative;
+       rcu_read_unlock();
 
-       if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+       if (hg == 100 || (hg == -100 && always_asbp)) {
                int pcount = (device->state.role == R_PRIMARY)
                           + (peer_role == R_PRIMARY);
                int forced = (hg == -100);
@@ -3455,9 +3598,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
                             "Sync from %s node\n",
                             (hg < 0) ? "peer" : "this");
        }
-       rr_conflict = nc->rr_conflict;
-       tentative = nc->tentative;
-       rcu_read_unlock();
 
        if (hg == -100) {
                /* FIXME this log message is not correct if we end up here
@@ -3980,6 +4120,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
        enum determine_dev_size dd = DS_UNCHANGED;
        sector_t p_size, p_usize, p_csize, my_usize;
+       sector_t new_size, cur_size;
        int ldsc = 0; /* local disk size changed */
        enum dds_flags ddsf;
 
@@ -3987,6 +4128,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        if (!peer_device)
                return config_unknown_volume(connection, pi);
        device = peer_device->device;
+       cur_size = drbd_get_capacity(device->this_bdev);
 
        p_size = be64_to_cpu(p->d_size);
        p_usize = be64_to_cpu(p->u_size);
@@ -3997,7 +4139,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
        device->p_size = p_size;
 
        if (get_ldev(device)) {
-               sector_t new_size, cur_size;
                rcu_read_lock();
                my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
                rcu_read_unlock();
@@ -4012,13 +4153,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
                if (device->state.conn == C_WF_REPORT_PARAMS)
                        p_usize = min_not_zero(my_usize, p_usize);
 
-               /* Never shrink a device with usable data during connect.
-                  But allow online shrinking if we are connected. */
+               /* Never shrink a device with usable data during connect,
+                * or "attach" on the peer.
+                * But allow online shrinking if we are connected. */
                new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
-               cur_size = drbd_get_capacity(device->this_bdev);
                if (new_size < cur_size &&
                    device->state.disk >= D_OUTDATED &&
-                   device->state.conn < C_CONNECTED) {
+                   (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
                        drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
                                        (unsigned long long)new_size, (unsigned long long)cur_size);
                        conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
@@ -4046,8 +4187,8 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
                        synchronize_rcu();
                        kfree(old_disk_conf);
 
-                       drbd_info(device, "Peer sets u_size to %lu sectors\n",
-                                (unsigned long)my_usize);
+                       drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
+                                (unsigned long)p_usize, (unsigned long)my_usize);
                }
 
                put_ldev(device);
@@ -4080,9 +4221,36 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
                 *
                 * However, if he sends a zero current size,
                 * take his (user-capped or) backing disk size anyways.
+                *
+                * Unless of course he does not have a disk himself.
+                * In which case we ignore this completely.
                 */
+               sector_t new_size = p_csize ?: p_usize ?: p_size;
                drbd_reconsider_queue_parameters(device, NULL, o);
-               drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
+               if (new_size == 0) {
+                       /* Ignore, peer does not know nothing. */
+               } else if (new_size == cur_size) {
+                       /* nothing to do */
+               } else if (cur_size != 0 && p_size == 0) {
+                       drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
+                                       (unsigned long long)new_size, (unsigned long long)cur_size);
+               } else if (new_size < cur_size && device->state.role == R_PRIMARY) {
+                       drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
+                                       (unsigned long long)new_size, (unsigned long long)cur_size);
+                       conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+                       return -EIO;
+               } else {
+                       /* I believe the peer, if
+                        *  - I don't have a current size myself
+                        *  - we agree on the size anyways
+                        *  - I do have a current size, am Secondary,
+                        *    and he has the only disk
+                        *  - I do have a current size, am Primary,
+                        *    and he has the only disk,
+                        *    which is larger than my current size
+                        */
+                       drbd_set_my_capacity(device, new_size);
+               }
        }
 
        if (get_ldev(device)) {
@@ -4142,7 +4310,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info
        kfree(device->p_uuid);
        device->p_uuid = p_uuid;
 
-       if (device->state.conn < C_CONNECTED &&
+       if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
            device->state.disk < D_INCONSISTENT &&
            device->state.role == R_PRIMARY &&
            (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
@@ -4368,6 +4536,25 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
        if (peer_state.conn == C_AHEAD)
                ns.conn = C_BEHIND;
 
+       /* TODO:
+        * if (primary and diskless and peer uuid != effective uuid)
+        *     abort attach on peer;
+        *
+        * If this node does not have good data, was already connected, but
+        * the peer did a late attach only now, trying to "negotiate" with me,
+        * AND I am currently Primary, possibly frozen, with some specific
+        * "effective" uuid, this should never be reached, really, because
+        * we first send the uuids, then the current state.
+        *
+        * In this scenario, we already dropped the connection hard
+        * when we received the unsuitable uuids (receive_uuids().
+        *
+        * Should we want to change this, that is: not drop the connection in
+        * receive_uuids() already, then we would need to add a branch here
+        * that aborts the attach of "unsuitable uuids" on the peer in case
+        * this node is currently Diskless Primary.
+        */
+
        if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
            get_ldev_if_state(device, D_NEGOTIATING)) {
                int cr; /* consider resync */
@@ -4380,7 +4567,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info
                       (peer_state.disk == D_NEGOTIATING ||
                        os.disk == D_NEGOTIATING));
                /* if we have both been inconsistent, and the peer has been
-                * forced to be UpToDate with --overwrite-data */
+                * forced to be UpToDate with --force */
                cr |= test_bit(CONSIDER_RESYNC, &device->flags);
                /* if we had been plain connected, and the admin requested to
                 * start a sync by "invalidate" or "invalidate-remote" */
@@ -4845,7 +5032,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
 
                peer_req->w.cb = e_end_resync_block;
                peer_req->submit_jif = jiffies;
-               peer_req->flags |= EE_IS_TRIM;
+               peer_req->flags |= EE_TRIM;
 
                spin_lock_irq(&device->resource->req_lock);
                list_add_tail(&peer_req->w.list, &device->sync_ee);
@@ -4913,6 +5100,7 @@ static struct data_cmd drbd_cmd_handler[] = {
        [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
        [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
        [P_TRIM]            = { 0, sizeof(struct p_trim), receive_Data },
+       [P_ZEROES]          = { 0, sizeof(struct p_trim), receive_Data },
        [P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
        [P_WSAME]           = { 1, sizeof(struct p_wsame), receive_Data },
 };
@@ -5197,11 +5385,12 @@ static int drbd_do_features(struct drbd_connection *connection)
        drbd_info(connection, "Handshake successful: "
             "Agreed network protocol version %d\n", connection->agreed_pro_version);
 
-       drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
+       drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
                  connection->agreed_features,
                  connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
                  connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
-                 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
+                 connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
+                 connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
                  connection->agreed_features ? "" : " none");
 
        return 1;
@@ -5284,7 +5473,7 @@ static int drbd_do_auth(struct drbd_connection *connection)
        if (pi.cmd != P_AUTH_CHALLENGE) {
                drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
                         cmdname(pi.cmd), pi.cmd);
-               rv = 0;
+               rv = -1;
                goto fail;
        }