fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset,
 129                                    unsigned int len)
 130 {
 131         struct zonefs_zone *z = zonefs_inode_zone(inode);
 132
 133         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 134                 return -EIO;
 135         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 136                 return -EIO;
 137
 138         /* If the mapping is already OK, nothing needs to be done */
 139         if (offset >= wpc->iomap.offset &&
 140             offset < wpc->iomap.offset + wpc->iomap.length)
 141                 return 0;
 142
 143         return zonefs_write_iomap_begin(inode, offset,
 144                                         z->z_capacity - offset,
 145                                         IOMAP_WRITE, &wpc->iomap, NULL);
 146 }
 147
 148 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 149         .map_blocks             = zonefs_write_map_blocks,
 150 };
 151
 152 static int zonefs_writepages(struct address_space *mapping,
 153                              struct writeback_control *wbc)
 154 {
 155         struct iomap_writepage_ctx wpc = { };
 156
 157         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 158 }
 159
 160 static int zonefs_swap_activate(struct swap_info_struct *sis,
 161                                 struct file *swap_file, sector_t *span)
 162 {
 163         struct inode *inode = file_inode(swap_file);
 164
 165         if (zonefs_inode_is_seq(inode)) {
 166                 zonefs_err(inode->i_sb,
 167                            "swap file: not a conventional zone file\n");
 168                 return -EINVAL;
 169         }
 170
 171         return iomap_swapfile_activate(sis, swap_file, span,
 172                                        &zonefs_read_iomap_ops);
 173 }
 174
 175 const struct address_space_operations zonefs_file_aops = {
 176         .read_folio             = zonefs_read_folio,
 177         .readahead              = zonefs_readahead,
 178         .writepages             = zonefs_writepages,
 179         .dirty_folio            = iomap_dirty_folio,
 180         .release_folio          = iomap_release_folio,
 181         .invalidate_folio       = iomap_invalidate_folio,
 182         .migrate_folio          = filemap_migrate_folio,
 183         .is_partially_uptodate  = iomap_is_partially_uptodate,
 184         .error_remove_folio     = generic_error_remove_folio,
 185         .swap_activate          = zonefs_swap_activate,
 186 };
 187
 188 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 189 {
 190         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 191         struct zonefs_zone *z = zonefs_inode_zone(inode);
 192         loff_t old_isize;
 193         enum req_op op;
 194         int ret = 0;
 195
 196         /*
 197          * Only sequential zone files can be truncated and truncation is allowed
 198          * only down to a 0 size, which is equivalent to a zone reset, and to
 199          * the maximum file size, which is equivalent to a zone finish.
 200          */
 201         if (!zonefs_zone_is_seq(z))
 202                 return -EPERM;
 203
 204         if (!isize)
 205                 op = REQ_OP_ZONE_RESET;
 206         else if (isize == z->z_capacity)
 207                 op = REQ_OP_ZONE_FINISH;
 208         else
 209                 return -EPERM;
 210
 211         inode_dio_wait(inode);
 212
 213         /* Serialize against page faults */
 214         filemap_invalidate_lock(inode->i_mapping);
 215
 216         /* Serialize against zonefs_iomap_begin() */
 217         mutex_lock(&zi->i_truncate_mutex);
 218
 219         old_isize = i_size_read(inode);
 220         if (isize == old_isize)
 221                 goto unlock;
 222
 223         ret = zonefs_inode_zone_mgmt(inode, op);
 224         if (ret)
 225                 goto unlock;
 226
 227         /*
 228          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 229          * take care of open zones.
 230          */
 231         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 232                 /*
 233                  * Truncating a zone to EMPTY or FULL is the equivalent of
 234                  * closing the zone. For a truncation to 0, we need to
 235                  * re-open the zone to ensure new writes can be processed.
 236                  * For a truncation to the maximum file size, the zone is
 237                  * closed and writes cannot be accepted anymore, so clear
 238                  * the open flag.
 239                  */
 240                 if (!isize)
 241                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 242                 else
 243                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 244         }
 245
 246         zonefs_update_stats(inode, isize);
 247         truncate_setsize(inode, isize);
 248         z->z_wpoffset = isize;
 249         zonefs_inode_account_active(inode);
 250
 251 unlock:
 252         mutex_unlock(&zi->i_truncate_mutex);
 253         filemap_invalidate_unlock(inode->i_mapping);
 254
 255         return ret;
 256 }
 257
 258 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 259                              int datasync)
 260 {
 261         struct inode *inode = file_inode(file);
 262         int ret = 0;
 263
 264         if (unlikely(IS_IMMUTABLE(inode)))
 265                 return -EPERM;
 266
 267         /*
 268          * Since only direct writes are allowed in sequential files, page cache
 269          * flush is needed only for conventional zone files.
 270          */
 271         if (zonefs_inode_is_cnv(inode))
 272                 ret = file_write_and_wait_range(file, start, end);
 273         if (!ret)
 274                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 275
 276         if (ret)
 277                 zonefs_io_error(inode, true);
 278
 279         return ret;
 280 }
 281
 282 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 283 {
 284         struct inode *inode = file_inode(vmf->vma->vm_file);
 285         vm_fault_t ret;
 286
 287         if (unlikely(IS_IMMUTABLE(inode)))
 288                 return VM_FAULT_SIGBUS;
 289
 290         /*
 291          * Sanity check: only conventional zone files can have shared
 292          * writeable mappings.
 293          */
 294         if (zonefs_inode_is_seq(inode))
 295                 return VM_FAULT_NOPAGE;
 296
 297         sb_start_pagefault(inode->i_sb);
 298         file_update_time(vmf->vma->vm_file);
 299
 300         /* Serialize against truncates */
 301         filemap_invalidate_lock_shared(inode->i_mapping);
 302         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 303         filemap_invalidate_unlock_shared(inode->i_mapping);
 304
 305         sb_end_pagefault(inode->i_sb);
 306         return ret;
 307 }
 308
 309 static const struct vm_operations_struct zonefs_file_vm_ops = {
 310         .fault          = filemap_fault,
 311         .map_pages      = filemap_map_pages,
 312         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 313 };
 314
 315 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 316 {
 317         /*
 318          * Conventional zones accept random writes, so their files can support
 319          * shared writable mappings. For sequential zone files, only read
 320          * mappings are possible since there are no guarantees for write
 321          * ordering between msync() and page cache writeback.
 322          */
 323         if (zonefs_inode_is_seq(file_inode(file)) &&
 324             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 325                 return -EINVAL;
 326
 327         file_accessed(file);
 328         vma->vm_ops = &zonefs_file_vm_ops;
 329
 330         return 0;
 331 }
 332
 333 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 334 {
 335         loff_t isize = i_size_read(file_inode(file));
 336
 337         /*
 338          * Seeks are limited to below the zone size for conventional zones
 339          * and below the zone write pointer for sequential zones. In both
 340          * cases, this limit is the inode size.
 341          */
 342         return generic_file_llseek_size(file, offset, whence, isize, isize);
 343 }
 344
 345 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 346                                         int error, unsigned int flags)
 347 {
 348         struct inode *inode = file_inode(iocb->ki_filp);
 349         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 350
 351         if (error) {
 352                 /*
 353                  * For Sync IOs, error recovery is called from
 354                  * zonefs_file_dio_write().
 355                  */
 356                 if (!is_sync_kiocb(iocb))
 357                         zonefs_io_error(inode, true);
 358                 return error;
 359         }
 360
 361         if (size && zonefs_inode_is_seq(inode)) {
 362                 /*
 363                  * Note that we may be seeing completions out of order,
 364                  * but that is not a problem since a write completed
 365                  * successfully necessarily means that all preceding writes
 366                  * were also successful. So we can safely increase the inode
 367                  * size to the write end location.
 368                  */
 369                 mutex_lock(&zi->i_truncate_mutex);
 370                 if (i_size_read(inode) < iocb->ki_pos + size) {
 371                         zonefs_update_stats(inode, iocb->ki_pos + size);
 372                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 373                 }
 374                 mutex_unlock(&zi->i_truncate_mutex);
 375         }
 376
 377         return 0;
 378 }
 379
 380 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 381         .end_io         = zonefs_file_write_dio_end_io,
 382 };
 383
 384 /*
 385  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 386  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 387  */
 388 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 389                                         loff_t count)
 390 {
 391         struct inode *inode = file_inode(file);
 392         struct zonefs_zone *z = zonefs_inode_zone(inode);
 393         loff_t limit = rlimit(RLIMIT_FSIZE);
 394         loff_t max_size = z->z_capacity;
 395
 396         if (limit != RLIM_INFINITY) {
 397                 if (pos >= limit) {
 398                         send_sig(SIGXFSZ, current, 0);
 399                         return -EFBIG;
 400                 }
 401                 count = min(count, limit - pos);
 402         }
 403
 404         if (!(file->f_flags & O_LARGEFILE))
 405                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 406
 407         if (unlikely(pos >= max_size))
 408                 return -EFBIG;
 409
 410         return min(count, max_size - pos);
 411 }
 412
 413 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 414 {
 415         struct file *file = iocb->ki_filp;
 416         struct inode *inode = file_inode(file);
 417         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 418         struct zonefs_zone *z = zonefs_inode_zone(inode);
 419         loff_t count;
 420
 421         if (IS_SWAPFILE(inode))
 422                 return -ETXTBSY;
 423
 424         if (!iov_iter_count(from))
 425                 return 0;
 426
 427         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 428                 return -EINVAL;
 429
 430         if (iocb->ki_flags & IOCB_APPEND) {
 431                 if (zonefs_zone_is_cnv(z))
 432                         return -EINVAL;
 433                 mutex_lock(&zi->i_truncate_mutex);
 434                 iocb->ki_pos = z->z_wpoffset;
 435                 mutex_unlock(&zi->i_truncate_mutex);
 436         }
 437
 438         count = zonefs_write_check_limits(file, iocb->ki_pos,
 439                                           iov_iter_count(from));
 440         if (count < 0)
 441                 return count;
 442
 443         iov_iter_truncate(from, count);
 444         return iov_iter_count(from);
 445 }
 446
 447 /*
 448  * Handle direct writes. For sequential zone files, this is the only possible
 449  * write path. For these files, check that the user is issuing writes
 450  * sequentially from the end of the file. This code assumes that the block layer
 451  * delivers write requests to the device in sequential order. This is always the
 452  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 453  * elevator feature is being used (e.g. mq-deadline). The block layer always
 454  * automatically select such an elevator for zoned block devices during the
 455  * device initialization.
 456  */
 457 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 458 {
 459         struct inode *inode = file_inode(iocb->ki_filp);
 460         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 461         struct zonefs_zone *z = zonefs_inode_zone(inode);
 462         struct super_block *sb = inode->i_sb;
 463         ssize_t ret, count;
 464
 465         /*
 466          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 467          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 468          * on the inode lock but the second goes through but is now unaligned).
 469          */
 470         if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
 471             (iocb->ki_flags & IOCB_NOWAIT))
 472                 return -EOPNOTSUPP;
 473
 474         if (iocb->ki_flags & IOCB_NOWAIT) {
 475                 if (!inode_trylock(inode))
 476                         return -EAGAIN;
 477         } else {
 478                 inode_lock(inode);
 479         }
 480
 481         count = zonefs_write_checks(iocb, from);
 482         if (count <= 0) {
 483                 ret = count;
 484                 goto inode_unlock;
 485         }
 486
 487         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 488                 ret = -EINVAL;
 489                 goto inode_unlock;
 490         }
 491
 492         /* Enforce sequential writes (append only) in sequential zones */
 493         if (zonefs_zone_is_seq(z)) {
 494                 mutex_lock(&zi->i_truncate_mutex);
 495                 if (iocb->ki_pos != z->z_wpoffset) {
 496                         mutex_unlock(&zi->i_truncate_mutex);
 497                         ret = -EINVAL;
 498                         goto inode_unlock;
 499                 }
 500                 /*
 501                  * Advance the zone write pointer offset. This assumes that the
 502                  * IO will succeed, which is OK to do because we do not allow
 503                  * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
 504                  * fails, the error path will correct the write pointer offset.
 505                  */
 506                 z->z_wpoffset += count;
 507                 zonefs_inode_account_active(inode);
 508                 mutex_unlock(&zi->i_truncate_mutex);
 509         }
 510
 511         /*
 512          * iomap_dio_rw() may return ENOTBLK if there was an issue with
 513          * page invalidation. Overwrite that error code with EBUSY so that
 514          * the user can make sense of the error.
 515          */
 516         ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 517                            &zonefs_write_dio_ops, 0, NULL, 0);
 518         if (ret == -ENOTBLK)
 519                 ret = -EBUSY;
 520
 521         /*
 522          * For a failed IO or partial completion, trigger error recovery
 523          * to update the zone write pointer offset to a correct value.
 524          * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
 525          * have executed error recovery if the IO already completed when we
 526          * reach here. However, we cannot know that and execute error recovery
 527          * again (that will not change anything).
 528          */
 529         if (zonefs_zone_is_seq(z)) {
 530                 if (ret > 0 && ret != count)
 531                         ret = -EIO;
 532                 if (ret < 0 && ret != -EIOCBQUEUED)
 533                         zonefs_io_error(inode, true);
 534         }
 535
 536 inode_unlock:
 537         inode_unlock(inode);
 538
 539         return ret;
 540 }
 541
 542 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 543                                           struct iov_iter *from)
 544 {
 545         struct inode *inode = file_inode(iocb->ki_filp);
 546         ssize_t ret;
 547
 548         /*
 549          * Direct IO writes are mandatory for sequential zone files so that the
 550          * write IO issuing order is preserved.
 551          */
 552         if (zonefs_inode_is_seq(inode))
 553                 return -EIO;
 554
 555         if (iocb->ki_flags & IOCB_NOWAIT) {
 556                 if (!inode_trylock(inode))
 557                         return -EAGAIN;
 558         } else {
 559                 inode_lock(inode);
 560         }
 561
 562         ret = zonefs_write_checks(iocb, from);
 563         if (ret <= 0)
 564                 goto inode_unlock;
 565
 566         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 567         if (ret == -EIO)
 568                 zonefs_io_error(inode, true);
 569
 570 inode_unlock:
 571         inode_unlock(inode);
 572         if (ret > 0)
 573                 ret = generic_write_sync(iocb, ret);
 574
 575         return ret;
 576 }
 577
 578 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 579 {
 580         struct inode *inode = file_inode(iocb->ki_filp);
 581         struct zonefs_zone *z = zonefs_inode_zone(inode);
 582
 583         if (unlikely(IS_IMMUTABLE(inode)))
 584                 return -EPERM;
 585
 586         if (sb_rdonly(inode->i_sb))
 587                 return -EROFS;
 588
 589         /* Write operations beyond the zone capacity are not allowed */
 590         if (iocb->ki_pos >= z->z_capacity)
 591                 return -EFBIG;
 592
 593         if (iocb->ki_flags & IOCB_DIRECT) {
 594                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 595
 596                 if (ret != -ENOTBLK)
 597                         return ret;
 598         }
 599
 600         return zonefs_file_buffered_write(iocb, from);
 601 }
 602
 603 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 604                                        int error, unsigned int flags)
 605 {
 606         if (error) {
 607                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 608                 return error;
 609         }
 610
 611         return 0;
 612 }
 613
 614 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 615         .end_io                 = zonefs_file_read_dio_end_io,
 616 };
 617
 618 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 619 {
 620         struct inode *inode = file_inode(iocb->ki_filp);
 621         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 622         struct zonefs_zone *z = zonefs_inode_zone(inode);
 623         struct super_block *sb = inode->i_sb;
 624         loff_t isize;
 625         ssize_t ret;
 626
 627         /* Offline zones cannot be read */
 628         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 629                 return -EPERM;
 630
 631         if (iocb->ki_pos >= z->z_capacity)
 632                 return 0;
 633
 634         if (iocb->ki_flags & IOCB_NOWAIT) {
 635                 if (!inode_trylock_shared(inode))
 636                         return -EAGAIN;
 637         } else {
 638                 inode_lock_shared(inode);
 639         }
 640
 641         /* Limit read operations to written data */
 642         mutex_lock(&zi->i_truncate_mutex);
 643         isize = i_size_read(inode);
 644         if (iocb->ki_pos >= isize) {
 645                 mutex_unlock(&zi->i_truncate_mutex);
 646                 ret = 0;
 647                 goto inode_unlock;
 648         }
 649         iov_iter_truncate(to, isize - iocb->ki_pos);
 650         mutex_unlock(&zi->i_truncate_mutex);
 651
 652         if (iocb->ki_flags & IOCB_DIRECT) {
 653                 size_t count = iov_iter_count(to);
 654
 655                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 656                         ret = -EINVAL;
 657                         goto inode_unlock;
 658                 }
 659                 file_accessed(iocb->ki_filp);
 660                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 661                                    &zonefs_read_dio_ops, 0, NULL, 0);
 662         } else {
 663                 ret = generic_file_read_iter(iocb, to);
 664                 if (ret == -EIO)
 665                         zonefs_io_error(inode, false);
 666         }
 667
 668 inode_unlock:
 669         inode_unlock_shared(inode);
 670
 671         return ret;
 672 }
 673
 674 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
 675                                        struct pipe_inode_info *pipe,
 676                                        size_t len, unsigned int flags)
 677 {
 678         struct inode *inode = file_inode(in);
 679         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 680         struct zonefs_zone *z = zonefs_inode_zone(inode);
 681         loff_t isize;
 682         ssize_t ret = 0;
 683
 684         /* Offline zones cannot be read */
 685         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 686                 return -EPERM;
 687
 688         if (*ppos >= z->z_capacity)
 689                 return 0;
 690
 691         inode_lock_shared(inode);
 692
 693         /* Limit read operations to written data */
 694         mutex_lock(&zi->i_truncate_mutex);
 695         isize = i_size_read(inode);
 696         if (*ppos >= isize)
 697                 len = 0;
 698         else
 699                 len = min_t(loff_t, len, isize - *ppos);
 700         mutex_unlock(&zi->i_truncate_mutex);
 701
 702         if (len > 0) {
 703                 ret = filemap_splice_read(in, ppos, pipe, len, flags);
 704                 if (ret == -EIO)
 705                         zonefs_io_error(inode, false);
 706         }
 707
 708         inode_unlock_shared(inode);
 709         return ret;
 710 }
 711
 712 /*
 713  * Write open accounting is done only for sequential files.
 714  */
 715 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 716                                             struct file *file)
 717 {
 718         if (zonefs_inode_is_cnv(inode))
 719                 return false;
 720
 721         if (!(file->f_mode & FMODE_WRITE))
 722                 return false;
 723
 724         return true;
 725 }
 726
 727 static int zonefs_seq_file_write_open(struct inode *inode)
 728 {
 729         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 730         struct zonefs_zone *z = zonefs_inode_zone(inode);
 731         int ret = 0;
 732
 733         mutex_lock(&zi->i_truncate_mutex);
 734
 735         if (!zi->i_wr_refcnt) {
 736                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 737                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 738
 739                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 740
 741                         if (sbi->s_max_wro_seq_files
 742                             && wro > sbi->s_max_wro_seq_files) {
 743                                 atomic_dec(&sbi->s_wro_seq_files);
 744                                 ret = -EBUSY;
 745                                 goto unlock;
 746                         }
 747
 748                         if (i_size_read(inode) < z->z_capacity) {
 749                                 ret = zonefs_inode_zone_mgmt(inode,
 750                                                              REQ_OP_ZONE_OPEN);
 751                                 if (ret) {
 752                                         atomic_dec(&sbi->s_wro_seq_files);
 753                                         goto unlock;
 754                                 }
 755                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 756                                 zonefs_inode_account_active(inode);
 757                         }
 758                 }
 759         }
 760
 761         zi->i_wr_refcnt++;
 762
 763 unlock:
 764         mutex_unlock(&zi->i_truncate_mutex);
 765
 766         return ret;
 767 }
 768
 769 static int zonefs_file_open(struct inode *inode, struct file *file)
 770 {
 771         int ret;
 772
 773         file->f_mode |= FMODE_CAN_ODIRECT;
 774         ret = generic_file_open(inode, file);
 775         if (ret)
 776                 return ret;
 777
 778         if (zonefs_seq_file_need_wro(inode, file))
 779                 return zonefs_seq_file_write_open(inode);
 780
 781         return 0;
 782 }
 783
 784 static void zonefs_seq_file_write_close(struct inode *inode)
 785 {
 786         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 787         struct zonefs_zone *z = zonefs_inode_zone(inode);
 788         struct super_block *sb = inode->i_sb;
 789         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 790         int ret = 0;
 791
 792         mutex_lock(&zi->i_truncate_mutex);
 793
 794         zi->i_wr_refcnt--;
 795         if (zi->i_wr_refcnt)
 796                 goto unlock;
 797
 798         /*
 799          * The file zone may not be open anymore (e.g. the file was truncated to
 800          * its maximum size or it was fully written). For this case, we only
 801          * need to decrement the write open count.
 802          */
 803         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 804                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 805                 if (ret) {
 806                         __zonefs_io_error(inode, false);
 807                         /*
 808                          * Leaving zones explicitly open may lead to a state
 809                          * where most zones cannot be written (zone resources
 810                          * exhausted). So take preventive action by remounting
 811                          * read-only.
 812                          */
 813                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 814                             !(sb->s_flags & SB_RDONLY)) {
 815                                 zonefs_warn(sb,
 816                                         "closing zone at %llu failed %d\n",
 817                                         z->z_sector, ret);
 818                                 zonefs_warn(sb,
 819                                         "remounting filesystem read-only\n");
 820                                 sb->s_flags |= SB_RDONLY;
 821                         }
 822                         goto unlock;
 823                 }
 824
 825                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 826                 zonefs_inode_account_active(inode);
 827         }
 828
 829         atomic_dec(&sbi->s_wro_seq_files);
 830
 831 unlock:
 832         mutex_unlock(&zi->i_truncate_mutex);
 833 }
 834
 835 static int zonefs_file_release(struct inode *inode, struct file *file)
 836 {
 837         /*
 838          * If we explicitly open a zone we must close it again as well, but the
 839          * zone management operation can fail (either due to an IO error or as
 840          * the zone has gone offline or read-only). Make sure we don't fail the
 841          * close(2) for user-space.
 842          */
 843         if (zonefs_seq_file_need_wro(inode, file))
 844                 zonefs_seq_file_write_close(inode);
 845
 846         return 0;
 847 }
 848
 849 const struct file_operations zonefs_file_operations = {
 850         .open           = zonefs_file_open,
 851         .release        = zonefs_file_release,
 852         .fsync          = zonefs_file_fsync,
 853         .mmap           = zonefs_file_mmap,
 854         .llseek         = zonefs_file_llseek,
 855         .read_iter      = zonefs_file_read_iter,
 856         .write_iter     = zonefs_file_write_iter,
 857         .splice_read    = zonefs_file_splice_read,
 858         .splice_write   = iter_file_splice_write,
 859         .iopoll         = iocb_bio_iopoll,
 860 };