fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset)
 129 {
 130         struct zonefs_zone *z = zonefs_inode_zone(inode);
 131
 132         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 133                 return -EIO;
 134         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 135                 return -EIO;
 136
 137         /* If the mapping is already OK, nothing needs to be done */
 138         if (offset >= wpc->iomap.offset &&
 139             offset < wpc->iomap.offset + wpc->iomap.length)
 140                 return 0;
 141
 142         return zonefs_write_iomap_begin(inode, offset,
 143                                         z->z_capacity - offset,
 144                                         IOMAP_WRITE, &wpc->iomap, NULL);
 145 }
 146
 147 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 148         .map_blocks             = zonefs_write_map_blocks,
 149 };
 150
 151 static int zonefs_writepages(struct address_space *mapping,
 152                              struct writeback_control *wbc)
 153 {
 154         struct iomap_writepage_ctx wpc = { };
 155
 156         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 157 }
 158
 159 static int zonefs_swap_activate(struct swap_info_struct *sis,
 160                                 struct file *swap_file, sector_t *span)
 161 {
 162         struct inode *inode = file_inode(swap_file);
 163
 164         if (zonefs_inode_is_seq(inode)) {
 165                 zonefs_err(inode->i_sb,
 166                            "swap file: not a conventional zone file\n");
 167                 return -EINVAL;
 168         }
 169
 170         return iomap_swapfile_activate(sis, swap_file, span,
 171                                        &zonefs_read_iomap_ops);
 172 }
 173
 174 const struct address_space_operations zonefs_file_aops = {
 175         .read_folio             = zonefs_read_folio,
 176         .readahead              = zonefs_readahead,
 177         .writepages             = zonefs_writepages,
 178         .dirty_folio            = iomap_dirty_folio,
 179         .release_folio          = iomap_release_folio,
 180         .invalidate_folio       = iomap_invalidate_folio,
 181         .migrate_folio          = filemap_migrate_folio,
 182         .is_partially_uptodate  = iomap_is_partially_uptodate,
 183         .error_remove_folio     = generic_error_remove_folio,
 184         .swap_activate          = zonefs_swap_activate,
 185 };
 186
 187 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 188 {
 189         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 190         struct zonefs_zone *z = zonefs_inode_zone(inode);
 191         loff_t old_isize;
 192         enum req_op op;
 193         int ret = 0;
 194
 195         /*
 196          * Only sequential zone files can be truncated and truncation is allowed
 197          * only down to a 0 size, which is equivalent to a zone reset, and to
 198          * the maximum file size, which is equivalent to a zone finish.
 199          */
 200         if (!zonefs_zone_is_seq(z))
 201                 return -EPERM;
 202
 203         if (!isize)
 204                 op = REQ_OP_ZONE_RESET;
 205         else if (isize == z->z_capacity)
 206                 op = REQ_OP_ZONE_FINISH;
 207         else
 208                 return -EPERM;
 209
 210         inode_dio_wait(inode);
 211
 212         /* Serialize against page faults */
 213         filemap_invalidate_lock(inode->i_mapping);
 214
 215         /* Serialize against zonefs_iomap_begin() */
 216         mutex_lock(&zi->i_truncate_mutex);
 217
 218         old_isize = i_size_read(inode);
 219         if (isize == old_isize)
 220                 goto unlock;
 221
 222         ret = zonefs_inode_zone_mgmt(inode, op);
 223         if (ret)
 224                 goto unlock;
 225
 226         /*
 227          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 228          * take care of open zones.
 229          */
 230         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 231                 /*
 232                  * Truncating a zone to EMPTY or FULL is the equivalent of
 233                  * closing the zone. For a truncation to 0, we need to
 234                  * re-open the zone to ensure new writes can be processed.
 235                  * For a truncation to the maximum file size, the zone is
 236                  * closed and writes cannot be accepted anymore, so clear
 237                  * the open flag.
 238                  */
 239                 if (!isize)
 240                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 241                 else
 242                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 243         }
 244
 245         zonefs_update_stats(inode, isize);
 246         truncate_setsize(inode, isize);
 247         z->z_wpoffset = isize;
 248         zonefs_inode_account_active(inode);
 249
 250 unlock:
 251         mutex_unlock(&zi->i_truncate_mutex);
 252         filemap_invalidate_unlock(inode->i_mapping);
 253
 254         return ret;
 255 }
 256
 257 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 258                              int datasync)
 259 {
 260         struct inode *inode = file_inode(file);
 261         int ret = 0;
 262
 263         if (unlikely(IS_IMMUTABLE(inode)))
 264                 return -EPERM;
 265
 266         /*
 267          * Since only direct writes are allowed in sequential files, page cache
 268          * flush is needed only for conventional zone files.
 269          */
 270         if (zonefs_inode_is_cnv(inode))
 271                 ret = file_write_and_wait_range(file, start, end);
 272         if (!ret)
 273                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 274
 275         if (ret)
 276                 zonefs_io_error(inode, true);
 277
 278         return ret;
 279 }
 280
 281 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 282 {
 283         struct inode *inode = file_inode(vmf->vma->vm_file);
 284         vm_fault_t ret;
 285
 286         if (unlikely(IS_IMMUTABLE(inode)))
 287                 return VM_FAULT_SIGBUS;
 288
 289         /*
 290          * Sanity check: only conventional zone files can have shared
 291          * writeable mappings.
 292          */
 293         if (zonefs_inode_is_seq(inode))
 294                 return VM_FAULT_NOPAGE;
 295
 296         sb_start_pagefault(inode->i_sb);
 297         file_update_time(vmf->vma->vm_file);
 298
 299         /* Serialize against truncates */
 300         filemap_invalidate_lock_shared(inode->i_mapping);
 301         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 302         filemap_invalidate_unlock_shared(inode->i_mapping);
 303
 304         sb_end_pagefault(inode->i_sb);
 305         return ret;
 306 }
 307
 308 static const struct vm_operations_struct zonefs_file_vm_ops = {
 309         .fault          = filemap_fault,
 310         .map_pages      = filemap_map_pages,
 311         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 312 };
 313
 314 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 315 {
 316         /*
 317          * Conventional zones accept random writes, so their files can support
 318          * shared writable mappings. For sequential zone files, only read
 319          * mappings are possible since there are no guarantees for write
 320          * ordering between msync() and page cache writeback.
 321          */
 322         if (zonefs_inode_is_seq(file_inode(file)) &&
 323             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 324                 return -EINVAL;
 325
 326         file_accessed(file);
 327         vma->vm_ops = &zonefs_file_vm_ops;
 328
 329         return 0;
 330 }
 331
 332 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 333 {
 334         loff_t isize = i_size_read(file_inode(file));
 335
 336         /*
 337          * Seeks are limited to below the zone size for conventional zones
 338          * and below the zone write pointer for sequential zones. In both
 339          * cases, this limit is the inode size.
 340          */
 341         return generic_file_llseek_size(file, offset, whence, isize, isize);
 342 }
 343
 344 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 345                                         int error, unsigned int flags)
 346 {
 347         struct inode *inode = file_inode(iocb->ki_filp);
 348         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 349
 350         if (error) {
 351                 /*
 352                  * For Sync IOs, error recovery is called from
 353                  * zonefs_file_dio_write().
 354                  */
 355                 if (!is_sync_kiocb(iocb))
 356                         zonefs_io_error(inode, true);
 357                 return error;
 358         }
 359
 360         if (size && zonefs_inode_is_seq(inode)) {
 361                 /*
 362                  * Note that we may be seeing completions out of order,
 363                  * but that is not a problem since a write completed
 364                  * successfully necessarily means that all preceding writes
 365                  * were also successful. So we can safely increase the inode
 366                  * size to the write end location.
 367                  */
 368                 mutex_lock(&zi->i_truncate_mutex);
 369                 if (i_size_read(inode) < iocb->ki_pos + size) {
 370                         zonefs_update_stats(inode, iocb->ki_pos + size);
 371                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 372                 }
 373                 mutex_unlock(&zi->i_truncate_mutex);
 374         }
 375
 376         return 0;
 377 }
 378
 379 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 380         .end_io         = zonefs_file_write_dio_end_io,
 381 };
 382
 383 /*
 384  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 385  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 386  */
 387 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 388                                         loff_t count)
 389 {
 390         struct inode *inode = file_inode(file);
 391         struct zonefs_zone *z = zonefs_inode_zone(inode);
 392         loff_t limit = rlimit(RLIMIT_FSIZE);
 393         loff_t max_size = z->z_capacity;
 394
 395         if (limit != RLIM_INFINITY) {
 396                 if (pos >= limit) {
 397                         send_sig(SIGXFSZ, current, 0);
 398                         return -EFBIG;
 399                 }
 400                 count = min(count, limit - pos);
 401         }
 402
 403         if (!(file->f_flags & O_LARGEFILE))
 404                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 405
 406         if (unlikely(pos >= max_size))
 407                 return -EFBIG;
 408
 409         return min(count, max_size - pos);
 410 }
 411
 412 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 413 {
 414         struct file *file = iocb->ki_filp;
 415         struct inode *inode = file_inode(file);
 416         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 417         struct zonefs_zone *z = zonefs_inode_zone(inode);
 418         loff_t count;
 419
 420         if (IS_SWAPFILE(inode))
 421                 return -ETXTBSY;
 422
 423         if (!iov_iter_count(from))
 424                 return 0;
 425
 426         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 427                 return -EINVAL;
 428
 429         if (iocb->ki_flags & IOCB_APPEND) {
 430                 if (zonefs_zone_is_cnv(z))
 431                         return -EINVAL;
 432                 mutex_lock(&zi->i_truncate_mutex);
 433                 iocb->ki_pos = z->z_wpoffset;
 434                 mutex_unlock(&zi->i_truncate_mutex);
 435         }
 436
 437         count = zonefs_write_check_limits(file, iocb->ki_pos,
 438                                           iov_iter_count(from));
 439         if (count < 0)
 440                 return count;
 441
 442         iov_iter_truncate(from, count);
 443         return iov_iter_count(from);
 444 }
 445
 446 /*
 447  * Handle direct writes. For sequential zone files, this is the only possible
 448  * write path. For these files, check that the user is issuing writes
 449  * sequentially from the end of the file. This code assumes that the block layer
 450  * delivers write requests to the device in sequential order. This is always the
 451  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 452  * elevator feature is being used (e.g. mq-deadline). The block layer always
 453  * automatically select such an elevator for zoned block devices during the
 454  * device initialization.
 455  */
 456 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 457 {
 458         struct inode *inode = file_inode(iocb->ki_filp);
 459         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 460         struct zonefs_zone *z = zonefs_inode_zone(inode);
 461         struct super_block *sb = inode->i_sb;
 462         ssize_t ret, count;
 463
 464         /*
 465          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 466          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 467          * on the inode lock but the second goes through but is now unaligned).
 468          */
 469         if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
 470             (iocb->ki_flags & IOCB_NOWAIT))
 471                 return -EOPNOTSUPP;
 472
 473         if (iocb->ki_flags & IOCB_NOWAIT) {
 474                 if (!inode_trylock(inode))
 475                         return -EAGAIN;
 476         } else {
 477                 inode_lock(inode);
 478         }
 479
 480         count = zonefs_write_checks(iocb, from);
 481         if (count <= 0) {
 482                 ret = count;
 483                 goto inode_unlock;
 484         }
 485
 486         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 487                 ret = -EINVAL;
 488                 goto inode_unlock;
 489         }
 490
 491         /* Enforce sequential writes (append only) in sequential zones */
 492         if (zonefs_zone_is_seq(z)) {
 493                 mutex_lock(&zi->i_truncate_mutex);
 494                 if (iocb->ki_pos != z->z_wpoffset) {
 495                         mutex_unlock(&zi->i_truncate_mutex);
 496                         ret = -EINVAL;
 497                         goto inode_unlock;
 498                 }
 499                 /*
 500                  * Advance the zone write pointer offset. This assumes that the
 501                  * IO will succeed, which is OK to do because we do not allow
 502                  * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
 503                  * fails, the error path will correct the write pointer offset.
 504                  */
 505                 z->z_wpoffset += count;
 506                 zonefs_inode_account_active(inode);
 507                 mutex_unlock(&zi->i_truncate_mutex);
 508         }
 509
 510         /*
 511          * iomap_dio_rw() may return ENOTBLK if there was an issue with
 512          * page invalidation. Overwrite that error code with EBUSY so that
 513          * the user can make sense of the error.
 514          */
 515         ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 516                            &zonefs_write_dio_ops, 0, NULL, 0);
 517         if (ret == -ENOTBLK)
 518                 ret = -EBUSY;
 519
 520         /*
 521          * For a failed IO or partial completion, trigger error recovery
 522          * to update the zone write pointer offset to a correct value.
 523          * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
 524          * have executed error recovery if the IO already completed when we
 525          * reach here. However, we cannot know that and execute error recovery
 526          * again (that will not change anything).
 527          */
 528         if (zonefs_zone_is_seq(z)) {
 529                 if (ret > 0 && ret != count)
 530                         ret = -EIO;
 531                 if (ret < 0 && ret != -EIOCBQUEUED)
 532                         zonefs_io_error(inode, true);
 533         }
 534
 535 inode_unlock:
 536         inode_unlock(inode);
 537
 538         return ret;
 539 }
 540
 541 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 542                                           struct iov_iter *from)
 543 {
 544         struct inode *inode = file_inode(iocb->ki_filp);
 545         ssize_t ret;
 546
 547         /*
 548          * Direct IO writes are mandatory for sequential zone files so that the
 549          * write IO issuing order is preserved.
 550          */
 551         if (zonefs_inode_is_seq(inode))
 552                 return -EIO;
 553
 554         if (iocb->ki_flags & IOCB_NOWAIT) {
 555                 if (!inode_trylock(inode))
 556                         return -EAGAIN;
 557         } else {
 558                 inode_lock(inode);
 559         }
 560
 561         ret = zonefs_write_checks(iocb, from);
 562         if (ret <= 0)
 563                 goto inode_unlock;
 564
 565         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 566         if (ret == -EIO)
 567                 zonefs_io_error(inode, true);
 568
 569 inode_unlock:
 570         inode_unlock(inode);
 571         if (ret > 0)
 572                 ret = generic_write_sync(iocb, ret);
 573
 574         return ret;
 575 }
 576
 577 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 578 {
 579         struct inode *inode = file_inode(iocb->ki_filp);
 580         struct zonefs_zone *z = zonefs_inode_zone(inode);
 581
 582         if (unlikely(IS_IMMUTABLE(inode)))
 583                 return -EPERM;
 584
 585         if (sb_rdonly(inode->i_sb))
 586                 return -EROFS;
 587
 588         /* Write operations beyond the zone capacity are not allowed */
 589         if (iocb->ki_pos >= z->z_capacity)
 590                 return -EFBIG;
 591
 592         if (iocb->ki_flags & IOCB_DIRECT) {
 593                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 594
 595                 if (ret != -ENOTBLK)
 596                         return ret;
 597         }
 598
 599         return zonefs_file_buffered_write(iocb, from);
 600 }
 601
 602 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 603                                        int error, unsigned int flags)
 604 {
 605         if (error) {
 606                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 607                 return error;
 608         }
 609
 610         return 0;
 611 }
 612
 613 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 614         .end_io                 = zonefs_file_read_dio_end_io,
 615 };
 616
 617 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 618 {
 619         struct inode *inode = file_inode(iocb->ki_filp);
 620         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 621         struct zonefs_zone *z = zonefs_inode_zone(inode);
 622         struct super_block *sb = inode->i_sb;
 623         loff_t isize;
 624         ssize_t ret;
 625
 626         /* Offline zones cannot be read */
 627         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 628                 return -EPERM;
 629
 630         if (iocb->ki_pos >= z->z_capacity)
 631                 return 0;
 632
 633         if (iocb->ki_flags & IOCB_NOWAIT) {
 634                 if (!inode_trylock_shared(inode))
 635                         return -EAGAIN;
 636         } else {
 637                 inode_lock_shared(inode);
 638         }
 639
 640         /* Limit read operations to written data */
 641         mutex_lock(&zi->i_truncate_mutex);
 642         isize = i_size_read(inode);
 643         if (iocb->ki_pos >= isize) {
 644                 mutex_unlock(&zi->i_truncate_mutex);
 645                 ret = 0;
 646                 goto inode_unlock;
 647         }
 648         iov_iter_truncate(to, isize - iocb->ki_pos);
 649         mutex_unlock(&zi->i_truncate_mutex);
 650
 651         if (iocb->ki_flags & IOCB_DIRECT) {
 652                 size_t count = iov_iter_count(to);
 653
 654                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 655                         ret = -EINVAL;
 656                         goto inode_unlock;
 657                 }
 658                 file_accessed(iocb->ki_filp);
 659                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 660                                    &zonefs_read_dio_ops, 0, NULL, 0);
 661         } else {
 662                 ret = generic_file_read_iter(iocb, to);
 663                 if (ret == -EIO)
 664                         zonefs_io_error(inode, false);
 665         }
 666
 667 inode_unlock:
 668         inode_unlock_shared(inode);
 669
 670         return ret;
 671 }
 672
 673 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
 674                                        struct pipe_inode_info *pipe,
 675                                        size_t len, unsigned int flags)
 676 {
 677         struct inode *inode = file_inode(in);
 678         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 679         struct zonefs_zone *z = zonefs_inode_zone(inode);
 680         loff_t isize;
 681         ssize_t ret = 0;
 682
 683         /* Offline zones cannot be read */
 684         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 685                 return -EPERM;
 686
 687         if (*ppos >= z->z_capacity)
 688                 return 0;
 689
 690         inode_lock_shared(inode);
 691
 692         /* Limit read operations to written data */
 693         mutex_lock(&zi->i_truncate_mutex);
 694         isize = i_size_read(inode);
 695         if (*ppos >= isize)
 696                 len = 0;
 697         else
 698                 len = min_t(loff_t, len, isize - *ppos);
 699         mutex_unlock(&zi->i_truncate_mutex);
 700
 701         if (len > 0) {
 702                 ret = filemap_splice_read(in, ppos, pipe, len, flags);
 703                 if (ret == -EIO)
 704                         zonefs_io_error(inode, false);
 705         }
 706
 707         inode_unlock_shared(inode);
 708         return ret;
 709 }
 710
 711 /*
 712  * Write open accounting is done only for sequential files.
 713  */
 714 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 715                                             struct file *file)
 716 {
 717         if (zonefs_inode_is_cnv(inode))
 718                 return false;
 719
 720         if (!(file->f_mode & FMODE_WRITE))
 721                 return false;
 722
 723         return true;
 724 }
 725
 726 static int zonefs_seq_file_write_open(struct inode *inode)
 727 {
 728         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 729         struct zonefs_zone *z = zonefs_inode_zone(inode);
 730         int ret = 0;
 731
 732         mutex_lock(&zi->i_truncate_mutex);
 733
 734         if (!zi->i_wr_refcnt) {
 735                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 736                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 737
 738                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 739
 740                         if (sbi->s_max_wro_seq_files
 741                             && wro > sbi->s_max_wro_seq_files) {
 742                                 atomic_dec(&sbi->s_wro_seq_files);
 743                                 ret = -EBUSY;
 744                                 goto unlock;
 745                         }
 746
 747                         if (i_size_read(inode) < z->z_capacity) {
 748                                 ret = zonefs_inode_zone_mgmt(inode,
 749                                                              REQ_OP_ZONE_OPEN);
 750                                 if (ret) {
 751                                         atomic_dec(&sbi->s_wro_seq_files);
 752                                         goto unlock;
 753                                 }
 754                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 755                                 zonefs_inode_account_active(inode);
 756                         }
 757                 }
 758         }
 759
 760         zi->i_wr_refcnt++;
 761
 762 unlock:
 763         mutex_unlock(&zi->i_truncate_mutex);
 764
 765         return ret;
 766 }
 767
 768 static int zonefs_file_open(struct inode *inode, struct file *file)
 769 {
 770         int ret;
 771
 772         file->f_mode |= FMODE_CAN_ODIRECT;
 773         ret = generic_file_open(inode, file);
 774         if (ret)
 775                 return ret;
 776
 777         if (zonefs_seq_file_need_wro(inode, file))
 778                 return zonefs_seq_file_write_open(inode);
 779
 780         return 0;
 781 }
 782
 783 static void zonefs_seq_file_write_close(struct inode *inode)
 784 {
 785         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 786         struct zonefs_zone *z = zonefs_inode_zone(inode);
 787         struct super_block *sb = inode->i_sb;
 788         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 789         int ret = 0;
 790
 791         mutex_lock(&zi->i_truncate_mutex);
 792
 793         zi->i_wr_refcnt--;
 794         if (zi->i_wr_refcnt)
 795                 goto unlock;
 796
 797         /*
 798          * The file zone may not be open anymore (e.g. the file was truncated to
 799          * its maximum size or it was fully written). For this case, we only
 800          * need to decrement the write open count.
 801          */
 802         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 803                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 804                 if (ret) {
 805                         __zonefs_io_error(inode, false);
 806                         /*
 807                          * Leaving zones explicitly open may lead to a state
 808                          * where most zones cannot be written (zone resources
 809                          * exhausted). So take preventive action by remounting
 810                          * read-only.
 811                          */
 812                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 813                             !(sb->s_flags & SB_RDONLY)) {
 814                                 zonefs_warn(sb,
 815                                         "closing zone at %llu failed %d\n",
 816                                         z->z_sector, ret);
 817                                 zonefs_warn(sb,
 818                                         "remounting filesystem read-only\n");
 819                                 sb->s_flags |= SB_RDONLY;
 820                         }
 821                         goto unlock;
 822                 }
 823
 824                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 825                 zonefs_inode_account_active(inode);
 826         }
 827
 828         atomic_dec(&sbi->s_wro_seq_files);
 829
 830 unlock:
 831         mutex_unlock(&zi->i_truncate_mutex);
 832 }
 833
 834 static int zonefs_file_release(struct inode *inode, struct file *file)
 835 {
 836         /*
 837          * If we explicitly open a zone we must close it again as well, but the
 838          * zone management operation can fail (either due to an IO error or as
 839          * the zone has gone offline or read-only). Make sure we don't fail the
 840          * close(2) for user-space.
 841          */
 842         if (zonefs_seq_file_need_wro(inode, file))
 843                 zonefs_seq_file_write_close(inode);
 844
 845         return 0;
 846 }
 847
 848 const struct file_operations zonefs_file_operations = {
 849         .open           = zonefs_file_open,
 850         .release        = zonefs_file_release,
 851         .fsync          = zonefs_file_fsync,
 852         .mmap           = zonefs_file_mmap,
 853         .llseek         = zonefs_file_llseek,
 854         .read_iter      = zonefs_file_read_iter,
 855         .write_iter     = zonefs_file_write_iter,
 856         .splice_read    = zonefs_file_splice_read,
 857         .splice_write   = iter_file_splice_write,
 858         .iopoll         = iocb_bio_iopoll,
 859 };