s3 onefs: Add missing newlines to debug statements in the onefs module
[ira/wip.git] / source3 / modules / onefs_system.c
index 4ebdf12a50342ac96ae968bd5baf65ce948aee61..bc2ed469bf808080d333a6d5efdb221e5a4bd840 100644 (file)
  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
+#include "includes.h"
 #include "onefs.h"
+#include "onefs_config.h"
+#include "oplock_onefs.h"
 
 #include <ifs/ifs_syscalls.h>
 #include <isi_acl/isi_acl_util.h>
+#include <sys/isi_acl.h>
 
 /*
  * Initialize the sm_lock struct before passing it to ifs_createfile.
@@ -68,58 +72,6 @@ static void smlock_dump(int debuglevel, const struct sm_lock *sml)
               (int)sml->sm_timeout.tv_usec));
 }
 
-/*
- * Return string value of onefs oplock types.
- */
-static const char *onefs_oplock_str(enum oplock_type onefs_oplock_type)
-{
-       switch (onefs_oplock_type) {
-       case OPLOCK_NONE:
-               return "OPLOCK_NONE";
-       case OPLOCK_EXCLUSIVE:
-               return "OPLOCK_EXCLUSIVE";
-       case OPLOCK_BATCH:
-               return "OPLOCK_BATCH";
-       case OPLOCK_SHARED:
-               return "OPLOCK_SHARED";
-       default:
-               break;
-       }
-       return "UNKNOWN";
-}
-
-/*
- * Convert from onefs to samba oplock.
- */
-static int onefs_oplock_to_samba_oplock(enum oplock_type onefs_oplock)
-{
-       switch (onefs_oplock) {
-       case OPLOCK_NONE:
-               return NO_OPLOCK;
-       case OPLOCK_EXCLUSIVE:
-               return EXCLUSIVE_OPLOCK;
-       case OPLOCK_BATCH:
-               return BATCH_OPLOCK;
-       case OPLOCK_SHARED:
-               return LEVEL_II_OPLOCK;
-       default:
-               DEBUG(0, ("unknown oplock type %d found\n", onefs_oplock));
-               break;
-       }
-       return NO_OPLOCK;
-}
-
-/*
- * Convert from samba to onefs oplock.
- */
-static enum oplock_type onefs_samba_oplock_to_oplock(int samba_oplock_type)
-{
-       if (BATCH_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_BATCH;
-       if (EXCLUSIVE_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_EXCLUSIVE;
-       if (LEVEL_II_OPLOCK_TYPE(samba_oplock_type)) return OPLOCK_SHARED;
-       return OPLOCK_NONE;
-}
-
 /**
  * External interface to ifs_createfile
  */
@@ -145,6 +97,9 @@ int onefs_sys_create_file(connection_struct *conn,
        int secinfo = 0;
        int ret_fd = -1;
        uint32_t onefs_dos_attributes;
+       struct ifs_createfile_flags cf_flags = CF_FLAGS_NONE;
+
+       START_PROFILE(syscall_createfile);
 
        /* Setup security descriptor and get secinfo. */
        if (sd != NULL) {
@@ -155,7 +110,7 @@ int onefs_sys_create_file(connection_struct *conn,
                status = onefs_samba_sd_to_sd(secinfo, sd, &ifs_sd, SNUM(conn));
 
                if (!NT_STATUS_IS_OK(status)) {
-                       DEBUG(1, ("SD initialization failure: %s",
+                       DEBUG(1, ("SD initialization failure: %s\n",
                                  nt_errstr(status)));
                        errno = EINVAL;
                        goto out;
@@ -164,25 +119,63 @@ int onefs_sys_create_file(connection_struct *conn,
                pifs_sd = &ifs_sd;
        }
 
+       /* Stripping off private bits will be done for us. */
        onefs_oplock = onefs_samba_oplock_to_oplock(oplock_request);
 
-       /* Temporary until oplock work is added to vfs_onefs */
-       onefs_oplock = OPLOCK_NONE;
+       if (!lp_oplocks(SNUM(conn))) {
+               SMB_ASSERT(onefs_oplock == OPLOCK_NONE);
+       }
 
        /* Convert samba dos flags to UF_DOS_* attributes. */
        onefs_dos_attributes = dos_attributes_to_stat_dos_flags(dos_flags);
 
-       DEBUG(10,("onefs_sys_create_file: base_fd = %d, "
-                 "open_access_mask = 0x%x, flags = 0x%x, mode = 0x%x, "
+       /**
+        * Deal with kernel creating Default ACLs. (Isilon bug 47447.)
+        *
+        * 1) "nt acl support = no", default_acl = no
+        * 2) "inherit permissions = yes", default_acl = no
+        */
+       if (lp_nt_acl_support(SNUM(conn)) && !lp_inherit_perms(SNUM(conn)))
+               cf_flags = cf_flags_or(cf_flags, CF_FLAGS_DEFAULT_ACL);
+
+       /*
+        * Some customer workflows require the execute bit to be ignored.
+        */
+       if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                        PARM_ALLOW_EXECUTE_ALWAYS,
+                        PARM_ALLOW_EXECUTE_ALWAYS_DEFAULT) &&
+           (open_access_mask & FILE_EXECUTE)) {
+
+               DEBUG(3, ("Stripping execute bit from %s: (0x%x)\n", path,
+                         open_access_mask));
+
+               /* Strip execute. */
+               open_access_mask &= ~FILE_EXECUTE;
+
+               /*
+                * Add READ_DATA, so we're not left with desired_access=0. An
+                * execute call should imply the client will read the data.
+                */
+               open_access_mask |= FILE_READ_DATA;
+
+               DEBUGADD(3, ("New stripped access mask: 0x%x\n",
+                            open_access_mask));
+       }
+
+       DEBUG(10,("onefs_sys_create_file: base_fd = %d, fname = %s"
+                 "open_access_mask = 0x%x, flags = 0x%x, mode = 0%o, "
                  "desired_oplock = %s, id = 0x%x, secinfo = 0x%x, sd = %p, "
-                 "dos_attributes = 0x%x, path = %s\n", base_fd,
+                 "dos_attributes = 0x%x, path = %s, "
+                 "default_acl=%s\n", base_fd, path,
                  (unsigned int)open_access_mask,
                  (unsigned int)flags,
                  (unsigned int)mode,
                  onefs_oplock_str(onefs_oplock),
                  (unsigned int)id,
                  (unsigned int)secinfo, sd,
-                 (unsigned int)onefs_dos_attributes, path));
+                 (unsigned int)onefs_dos_attributes, path,
+                 cf_flags_and_bool(cf_flags, CF_FLAGS_DEFAULT_ACL) ?
+                     "true" : "false"));
 
        /* Initialize smlock struct for files/dirs but not internal opens */
        if (!(oplock_request & INTERNAL_OPEN_ONLY)) {
@@ -196,7 +189,7 @@ int onefs_sys_create_file(connection_struct *conn,
        ret_fd = ifs_createfile(base_fd, path,
            (enum ifs_ace_rights)open_access_mask, flags & ~O_ACCMODE, mode,
            onefs_oplock, id, psml, secinfo, pifs_sd, onefs_dos_attributes,
-           &onefs_granted_oplock);
+           cf_flags, &onefs_granted_oplock);
 
        DEBUG(10,("onefs_sys_create_file(%s): ret_fd = %d, "
                  "onefs_granted_oplock = %s\n",
@@ -209,7 +202,465 @@ int onefs_sys_create_file(connection_struct *conn,
        }
 
  out:
+       END_PROFILE(syscall_createfile);
        aclu_free_sd(pifs_sd, false);
 
        return ret_fd;
 }
+
+/**
+ * FreeBSD based sendfile implementation that allows for atomic semantics.
+ */
+static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
+    const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
+{
+       size_t total=0;
+       struct sf_hdtr hdr;
+       struct iovec hdtrl;
+       size_t hdr_len = 0;
+       int flags = 0;
+
+       if (atomic) {
+               flags = SF_ATOMIC;
+       }
+
+       hdr.headers = &hdtrl;
+       hdr.hdr_cnt = 1;
+       hdr.trailers = NULL;
+       hdr.trl_cnt = 0;
+
+       /* Set up the header iovec. */
+       if (header) {
+               hdtrl.iov_base = header->data;
+               hdtrl.iov_len = hdr_len = header->length;
+       } else {
+               hdtrl.iov_base = NULL;
+               hdtrl.iov_len = 0;
+       }
+
+       total = count;
+       while (total + hdtrl.iov_len) {
+               SMB_OFF_T nwritten;
+               int ret;
+
+               /*
+                * FreeBSD sendfile returns 0 on success, -1 on error.
+                * Remember, the tofd and fromfd are reversed..... :-).
+                * nwritten includes the header data sent.
+                */
+
+               do {
+                       ret = sendfile(fromfd, tofd, offset, total, &hdr,
+                                      &nwritten, flags);
+               } while (ret == -1 && errno == EINTR);
+
+               /* On error we're done. */
+               if (ret == -1) {
+                       return -1;
+               }
+
+               /*
+                * If this was an ATOMIC sendfile, nwritten doesn't
+                * necessarily indicate an error.  It could mean count > than
+                * what sendfile can handle atomically (usually 64K) or that
+                * there was a short read due to the file being truncated.
+                */
+               if (nwritten == 0) {
+                       return atomic ? 0 : -1;
+               }
+
+               /*
+                * An atomic sendfile should never send partial data!
+                */
+               if (atomic && nwritten != total + hdtrl.iov_len) {
+                       DEBUG(0,("Atomic sendfile() sent partial data: "
+                                "%llu of %d\n", nwritten,
+                                total + hdtrl.iov_len));
+                       return -1;
+               }
+
+               /*
+                * If this was a short (signal interrupted) write we may need
+                * to subtract it from the header data, or null out the header
+                * data altogether if we wrote more than hdtrl.iov_len bytes.
+                * We change nwritten to be the number of file bytes written.
+                */
+
+               if (hdtrl.iov_base && hdtrl.iov_len) {
+                       if (nwritten >= hdtrl.iov_len) {
+                               nwritten -= hdtrl.iov_len;
+                               hdtrl.iov_base = NULL;
+                               hdtrl.iov_len = 0;
+                       } else {
+                               hdtrl.iov_base =
+                                   (caddr_t)hdtrl.iov_base + nwritten;
+                               hdtrl.iov_len -= nwritten;
+                               nwritten = 0;
+                       }
+               }
+               total -= nwritten;
+               offset += nwritten;
+       }
+       return count + hdr_len;
+}
+
+/**
+ * Handles the subtleties of using sendfile with CIFS.
+ */
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+                          const DATA_BLOB *header, SMB_OFF_T offset,
+                          size_t count)
+{
+       bool atomic = false;
+       ssize_t ret = 0;
+
+       START_PROFILE_BYTES(syscall_sendfile, count);
+
+       if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                        PARM_ATOMIC_SENDFILE,
+                        PARM_ATOMIC_SENDFILE_DEFAULT)) {
+               atomic = true;
+       }
+
+       /* Try the sendfile */
+       ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
+                                   atomic);
+
+       /* If the sendfile wasn't atomic, we're done. */
+       if (!atomic) {
+               DEBUG(10, ("non-atomic sendfile read %ul bytes\n", ret));
+               END_PROFILE(syscall_sendfile);
+               return ret;
+       }
+
+       /*
+        * Atomic sendfile takes care to not write anything to the socket
+        * until all of the requested bytes have been read from the file.
+        * There are two atomic cases that need to be handled.
+        *
+        *  1. The file was truncated causing less data to be read than was
+        *     requested.  In this case, we return back to the caller to
+        *     indicate 0 bytes were written to the socket.  This should
+        *     prompt the caller to fallback to the standard read path: read
+        *     the data, create a header that indicates how many bytes were
+        *     actually read, and send the header/data back to the client.
+        *
+        *     This saves us from standard sendfile behavior of sending a
+        *     header promising more data then will actually be sent.  The
+        *     only two options are to close the socket and kill the client
+        *     connection, or write a bunch of 0s.  Closing the client
+        *     connection is bad because there could actually be multiple
+        *     sessions multiplexed from the same client that are all dropped
+        *     because of a truncate.  Writing the remaining data as 0s also
+        *     isn't good, because the client will have an incorrect version
+        *     of the file.  If the file is written back to the server, the 0s
+        *     will be written back.  Fortunately, atomic sendfile allows us
+        *     to avoid making this choice in most cases.
+        *
+        *  2. One downside of atomic sendfile, is that there is a limit on
+        *     the number of bytes that can be sent atomically.  The kernel
+        *     has a limited amount of mbuf space that it can read file data
+        *     into without exhausting the system's mbufs, so a buffer of
+        *     length xfsize is used.  The xfsize at the time of writing this
+        *     is 64K.  xfsize bytes are read from the file, and subsequently
+        *     written to the socket.  This makes it impossible to do the
+        *     sendfile atomically for a byte count > xfsize.
+        *
+        *     To cope with large requests, atomic sendfile returns -1 with
+        *     errno set to E2BIG.  Since windows maxes out at 64K writes,
+        *     this is currently only a concern with non-windows clients.
+        *     Posix extensions allow the full 24bit bytecount field to be
+        *     used in ReadAndX, and clients such as smbclient and the linux
+        *     cifs client can request up to 16MB reads!  There are a few
+        *     options for handling large sendfile requests.
+        *
+        *      a. Fall back to the standard read path.  This is unacceptable
+        *         because it would require prohibitively large mallocs.
+        *
+        *      b. Fall back to using samba's fake_send_file which emulates
+        *         the kernel sendfile in userspace.  This still has the same
+        *         problem of sending the header before all of the data has
+        *         been read, so it doesn't buy us anything, and has worse
+        *         performance than the kernel's zero-copy sendfile.
+        *
+        *      c. Use non-atomic sendfile syscall to attempt a zero copy
+        *         read, and hope that there isn't a short read due to
+        *         truncation.  In the case of a short read, there are two
+        *         options:
+        *
+        *          1. Kill the client connection
+        *
+        *          2. Write zeros to the socket for the remaining bytes
+        *             promised in the header.
+        *
+        *         It is safer from a data corruption perspective to kill the
+        *         client connection, so this is our default behavior, but if
+        *         this causes problems this can be configured to write zeros
+        *         via smb.conf.
+        */
+
+       /* Handle case 1: short read -> truncated file. */
+       if (ret == 0) {
+               END_PROFILE(syscall_sendfile);
+               return ret;
+       }
+
+       /* Handle case 2: large read. */
+       if (ret == -1 && errno == E2BIG) {
+
+               if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                                PARM_SENDFILE_LARGE_READS,
+                                PARM_SENDFILE_LARGE_READS_DEFAULT)) {
+                       DEBUG(3, ("Not attempting non-atomic large sendfile: "
+                                 "%lu bytes\n", count));
+                       END_PROFILE(syscall_sendfile);
+                       return 0;
+               }
+
+               if (count < 0x10000) {
+                       DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu\n",
+                                 count));
+               }
+
+               DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
+                          count));
+
+               /* Try a non-atomic sendfile. */
+               ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
+                                           count, false);
+               /* Real error: kill the client connection. */
+               if (ret == -1) {
+                       DEBUG(1, ("error on non-atomic large sendfile "
+                                 "(%lu bytes): %s\n", count,
+                                 strerror(errno)));
+                       END_PROFILE(syscall_sendfile);
+                       return ret;
+               }
+
+               /* Short read: kill the client connection. */
+               if (ret != count + header->length) {
+                       DEBUG(1, ("short read on non-atomic large sendfile "
+                                 "(%lu of %lu bytes): %s\n", ret, count,
+                                 strerror(errno)));
+
+                       /*
+                        * Returning ret here would cause us to drop into the
+                        * codepath that calls sendfile_short_send, which
+                        * sends the client a bunch of zeros instead.
+                        * Returning -1 kills the connection.
+                        */
+                       if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                               PARM_SENDFILE_SAFE,
+                               PARM_SENDFILE_SAFE_DEFAULT)) {
+                               END_PROFILE(syscall_sendfile);
+                               return -1;
+                       }
+
+                       END_PROFILE(syscall_sendfile);
+                       return ret;
+               }
+
+               DEBUG(10, ("non-atomic large sendfile successful\n"));
+       }
+
+       /* There was error in the atomic sendfile. */
+       if (ret == -1) {
+               DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
+                         atomic ? "atomic" : "non-atomic",
+                         count, strerror(errno)));
+       }
+
+       END_PROFILE(syscall_sendfile);
+       return ret;
+}
+
+/**
+ * Only talloc the spill buffer once (reallocing when necessary).
+ */
+static char *get_spill_buffer(size_t new_count)
+{
+       static int cur_count = 0;
+       static char *spill_buffer = NULL;
+
+       /* If a sufficiently sized buffer exists, just return. */
+       if (new_count <= cur_count) {
+               SMB_ASSERT(spill_buffer);
+               return spill_buffer;
+       }
+
+       /* Allocate the first time. */
+       if (cur_count == 0) {
+               SMB_ASSERT(!spill_buffer);
+               spill_buffer = talloc_array(NULL, char, new_count);
+               if (spill_buffer) {
+                       cur_count = new_count;
+               }
+               return spill_buffer;
+       }
+
+       /* A buffer exists, but it's not big enough, so realloc. */
+       SMB_ASSERT(spill_buffer);
+       spill_buffer = talloc_realloc(NULL, spill_buffer, char, new_count);
+       if (spill_buffer) {
+               cur_count = new_count;
+       }
+       return spill_buffer;
+}
+
+/**
+ * recvfile does zero-copy writes given an fd to write to, and a socket with
+ * some data to write.  If recvfile read more than it was able to write, it
+ * spills the data into a buffer.  After first reading any additional data
+ * from the socket into the buffer, the spill buffer is then written with a
+ * standard pwrite.
+ */
+ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
+                          size_t count)
+{
+       char *spill_buffer = NULL;
+       bool socket_drained = false;
+       int ret;
+       off_t total_rbytes = 0;
+       off_t total_wbytes = 0;
+       off_t rbytes;
+       off_t wbytes;
+
+       START_PROFILE_BYTES(syscall_recvfile, count);
+
+       DEBUG(10,("onefs_recvfile: from = %d, to = %d, offset=%llu, count = "
+                 "%lu\n", fromfd, tofd, offset, count));
+
+       if (count == 0) {
+               END_PROFILE(syscall_recvfile);
+               return 0;
+       }
+
+       /*
+        * Setup up a buffer for recvfile to spill data that has been read
+        * from the socket but not written.
+        */
+       spill_buffer = get_spill_buffer(count);
+       if (spill_buffer == NULL) {
+               ret = -1;
+               goto out;
+       }
+
+       /*
+        * Keep trying recvfile until:
+        *  - There is no data left to read on the socket, or
+        *  - bytes read != bytes written, or
+        *  - An error is returned that isn't EINTR/EAGAIN
+        */
+       do {
+               /* Keep track of bytes read/written for recvfile */
+               rbytes = 0;
+               wbytes = 0;
+
+               DEBUG(10, ("calling recvfile loop, offset + total_wbytes = "
+                          "%llu, count - total_rbytes = %llu\n",
+                          offset + total_wbytes, count - total_rbytes));
+
+               ret = recvfile(tofd, fromfd, offset + total_wbytes,
+                              count - total_wbytes, &rbytes, &wbytes, 0,
+                              spill_buffer);
+
+               DEBUG(10, ("recvfile ret = %d, errno = %d, rbytes = %llu, "
+                          "wbytes = %llu\n", ret, ret >= 0 ? 0 : errno,
+                          rbytes, wbytes));
+
+               /* Update our progress so far */
+               total_rbytes += rbytes;
+               total_wbytes += wbytes;
+
+       } while ((count - total_rbytes) && (rbytes == wbytes) &&
+                (ret == -1 && (errno == EINTR || errno == EAGAIN)));
+
+       DEBUG(10, ("total_rbytes = %llu, total_wbytes = %llu\n",
+                  total_rbytes, total_wbytes));
+
+       /* Log if recvfile didn't write everything it read. */
+       if (total_rbytes != total_wbytes) {
+               DEBUG(0, ("partial recvfile: total_rbytes=%llu but "
+                         "total_wbytes=%llu, diff = %llu\n", total_rbytes,
+                         total_wbytes, total_rbytes - total_wbytes));
+               SMB_ASSERT(total_rbytes > total_wbytes);
+       }
+
+       /*
+        * If there is still data on the socket, read it off.
+        */
+       while (total_rbytes < count) {
+
+               DEBUG(0, ("shallow recvfile (%s), reading %llu\n",
+                         strerror(errno), count - total_rbytes));
+
+               /*
+                * Read the remaining data into the spill buffer.  recvfile
+                * may already have some data in the spill buffer, so start
+                * filling the buffer at total_rbytes - total_wbytes.
+                */
+               ret = sys_read(fromfd,
+                              spill_buffer + (total_rbytes - total_wbytes),
+                              count - total_rbytes);
+
+               if (ret <= 0) {
+                       if (ret == 0) {
+                               DEBUG(0, ("shallow recvfile read: EOF\n"));
+                       } else {
+                               DEBUG(0, ("shallow recvfile read failed: %s\n",
+                                         strerror(errno)));
+                       }
+                       /* Socket is dead, so treat as if it were drained. */
+                       socket_drained = true;
+                       goto out;
+               }
+
+               /* Data was read so update the rbytes */
+               total_rbytes += ret;
+       }
+
+       if (total_rbytes != count) {
+               smb_panic("Unread recvfile data still on the socket!");
+       }
+
+       /*
+        * Now write any spilled data + the extra data read off the socket.
+        */
+       while (total_wbytes < count) {
+
+               DEBUG(0, ("partial recvfile, writing %llu\n", count - total_wbytes));
+
+               ret = sys_pwrite(tofd, spill_buffer, count - total_wbytes,
+                                offset + total_wbytes);
+
+               if (ret == -1) {
+                       DEBUG(0, ("partial recvfile write failed: %s\n",
+                                 strerror(errno)));
+                       goto out;
+               }
+
+               /* Data was written so update the wbytes */
+               total_wbytes += ret;
+       }
+
+       /* Success! */
+       ret = total_wbytes;
+
+out:
+
+       END_PROFILE(syscall_recvfile);
+
+       /* Make sure we always try to drain the socket. */
+       if (!socket_drained && count - total_rbytes) {
+               int saved_errno = errno;
+
+               if (drain_socket(fromfd, count - total_rbytes) !=
+                   count - total_rbytes) {
+                       /* Socket is dead! */
+                       DEBUG(0, ("drain socket failed: %d\n", errno));
+               }
+               errno = saved_errno;
+       }
+
+       return ret;
+}