From 953b51b7a398b15bc172e1743181b29b5235b2df Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 13 Jun 2018 15:21:35 -0400 Subject: nfsd: fix corrupted reply to badly ordered compound [ Upstream commit 5b7b15aee641904ae269be9846610a3950cbd64c ] We're encoding a single op in the reply but leaving the number of ops zero, so the reply makes no sense. Somewhat academic as this isn't a case any real client will hit, though in theory perhaps that could change in a future protocol extension. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bfbee8ddf978..c67064d94096 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1632,6 +1632,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (status) { op = &args->ops[0]; op->status = status; + resp->opcnt = 1; goto encode_op; } -- cgit v1.2.3 From cd3d6463759d21f4093d3434effacc358dd0caf8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 16 Jun 2018 15:40:48 -0400 Subject: ext4: never move the system.data xattr out of the inode body commit 8cdb5240ec5928b20490a2bb34cb87e9a5f40226 upstream. When expanding the extra isize space, we must never move the system.data xattr out of the inode body. For performance reasons, it doesn't make any sense, and the inline data implementation assumes that system.data xattr is never in the external xattr block. This addresses CVE-2018-10880 https://bugzilla.kernel.org/show_bug.cgi?id=200005 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Zubin Mithra Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9fb2a751fce4..b51bb73b06a6 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1386,6 +1386,11 @@ retry: /* Find the entry best suited to be pushed into EA block */ entry = NULL; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + /* never move system.data out of the inode */ + if ((last->e_name_len == 4) && + (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && + !memcmp(last->e_name, "data", 4)) + continue; total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); -- cgit v1.2.3 From 2b2ccb29f307c382ce4c19c3cf5733be2abd2dba Mon Sep 17 00:00:00 2001 From: Jon Kuhn Date: Mon, 9 Jul 2018 14:33:14 +0000 Subject: fs/cifs: don't translate SFM_SLASH (U+F026) to backslash [ Upstream commit c15e3f19a6d5c89b1209dc94b40e568177cb0921 ] When a Mac client saves an item containing a backslash to a file server the backslash is represented in the CIFS/SMB protocol as as U+F026. Before this change, listing a directory containing an item with a backslash in its name will return that item with the backslash represented with a true backslash character (U+005C) because convert_sfm_character mapped U+F026 to U+005C when interpretting the CIFS/SMB protocol response. However, attempting to open or stat the path using a true backslash will result in an error because convert_to_sfm_char does not map U+005C back to U+F026 causing the CIFS/SMB request to be made with the backslash represented as U+005C. This change simply prevents the U+F026 to U+005C conversion from happenning. This is analogous to how the code does not do any translation of UNI_SLASH (U+F000). Signed-off-by: Jon Kuhn Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifs_unicode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index a0b3e7d1be48..211ac472cb9d 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -101,9 +101,6 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_LESSTHAN: *target = '<'; break; - case SFM_SLASH: - *target = '\\'; - break; case SFM_SPACE: *target = ' '; break; -- cgit v1.2.3 From 8c47defad8510820d609a66d1c8d3cac64913ac5 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 3 Sep 2018 13:15:58 +1000 Subject: fs/cifs: suppress a string overflow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit bcfb84a996f6fa90b5e6e2954b2accb7a4711097 ] A powerpc build of cifs with gcc v8.2.0 produces this warning: fs/cifs/cifssmb.c: In function ‘CIFSSMBNegotiate’: fs/cifs/cifssmb.c:605:3: warning: ‘strncpy’ writing 16 bytes into a region of size 1 overflows the destination [-Wstringop-overflow=] strncpy(pSMB->DialectsArray+count, protocols[i].name, 16); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Since we are already doing a strlen() on the source, change the strncpy to a memcpy(). Signed-off-by: Stephen Rothwell Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifssmb.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 63aea21e6298..b9b8f19dce0e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -577,10 +577,15 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) } count = 0; + /* + * We know that all the name entries in the protocols array + * are short (< 16 bytes anyway) and are NUL terminated. + */ for (i = 0; i < CIFS_NUM_PROT; i++) { - strncpy(pSMB->DialectsArray+count, protocols[i].name, 16); - count += strlen(protocols[i].name) + 1; - /* null at end of source and target buffers anyway */ + size_t len = strlen(protocols[i].name) + 1; + + memcpy(pSMB->DialectsArray+count, protocols[i].name, len); + count += len; } inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); -- cgit v1.2.3 From cd65a43f4dfb75a6bb5211ce28e65e5429b7c499 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Sep 2018 12:47:01 +0300 Subject: cifs: read overflow in is_valid_oplock_break() [ Upstream commit 097f5863b1a0c9901f180bbd56ae7d630655faaa ] We need to verify that the "data_offset" is within bounds. Reported-by: Dr Silvio Cesare of InfoSect Signed-off-by: Dan Carpenter Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/cifs/misc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0cc699d9b932..61a09ab2752e 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -406,9 +406,17 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) (struct smb_com_transaction_change_notify_rsp *)buf; struct file_notify_information *pnotify; __u32 data_offset = 0; + size_t len = srv->total_read - sizeof(pSMBr->hdr.smb_buf_length); + if (get_bcc(buf) > sizeof(struct file_notify_information)) { data_offset = le32_to_cpu(pSMBr->DataOffset); + if (data_offset > + len - sizeof(struct file_notify_information)) { + cifs_dbg(FYI, "invalid data_offset %u\n", + data_offset); + return true; + } pnotify = (struct file_notify_information *) ((char *)&pSMBr->hdr.Protocol + data_offset); cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n", -- cgit v1.2.3 From ec2a4f06e31f410e027ff10237c8c824f36fa259 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 17 May 2018 16:35:07 +0200 Subject: smb2: fix missing files in root share directory listing commit 0595751f267994c3c7027377058e4185b3a28e75 upstream. When mounting a Windows share that is the root of a drive (eg. C$) the server does not return . and .. directory entries. This results in the smb2 code path erroneously skipping the 2 first entries. Pseudo-code of the readdir() code path: cifs_readdir(struct file, struct dir_context) initiate_cifs_search <-- if no reponse cached yet server->ops->query_dir_first dir_emit_dots dir_emit <-- adds "." and ".." if we're at pos=0 find_cifs_entry initiate_cifs_search <-- if pos < start of current response (restart search) server->ops->query_dir_next <-- if pos > end of current response (fetch next search res) for(...) <-- loops over cur response entries starting at pos cifs_filldir <-- skip . and .., emit entry cifs_fill_dirent dir_emit pos++ A) dir_emit_dots() always adds . & .. and sets the current dir pos to 2 (0 and 1 are done). Therefore we always want the index_to_find to be 2 regardless of if the response has . and .. B) smb1 code initializes index_of_last_entry with a +2 offset in cifssmb.c CIFSFindFirst(): psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; Later in find_cifs_entry() we want to find the next dir entry at pos=2 as a result of (A) first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - cfile->srch_inf.entries_in_buffer; This var is the dir pos that the first entry in the buffer will have therefore it must be 2 in the first call. If we don't offset index_of_last_entry by 2 (like in (B)), first_entry_in_buffer=0 but we were instructed to get pos=2 so this code in find_cifs_entry() skips the 2 first which is ok for non-root shares, as it skips . and .. from the response but is not ok for root shares where the 2 first are actual files pos_in_buf = index_to_find - first_entry_in_buffer; // pos_in_buf=2 // we skip 2 first response entries :( for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { /* go entry by entry figuring out which is first */ cur_ent = nxt_dir_entry(cur_ent, end_of_smb, cfile->srch_inf.info_level); } C) cifs_filldir() skips . and .. so we can safely ignore them for now. Sample program: int main(int argc, char **argv) { const char *path = argc >= 2 ? argv[1] : "."; DIR *dh; struct dirent *de; printf("listing path <%s>\n", path); dh = opendir(path); if (!dh) { printf("opendir error %d\n", errno); return 1; } while (1) { de = readdir(dh); if (!de) { if (errno) { printf("readdir error %d\n", errno); return 1; } printf("end of listing\n"); break; } printf("off=%lu <%s>\n", de->d_off, de->d_name); } return 0; } Before the fix with SMB1 on root shares: <.> off=1 <..> off=2 <$Recycle.Bin> off=3 off=4 and on non-root shares: <.> off=1 <..> off=4 <-- after adding .., the offsets jumps to +2 because <2536> off=5 we skipped . and .. from response buffer (C) <411> off=6 but still incremented pos off=7 off=8 Therefore the fix for smb2 is to mimic smb1 behaviour and offset the index_of_last_entry by 2. Test results comparing smb1 and smb2 before/after the fix on root share, non-root shares and on large directories (ie. multi-response dir listing): PRE FIX ======= pre-1-root VS pre-2-root: ERR pre-2-root is missing [bootmgr, $Recycle.Bin] pre-1-nonroot VS pre-2-nonroot: OK~ same files, same order, different offsets pre-1-nonroot-large VS pre-2-nonroot-large: OK~ same files, same order, different offsets POST FIX ======== post-1-root VS post-2-root: OK same files, same order, same offsets post-1-nonroot VS post-2-nonroot: OK same files, same order, same offsets post-1-nonroot-large VS post-2-nonroot-large: OK same files, same order, same offsets REGRESSION? =========== pre-1-root VS post-1-root: OK same files, same order, same offsets pre-1-nonroot VS post-1-nonroot: OK same files, same order, same offsets BugLink: https://bugzilla.samba.org/show_bug.cgi?id=13107 Signed-off-by: Aurelien Aptel Signed-off-by: Paulo Alcantara Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e6b1795fbf2a..2725085a3f9f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -914,7 +914,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, } srch_inf->entries_in_buffer = 0; - srch_inf->index_of_last_entry = 0; + srch_inf->index_of_last_entry = 2; rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); -- cgit v1.2.3 From 574757073482f77ec10caea5e57726190a2837fa Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 15:51:58 -0700 Subject: proc: restrict kernel stack dumps to root commit f8a00cef17206ecd1b30d3d9f99e10d9fa707aa7 upstream. Currently, you can use /proc/self/task/*/stack to cause a stack walk on a task you control while it is running on another CPU. That means that the stack can change under the stack walker. The stack walker does have guards against going completely off the rails and into random kernel memory, but it can interpret random data from your kernel stack as instruction pointers and stack pointers. This can cause exposure of kernel stack contents to userspace. Restrict the ability to inspect kernel stacks of arbitrary tasks to root in order to prevent a local attacker from exploiting racy stack unwinding to leak kernel task stack contents. See the added comment for a longer rationale. There don't seem to be any users of this userspace API that can't gracefully bail out if reading from the file fails. Therefore, I believe that this change is unlikely to break things. In the case that this patch does end up needing a revert, the next-best solution might be to fake a single-entry stack based on wchan. Link: http://lkml.kernel.org/r/20180927153316.200286-1-jannh@google.com Fixes: 2ec220e27f50 ("proc: add /proc/*/stack") Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Alexey Dobriyan Cc: Ken Chen Cc: Will Deacon Cc: Laura Abbott Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Josh Poimboeuf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5f9cec2db6c3..4beed301e224 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -471,6 +471,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, int err; int i; + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; -- cgit v1.2.3 From 20ba8a53a1ef1cffd0f795206d69181572ae84e8 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Fri, 5 Oct 2018 15:52:15 -0700 Subject: ocfs2: fix locking for res->tracking and dlm->tracking_list commit cbe355f57c8074bc4f452e5b6e35509044c6fa23 upstream. In dlm_init_lockres() we access and modify res->tracking and dlm->tracking_list without holding dlm->track_lock. This can cause list corruptions and can end up in kernel panic. Fix this by locking res->tracking and dlm->tracking_list with dlm->track_lock instead of dlm->spinlock. Link: http://lkml.kernel.org/r/1529951192-4686-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reviewed-by: Changwei Ge Acked-by: Joseph Qi Acked-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmmaster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4e2162b355db..0cefb036a17e 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -589,9 +589,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); -- cgit v1.2.3 From fb751efb29d037ee051f326b6f622881ca0b9a14 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 13 Jun 2018 00:51:28 -0400 Subject: ext4: always verify the magic number in xattr blocks commit 513f86d73855ce556ea9522b6bfd79f87356dc3a upstream. If there an inode points to a block which is also some other type of metadata block (such as a block allocation bitmap), the buffer_verified flag can be set when it was validated as that other metadata block type; however, it would make a really terrible external attribute block. The reason why we use the verified flag is to avoid constantly reverifying the block. However, it doesn't take much overhead to make sure the magic number of the xattr block is correct, and this will avoid potential crashes. This addresses CVE-2018-10879. https://bugzilla.kernel.org/show_bug.cgi?id=200001 Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger [Backported to 4.4: adjust context] Signed-off-by: Daniel Rosenberg Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b51bb73b06a6..d0aaf338fa9f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -220,12 +220,12 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) { int error; - if (buffer_verified(bh)) - return 0; - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EFSCORRUPTED; + if (buffer_verified(bh)) + return 0; + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, -- cgit v1.2.3 From c4c84454902516b7648a18d5173304da7dffa9d6 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 3 Sep 2018 23:06:23 +0200 Subject: ubifs: Check for name being NULL while mounting commit 37f31b6ca4311b94d985fb398a72e5399ad57925 upstream. The requested device name can be NULL or an empty string. Check for that and refuse to continue. UBIFS has to do this manually since we cannot use mount_bdev(), which checks for this condition. Fixes: 1e51764a3c2ac ("UBIFS: add new flash file system") Reported-by: syzbot+38bd0f7865e5c6379280@syzkaller.appspotmail.com Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0bb6de356451..7968b7a5e787 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1918,6 +1918,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) int dev, vol; char *endptr; + if (!name || !*name) + return ERR_PTR(-EINVAL); + /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) -- cgit v1.2.3 From 7a3d844f83d47c432fba7d28570bc68b5015ecd0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 13 Jun 2018 00:23:11 -0400 Subject: ext4: add corruption check in ext4_xattr_set_entry() commit 5369a762c882c0b6e9599e4ebbb3a9ba9eee7e2d upstream. In theory this should have been caught earlier when the xattr list was verified, but in case it got missed, it's simple enough to add check to make sure we don't overrun the xattr buffer. This addresses CVE-2018-10879. https://bugzilla.kernel.org/show_bug.cgi?id=200001 Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger [bwh: Backported to 3.16: - Add inode parameter to ext4_xattr_set_entry() and update callers - Adjust context] Signed-off-by: Ben Hutchings [adjusted for 4.4 context] Signed-off-by: Daniel Rosenberg Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d0aaf338fa9f..d6bae37489af 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -638,14 +638,20 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, } static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, + struct inode *inode) { - struct ext4_xattr_entry *last; + struct ext4_xattr_entry *last, *next; size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); /* Compute min_offs and last. */ last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + for (; !IS_LAST_ENTRY(last); last = next) { + next = EXT4_XATTR_NEXT(last); + if ((void *)next >= s->end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + return -EFSCORRUPTED; + } if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) @@ -825,7 +831,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ce = NULL; } ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, inode); if (!error) { if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), @@ -875,7 +881,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, inode); if (error == -EFSCORRUPTED) goto bad_block; if (error) @@ -1037,7 +1043,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, inode); if (error) { if (error == -ENOSPC && ext4_has_inline_data(inode)) { @@ -1049,7 +1055,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_find(inode, i, is); if (error) return error; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, inode); } if (error) return error; @@ -1075,7 +1081,7 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; - error = ext4_xattr_set_entry(i, s); + error = ext4_xattr_set_entry(i, s, inode); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); -- cgit v1.2.3 From 5ac147ebfa1c50cc0f25e3c7f42862cfe6b40d0d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 12 Oct 2018 14:01:26 +0800 Subject: jffs2: return -ERANGE when xattr buffer is too small When a file have multiple xattrs and the passed buffer is smaller than the required size, jffs2_listxattr() should return -ERANGE instead of continue, else Oops may occur due to memory corruption. Also remove the unnecessary check ("rc < 0"), because xhandle->list(...) will not return an error number. Spotted by generic/377 in xfstests-dev. NB: The problem had been fixed by commit 764a5c6b1fa4 ("xattr handlers: Simplify list operation") in v4.5-rc1, but the modification in that commit may be too much because it modifies all file-systems which implement xattr, so I create a single patch for jffs2 to fix the problem. Signed-off-by: Hou Tao Cc: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4c2c03663533..8e1427762eeb 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -1004,12 +1004,14 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) rc = xhandle->list(xhandle, dentry, buffer + len, size - len, xd->xname, xd->name_len); + if (rc > size - len) { + rc = -ERANGE; + goto out; + } } else { rc = xhandle->list(xhandle, dentry, NULL, 0, xd->xname, xd->name_len); } - if (rc < 0) - goto out; len += rc; } rc = len; -- cgit v1.2.3 From c1504091b9805d131abb36a1f01af7a5c99ac9fc Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 17 Aug 2016 21:58:33 -0400 Subject: btrfs: don't create or leak aliased root while cleaning up orphans [ Upstream commit 35bbb97fc898aeb874cb7c8b746f091caa359994 ] commit 909c3a22da3 (Btrfs: fix loading of orphan roots leading to BUG_ON) avoids the BUG_ON but can add an aliased root to the dead_roots list or leak the root. Since we've already been loading roots into the radix tree, we should use it before looking the root up on disk. Cc: # 4.5 Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/root-tree.c | 27 ++++++++++++++++++--------- 3 files changed, 22 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ae6e3a30e61e..8dbb00fbb00b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1608,8 +1608,8 @@ fail: return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index adeb31830b9c..3c9819403487 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 2c849b08a91b..6a6efb26d52f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + root = btrfs_lookup_fs_root(tree_root->fs_info, + root_key.objectid); + if (root) { + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + continue; + } + root = btrfs_read_fs_root(tree_root, &root_key); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - if (err == -EEXIST) - err = 0; if (err) { + BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } -- cgit v1.2.3 From 22979776093abc47dcbc6a06eab41d24800ec32d Mon Sep 17 00:00:00 2001 From: Mark Syms Date: Tue, 29 Nov 2016 11:36:46 +0000 Subject: CIFS: handle guest access errors to Windows shares [ Upstream commit 40920c2bb119fd49ba03e2f97a172171781be442 ] Commit 1a967d6c9b39c226be1b45f13acd4d8a5ab3dc44 ("correctly to anonymous authentication for the NTLM(v2) authentication") introduces a regression in handling errors related to attempting a guest connection to a Windows share which requires authentication. This should result in a permission denied error but actually causes the kernel module to enter a never-ending loop trying to follow a DFS referal which doesn't exist. The base cause of this is the failure now occurs later in the process during tree connect and not at the session setup setup and all errors in tree connect are interpreted as needing to follow the DFS paths which isn't in this case correct. So, check the returned error against EACCES and fail if this is returned error. Feedback from Aurelien: PS> net user guest /activate:no PS> mkdir C:\guestshare PS> icacls C:\guestshare /grant 'Everyone:(OI)(CI)F' PS> new-smbshare -name guestshare -path C:\guestshare -fullaccess Everyone I've tested v3.10, v4.4, master, master+your patch using default options (empty or no user "NU") and user=abc (U). NT_LOGON_FAILURE in session setup: LF This is what you seem to have in 3.10. NT_ACCESS_DENIED in tree connect to the share: AD This is what you get before your infinite loop. | NU U -------------------------------- 3.10 | LF LF 4.4 | LF LF master | AD LF master+patch | AD LF No infinite DFS loop :( All these issues result in mount failing very fast with permission denied. I guess it could be from either the Windows version or the share/folder ACL. A deeper analysis of the packets might reveal more. In any case I did not notice any issues for on a basic DFS setup with the patch so I don't think it introduced any regressions, which is probably all that matters. It still bothers me a little I couldn't hit the bug. I've included kernel output w/ debugging output and network capture of my tests if anyone want to have a look at it. (master+patch = ml-guestfix). Signed-off-by: Mark Syms Reviewed-by: Aurelien Aptel Tested-by: Aurelien Aptel Acked-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/connect.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 077ad3a06c9a..1eeb4780c3ed 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3674,6 +3674,9 @@ try_mount_again: if (IS_ERR(tcon)) { rc = PTR_ERR(tcon); tcon = NULL; + if (rc == -EACCES) + goto mount_fail_check; + goto remote_path_check; } -- cgit v1.2.3 From 1b6a863ff29ccd7b65b0112bbec1cda26b7f9c4e Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Wed, 12 Jul 2017 19:26:58 -0700 Subject: fuse: Dont call set_page_dirty_lock() for ITER_BVEC pages for async_dio [ Upstream commit 61c12b49e1c9c77d7a1bcc161de540d0fd21cf0c ] Commit 8fba54aebbdf ("fuse: direct-io: don't dirty ITER_BVEC pages") fixes the ITER_BVEC page deadlock for direct io in fuse by checking in fuse_direct_io(), whether the page is a bvec page or not, before locking it. However, this check is missed when the "async_dio" mount option is enabled. In this case, set_page_dirty_lock() is called from the req->end callback in request_end(), when the fuse thread is returning from userspace to respond to the read request. This will cause the same deadlock because the bvec condition is not checked in this path. Here is the stack of the deadlocked thread, while returning from userspace: [13706.656686] INFO: task glusterfs:3006 blocked for more than 120 seconds. [13706.657808] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13706.658788] glusterfs D ffffffff816c80f0 0 3006 1 0x00000080 [13706.658797] ffff8800d6713a58 0000000000000086 ffff8800d9ad7000 ffff8800d9ad5400 [13706.658799] ffff88011ffd5cc0 ffff8800d6710008 ffff88011fd176c0 7fffffffffffffff [13706.658801] 0000000000000002 ffffffff816c80f0 ffff8800d6713a78 ffffffff816c790e [13706.658803] Call Trace: [13706.658809] [] ? bit_wait_io_timeout+0x80/0x80 [13706.658811] [] schedule+0x3e/0x90 [13706.658813] [] schedule_timeout+0x1b5/0x210 [13706.658816] [] ? gup_pud_range+0x1db/0x1f0 [13706.658817] [] ? kvm_clock_read+0x1e/0x20 [13706.658819] [] ? kvm_clock_get_cycles+0x9/0x10 [13706.658822] [] ? ktime_get+0x52/0xc0 [13706.658824] [] io_schedule_timeout+0xa4/0x110 [13706.658826] [] bit_wait_io+0x36/0x50 [13706.658828] [] __wait_on_bit_lock+0x76/0xb0 [13706.658831] [] ? lock_request+0x46/0x70 [fuse] [13706.658834] [] __lock_page+0xaa/0xb0 [13706.658836] [] ? wake_atomic_t_function+0x40/0x40 [13706.658838] [] set_page_dirty_lock+0x58/0x60 [13706.658841] [] fuse_release_user_pages+0x58/0x70 [fuse] [13706.658844] [] ? fuse_aio_complete+0x190/0x190 [fuse] [13706.658847] [] fuse_aio_complete_req+0x29/0x90 [fuse] [13706.658849] [] request_end+0xd9/0x190 [fuse] [13706.658852] [] fuse_dev_do_write+0x336/0x490 [fuse] [13706.658854] [] fuse_dev_write+0x6e/0xa0 [fuse] [13706.658857] [] ? security_file_permission+0x23/0x90 [13706.658859] [] do_iter_readv_writev+0x60/0x90 [13706.658862] [] ? fuse_dev_splice_write+0x350/0x350 [fuse] [13706.658863] [] do_readv_writev+0x171/0x1f0 [13706.658866] [] ? try_to_wake_up+0x210/0x210 [13706.658868] [] vfs_writev+0x41/0x50 [13706.658870] [] SyS_writev+0x56/0xf0 [13706.658872] [] ? syscall_trace_leave+0xf1/0x160 [13706.658874] [] system_call_fastpath+0x12/0x71 Fix this by making should_dirty a fuse_io_priv parameter that can be checked in fuse_aio_complete_req(). Reported-by: Tiger Yang Signed-off-by: Ashish Samant Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin --- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8577f3ba6dc6..7014318f6d18 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -625,7 +625,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_io_priv *io = req->io; ssize_t pos = -1; - fuse_release_user_pages(req, !io->write); + fuse_release_user_pages(req, io->should_dirty); if (io->write) { if (req->misc.write.in.size != req->misc.write.out.size) @@ -1333,7 +1333,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; - bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1362,6 +1361,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, mutex_unlock(&inode->i_mutex); } + io->should_dirty = !write && iter_is_iovec(iter); while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1378,7 +1378,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, should_dirty); + fuse_release_user_pages(req, io->should_dirty); if (req->out.h.error) { if (!res) res = req->out.h.error; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7aafe9acc6c0..c6eb35a95fcc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -252,6 +252,7 @@ struct fuse_io_priv { size_t size; __u64 offset; bool write; + bool should_dirty; int err; struct kiocb *iocb; struct file *file; -- cgit v1.2.3 From fa2075baa95c1957b477c830415d4d6b72282ddc Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Fri, 12 Oct 2018 21:34:40 -0700 Subject: fs/fat/fatent.c: add cond_resched() to fat_count_free_clusters() [ Upstream commit ac081c3be3fae6d0cc3e1862507fca3862d30b67 ] On non-preempt kernels this loop can take a long time (more than 50 ticks) processing through entries. Link: http://lkml.kernel.org/r/20181010172623.57033-1-khazhy@google.com Signed-off-by: Khazhismel Kumykov Acked-by: OGAWA Hirofumi Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- fs/fat/fatent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index a70e37c47a78..e3fc477728b3 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -681,6 +681,7 @@ int fat_count_free_clusters(struct super_block *sb) if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); + cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; -- cgit v1.2.3 From d1ce094c3c90434844c6580be7290517762422d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Oct 2018 15:23:26 +0100 Subject: cachefiles: fix the race between cachefiles_bury_object() and rmdir(2) commit 169b803397499be85bdd1e3d07d6f5e3d4bd669e upstream. the victim might've been rmdir'ed just before the lock_rename(); unlike the normal callers, we do not look the source up after the parents are locked - we know it beforehand and just recheck that it's still the child of what used to be its parent. Unfortunately, the check is too weak - we don't spot a dead directory since its ->d_parent is unchanged, dentry is positive, etc. So we sail all the way to ->rename(), with hosting filesystems _not_ expecting to be asked renaming an rmdir'ed subdirectory. The fix is easy, fortunately - the lock on parent is sufficient for making IS_DEADDIR() on child safe. Cc: stable@vger.kernel.org Fixes: 9ae326a69004 (CacheFiles: A cache that backs onto a mounted filesystem) Signed-off-by: Al Viro Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- fs/cachefiles/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c43b4b08546b..a5f59eed8287 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -317,7 +317,7 @@ try_again: trap = lock_rename(cache->graveyard, dir); /* do some checks before getting the grave dentry */ - if (rep->d_parent != dir) { + if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) { /* the entry was probably culled when we dropped the parent dir * lock */ unlock_rename(cache->graveyard, dir); -- cgit v1.2.3 From 85b89ccf86cececb3e562b071d078a3cf50a54cc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 6 Oct 2018 17:09:35 +0800 Subject: jffs2: free jffs2_sb_info through jffs2_kill_sb() commit 92e2921f7eee63450a5f953f4b15dc6210219430 upstream. When an invalid mount option is passed to jffs2, jffs2_parse_options() will fail and jffs2_sb_info will be freed, but then jffs2_sb_info will be used (use-after-free) and freeed (double-free) in jffs2_kill_sb(). Fix it by removing the buggy invocation of kfree() when getting invalid mount options. Fixes: 92abc475d8de ("jffs2: implement mount option parsing and compression overriding") Cc: stable@kernel.org Signed-off-by: Hou Tao Reviewed-by: Richard Weinberger Signed-off-by: Boris Brezillon Signed-off-by: Greg Kroah-Hartman --- fs/jffs2/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 600da1a4df29..1544f530ccd0 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -285,10 +285,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = c; ret = jffs2_parse_options(c, data); - if (ret) { - kfree(c); + if (ret) return -EINVAL; - } /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ -- cgit v1.2.3 From c87fb36e0dfd35c3c2c454a3f8c6d7018fcde400 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 2 Oct 2018 01:34:44 -0400 Subject: ext4: fix argument checking in EXT4_IOC_MOVE_EXT [ Upstream commit f18b2b83a727a3db208308057d2c7945f368e625 ] If the starting block number of either the source or destination file exceeds the EOF, EXT4_IOC_MOVE_EXT should return EINVAL. Also fixed the helper function mext_check_coverage() so that if the logical block is beyond EOF, make it return immediately, instead of looping until the block number wraps all the away around. This takes long enough that if there are multiple threads trying to do pound on an the same inode doing non-sensical things, it can end up triggering the kernel's soft lockup detector. Reported-by: syzbot+c61979f6f2cba5cb3c06@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/ext4/move_extent.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 05048fcfd602..6b5e2eddd8d7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -526,9 +526,13 @@ mext_check_arguments(struct inode *orig_inode, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_eof < orig_start + *len - 1) + if (orig_eof <= orig_start) + *len = 0; + else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; - if (donor_eof < donor_start + *len - 1) + if (donor_eof <= donor_start) + *len = 0; + else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " -- cgit v1.2.3 From f4af4c7329886024c122cdd88559db4d5b93dc81 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 5 Oct 2018 18:44:40 -0400 Subject: jbd2: fix use after free in jbd2_log_do_checkpoint() commit ccd3c4373eacb044eb3832966299d13d2631f66f upstream. The code cleaning transaction's lists of checkpoint buffers has a bug where it increases bh refcount only after releasing journal->j_list_lock. Thus the following race is possible: CPU0 CPU1 jbd2_log_do_checkpoint() jbd2_journal_try_to_free_buffers() __journal_try_to_free_buffer(bh) ... while (transaction->t_checkpoint_io_list) ... if (buffer_locked(bh)) { <-- IO completes now, buffer gets unlocked --> spin_unlock(&journal->j_list_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); try_to_free_buffers(page); get_bh(bh) <-- accesses freed bh Fix the problem by grabbing bh reference before unlocking journal->j_list_lock. Fixes: dc6e8d669cf5 ("jbd2: don't call get_bh() before calling __jbd2_journal_remove_checkpoint()") Fixes: be1158cc615f ("jbd2: fold __process_buffer() into jbd2_log_do_checkpoint()") Reported-by: syzbot+7f4a27091759e2fe7453@syzkaller.appspotmail.com CC: stable@vger.kernel.org Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 684996c8a3a4..4d5a5a4cc017 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -254,8 +254,8 @@ restart: bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -336,8 +336,8 @@ restart2: jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); get_bh(bh); + spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); -- cgit v1.2.3 From 82a76725f960854d275991bb0c1f8adb44b3c1eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Oct 2018 00:19:13 -0400 Subject: gfs2_meta: ->mount() can get NULL dev_name commit 3df629d873f8683af6f0d34dfc743f637966d483 upstream. get in sync with mount_bdev() handling of the same Reported-by: syzbot+c54f8e94e6bba03b04e9@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/ops_fstype.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index baab99b69d8a..d9178388cf48 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1353,6 +1353,9 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, struct path path; int error; + if (!dev_name || !*dev_name) + return ERR_PTR(-EINVAL); + error = kern_path(dev_name, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", -- cgit v1.2.3 From d396e5395271f59bbe9069ebcc14a0fa5d9c85e7 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 2 Oct 2018 21:18:45 -0400 Subject: ext4: initialize retries variable in ext4_da_write_inline_data_begin() commit 625ef8a3acd111d5f496d190baf99d1a815bd03e upstream. Variable retries is not initialized in ext4_da_write_inline_data_begin() which can lead to nondeterministic number of retries in case we hit ENOSPC. Initialize retries to zero as we do everywhere else. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: bc0ca9df3b2a ("ext4: retry allocation when inline->extent conversion failed") Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1e7a9774119c..34c365f0e69e 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -859,7 +859,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; - int retries; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) -- cgit v1.2.3 From 39d6c4cdcf67e3b7f91822d138b7d7f870c295be Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 15 Sep 2018 23:04:41 -0500 Subject: smb3: allow stats which track session and share reconnects to be reset commit 2c887635cd6ab3af619dc2be94e5bf8f2e172b78 upstream. Currently, "echo 0 > /proc/fs/cifs/Stats" resets all of the stats except the session and share reconnect counts. Fix it to reset those as well. CC: Stable Signed-off-by: Steve French Reviewed-by: Aurelien Aptel Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifs_debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 0e72a14228f8..7bc6d27d47a4 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -285,6 +285,9 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); #endif /* CONFIG_CIFS_STATS2 */ + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + spin_lock(&GlobalMid_Lock); GlobalMaxActiveXid = 0; GlobalCurrentXid = 0; -- cgit v1.2.3 From ae83508da422b1c6ec5a1409ca15c0f5f33cff31 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 19 Oct 2018 00:45:21 -0500 Subject: smb3: do not attempt cifs operation in smb3 query info error path commit 1e77a8c204c9d1b655c61751b8ad0fde22421dbb upstream. If backupuid mount option is sent, we can incorrectly retry (on access denied on query info) with a cifs (FindFirst) operation on an smb3 mount which causes the server to force the session close. We set backup intent on open so no need for this fallback. See kernel bugzilla 201435 Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 36c8594bb147..5c3187df9ab9 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -756,7 +756,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if (rc == -EACCES && backup_cred(cifs_sb)) { + } else if ((rc == -EACCES) && backup_cred(cifs_sb) && + (strcmp(server->vals->version_string, SMB1_VERSION_STRING) + == 0)) { + /* + * For SMB2 and later the backup intent flag is already + * sent if needed on open and there is no path based + * FindFirst operation to use to retry with + */ + srchinf = kzalloc(sizeof(struct cifs_search_info), GFP_KERNEL); if (srchinf == NULL) { -- cgit v1.2.3 From aa21d67d7fe74897df53cbd22c8be21f8928e07d Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 28 Oct 2018 13:13:23 -0500 Subject: smb3: on kerberos mount if server doesn't specify auth type use krb5 commit 926674de6705f0f1dbf29a62fd758d0977f535d6 upstream. Some servers (e.g. Azure) do not include a spnego blob in the SMB3 negotiate protocol response, so on kerberos mounts ("sec=krb5") we can fail, as we expected the server to list its supported auth types (OIDs in the spnego blob in the negprot response). Change this so that on krb5 mounts we default to trying krb5 if the server doesn't list its supported protocol mechanisms. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifs_spnego.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 6908080e9b6d..e3f2b7370bd8 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -143,8 +143,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) sprintf(dp, ";sec=mskrb5"); - else - goto out; + else { + cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); + sprintf(dp, ";sec=krb5"); + } dp = description + strlen(description); sprintf(dp, ";uid=0x%x", -- cgit v1.2.3 From a88fd5847b939a87f4814b41814c74ec1d2a5309 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Sep 2018 10:07:44 -0400 Subject: NFSv4.1: Fix the r/wsize checking commit 943cff67b842839f4f35364ba2db5c2d3f025d94 upstream. The intention of nfs4_session_set_rwsize() was to cap the r/wsize to the buffer sizes negotiated by the CREATE_SESSION. The initial code had a bug whereby we would not check the values negotiated by nfs_probe_fsinfo() (the assumption being that CREATE_SESSION will always negotiate buffer values that are sane w.r.t. the server's preferred r/wsizes) but would only check values set by the user in the 'mount' command. The code was changed in 4.11 to _always_ set the r/wsize, meaning that we now never use the server preferred r/wsizes. This is the regression that this patch fixes. Also rename the function to nfs4_session_limit_rwsize() in order to avoid future confusion. Fixes: 033853325fe3 (NFSv4.1 respect server's max size in CREATE_SESSION") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4client.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 63498e1a542a..ae91d1e450be 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -879,10 +879,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. + * Limit the mount rsize, wsize and dtsize using negotiated fore + * channel attributes. */ -static void nfs4_session_set_rwsize(struct nfs_server *server) +static void nfs4_session_limit_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 struct nfs4_session *sess; @@ -895,9 +895,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (!server->rsize || server->rsize > server_resp_sz) + if (server->dtsize > server_resp_sz) + server->dtsize = server_resp_sz; + if (server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (!server->wsize || server->wsize > server_rqst_sz) + if (server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } @@ -944,12 +946,12 @@ static int nfs4_server_common_setup(struct nfs_server *server, (unsigned long long) server->fsid.minor); nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); - nfs4_session_set_rwsize(server); - error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto out; + nfs4_session_limit_rwsize(server); + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; -- cgit v1.2.3 From 637276555f3a9c35dcbe136d1bc108f5f708d500 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 28 Sep 2018 20:41:48 +0300 Subject: lockd: fix access beyond unterminated strings in prints commit 93f38b6fae0ea8987e22d9e6c38f8dfdccd867ee upstream. printk format used %*s instead of %.*s, so hostname_len does not limit the number of bytes accessed from hostname. Signed-off-by: Amir Goldstein Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/lockd/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index d716c9993a26..c7eb47f2fb6c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -340,7 +340,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, }; struct lockd_net *ln = net_generic(net, lockd_net_id); - dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + dprintk("lockd: %s(host='%.*s', vers=%u, proto=%s)\n", __func__, (int)hostname_len, hostname, rqstp->rq_vers, (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); -- cgit v1.2.3 From 4492b0b5f0a46859b12d67126fd2f68ae2674936 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 16 Jun 2018 23:41:59 -0400 Subject: ext4: avoid running out of journal credits when appending to an inline file commit 8bc1379b82b8e809eef77a9fedbb75c6c297be19 upstream. Use a separate journal transaction if it turns out that we need to convert an inline file to use an data block. Otherwise we could end up failing due to not having journal credits. This addresses CVE-2018-10883. https://bugzilla.kernel.org/show_bug.cgi?id=200071 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org [fengc@google.com: 4.4 backport: adjust context] Signed-off-by: Chenbo Feng Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 3 --- fs/ext4/inline.c | 38 +------------------------------------- fs/ext4/xattr.c | 18 ++---------------- 3 files changed, 3 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f5d9f82b173a..b6e25d771eea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3039,9 +3039,6 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); -extern int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed); extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 34c365f0e69e..1aec46733ef8 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -888,11 +888,11 @@ retry_journal: flags |= AOP_FLAG_NOFS; if (ret == -ENOSPC) { + ext4_journal_stop(handle); ret = ext4_da_convert_inline_data_to_extent(mapping, inode, flags, fsdata); - ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; @@ -1867,42 +1867,6 @@ out: return (error < 0 ? error : 0); } -/* - * Called during xattr set, and if we can sparse space 'needed', - * just create the extent tree evict the data to the outer block. - * - * We use jbd2 instead of page cache to move data to the 1st block - * so that the whole transaction can be committed as a whole and - * the data isn't lost because of the delayed page cache write. - */ -int ext4_try_to_evict_inline_data(handle_t *handle, - struct inode *inode, - int needed) -{ - int error; - struct ext4_xattr_entry *entry; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - - raw_inode = ext4_raw_inode(&iloc); - entry = (struct ext4_xattr_entry *)((void *)raw_inode + - EXT4_I(inode)->i_inline_off); - if (EXT4_XATTR_LEN(entry->e_name_len) + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { - error = -ENOSPC; - goto out; - } - - error = ext4_convert_inline_data_nolock(handle, inode, &iloc); -out: - brelse(iloc.bh); - return error; -} - void ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d6bae37489af..16cf24dc9c34 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1044,22 +1044,8 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, if (EXT4_I(inode)->i_extra_isize == 0) return -ENOSPC; error = ext4_xattr_set_entry(i, s, inode); - if (error) { - if (error == -ENOSPC && - ext4_has_inline_data(inode)) { - error = ext4_try_to_evict_inline_data(handle, inode, - EXT4_XATTR_LEN(strlen(i->name) + - EXT4_XATTR_SIZE(i->value_len))); - if (error) - return error; - error = ext4_xattr_ibody_find(inode, i, is); - if (error) - return error; - error = ext4_xattr_set_entry(i, s, inode); - } - if (error) - return error; - } + if (error) + return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); -- cgit v1.2.3 From ae6e27402c216840e2d361675a684803b319aa27 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 30 Oct 2018 13:26:15 -0400 Subject: Cramfs: fix abad comparison when wrap-arounds occur commit 672ca9dd13f1aca0c17516f76fc5b0e8344b3e46 upstream. It is possible for corrupted filesystem images to produce very large block offsets that may wrap when a length is added, and wrongly pass the buffer size test. Reported-by: Anatoly Trosinenko Signed-off-by: Nicolas Pitre Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/cramfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 355c522f3585..a6c9c2d66af1 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -185,7 +185,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i continue; blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT; blk_offset += offset; - if (blk_offset + len > BUFFER_SIZE) + if (blk_offset > BUFFER_SIZE || + blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } -- cgit v1.2.3 From dc0a989b23a65501827d02bb94dd6a8d6d3fc8c8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 21 Aug 2018 09:42:03 +0800 Subject: btrfs: Handle owner mismatch gracefully when walking up tree commit 65c6e82becec33731f48786e5a30f98662c86b16 upstream. [BUG] When mounting certain crafted image, btrfs will trigger kernel BUG_ON() when trying to recover balance: kernel BUG at fs/btrfs/extent-tree.c:8956! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 662 Comm: mount Not tainted 4.18.0-rc1-custom+ #10 RIP: 0010:walk_up_proc+0x336/0x480 [btrfs] RSP: 0018:ffffb53540c9b890 EFLAGS: 00010202 Call Trace: walk_up_tree+0x172/0x1f0 [btrfs] btrfs_drop_snapshot+0x3a4/0x830 [btrfs] merge_reloc_roots+0xe1/0x1d0 [btrfs] btrfs_recover_relocation+0x3ea/0x420 [btrfs] open_ctree+0x1af3/0x1dd0 [btrfs] btrfs_mount_root+0x66b/0x740 [btrfs] mount_fs+0x3b/0x16a vfs_kern_mount.part.9+0x54/0x140 btrfs_mount+0x16d/0x890 [btrfs] mount_fs+0x3b/0x16a vfs_kern_mount.part.9+0x54/0x140 do_mount+0x1fd/0xda0 ksys_mount+0xba/0xd0 __x64_sys_mount+0x21/0x30 do_syscall_64+0x60/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe [CAUSE] Extent tree corruption. In this particular case, reloc tree root's owner is DATA_RELOC_TREE (should be TREE_RELOC), thus its backref is corrupted and we failed the owner check in walk_up_tree(). [FIX] It's pretty hard to take care of every extent tree corruption, but at least we can remove such BUG_ON() and exit more gracefully. And since in this particular image, DATA_RELOC_TREE and TREE_RELOC share the same root (which is obviously invalid), we needs to make __del_reloc_root() more robust to detect such invalid sharing to avoid possible NULL dereference as root->node can be NULL in this case. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200411 Reported-by: Xu Wen CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 18 ++++++++++++------ fs/btrfs/relocation.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a72f941ca750..722e11b318a7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8704,15 +8704,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(eb)); + else if (root->root_key.objectid != btrfs_header_owner(eb)) + goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])); + else if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])) + goto owner_mismatch; } btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); @@ -8720,6 +8719,11 @@ out: wc->refs[level] = 0; wc->flags[level] = 0; return 0; + +owner_mismatch: + btrfs_err_rl(root->fs_info, "unexpected tree owner, have %llu expect %llu", + btrfs_header_owner(eb), root->root_key.objectid); + return -EUCLEAN; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -8773,6 +8777,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; + if (ret < 0) + return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfe913d2d3df..d6ccfb31aef0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1318,7 +1318,7 @@ static void __del_reloc_root(struct btrfs_root *root) struct mapping_node *node = NULL; struct reloc_control *rc = root->fs_info->reloc_ctl; - if (rc) { + if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, root->node->start); -- cgit v1.2.3 From ff58ad5f344d9cd6b706271781b2d11baa32e18d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 21 Aug 2018 09:53:47 +0800 Subject: btrfs: locking: Add extra check in btrfs_init_new_buffer() to avoid deadlock commit b72c3aba09a53fc7c1824250d71180ca154517a7 upstream. [BUG] For certain crafted image, whose csum root leaf has missing backref, if we try to trigger write with data csum, it could cause deadlock with the following kernel WARN_ON(): WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400 CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8 Workqueue: btrfs-endio-write btrfs_endio_write_helper RIP: 0010:btrfs_tree_lock+0x3e2/0x400 Call Trace: btrfs_alloc_tree_block+0x39f/0x770 __btrfs_cow_block+0x285/0x9e0 btrfs_cow_block+0x191/0x2e0 btrfs_search_slot+0x492/0x1160 btrfs_lookup_csum+0xec/0x280 btrfs_csum_file_blocks+0x2be/0xa60 add_pending_csums+0xaf/0xf0 btrfs_finish_ordered_io+0x74b/0xc90 finish_ordered_fn+0x15/0x20 normal_work_helper+0xf6/0x500 btrfs_endio_write_helper+0x12/0x20 process_one_work+0x302/0x770 worker_thread+0x81/0x6d0 kthread+0x180/0x1d0 ret_from_fork+0x35/0x40 [CAUSE] That crafted image has missing backref for csum tree root leaf. And when we try to allocate new tree block, since there is no EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot and use it. The extent tree of the image looks like: Normal image | This fuzzed image ----------------------------------+-------------------------------- BG 29360128 | BG 29360128 One empty slot | One empty slot 29364224: backref to UUID tree | 29364224: backref to UUID tree Two empty slots | Two empty slots 29376512: backref to CSUM tree | One empty slot (bad type) <<< 29380608: backref to D_RELOC tree | 29380608: backref to D_RELOC tree ... | ... Since bytenr 29376512 has no METADATA/EXTENT_ITEM, when btrfs try to alloc tree block, it's an valid slot for btrfs. And for finish_ordered_write, when we need to insert csum, we try to CoW csum tree root. By accident, empty slots at bytenr BG_OFFSET, BG_OFFSET + 8K, BG_OFFSET + 12K is already used by tree block COW for other trees, the next empty slot is BG_OFFSET + 16K, which should be the backref for CSUM tree. But due to the bad type, btrfs can recognize it and still consider it as an empty slot, and will try to use it for csum tree CoW. Then in the following call trace, we will try to lock the new tree block, which turns out to be the old csum tree root which is already locked: btrfs_search_slot() called on csum tree root, which is at 29376512 |- btrfs_cow_block() |- btrfs_set_lock_block() | |- Now locks tree block 29376512 (old csum tree root) |- __btrfs_cow_block() |- btrfs_alloc_tree_block() |- btrfs_reserve_extent() | Now it returns tree block 29376512, which extent tree | shows its empty slot, but it's already hold by csum tree |- btrfs_init_new_buffer() |- btrfs_tree_lock() | Triggers WARN_ON(eb->lock_owner == current->pid) |- wait_event() Wait lock owner to release the lock, but it's locked by ourself, so it will deadlock [FIX] This patch will do the lock_owner and current->pid check at btrfs_init_new_buffer(). So above deadlock can be avoided. Since such problem can only happen in crafted image, we will still trigger kernel warning for later aborted transaction, but with a little more meaningful warning message. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405 Reported-by: Xu Wen CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 722e11b318a7..6f0ecaf09962 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7835,6 +7835,20 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); + + /* + * Extra safety check in case the extent tree is corrupted and extent + * allocator chooses to use a tree block which is already used and + * locked. + */ + if (buf->lock_owner == current->pid) { + btrfs_err_rl(root->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + buf->start, btrfs_header_owner(buf), current->pid); + free_extent_buffer(buf); + return ERR_PTR(-EUCLEAN); + } + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); -- cgit v1.2.3 From 98edddde5a2cccf6daf6428d9d03f82027ab62d8 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 17:18:14 -0400 Subject: btrfs: iterate all devices during trim, instead of fs_devices::alloc_list commit d4e329de5e5e21594df2e0dd59da9acee71f133b upstream. btrfs_trim_fs iterates over the fs_devices->alloc_list while holding the device_list_mutex. The problem is that ->alloc_list is protected by the chunk mutex. We don't want to hold the chunk mutex over the trim of the entire file system. Fortunately, the ->dev_list list is protected by the dev_list mutex and while it will give us all devices, including read-only devices, we already just skip the read-only devices. Then we can continue to take and release the chunk mutex while scanning each device. Fixes: 499f377f49f ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6f0ecaf09962..4ebfe130bee0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10751,8 +10751,8 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - devices = &root->fs_info->fs_devices->alloc_list; - list_for_each_entry(device, devices, dev_alloc_list) { + devices = &root->fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); if (ret) -- cgit v1.2.3 From 639a61c6bab798193e9e0a60dc2b988ae66847fd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 6 Sep 2018 17:18:15 -0400 Subject: btrfs: don't attempt to trim devices that don't support it commit 0be88e367fd8fbdb45257615d691f4675dda062f upstream. We check whether any device the file system is using supports discard in the ioctl call, but then we attempt to trim free extents on every device regardless of whether discard is supported. Due to the way we mask off EOPNOTSUPP, we can end up issuing the trim operations on each free range on devices that don't support it, just wasting time. Fixes: 499f377f49f08 ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4ebfe130bee0..59f0664de70f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10629,6 +10629,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, *trimmed = 0; + /* Discard not supported = nothing to do. */ + if (!blk_queue_discard(bdev_get_queue(device->bdev))) + return 0; + /* Not writeable = nothing to do. */ if (!device->writeable) return 0; -- cgit v1.2.3 From 6968f018a95ad377b644b2f4b449aa8a4f6620d8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Sep 2018 10:45:45 -0400 Subject: btrfs: wait on caching when putting the bg cache commit 3aa7c7a31c26321696b92841d5103461c6f3f517 upstream. While testing my backport I noticed there was a panic if I ran generic/416 generic/417 generic/418 all in a row. This just happened to uncover a race where we had outstanding IO after we destroy all of our workqueues, and then we'd go to queue the endio work on those free'd workqueues. This is because we aren't waiting for the caching threads to be done before freeing everything up, so to fix this make sure we wait on any outstanding caching that's being done before we free up the block group, so we're sure to be done with all IO by the time we get to btrfs_stop_all_workers(). This fixes the panic I was seeing consistently in testing. ------------[ cut here ]------------ kernel BUG at fs/btrfs/volumes.c:6112! SMP PTI Modules linked in: CPU: 1 PID: 27165 Comm: kworker/u4:7 Not tainted 4.16.0-02155-g3553e54a578d-dirty #875 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 Workqueue: btrfs-cache btrfs_cache_helper RIP: 0010:btrfs_map_bio+0x346/0x370 RSP: 0000:ffffc900061e79d0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff880071542e00 RCX: 0000000000533000 RDX: ffff88006bb74380 RSI: 0000000000000008 RDI: ffff880078160000 RBP: 0000000000000001 R08: ffff8800781cd200 R09: 0000000000503000 R10: ffff88006cd21200 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8800781cd200 R15: ffff880071542e00 FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000817ffc4 CR3: 0000000078314000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btree_submit_bio_hook+0x8a/0xd0 submit_one_bio+0x5d/0x80 read_extent_buffer_pages+0x18a/0x320 btree_read_extent_buffer_pages+0xbc/0x200 ? alloc_extent_buffer+0x359/0x3e0 read_tree_block+0x3d/0x60 read_block_for_search.isra.30+0x1a5/0x360 btrfs_search_slot+0x41b/0xa10 btrfs_next_old_leaf+0x212/0x470 caching_thread+0x323/0x490 normal_work_helper+0xc5/0x310 process_one_work+0x141/0x340 worker_thread+0x44/0x3c0 kthread+0xf8/0x130 ? process_one_work+0x340/0x340 ? kthread_bind+0x10/0x10 ret_from_fork+0x35/0x40 RIP: btrfs_map_bio+0x346/0x370 RSP: ffffc900061e79d0 ---[ end trace 827eb13e50846033 ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: Omar Sandoval Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 59f0664de70f..528b68cdc350 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9521,6 +9521,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group = btrfs_lookup_first_block_group(info, last); while (block_group) { + wait_block_group_cache_done(block_group); spin_lock(&block_group->lock); if (block_group->iref) break; -- cgit v1.2.3 From 5e7a422384626abbd41ff229bbf18b7119053b9e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:00 -0400 Subject: btrfs: reset max_extent_size on clear in a bitmap commit 553cceb49681d60975d00892877d4c871bf220f9 upstream. We need to clear the max_extent_size when we clear bits from a bitmap since it could have been from the range that contains the max_extent_size. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/free-space-cache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 45934deacfd7..2b4c5f298325 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1699,6 +1699,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; + if (info->max_extent_size > ctl->unit) + info->max_extent_size = 0; } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, -- cgit v1.2.3 From da36a0a52348e6301dba61c5d764ac8275d173a1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 28 Sep 2018 07:18:02 -0400 Subject: btrfs: make sure we create all new block groups commit 545e3366db823dc3342ca9d7fea803f829c9062f upstream. Allocating new chunks modifies both the extent and chunk tree, which can trigger new chunk allocations. So instead of doing list_for_each_safe, just do while (!list_empty()) so we make sure we don't exit with other pending bg's still on our list. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Omar Sandoval Reviewed-by: Liu Bo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 528b68cdc350..80cd28456f08 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9912,7 +9912,7 @@ error: void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group_cache *block_group; struct btrfs_root *extent_root = root->fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; @@ -9920,7 +9920,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, bool can_flush_pending_bgs = trans->can_flush_pending_bgs; trans->can_flush_pending_bgs = false; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, + bg_list); if (ret) goto next; -- cgit v1.2.3 From b1223028e6882ba36430cfb328f66605c7034f1e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 9 Oct 2018 15:05:29 +0100 Subject: Btrfs: fix wrong dentries after fsync of file that got its parent replaced commit 0f375eed92b5a407657532637ed9652611a682f5 upstream. In a scenario like the following: mkdir /mnt/A # inode 258 mkdir /mnt/B # inode 259 touch /mnt/B/bar # inode 260 sync mv /mnt/B/bar /mnt/A/bar mv -T /mnt/A /mnt/B fsync /mnt/B/bar After replaying the log we end up with file bar having 2 hard links, both with the name 'bar' and one in the directory with inode number 258 and the other in the directory with inode number 259. Also, we end up with the directory inode 259 still existing and with the directory inode 258 still named as 'A', instead of 'B'. In this scenario, file 'bar' should only have one hard link, located at directory inode 258, the directory inode 259 should not exist anymore and the name for directory inode 258 should be 'B'. This incorrect behaviour happens because when attempting to log the old parents of an inode, we skip any parents that no longer exist. Fix this by forcing a full commit if an old parent no longer exists. A test case for fstests follows soon. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2c7f9a5f8717..63f59f17c97e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5240,9 +5240,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, root, NULL); - /* If parent inode was deleted, skip it. */ - if (IS_ERR(dir_inode)) - continue; + /* + * If the parent inode was deleted, return an error to + * fallback to a transaction commit. This is to prevent + * getting an inode that was moved from one parent A to + * a parent B, got its former parent A deleted and then + * it got fsync'ed, from existing at both parents after + * a log replay (and the old parent still existing). + * Example: + * + * mkdir /mnt/A + * mkdir /mnt/B + * touch /mnt/B/bar + * sync + * mv /mnt/B/bar /mnt/A/bar + * mv -T /mnt/A /mnt/B + * fsync /mnt/B/bar + * + * + * If we ignore the old parent B which got deleted, + * after a log replay we would have file bar linked + * at both parents and the old parent B would still + * exist. + */ + if (IS_ERR(dir_inode)) { + ret = PTR_ERR(dir_inode); + goto out; + } ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); -- cgit v1.2.3 From b5bb62e56134180d802649a2954163454e551b7a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 10 Aug 2018 10:20:26 +0800 Subject: btrfs: qgroup: Dirty all qgroups before rescan commit 9c7b0c2e8dbfbcd80a71e2cbfe02704f26c185c6 upstream. [BUG] In the following case, rescan won't zero out the number of qgroup 1/0: $ mkfs.btrfs -fq $DEV $ mount $DEV /mnt $ btrfs quota enable /mnt $ btrfs qgroup create 1/0 /mnt $ btrfs sub create /mnt/sub $ btrfs qgroup assign 0/257 1/0 /mnt $ dd if=/dev/urandom of=/mnt/sub/file bs=1k count=1000 $ btrfs sub snap /mnt/sub /mnt/snap $ btrfs quota rescan -w /mnt $ btrfs qgroup show -pcre /mnt qgroupid rfer excl max_rfer max_excl parent child -------- ---- ---- -------- -------- ------ ----- 0/5 16.00KiB 16.00KiB none none --- --- 0/257 1016.00KiB 16.00KiB none none 1/0 --- 0/258 1016.00KiB 16.00KiB none none --- --- 1/0 1016.00KiB 16.00KiB none none --- 0/257 So far so good, but: $ btrfs qgroup remove 0/257 1/0 /mnt WARNING: quotas may be inconsistent, rescan needed $ btrfs quota rescan -w /mnt $ btrfs qgroup show -pcre /mnt qgoupid rfer excl max_rfer max_excl parent child -------- ---- ---- -------- -------- ------ ----- 0/5 16.00KiB 16.00KiB none none --- --- 0/257 1016.00KiB 16.00KiB none none --- --- 0/258 1016.00KiB 16.00KiB none none --- --- 1/0 1016.00KiB 16.00KiB none none --- --- ^^^^^^^^^^ ^^^^^^^^ not cleared [CAUSE] Before rescan we call qgroup_rescan_zero_tracking() to zero out all qgroups' accounting numbers. However we don't mark all qgroups dirty, but rely on rescan to do so. If we have any high level qgroup without children, it won't be marked dirty during rescan, since we cannot reach that qgroup. This will cause QGROUP_INFO items of childless qgroups never get updated in the quota tree, thus their numbers will stay the same in "btrfs qgroup show" output. [FIX] Just mark all qgroups dirty in qgroup_rescan_zero_tracking(), so even if we have childless qgroups, their QGROUP_INFO items will still get updated during rescan. Reported-by: Misono Tomohiro CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: Misono Tomohiro Tested-by: Misono Tomohiro Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/qgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a751937dded5..90e29d40aa82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2446,6 +2446,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) qgroup->rfer_cmpr = 0; qgroup->excl = 0; qgroup->excl_cmpr = 0; + qgroup_dirty(fs_info, qgroup); } spin_unlock(&fs_info->qgroup_lock); } -- cgit v1.2.3 From b38ace86f35a95e7e91f73dd2524da1fd7c3bc35 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 13 Oct 2018 00:37:25 +0100 Subject: Btrfs: fix null pointer dereference on compressed write path error commit 3527a018c00e5dbada2f9d7ed5576437b6dd5cfb upstream. At inode.c:compress_file_range(), under the "free_pages_out" label, we can end up dereferencing the "pages" pointer when it has a NULL value. This case happens when "start" has a value of 0 and we fail to allocate memory for the "pages" pointer. When that happens we jump to the "cont" label and then enter the "if (start == 0)" branch where we immediately call the cow_file_range_inline() function. If that function returns 0 (success creating an inline extent) or an error (like -ENOMEM for example) we jump to the "free_pages_out" label and then access "pages[i]" leading to a NULL pointer dereference, since "nr_pages" has a value greater than zero at that point. Fix this by setting "nr_pages" to 0 when we fail to allocate memory for the "pages" pointer. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201119 Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Liu Bo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b895be3d4311..383717ccecc7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -481,6 +481,7 @@ again: pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ + nr_pages = 0; goto cont; } -- cgit v1.2.3 From bddfd0a2d61e3f627b50d88b713769583f7d7b03 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 12 Oct 2018 15:32:33 -0400 Subject: btrfs: set max_extent_size properly commit ad22cf6ea47fa20fbe11ac324a0a15c0a9a4a2a9 upstream. We can't use entry->bytes if our entry is a bitmap entry, we need to use entry->max_extent_size in that case. Fix up all the logic to make this consistent. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/free-space-cache.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2b4c5f298325..1aa897dd9ce3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1784,6 +1784,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, return -1; } +static inline u64 get_max_extent_size(struct btrfs_free_space *entry) +{ + if (entry->bitmap) + return entry->max_extent_size; + return entry->bytes; +} + /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, @@ -1805,8 +1812,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bytes < *bytes) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1824,8 +1831,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bytes < *bytes + align_off) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1837,8 +1844,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *offset = tmp; *bytes = size; return entry; - } else if (size > *max_extent_size) { - *max_extent_size = size; + } else { + *max_extent_size = + max(get_max_extent_size(entry), + *max_extent_size); } continue; } @@ -2696,8 +2705,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { - if (search_bytes > *max_extent_size) - *max_extent_size = search_bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); return 0; } @@ -2734,8 +2743,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { - if (entry->bytes < bytes && entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + if (entry->bytes < bytes) + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { -- cgit v1.2.3 From 55e464966baf3e57374540f058ab68d6dce15f32 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Sat, 8 Sep 2018 01:18:43 +0900 Subject: 9p locks: fix glock.client_id leak in do_lock [ Upstream commit b4dc44b3cac9e8327e0655f530ed0c46f2e6214c ] the 9p client code overwrites our glock.client_id pointing to a static buffer by an allocated string holding the network provided value which we do not care about; free and reset the value as appropriate. This is almost identical to the leak in v9fs_file_getlock() fixed by Al Viro in commit ce85dd58ad5a6 ("9p: we are leaking glock.client_id in v9fs_file_getlock()"), which was returned as an error by a coverity false positive -- while we are here attempt to make the code slightly more robust to future change of the net/9p/client code and hopefully more clear to coverity that there is no problem. Link: http://lkml.kernel.org/r/1536339057-21974-5-git-send-email-asmadeus@codewreck.org Signed-off-by: Dominique Martinet Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_file.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 12ceaf52dae6..e7b3d2c4472d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -204,6 +204,14 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) break; + /* + * p9_client_lock_dotl overwrites flock.client_id with the + * server message, free and reuse the client name + */ + if (flock.client_id != fid->clnt->name) { + kfree(flock.client_id); + flock.client_id = fid->clnt->name; + } } /* map 9p status to VFS status */ @@ -235,6 +243,8 @@ out_unlock: locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } + if (flock.client_id != fid->clnt->name) + kfree(flock.client_id); out: return res; } @@ -269,7 +279,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) - return res; + goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: @@ -290,7 +300,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = glock.proc_id; } - kfree(glock.client_id); +out: + if (glock.client_id != fid->clnt->name) + kfree(glock.client_id); return res; } -- cgit v1.2.3 From 7574afe0cfc0e103f309a721880d195f38b292e0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Sep 2018 12:28:55 +0300 Subject: fuse: Fix use-after-free in fuse_dev_do_read() commit bc78abbd55dd28e2287ec6d6502b842321a17c87 upstream. We may pick freed req in this way: [cpu0] [cpu1] fuse_dev_do_read() fuse_dev_do_write() list_move_tail(&req->list, ...); ... spin_unlock(&fpq->lock); ... ... request_end(fc, req); ... fuse_put_request(fc, req); if (test_bit(FR_INTERRUPTED, ...)) queue_interrupt(fiq, req); Fix that by keeping req alive until we finish all manipulations. Reported-by: syzbot+4e975615ca01f2277bdd@syzkaller.appspotmail.com Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2671e922c720..f47253a0502b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1328,12 +1328,14 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto out_end; } list_move_tail(&req->list, &fpq->processing); + __fuse_get_request(req); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(fiq, req); + fuse_put_request(fc, req); return reqsize; -- cgit v1.2.3 From 8bb4354af373a8af395450acd298f25ddb79d93b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Sep 2018 12:52:42 +0300 Subject: fuse: Fix use-after-free in fuse_dev_do_write() commit d2d2d4fb1f54eff0f3faa9762d84f6446a4bc5d0 upstream. After we found req in request_find() and released the lock, everything may happen with the req in parallel: cpu0 cpu1 fuse_dev_do_write() fuse_dev_do_write() req = request_find(fpq, ...) ... spin_unlock(&fpq->lock) ... ... req = request_find(fpq, oh.unique) ... spin_unlock(&fpq->lock) queue_interrupt(&fc->iq, req); ... ... ... ... ... request_end(fc, req); fuse_put_request(fc, req); ... queue_interrupt(&fc->iq, req); Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f47253a0502b..78225f1f1902 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1924,16 +1924,20 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + __fuse_get_request(req); spin_unlock(&fpq->lock); err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) + if (nbytes != sizeof(struct fuse_out_header)) { + fuse_put_request(fc, req); goto err_finish; + } if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); fuse_copy_finish(cs); return nbytes; -- cgit v1.2.3 From 2fe23468dae467043183e301829827c17f65f45f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:22 +0200 Subject: fuse: fix blocked_waitq wakeup commit 908a572b80f6e9577b45e81b3dfe2e22111286b8 upstream. Using waitqueue_active() is racy. Make sure we issue a wake_up() unconditionally after storing into fc->blocked. After that it's okay to optimize with waitqueue_active() since the first wake up provides the necessary barrier for all waiters, not the just the woken one. Signed-off-by: Miklos Szeredi Fixes: 3c18ef8117f0 ("fuse: optimize wake_up") Cc: # v3.10 Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 78225f1f1902..da31954e1a99 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -402,12 +402,19 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (test_bit(FR_BACKGROUND, &req->flags)) { spin_lock(&fc->lock); clear_bit(FR_BACKGROUND, &req->flags); - if (fc->num_background == fc->max_background) + if (fc->num_background == fc->max_background) { fc->blocked = 0; - - /* Wake up next waiter, if any */ - if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } if (fc->num_background == fc->congestion_threshold && fc->connected && fc->bdi_initialized) { -- cgit v1.2.3 From f04651b97aed95cc82fc49997de1e5b1a6990e97 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:22 +0200 Subject: fuse: set FR_SENT while locked commit 4c316f2f3ff315cb48efb7435621e5bfb81df96d upstream. Otherwise fuse_dev_do_write() could come in and finish off the request, and the set_bit(FR_SENT, ...) could trigger the WARN_ON(test_bit(FR_SENT, ...)) in request_end(). Signed-off-by: Miklos Szeredi Reported-by: syzbot+ef054c4d3f64cd7f7cec@syzkaller.appspotmai Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index da31954e1a99..18e2f54737d1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1336,8 +1336,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, } list_move_tail(&req->list, &fpq->processing); __fuse_get_request(req); - spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) -- cgit v1.2.3 From cac9339052667de0805dc079b7fd74432acff515 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:39 -0700 Subject: mm, elf: handle vm_brk error commit ecc2bc8ac03884266cf73f8a2a42b911465b2fbc upstream. load_elf_library doesn't handle vm_brk failure although nothing really indicates it cannot do that because the function is allowed to fail due to vm_mmap failures already. This might be not a problem now but later patch will make vm_brk killable (resp. mmap_sem for write waiting will become killable) and so the failure will be more probable. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ben Hutchings Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 62bc72001fce..70ea4b9c6dd9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1215,8 +1215,11 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) - vm_brk(len, bss - len); + if (bss > len) { + error = vm_brk(len, bss - len); + if (BAD_ADDR(error)) + goto out_free_ph; + } error = 0; out_free_ph: -- cgit v1.2.3 From 4a0c08d709536d12f0dc2a9e6360826a56c9bceb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 2 Aug 2016 14:04:51 -0700 Subject: binfmt_elf: fix calculations for bss padding commit 0036d1f7eb95bcc52977f15507f00dd07018e7e2 upstream. A double-bug exists in the bss calculation code, where an overflow can happen in the "last_bss - elf_bss" calculation, but vm_brk internally aligns the argument, underflowing it, wrapping back around safe. We shouldn't depend on these bugs staying in sync, so this cleans up the bss padding handling to avoid the overflow. This moves the bss padzero() before the last_bss > elf_bss case, since the zero-filling of the ELF_PAGE should have nothing to do with the relationship of last_bss and elf_bss: any trailing portion should be zeroed, and a zero size is already handled by padzero(). Then it handles the math on elf_bss vs last_bss correctly. These need to both be ELF_PAGE aligned to get the comparison correct, since that's the expected granularity of the mappings. Since elf_bss already had alignment-based padding happen in padzero(), the "start" of the new vm_brk() should be moved forward as done in the original code. However, since the "end" of the vm_brk() area will already become PAGE_ALIGNed in vm_brk() then last_bss should get aligned here to avoid hiding it as a side-effect. Additionally makes a cosmetic change to the initial last_bss calculation so it's easier to read in comparison to the load_addr calculation above it (i.e. the only difference is p_filesz vs p_memsz). Link: http://lkml.kernel.org/r/1468014494-25291-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Reported-by: Hector Marco-Gisbert Cc: Ismael Ripoll Ripoll Cc: Alexander Viro Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Chen Gang Cc: Michal Hocko Cc: Konstantin Khlebnikov Cc: Andrea Arcangeli Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ben Hutchings Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 70ea4b9c6dd9..2963a23f7a80 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -604,28 +604,30 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * Do the same thing for the memory mapping - between * elf_bss and last_bss is the bss section. */ - k = load_addr + eppnt->p_memsz + eppnt->p_vaddr; + k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; if (k > last_bss) last_bss = k; } } + /* + * Now fill out the bss section: first pad the last page from + * the file up to the page boundary, and zero it from elf_bss + * up to the end of the page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out; + } + /* + * Next, align both the file and mem bss up to the page size, + * since this is where elf_bss was just zeroed up to, and where + * last_bss will end after the vm_brk() below. + */ + elf_bss = ELF_PAGEALIGN(elf_bss); + last_bss = ELF_PAGEALIGN(last_bss); + /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out; - } - - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - - /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) goto out; -- cgit v1.2.3 From a0b71d1b894556ae8e346596046e4f8bb0689f43 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 13 Jul 2018 16:59:13 -0700 Subject: fs, elf: make sure to page align bss in load_elf_library commit 24962af7e1041b7e50c1bc71d8d10dc678c556b5 upstream. The current code does not make sure to page align bss before calling vm_brk(), and this can lead to a VM_BUG_ON() in __mm_populate() due to the requested lenght not being correctly aligned. Let us make sure to align it properly. Kees: only applicable to CONFIG_USELIB kernels: 32-bit and configured for libc5. Link: http://lkml.kernel.org/r/20180705145539.9627-1-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reported-by: syzbot+5dcb560fe12aa5091c06@syzkaller.appspotmail.com Tested-by: Tetsuo Handa Acked-by: Kees Cook Cc: Michal Hocko Cc: Nicolas Pitre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Ben Hutchings Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2963a23f7a80..f010d6c8dd14 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1214,9 +1214,8 @@ static int load_elf_library(struct file *file) goto out_free_ph; } - len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + - ELF_MIN_ALIGN - 1); - bss = eppnt->p_memsz + eppnt->p_vaddr; + len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr); + bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr); if (bss > len) { error = vm_brk(len, bss - len); if (BAD_ADDR(error)) -- cgit v1.2.3 From 298ed64f1e889b87172e2f9ba807318dadc893d7 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Fri, 2 Nov 2018 15:48:15 -0700 Subject: ocfs2: fix a misuse a of brelse after failing ocfs2_check_dir_entry commit 29aa30167a0a2e6045a0d6d2e89d8168132333d5 upstream. Somehow, file system metadata was corrupted, which causes ocfs2_check_dir_entry() to fail in function ocfs2_dir_foreach_blk_el(). According to the original design intention, if above happens we should skip the problematic block and continue to retrieve dir entry. But there is obviouse misuse of brelse around related code. After failure of ocfs2_check_dir_entry(), current code just moves to next position and uses the problematic buffer head again and again during which the problematic buffer head is released for multiple times. I suppose, this a serious issue which is long-lived in ocfs2. This may cause other file systems which is also used in a the same host insane. So we should also consider about bakcporting this patch into linux -stable. Link: http://lkml.kernel.org/r/HK2PR06MB045211675B43EED794E597B6D56E0@HK2PR06MB0452.apcprd06.prod.outlook.com Signed-off-by: Changwei Ge Suggested-by: Changkuo Shi Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..49af618e410d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1896,8 +1896,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, /* On error, skip the f_pos to the next block. */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; - brelse(bh); - continue; + break; } if (le64_to_cpu(de->inode)) { unsigned char d_type = DT_UNKNOWN; -- cgit v1.2.3 From f02c3c34bda23e96df909d23625073539db19e3d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 5 Nov 2018 11:14:17 +0000 Subject: Btrfs: fix data corruption due to cloning of eof block commit ac765f83f1397646c11092a032d4f62c3d478b81 upstream. We currently allow cloning a range from a file which includes the last block of the file even if the file's size is not aligned to the block size. This is fine and useful when the destination file has the same size, but when it does not and the range ends somewhere in the middle of the destination file, it leads to corruption because the bytes between the EOF and the end of the block have undefined data (when there is support for discard/trimming they have a value of 0x00). Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ export foo_size=$((256 * 1024 + 100)) $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar $ od -A d -t x1 /mnt/bar 0000000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c * 0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00 0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 1048576 The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527 (512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead of 0xb5. This is similar to the problem we had for deduplication that got recently fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when deduplicating between different files"). Fix this by not allowing such operations to be performed and return the errno -EINVAL to user space. This is what XFS is doing as well at the VFS level. This change however now makes us return -EINVAL instead of -EOPNOTSUPP for cases where the source range maps to an inline extent and the destination range's end is smaller then the destination file's size, since the detection of inline extents is done during the actual process of dropping file extent items (at __btrfs_drop_extents()). Returning the -EINVAL error is done early on and solely based on the input parameters (offsets and length) and destination file's size. This makes us consistent with XFS and anyone else supporting cloning since this case is now checked at a higher level in the VFS and is where the -EINVAL will be returned from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1 by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF block"). So this change is more geared towards stable kernels, as it's unlikely the new VFS checks get removed intentionally. A test case for fstests follows soon, as well as an update to filter existing tests that expect -EOPNOTSUPP to accept -EINVAL as well. CC: # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6caeb946fc1d..150d3c891815 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3950,9 +3950,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; if (len == 0) olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) + /* + * If we extend to eof, continue to block boundary if and only if the + * destination end offset matches the destination file's size, otherwise + * we would be corrupting data by placing the eof block into the middle + * of a file. + */ + if (off + len == src->i_size) { + if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) + goto out_unlock; len = ALIGN(src->i_size, bs) - off; + } if (len == 0) { ret = 0; -- cgit v1.2.3 From 2eefc69471f17b4b65fc8ba8356b9ee3a7c523d0 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 3 Nov 2018 17:11:19 -0400 Subject: ext4: add missing brelse() update_backups()'s error path commit ea0abbb648452cdb6e1734b702b6330a7448fcf8 upstream. Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 2.6.19 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 783280ebc2fe..550eb2dfe357 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1095,8 +1095,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) -- cgit v1.2.3 From 88cc2d51d221bf5bbc3622a47b07ef0947cdbfa1 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 3 Nov 2018 16:22:10 -0400 Subject: ext4: add missing brelse() in set_flexbg_block_bitmap()'s error path commit cea5794122125bf67559906a0762186cf417099c upstream. Fixes: 33afdcc5402d ("ext4: add a function which sets up group blocks ...") Cc: stable@kernel.org # 3.3 Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 550eb2dfe357..9646d4b325cb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -442,16 +442,18 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); - if (err) + if (err) { + brelse(bh); return err; + } ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, block - start, count2); ext4_set_bits(bh->b_data, block - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (unlikely(err)) return err; - brelse(bh); } return 0; -- cgit v1.2.3 From 47aa49f64f0f2b8db7de8eba2e89490e8371fd7e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 3 Nov 2018 16:50:08 -0400 Subject: ext4: add missing brelse() add_new_gdb_meta_bg()'s error path commit 61a9c11e5e7a0dab5381afa5d9d4dd5ebf18f7a0 upstream. Fixes: 01f795f9e0d6 ("ext4: add online resizing support for meta_bg ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.7 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9646d4b325cb..f87dc9e4999d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -899,6 +899,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, sizeof(struct buffer_head *), GFP_NOFS); if (!n_group_desc) { + brelse(gdb_bh); err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); @@ -914,8 +915,6 @@ static int add_new_gdb_meta_bg(struct super_block *sb, kvfree(o_group_desc); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); - if (unlikely(err)) - brelse(gdb_bh); return err; } -- cgit v1.2.3 From d507dfb5e4fe3e5b01259055c7a4fcaba114f89e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Sat, 3 Nov 2018 16:13:17 -0400 Subject: ext4: avoid potential extra brelse in setup_new_flex_group_blocks() commit 9e4028935cca3f9ef9b6a90df9da6f1f94853536 upstream. Currently bh is set to NULL only during first iteration of for cycle, then this pointer is not cleared after end of using. Therefore rollback after errors can lead to extra brelse(bh) call, decrements bh counter and later trigger an unexpected warning in __brelse() Patch moves brelse() calls in body of cycle to exclude requirement of brelse() call in rollback. Fixes: 33afdcc5402d ("ext4: add a function which sets up group blocks ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.3+ Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f87dc9e4999d..37f275375089 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -590,7 +590,6 @@ handle_bb: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); - bh = NULL; goto out; } overhead = ext4_group_overhead_blocks(sb, group); @@ -602,9 +601,9 @@ handle_bb: ext4_mark_bitmap_end(group_data[i].blocks_count, sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (err) goto out; - brelse(bh); handle_ib: if (bg_flags[i] & EXT4_BG_INODE_UNINIT) @@ -619,18 +618,16 @@ handle_ib: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); - bh = NULL; goto out; } ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); if (err) goto out; - brelse(bh); } - bh = NULL; /* Mark group tables in block bitmap */ for (j = 0; j < GROUP_TABLE_COUNT; j++) { @@ -661,7 +658,6 @@ handle_ib: } out: - brelse(bh); err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; -- cgit v1.2.3 From 36ab0ab9b88d85fcb7b6a86db4b13cbbd52c208a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 6 Nov 2018 16:20:40 -0500 Subject: ext4: fix possible inode leak in the retry loop of ext4_resize_fs() commit db6aee62406d9fbb53315fcddd81f1dc271d49fa upstream. Fixes: 1c6bd7173d66 ("ext4: convert file system to meta_bg if needed ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.7 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 37f275375089..52c28d37112e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2026,6 +2026,10 @@ retry: n_blocks_count_retry = 0; free_flex_gd(flex_gd); flex_gd = NULL; + if (resize_inode) { + iput(resize_inode); + resize_inode = NULL; + } goto retry; } -- cgit v1.2.3 From e85d624f7164b51b3a9c081d0d9d7e3b5187dab8 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 6 Nov 2018 17:01:36 -0500 Subject: ext4: avoid buffer leak in ext4_orphan_add() after prior errors commit feaf264ce7f8d54582e2f66eb82dd9dd124c94f3 upstream. Fixes: d745a8c20c1f ("ext4: reduce contention on s_orphan_lock") Fixes: 6e3617e579e0 ("ext4: Handle non empty on-disk orphan link") Cc: Dmitry Monakhov Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 2.6.34 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a1f1e53d0e25..f6a63029b24d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2830,7 +2830,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } - } + } else + brelse(iloc.bh); + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); -- cgit v1.2.3 From 7f168837b121b47de9ce22201293819d65f5d6ac Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 6 Nov 2018 16:16:01 -0500 Subject: ext4: fix missing cleanup if ext4_alloc_flex_bg_array() fails while resizing commit f348e2241fb73515d65b5d77dd9c174128a7fbf2 upstream. Fixes: 117fff10d7f1 ("ext4: grow the s_flex_groups array as needed ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.7 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 52c28d37112e..82df738e5403 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1990,7 +1990,7 @@ retry: err = ext4_alloc_flex_bg_array(sb, n_group + 1); if (err) - return err; + goto out; err = ext4_mb_alloc_groupinfo(sb, n_group + 1); if (err) -- cgit v1.2.3 From 6371a05a1580651c8cd21bb1e49c4985f702c63d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Nov 2018 17:18:17 -0500 Subject: ext4: avoid possible double brelse() in add_new_gdb() on error path commit 4f32c38b4662312dd3c5f113d8bdd459887fb773 upstream. Fixes: b40971426a83 ("ext4: add error checking to calls to ...") Reported-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 2.6.38 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 82df738e5403..bad13f049fb0 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -844,6 +844,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); + iloc.bh = NULL; goto exit_inode; } brelse(dind); -- cgit v1.2.3 From 761ff77def544b52954334db2923feb1a947cebf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 7 Nov 2018 10:32:53 -0500 Subject: ext4: fix possible leak of sbi->s_group_desc_leak in error path commit 9e463084cdb22e0b56b2dfbc50461020409a5fd3 upstream. Fixes: bfe0a5f47ada ("ext4: add more mount time checks of the superblock") Reported-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 4.18 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a3d905abbaa9..cd9cd581fd92 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3731,6 +3731,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != + le32_to_cpu(es->s_inodes_count)) { + ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", + le32_to_cpu(es->s_inodes_count), + ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); + ret = -EINVAL; + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { @@ -3750,14 +3758,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto failed_mount; } - if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != - le32_to_cpu(es->s_inodes_count)) { - ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", - le32_to_cpu(es->s_inodes_count), - ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); - ret = -EINVAL; - goto failed_mount; - } bgl_lock_init(sbi->s_blockgroup_lock); -- cgit v1.2.3 From 0d9902d43fb696d460c29ef4ee0b86ce6ddc9a15 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 7 Nov 2018 11:07:01 -0500 Subject: ext4: release bs.bh before re-using in ext4_xattr_block_find() commit 45ae932d246f721e6584430017176cbcadfde610 upstream. bs.bh was taken in previous ext4_xattr_block_find() call, it should be released before re-using Fixes: 7e01c8e5420b ("ext3/4: fix uninitialized bs in ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 2.6.26 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 16cf24dc9c34..1022c17fc1a1 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1161,6 +1161,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + brelse(bs.bh); + bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; -- cgit v1.2.3 From 27acd8f869ba92e70408168ed2bed504a99b154c Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 7 Nov 2018 11:10:21 -0500 Subject: ext4: fix buffer leak in ext4_xattr_move_to_block() on error path commit 6bdc9977fcdedf47118d2caf7270a19f4b6d8a8f upstream. Fixes: 3f2571c1f91f ("ext4: factor out xattr moving") Fixes: 6dd4ee7cab7e ("ext4: Expand extra_inodes space per ...") Reviewed-by: Jan Kara Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 2.6.23 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1022c17fc1a1..53679716baca 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1490,6 +1490,8 @@ cleanup: kfree(buffer); if (is) brelse(is->iloc.bh); + if (bs) + brelse(bs->bh); kfree(is); kfree(bs); brelse(bh); -- cgit v1.2.3 From 954ff0468f8bc17baed4673b2458cf8377f0b9ea Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 7 Nov 2018 22:36:23 -0500 Subject: ext4: fix buffer leak in __ext4_read_dirblock() on error path commit de59fae0043f07de5d25e02ca360f7d57bfa5866 upstream. Fixes: dc6982ff4db1 ("ext4: refactor code to read directory blocks ...") Signed-off-by: Vasily Averin Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 3.9 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f6a63029b24d..aa08e129149d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -124,6 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); + brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || -- cgit v1.2.3 From 5becfca9f9714df49b06562b3b6ab434bbcfd3ae Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Oct 2018 10:21:38 -0500 Subject: mount: Retest MNT_LOCKED in do_umount commit 25d202ed820ee347edec0bf3bf553544556bf64b upstream. It was recently pointed out that the one instance of testing MNT_LOCKED outside of the namespace_sem is in ksys_umount. Fix that by adding a test inside of do_umount with namespace_sem and the mount_lock held. As it helps to fail fails the existing test is maintained with an additional comment pointing out that it may be racy because the locks are not held. Cc: stable@vger.kernel.org Reported-by: Al Viro Fixes: 5ff9d8a65ce8 ("vfs: Lock in place mounts from more privileged users") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b56b50e3da11..416496e2f648 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1584,8 +1584,13 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - event++; + /* Recheck MNT_LOCKED with the locks held */ + retval = -EINVAL; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto out; + + event++; if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); @@ -1599,6 +1604,7 @@ static int do_umount(struct mount *mnt, int flags) retval = 0; } } +out: unlock_mount_hash(); namespace_unlock(); return retval; @@ -1681,7 +1687,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; - if (mnt->mnt.mnt_flags & MNT_LOCKED) + if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ goto dput_and_out; retval = -EPERM; if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) -- cgit v1.2.3 From 6258099fd2c9573cde4e3201bef5ef41a56618c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Oct 2018 09:04:18 -0500 Subject: mount: Don't allow copying MNT_UNBINDABLE|MNT_LOCKED mounts commit df7342b240185d58d3d9665c0bbf0a0f5570ec29 upstream. Jonathan Calmels from NVIDIA reported that he's able to bypass the mount visibility security check in place in the Linux kernel by using a combination of the unbindable property along with the private mount propagation option to allow a unprivileged user to see a path which was purposefully hidden by the root user. Reproducer: # Hide a path to all users using a tmpfs root@castiana:~# mount -t tmpfs tmpfs /sys/devices/ root@castiana:~# # As an unprivileged user, unshare user namespace and mount namespace stgraber@castiana:~$ unshare -U -m -r # Confirm the path is still not accessible root@castiana:~# ls /sys/devices/ # Make /sys recursively unbindable and private root@castiana:~# mount --make-runbindable /sys root@castiana:~# mount --make-private /sys # Recursively bind-mount the rest of /sys over to /mnnt root@castiana:~# mount --rbind /sys/ /mnt # Access our hidden /sys/device as an unprivileged user root@castiana:~# ls /mnt/devices/ breakpoint cpu cstate_core cstate_pkg i915 intel_pt isa kprobe LNXSYSTM:00 msr pci0000:00 platform pnp0 power software system tracepoint uncore_arb uncore_cbox_0 uncore_cbox_1 uprobe virtual Solve this by teaching copy_tree to fail if a mount turns out to be both unbindable and locked. Cc: stable@vger.kernel.org Fixes: 5ff9d8a65ce8 ("vfs: Lock in place mounts from more privileged users") Reported-by: Jonathan Calmels Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 416496e2f648..2d718d552a37 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1765,8 +1765,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, for (s = r; s; s = next_mnt(s, r)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { - s = skip_mnt_tree(s); - continue; + if (s->mnt.mnt_flags & MNT_LOCKED) { + /* Both unbindable and locked. */ + q = ERR_PTR(-EPERM); + goto out; + } else { + s = skip_mnt_tree(s); + continue; + } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(s->mnt.mnt_root)) { -- cgit v1.2.3 From f7e6ee2e18cc745f05a9fe678de65ac24a6adcfd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 25 Oct 2018 12:05:11 -0500 Subject: mount: Prevent MNT_DETACH from disconnecting locked mounts commit 9c8e0a1b683525464a2abe9fb4b54404a50ed2b4 upstream. Timothy Baldwin wrote: > As per mount_namespaces(7) unprivileged users should not be able to look under mount points: > > Mounts that come as a single unit from more privileged mount are locked > together and may not be separated in a less privileged mount namespace. > > However they can: > > 1. Create a mount namespace. > 2. In the mount namespace open a file descriptor to the parent of a mount point. > 3. Destroy the mount namespace. > 4. Use the file descriptor to look under the mount point. > > I have reproduced this with Linux 4.16.18 and Linux 4.18-rc8. > > The setup: > > $ sudo sysctl kernel.unprivileged_userns_clone=1 > kernel.unprivileged_userns_clone = 1 > $ mkdir -p A/B/Secret > $ sudo mount -t tmpfs hide A/B > > > "Secret" is indeed hidden as expected: > > $ ls -lR A > A: > total 0 > drwxrwxrwt 2 root root 40 Feb 12 21:08 B > > A/B: > total 0 > > > The attack revealing "Secret": > > $ unshare -Umr sh -c "exec unshare -m ls -lR /proc/self/fd/4/ 4 /proc/self/fd/4/: > total 0 > drwxr-xr-x 3 root root 60 Feb 12 21:08 B > > /proc/self/fd/4/B: > total 0 > drwxr-xr-x 2 root root 40 Feb 12 21:08 Secret > > /proc/self/fd/4/B/Secret: > total 0 I tracked this down to put_mnt_ns running passing UMOUNT_SYNC and disconnecting all of the mounts in a mount namespace. Fix this by factoring drop_mounts out of drop_collected_mounts and passing 0 instead of UMOUNT_SYNC. There are two possible behavior differences that result from this. - No longer setting UMOUNT_SYNC will no longer set MNT_SYNC_UMOUNT on the vfsmounts being unmounted. This effects the lazy rcu walk by kicking the walk out of rcu mode and forcing it to be a non-lazy walk. - No longer disconnecting locked mounts will keep some mounts around longer as they stay because the are locked to other mounts. There are only two users of drop_collected mounts: audit_tree.c and put_mnt_ns. In audit_tree.c the mounts are private and there are no rcu lazy walks only calls to iterate_mounts. So the changes should have no effect except for a small timing effect as the connected mounts are disconnected. In put_mnt_ns there may be references from process outside the mount namespace to the mounts. So the mounts remaining connected will be the bug fix that is needed. That rcu walks are allowed to continue appears not to be a problem especially as the rcu walk change was about an implementation detail not about semantics. Cc: stable@vger.kernel.org Fixes: 5ff9d8a65ce8 ("vfs: Lock in place mounts from more privileged users") Reported-by: Timothy Baldwin Tested-by: Timothy Baldwin Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 2d718d552a37..88c5d5bddf74 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1825,7 +1825,7 @@ void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); - umount_tree(real_mount(mnt), UMOUNT_SYNC); + umount_tree(real_mount(mnt), 0); unlock_mount_hash(); namespace_unlock(); } -- cgit v1.2.3 From 6023d16fdb84e849d4aa60c2bc1ea294a217d6cb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 9 Nov 2018 15:52:16 +0100 Subject: fuse: fix leaked notify reply commit 7fabaf303458fcabb694999d6fa772cc13d4e217 upstream. fuse_request_send_notify_reply() may fail if the connection was reset for some reason (e.g. fs was unmounted). Don't leak request reference in this case. Besides leaking memory, this resulted in fc->num_waiting not being decremented and hence fuse_wait_aborted() left in a hanging and unkillable state. Fixes: 2d45ba381a74 ("fuse: add retrieve request") Fixes: b8f95e5d13f5 ("fuse: umount should wait for all requests") Reported-and-tested-by: syzbot+6339eda9cb4ebbc4c37b@syzkaller.appspotmail.com Signed-off-by: Miklos Szeredi Cc: #v2.6.36 Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 18e2f54737d1..e566652ac922 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1771,8 +1771,10 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.args[1].size = total_len; err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) + if (err) { fuse_retrieve_end(fc, req); + fuse_put_request(fc, req); + } return err; } -- cgit v1.2.3 From 9546d6609e1095af6a5c8728238234e1b6b4d85c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 1 Jul 2018 13:56:54 -0700 Subject: configfs: replace strncpy with memcpy commit 1823342a1f2b47a4e6f5667f67cd28ab6bc4d6cd upstream. gcc 8.1.0 complains: fs/configfs/symlink.c:67:3: warning: 'strncpy' output truncated before terminating nul copying as many bytes from a string as its length fs/configfs/symlink.c: In function 'configfs_get_link': fs/configfs/symlink.c:63:13: note: length computed here Using strncpy() is indeed less than perfect since the length of data to be copied has already been determined with strlen(). Replace strncpy() with memcpy() to address the warning and optimize the code a little. Signed-off-by: Guenter Roeck Signed-off-by: Christoph Hellwig Signed-off-by: Nobuhiro Iwamatsu Signed-off-by: Greg Kroah-Hartman --- fs/configfs/symlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 0525ebc3aea2..66e8c5d58b21 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -64,7 +64,7 @@ static void fill_item_path(struct config_item * item, char * buffer, int length) /* back up enough to print this bus id with '/' */ length -= cur; - strncpy(buffer + length,config_item_name(p),cur); + memcpy(buffer + length, config_item_name(p), cur); *(buffer + --length) = '/'; } } -- cgit v1.2.3 From 235fbd9c76b5b96bf4b483b65c42e1c2388b3ae8 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 30 Oct 2018 15:06:38 -0700 Subject: reiserfs: propagate errors from fill_with_dentries() properly [ Upstream commit b10298d56c9623f9b173f19959732d3184b35f4f ] fill_with_dentries() failed to propagate errors up to reiserfs_for_each_xattr() properly. Plumb them through. Note that reiserfs_for_each_xattr() is only used by reiserfs_delete_xattrs() and reiserfs_chown_xattrs(). The result of reiserfs_delete_xattrs() is discarded anyway, the only difference there is whether a warning is printed to dmesg. The result of reiserfs_chown_xattrs() does matter because it can block chowning of the file to which the xattrs belong; but either way, the resulting state can have misaligned ownership, so my patch doesn't improve things greatly. Credit for making me look at this code goes to Al Viro, who pointed out that the ->actor calling convention is suboptimal and should be changed. Link: http://lkml.kernel.org/r/20180802163335.83312-1-jannh@google.com Signed-off-by: Jann Horn Reviewed-by: Andrew Morton Cc: Jeff Mahoney Cc: Eric Biggers Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/reiserfs/xattr.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8b32fdaad468..d424b3d4bf3b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -184,6 +184,7 @@ struct reiserfs_dentry_buf { struct dir_context ctx; struct dentry *xadir; int count; + int err; struct dentry *dentries[8]; }; @@ -206,6 +207,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { + dbuf->err = PTR_ERR(dentry); return PTR_ERR(dentry); } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ @@ -214,6 +216,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, "not found for file %pd.\n", dentry, dbuf->xadir); dput(dentry); + dbuf->err = -EIO; return -EIO; } @@ -261,6 +264,10 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; + if (buf.err) { + err = buf.err; + break; + } if (!buf.count) break; for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { -- cgit v1.2.3 From b4cb4ecd6af40b77b39f8028325f36ceff8292fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Tue, 30 Oct 2018 15:06:07 -0700 Subject: hfs: prevent btree data loss on root split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d057c036672f33d43a5f7344acbb08cf3a8a0c09 ] This bug is triggered whenever hfs_brec_update_parent() needs to split the root node. The height of the btree is not increased, which leaves the new node orphaned and its records lost. It is not possible for this to happen on a valid hfs filesystem because the index nodes have fixed length keys. For reasons I ignore, the hfs module does have support for a number of hfsplus features. A corrupt btree header may report variable length keys and trigger this bug, so it's better to fix it. Link: http://lkml.kernel.org/r/9750b1415685c4adca10766895f6d5ef12babdb0.1535682463.git.ernesto.mnd.fernandez@gmail.com Signed-off-by: Ernesto A. Fernández Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/hfs/brec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 2a6f3c67cb3f..2e713673df42 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -424,6 +424,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); -- cgit v1.2.3 From 5f427ca479c33907abceb3e767cf1a76a5f5bffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Tue, 30 Oct 2018 15:06:00 -0700 Subject: hfsplus: prevent btree data loss on root split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0a3021d4f5295aa073c7bf5c5e4de60a2e292578 ] Creating, renaming or deleting a file may cause catalog corruption and data loss. This bug is randomly triggered by xfstests generic/027, but here is a faster reproducer: truncate -s 50M fs.iso mkfs.hfsplus fs.iso mount fs.iso /mnt i=100 while [ $i -le 150 ]; do touch /mnt/$i &>/dev/null ((++i)) done i=100 while [ $i -le 150 ]; do mv /mnt/$i /mnt/$(perl -e "print $i x82") &>/dev/null ((++i)) done umount /mnt fsck.hfsplus -n fs.iso The bug is triggered whenever hfs_brec_update_parent() needs to split the root node. The height of the btree is not increased, which leaves the new node orphaned and its records lost. Link: http://lkml.kernel.org/r/26d882184fc43043a810114258f45277752186c7.1535682461.git.ernesto.mnd.fernandez@gmail.com Signed-off-by: Ernesto A. Fernández Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/hfsplus/brec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 754fdf8c6356..1002a0c08319 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -427,6 +427,10 @@ skip: if (new_node) { __be32 cnid; + if (!new_node->parent) { + hfs_btree_inc_height(tree); + new_node->parent = tree->root; + } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); -- cgit v1.2.3 From b45cb18117893965cbe3191f4f308aa7dd3cd689 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 13 Jun 2018 12:05:13 +0800 Subject: fs/exofs: fix potential memory leak in mount option parsing [ Upstream commit 515f1867addaba49c1c6ac73abfaffbc192c1db4 ] There are some cases can cause memory leak when parsing option 'osdname'. Signed-off-by: Chengguang Xu Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/exofs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b5e1..360ba74e04e6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -100,6 +100,7 @@ static int parse_options(char *options, struct exofs_mountopt *opts) token = match_token(p, tokens, args); switch (token) { case Opt_name: + kfree(opts->dev_name); opts->dev_name = match_strdup(&args[0]); if (unlikely(!opts->dev_name)) { EXOFS_ERR("Error allocating dev_name"); @@ -868,8 +869,10 @@ static struct dentry *exofs_mount(struct file_system_type *type, int ret; ret = parse_options(data, &opts); - if (ret) + if (ret) { + kfree(opts.dev_name); return ERR_PTR(ret); + } if (!opts.dev_name) opts.dev_name = dev_name; -- cgit v1.2.3 From 00112fda0835dda03eb88ad989d735334af9ecbf Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 19 Nov 2018 17:24:36 +0100 Subject: gfs2: Put bitmap buffers in put_super commit 10283ea525d30f2e99828978fd04d8427876a7ad upstream. gfs2_put_super calls gfs2_clear_rgrpd to destroy the gfs2_rgrpd objects attached to the resource group glocks. That function should release the buffers attached to the gfs2_bitmap objects (bi_bh), but the call to gfs2_rgrp_brelse for doing that is missing. When gfs2_releasepage later runs across these buffers which are still referenced, it refuses to free them. This causes the pages the buffers are attached to to remain referenced as well. With enough mount/unmount cycles, the system will eventually run out of memory. Fix this by adding the missing call to gfs2_rgrp_brelse in gfs2_clear_rgrpd. (Also fix a gfs2_rgrp_relse -> gfs2_rgrp_brelse typo in a comment.) Fixes: 39b0f1e92908 ("GFS2: Don't brelse rgrp buffer_heads every allocation") Cc: stable@vger.kernel.org # v4.4 Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c134c0462cee..ef24894edecc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -732,6 +732,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) spin_lock(&gl->gl_lockref.lock); gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); + gfs2_rgrp_brelse(rgd); gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -1139,7 +1140,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) * @rgd: the struct gfs2_rgrpd describing the RG to read in * * Read in all of a Resource Group's header and bitmap blocks. - * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. * * Returns: errno */ -- cgit v1.2.3 From 37944cba362d17a3d8ff1180f9fd2047299e9f57 Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Tue, 20 Nov 2018 11:25:10 +0800 Subject: btrfs: fix pinned underflow after transaction aborted commit fcd5e74288f7d36991b1f0fb96b8c57079645e38 upstream. When running generic/475, we may get the following warning in dmesg: [ 6902.102154] WARNING: CPU: 3 PID: 18013 at fs/btrfs/extent-tree.c:9776 btrfs_free_block_groups+0x2af/0x3b0 [btrfs] [ 6902.109160] CPU: 3 PID: 18013 Comm: umount Tainted: G W O 4.19.0-rc8+ #8 [ 6902.110971] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 6902.112857] RIP: 0010:btrfs_free_block_groups+0x2af/0x3b0 [btrfs] [ 6902.118921] RSP: 0018:ffffc9000459bdb0 EFLAGS: 00010286 [ 6902.120315] RAX: ffff880175050bb0 RBX: ffff8801124a8000 RCX: 0000000000170007 [ 6902.121969] RDX: 0000000000000002 RSI: 0000000000170007 RDI: ffffffff8125fb74 [ 6902.123716] RBP: ffff880175055d10 R08: 0000000000000000 R09: 0000000000000000 [ 6902.125417] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880175055d88 [ 6902.127129] R13: ffff880175050bb0 R14: 0000000000000000 R15: dead000000000100 [ 6902.129060] FS: 00007f4507223780(0000) GS:ffff88017ba00000(0000) knlGS:0000000000000000 [ 6902.130996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6902.132558] CR2: 00005623599cac78 CR3: 000000014b700001 CR4: 00000000003606e0 [ 6902.134270] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6902.135981] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 6902.137836] Call Trace: [ 6902.138939] close_ctree+0x171/0x330 [btrfs] [ 6902.140181] ? kthread_stop+0x146/0x1f0 [ 6902.141277] generic_shutdown_super+0x6c/0x100 [ 6902.142517] kill_anon_super+0x14/0x30 [ 6902.143554] btrfs_kill_super+0x13/0x100 [btrfs] [ 6902.144790] deactivate_locked_super+0x2f/0x70 [ 6902.146014] cleanup_mnt+0x3b/0x70 [ 6902.147020] task_work_run+0x9e/0xd0 [ 6902.148036] do_syscall_64+0x470/0x600 [ 6902.149142] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 6902.150375] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 6902.151640] RIP: 0033:0x7f45077a6a7b [ 6902.157324] RSP: 002b:00007ffd589f3e68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 6902.159187] RAX: 0000000000000000 RBX: 000055e8eec732b0 RCX: 00007f45077a6a7b [ 6902.160834] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000055e8eec73490 [ 6902.162526] RBP: 0000000000000000 R08: 000055e8eec734b0 R09: 00007ffd589f26c0 [ 6902.164141] R10: 0000000000000000 R11: 0000000000000246 R12: 000055e8eec73490 [ 6902.165815] R13: 00007f4507ac61a4 R14: 0000000000000000 R15: 00007ffd589f40d8 [ 6902.167553] irq event stamp: 0 [ 6902.168998] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 6902.170731] hardirqs last disabled at (0): [] copy_process.part.55+0x3b0/0x1f00 [ 6902.172773] softirqs last enabled at (0): [] copy_process.part.55+0x3b0/0x1f00 [ 6902.174671] softirqs last disabled at (0): [<0000000000000000>] (null) [ 6902.176407] ---[ end trace 463138c2986b275c ]--- [ 6902.177636] BTRFS info (device dm-3): space_info 4 has 273465344 free, is not full [ 6902.179453] BTRFS info (device dm-3): space_info total=276824064, used=4685824, pinned=18446744073708158976, reserved=0, may_use=0, readonly=65536 In the above line there's "pinned=18446744073708158976" which is an unsigned u64 value of -1392640, an obvious underflow. When transaction_kthread is running cleanup_transaction(), another fsstress is running btrfs_commit_transaction(). The btrfs_finish_extent_commit() may get the same range as btrfs_destroy_pinned_extent() got, which causes the pinned underflow. Fixes: d4b450cd4b33 ("Btrfs: fix race between transaction commit and empty block group removal") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Lu Fengqi Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8dbb00fbb00b..b0875ef48522 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4333,6 +4333,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; u64 start; u64 end; @@ -4342,21 +4343,31 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, unpin = pinned_extents; again: while (1) { + /* + * The btrfs_finish_extent_commit() may get the same range as + * ours between find_first_extent_bit and clear_extent_dirty. + * Hence, hold the unused_bg_unpin_mutex to avoid double unpin + * the same extent range. + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } if (loop) { - if (unpin == &root->fs_info->freed_extents[0]) - unpin = &root->fs_info->freed_extents[1]; + if (unpin == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; else - unpin = &root->fs_info->freed_extents[0]; + unpin = &fs_info->freed_extents[0]; loop = false; goto again; } -- cgit v1.2.3 From ebd55db37fa7856ddc182da5587dd2189940d987 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Mon, 27 Aug 2018 15:12:05 +0900 Subject: v9fs_dir_readdir: fix double-free on p9stat_read error commit 81c99089bce693b94b775b6eb888115d2d540086 upstream. p9stat_read will call p9stat_free on error, we should only free the struct content on success. There also is no need to "p9stat_init" st as the read function will zero the whole struct for us anyway, so clean up the code a bit while we are here. Link: http://lkml.kernel.org/r/1535410108-20650-1-git-send-email-asmadeus@codewreck.org Signed-off-by: Dominique Martinet Reported-by: syzbot+d4252148d198410b864f@syzkaller.appspotmail.com Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_dir.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 5cc00e56206e..7d889f56b8e7 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat) return rettype; } -static void p9stat_init(struct p9_wstat *stbuf) -{ - stbuf->name = NULL; - stbuf->uid = NULL; - stbuf->gid = NULL; - stbuf->muid = NULL; - stbuf->extension = NULL; -} - /** * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure @@ -145,12 +136,10 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) rdir->tail = n; } while (rdir->head < rdir->tail) { - p9stat_init(&st); err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); if (err) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - p9stat_free(&st); return -EIO; } reclen = st.size+2; -- cgit v1.2.3 From 9891fda685f1a9da69012bb39aad81b67870cf5e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 2 Nov 2018 15:48:42 -0700 Subject: bfs: add sanity check at bfs_fill_super() commit 9f2df09a33aa2c76ce6385d382693f98d7f2f07e upstream. syzbot is reporting too large memory allocation at bfs_fill_super() [1]. Since file system image is corrupted such that bfs_sb->s_start == 0, bfs_fill_super() is trying to allocate 8MB of continuous memory. Fix this by adding a sanity check on bfs_sb->s_start, __GFP_NOWARN and printf(). [1] https://syzkaller.appspot.com/bug?id=16a87c236b951351374a84c8a32f40edbc034e96 Link: http://lkml.kernel.org/r/1525862104-3407-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Reviewed-by: Andrew Morton Cc: Tigran Aivazian Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/bfs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f430..4714c55c1ae5 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -350,7 +350,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; - if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || + le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { printf("Superblock is corrupted\n"); goto out1; } @@ -359,9 +360,11 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; imap_len = (info->si_lasti / 8) + 1; - info->si_imap = kzalloc(imap_len, GFP_KERNEL); - if (!info->si_imap) + info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); + if (!info->si_imap) { + printf("Cannot allocate %u bytes\n", imap_len); goto out1; + } for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); -- cgit v1.2.3 From b689a81a4c2ac139a3fdb92b08aab6dd73bc6108 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Mon, 8 Oct 2018 07:52:43 -0500 Subject: gfs2: Don't leave s_fs_info pointing to freed memory in init_sbd commit 4c62bd9cea7bcf10292f7e4c57a2bca332942697 upstream. When alloc_percpu() fails, sdp gets freed but sb->s_fs_info still points to the same address. Move the assignment after that error check so that s_fs_info can only point to a valid sdp or NULL, which is checked for later in the error path, in gfs2_kill_super(). Reported-by: syzbot+dcb8b3587445007f5808@syzkaller.appspotmail.com Signed-off-by: Andrew Price Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/ops_fstype.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index d9178388cf48..de7143e2b361 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -71,13 +71,13 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) if (!sdp) return NULL; - sb->s_fs_info = sdp; sdp->sd_vfs = sb; sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); if (!sdp->sd_lkstats) { kfree(sdp); return NULL; } + sb->s_fs_info = sdp; set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); -- cgit v1.2.3 From 522ac5232ad7b9cd644ebbd9d7ec476e0c148284 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Sep 2018 14:16:24 +0800 Subject: btrfs: Ensure btrfs_trim_fs can trim the whole filesystem commit 6ba9fc8e628becf0e3ec94083450d089b0dec5f5 upstream. [BUG] fstrim on some btrfs only trims the unallocated space, not trimming any space in existing block groups. [CAUSE] Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to range [0, super->total_bytes). So later btrfs_trim_fs() will only be able to trim block groups in range [0, super->total_bytes). While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs uses its logical address space, there is nothing limiting the location where we put block groups. For filesystem with frequent balance, it's quite easy to relocate all block groups and bytenr of block groups will start beyond super->total_bytes. In that case, btrfs will not trim existing block groups. [FIX] Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs() can get the unmodified range, which is normally set to [0, U64_MAX]. Reported-by: Chris Murphy Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl") CC: # v4.4+ Signed-off-by: Qu Wenruo Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 10 +--------- fs/btrfs/ioctl.c | 11 +++++++---- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 80cd28456f08..13ff0fdae03e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10708,17 +10708,9 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - /* - * try to trim all FS space, our block group may start from non-zero. - */ - if (range->len == total_bytes) - cache = btrfs_lookup_first_block_group(fs_info, range->start); - else - cache = btrfs_lookup_block_group(fs_info, range->start); - + cache = btrfs_lookup_first_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { btrfs_put_block_group(cache); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 150d3c891815..3379490ce54d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,7 +378,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -402,11 +401,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; - if (range.start > total_bytes || - range.len < fs_info->sb->s_blocksize) + + /* + * NOTE: Don't truncate the range using super->total_bytes. Bytenr of + * block group is in the logical address space, which can be any + * sectorsize aligned bytenr in the range [0, U64_MAX]. + */ + if (range.len < fs_info->sb->s_blocksize) return -EINVAL; - range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info->tree_root, &range); if (ret < 0) -- cgit v1.2.3 From 3658ccbbac39cc634e357ee08ff46d0893cbc111 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Thu, 23 Aug 2018 17:00:35 -0700 Subject: namei: allow restricted O_CREAT of FIFOs and regular files commit 30aba6656f61ed44cba445a3c0d38b296fa9e8f5 upstream. Disallows open of FIFOs or regular files not owned by the user in world writable sticky directories, unless the owner is the same as that of the directory or the file is opened without the O_CREAT flag. The purpose is to make data spoofing attacks harder. This protection can be turned on and off separately for FIFOs and regular files via sysctl, just like the symlinks/hardlinks protection. This patch is based on Openwall's "HARDEN_FIFO" feature by Solar Designer. This is a brief list of old vulnerabilities that could have been prevented by this feature, some of them even allow for privilege escalation: CVE-2000-1134 CVE-2007-3852 CVE-2008-0525 CVE-2009-0416 CVE-2011-4834 CVE-2015-1838 CVE-2015-7442 CVE-2016-7489 This list is not meant to be complete. It's difficult to track down all vulnerabilities of this kind because they were often reported without any mention of this particular attack vector. In fact, before hardlinks/symlinks restrictions, fifos/regular files weren't the favorite vehicle to exploit them. [s.mesoraca16@gmail.com: fix bug reported by Dan Carpenter] Link: https://lkml.kernel.org/r/20180426081456.GA7060@mwanda Link: http://lkml.kernel.org/r/1524829819-11275-1-git-send-email-s.mesoraca16@gmail.com [keescook@chromium.org: drop pr_warn_ratelimited() in favor of audit changes in the future] [keescook@chromium.org: adjust commit subjet] Link: http://lkml.kernel.org/r/20180416175918.GA13494@beast Signed-off-by: Salvatore Mesoraca Signed-off-by: Kees Cook Suggested-by: Solar Designer Suggested-by: Kees Cook Cc: Al Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Loic Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index de57dd59d95f..40049d61ef37 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -869,6 +869,8 @@ static inline void put_link(struct nameidata *nd) int sysctl_protected_symlinks __read_mostly = 0; int sysctl_protected_hardlinks __read_mostly = 0; +int sysctl_protected_fifos __read_mostly; +int sysctl_protected_regular __read_mostly; /** * may_follow_link - Check symlink following for unsafe situations @@ -982,6 +984,45 @@ static int may_linkat(struct path *link) return -EPERM; } +/** + * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory + * should be allowed, or not, on files that already + * exist. + * @dir: the sticky parent directory + * @inode: the inode of the file to open + * + * Block an O_CREAT open of a FIFO (or a regular file) when: + * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled + * - the file already exists + * - we are in a sticky directory + * - we don't own the file + * - the owner of the directory doesn't own the file + * - the directory is world writable + * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 + * the directory doesn't have to be world writable: being group writable will + * be enough. + * + * Returns 0 if the open is allowed, -ve on error. + */ +static int may_create_in_sticky(struct dentry * const dir, + struct inode * const inode) +{ + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || + (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || + likely(!(dir->d_inode->i_mode & S_ISVTX)) || + uid_eq(inode->i_uid, dir->d_inode->i_uid) || + uid_eq(current_fsuid(), inode->i_uid)) + return 0; + + if (likely(dir->d_inode->i_mode & 0002) || + (dir->d_inode->i_mode & 0020 && + ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || + (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { + return -EACCES; + } + return 0; +} + static __always_inline const char *get_link(struct nameidata *nd) { @@ -3166,9 +3207,15 @@ finish_open: error = -ELOOP; goto out; } - error = -EISDIR; - if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) - goto out; + if (open_flag & O_CREAT) { + error = -EISDIR; + if (d_is_dir(nd->path.dentry)) + goto out; + error = may_create_in_sticky(dir, + d_backing_inode(nd->path.dentry)); + if (unlikely(error)) + goto out; + } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; -- cgit v1.2.3