summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c61
-rw-r--r--fs/9p/fid.h6
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_file.c9
-rw-r--r--fs/9p/vfs_inode.c89
-rw-r--r--fs/9p/vfs_inode_dotl.c82
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c8
-rw-r--r--fs/Kconfig12
-rw-r--r--fs/afs/cell.c61
-rw-r--r--fs/afs/cmservice.c4
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c2
-rw-r--r--fs/afs/internal.h19
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/proc.c6
-rw-r--r--fs/afs/rxrpc.c38
-rw-r--r--fs/afs/server.c46
-rw-r--r--fs/afs/vl_list.c19
-rw-r--r--fs/afs/volume.c21
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/yfsclient.c3
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs/autofs_i.h7
-rw-r--r--fs/autofs/expire.c2
-rw-r--r--fs/autofs/inode.c1
-rw-r--r--fs/autofs/root.c108
-rw-r--r--fs/btrfs/block-group.c51
-rw-r--r--fs/btrfs/block-group.h4
-rw-r--r--fs/btrfs/ctree.c3
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/dev-replace.c5
-rw-r--r--fs/btrfs/disk-io.c83
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c48
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/file.c2
-rw-r--r--fs/btrfs/inode.c21
-rw-r--r--fs/btrfs/locking.c91
-rw-r--r--fs/btrfs/locking.h14
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/space-info.c2
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/tree-checker.c25
-rw-r--r--fs/btrfs/tree-log.c8
-rw-r--r--fs/btrfs/volumes.c8
-rw-r--r--fs/btrfs/xattr.c3
-rw-r--r--fs/btrfs/zoned.c99
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/ondemand.c22
-rw-r--r--fs/ceph/addr.c61
-rw-r--r--fs/ceph/caps.c38
-rw-r--r--fs/ceph/dir.c79
-rw-r--r--fs/ceph/file.c132
-rw-r--r--fs/ceph/inode.c13
-rw-r--r--fs/ceph/mds_client.c165
-rw-r--r--fs/ceph/mds_client.h13
-rw-r--r--fs/ceph/mdsmap.c22
-rw-r--r--fs/ceph/super.c19
-rw-r--r--fs/ceph/super.h31
-rw-r--r--fs/ceph/xattr.c12
-rw-r--r--fs/cifs/Makefile6
-rw-r--r--fs/cifs/cached_dir.c388
-rw-r--r--fs/cifs/cached_dir.h64
-rw-r--r--fs/cifs/cifs_debug.c74
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c9
-rw-r--r--fs/cifs/cifsfs.c80
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h186
-rw-r--r--fs/cifs/cifsproto.h13
-rw-r--r--fs/cifs/cifsroot.c2
-rw-r--r--fs/cifs/cifssmb.c477
-rw-r--r--fs/cifs/connect.c326
-rw-r--r--fs/cifs/dfs_cache.c8
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c316
-rw-r--r--fs/cifs/fs_context.c9
-rw-r--r--fs/cifs/fs_context.h8
-rw-r--r--fs/cifs/fscache.h16
-rw-r--r--fs/cifs/inode.c65
-rw-r--r--fs/cifs/ioctl.c2
-rw-r--r--fs/cifs/link.c8
-rw-r--r--fs/cifs/misc.c64
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/readdir.c11
-rw-r--r--fs/cifs/sess.c5
-rw-r--r--fs/cifs/smb1ops.c10
-rw-r--r--fs/cifs/smb2file.c1
-rw-r--r--fs/cifs/smb2inode.c11
-rw-r--r--fs/cifs/smb2misc.c62
-rw-r--r--fs/cifs/smb2ops.c552
-rw-r--r--fs/cifs/smb2pdu.c85
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c38
-rw-r--r--fs/cifs/transport.c341
-rw-r--r--fs/cifs/xattr.c5
-rw-r--r--fs/crypto/fname.c36
-rw-r--r--fs/crypto/fscrypt_private.h9
-rw-r--r--fs/crypto/hooks.c6
-rw-r--r--fs/crypto/policy.c35
-rw-r--r--fs/dax.c401
-rw-r--r--fs/dcache.c87
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/erofs/fscache.c8
-rw-r--r--fs/erofs/internal.h29
-rw-r--r--fs/erofs/super.c10
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zmap.c16
-rw-r--r--fs/eventpoll.c22
-rw-r--r--fs/exec.c17
-rw-r--r--fs/exfat/exfat_fs.h19
-rw-r--r--fs/exfat/fatent.c2
-rw-r--r--fs/exfat/file.c82
-rw-r--r--fs/exfat/inode.c41
-rw-r--r--fs/exfat/misc.c17
-rw-r--r--fs/exfat/namei.c22
-rw-r--r--fs/exfat/nls.c4
-rw-r--r--fs/exfat/super.c4
-rw-r--r--fs/ext2/super.c7
-rw-r--r--fs/ext2/xattr.c170
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h16
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents_status.c3
-rw-r--r--fs/ext4/fast_commit.c44
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inline.c33
-rw-r--r--fs/ext4/inode.c26
-rw-r--r--fs/ext4/ioctl.c105
-rw-r--r--fs/ext4/mballoc.c31
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/namei.c30
-rw-r--r--fs/ext4/orphan.c24
-rw-r--r--fs/ext4/resize.c39
-rw-r--r--fs/ext4/super.c22
-rw-r--r--fs/ext4/symlink.c15
-rw-r--r--fs/ext4/xattr.c168
-rw-r--r--fs/ext4/xattr.h16
-rw-r--r--fs/f2fs/compress.c229
-rw-r--r--fs/f2fs/data.c82
-rw-r--r--fs/f2fs/debug.c2
-rw-r--r--fs/f2fs/f2fs.h102
-rw-r--r--fs/f2fs/file.c79
-rw-r--r--fs/f2fs/gc.c11
-rw-r--r--fs/f2fs/gc.h21
-rw-r--r--fs/f2fs/inode.c3
-rw-r--r--fs/f2fs/node.c14
-rw-r--r--fs/f2fs/segment.c79
-rw-r--r--fs/f2fs/segment.h11
-rw-r--r--fs/f2fs/super.c92
-rw-r--r--fs/f2fs/sysfs.c56
-rw-r--r--fs/fat/namei_vfat.c231
-rw-r--r--fs/fs-writeback.c12
-rw-r--r--fs/fscache/cookie.c9
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/dax.c2
-rw-r--r--fs/fuse/dev.c7
-rw-r--r--fs/fuse/dir.c16
-rw-r--r--fs/fuse/file.c44
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/fuse/ioctl.c15
-rw-r--r--fs/fuse/virtio_fs.c9
-rw-r--r--fs/gfs2/aops.c26
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c202
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c31
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/log.c5
-rw-r--r--fs/gfs2/main.c3
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c28
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/rgrp.h5
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hugetlbfs/inode.c44
-rw-r--r--fs/inode.c58
-rw-r--r--fs/iomap/buffered-io.c15
-rw-r--r--fs/iomap/direct-io.c2
-rw-r--r--fs/jbd2/checkpoint.c6
-rw-r--r--fs/jbd2/commit.c32
-rw-r--r--fs/jbd2/journal.c44
-rw-r--r--fs/jbd2/recovery.c30
-rw-r--r--fs/jbd2/revoke.c8
-rw-r--r--fs/jbd2/transaction.c40
-rw-r--r--fs/kernel_read_file.c38
-rw-r--r--fs/ksmbd/auth.c56
-rw-r--r--fs/ksmbd/auth.h11
-rw-r--r--fs/ksmbd/connection.c9
-rw-r--r--fs/ksmbd/connection.h10
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/mgmt/share_config.c20
-rw-r--r--fs/ksmbd/mgmt/share_config.h3
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c21
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h4
-rw-r--r--fs/ksmbd/mgmt/user_session.c95
-rw-r--r--fs/ksmbd/mgmt/user_session.h13
-rw-r--r--fs/ksmbd/oplock.c46
-rw-r--r--fs/ksmbd/server.c8
-rw-r--r--fs/ksmbd/smb2misc.c12
-rw-r--r--fs/ksmbd/smb2pdu.c147
-rw-r--r--fs/ksmbd/smb_common.h2
-rw-r--r--fs/ksmbd/smbacl.c130
-rw-r--r--fs/ksmbd/smbacl.h2
-rw-r--r--fs/ksmbd/vfs.c8
-rw-r--r--fs/ksmbd/vfs_cache.c2
-rw-r--r--fs/lockd/svc4proc.c12
-rw-r--r--fs/lockd/svclock.c10
-rw-r--r--fs/lockd/svcproc.c5
-rw-r--r--fs/lockd/xdr4.c19
-rw-r--r--fs/locks.c1
-rw-r--r--fs/mbcache.c125
-rw-r--r--fs/namei.c80
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/nfs/blocklayout/dev.c42
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c83
-rw-r--r--fs/nfs/direct.c58
-rw-r--r--fs/nfs/file.c17
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c4
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c6
-rw-r--r--fs/nfs/fs_context.c26
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/internal.h51
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs42xattr.c7
-rw-r--r--fs/nfs/nfs42xdr.c170
-rw-r--r--fs/nfs/nfs4client.c4
-rw-r--r--fs/nfs/nfs4file.c6
-rw-r--r--fs/nfs/nfs4idmap.c46
-rw-r--r--fs/nfs/nfs4proc.c32
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/pnfs.c1
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c64
-rw-r--r--fs/nfsd/acl.h6
-rw-r--r--fs/nfsd/filecache.c751
-rw-r--r--fs/nfsd/filecache.h11
-rw-r--r--fs/nfsd/netns.h3
-rw-r--r--fs/nfsd/nfs2acl.c6
-rw-r--r--fs/nfsd/nfs3acl.c4
-rw-r--r--fs/nfsd/nfs3proc.c35
-rw-r--r--fs/nfsd/nfs4acl.c46
-rw-r--r--fs/nfsd/nfs4callback.c37
-rw-r--r--fs/nfsd/nfs4proc.c330
-rw-r--r--fs/nfsd/nfs4state.c127
-rw-r--r--fs/nfsd/nfs4xdr.c123
-rw-r--r--fs/nfsd/nfscache.c3
-rw-r--r--fs/nfsd/nfsctl.c21
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfsfh.c27
-rw-r--r--fs/nfsd/nfsfh.h58
-rw-r--r--fs/nfsd/nfsproc.c27
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/trace.h327
-rw-r--r--fs/nfsd/vfs.c256
-rw-r--r--fs/nfsd/vfs.h33
-rw-r--r--fs/nfsd/xdr4.h60
-rw-r--r--fs/ntfs3/attrib.c557
-rw-r--r--fs/ntfs3/bitmap.c12
-rw-r--r--fs/ntfs3/file.c110
-rw-r--r--fs/ntfs3/frecord.c128
-rw-r--r--fs/ntfs3/fslog.c4
-rw-r--r--fs/ntfs3/fsntfs.c92
-rw-r--r--fs/ntfs3/index.c33
-rw-r--r--fs/ntfs3/inode.c19
-rw-r--r--fs/ntfs3/namei.c6
-rw-r--r--fs/ntfs3/ntfs_fs.h16
-rw-r--r--fs/ntfs3/record.c27
-rw-r--r--fs/ntfs3/run.c108
-rw-r--r--fs/ntfs3/super.c17
-rw-r--r--fs/ntfs3/xattr.c51
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/heartbeat.c27
-rw-r--r--fs/ocfs2/namei.c1
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/inode.c15
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h6
-rw-r--r--fs/overlayfs/super.c13
-rw-r--r--fs/posix_acl.c15
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c46
-rw-r--r--fs/proc/inode.c22
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/task_mmu.c14
-rw-r--r--fs/proc/vmcore.c1
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/remap_range.c31
-rw-r--r--fs/splice.c54
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/block.c10
-rw-r--r--fs/squashfs/decompressor.h1
-rw-r--r--fs/squashfs/file.c133
-rw-r--r--fs/squashfs/file_direct.c90
-rw-r--r--fs/squashfs/lz4_wrapper.c7
-rw-r--r--fs/squashfs/lzo_wrapper.c7
-rw-r--r--fs/squashfs/page_actor.c53
-rw-r--r--fs/squashfs/page_actor.h62
-rw-r--r--fs/squashfs/super.c33
-rw-r--r--fs/squashfs/xz_wrapper.c11
-rw-r--r--fs/squashfs/zlib_wrapper.c12
-rw-r--r--fs/squashfs/zstd_wrapper.c12
-rw-r--r--fs/super.c39
-rw-r--r--fs/tracefs/inode.c31
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/userfaultfd.c10
-rw-r--r--fs/xfs/Makefile6
-rw-r--r--fs/xfs/libxfs/xfs_ag.c171
-rw-r--r--fs/xfs/libxfs/xfs_ag.h75
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c145
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h58
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_attr.c22
-rw-r--r--fs/xfs/libxfs/xfs_attr.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c28
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c15
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c84
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_btree.c29
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c8
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c86
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h25
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c15
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c65
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h27
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c19
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c8
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_types.c73
-rw-r--r--fs/xfs/libxfs/xfs_types.h9
-rw-r--r--fs/xfs/scrub/agheader.c25
-rw-r--r--fs/xfs/scrub/agheader_repair.c21
-rw-r--r--fs/xfs/scrub/alloc.c7
-rw-r--r--fs/xfs/scrub/bmap.c16
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c6
-rw-r--r--fs/xfs/scrub/dabtree.c2
-rw-r--r--fs/xfs/scrub/dir.c2
-rw-r--r--fs/xfs/scrub/fscounters.c4
-rw-r--r--fs/xfs/scrub/health.c2
-rw-r--r--fs/xfs/scrub/ialloc.c12
-rw-r--r--fs/xfs/scrub/quota.c2
-rw-r--r--fs/xfs/scrub/refcount.c9
-rw-r--r--fs/xfs/scrub/repair.c49
-rw-r--r--fs/xfs/scrub/rmap.c6
-rw-r--r--fs/xfs/scrub/symlink.c6
-rw-r--r--fs/xfs/xfs_attr_inactive.c23
-rw-r--r--fs/xfs/xfs_attr_list.c9
-rw-r--r--fs/xfs/xfs_bmap_util.c37
-rw-r--r--fs/xfs/xfs_buf.c301
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c18
-rw-r--r--fs/xfs/xfs_file.c57
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsmap.c3
-rw-r--r--fs/xfs/xfs_fsops.c16
-rw-r--r--fs/xfs/xfs_icache.c16
-rw-r--r--fs/xfs/xfs_inode.c693
-rw-r--r--fs/xfs/xfs_inode.h70
-rw-r--r--fs/xfs/xfs_inode_item.c58
-rw-r--r--fs/xfs/xfs_ioctl.c10
-rw-r--r--fs/xfs/xfs_iomap.c38
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c13
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_iunlink_item.c180
-rw-r--r--fs/xfs/xfs_iunlink_item.h27
-rw-r--r--fs/xfs/xfs_log.c69
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_cil.c472
-rw-r--r--fs/xfs/xfs_log_priv.h58
-rw-r--r--fs/xfs/xfs_log_recover.c196
-rw-r--r--fs/xfs/xfs_mount.c3
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_notify_failure.c226
-rw-r--r--fs/xfs/xfs_qm.c17
-rw-r--r--fs/xfs/xfs_reflink.c256
-rw-r--r--fs/xfs/xfs_reflink.h3
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c2
-rw-r--r--fs/xfs/xfs_trace.h3
-rw-r--r--fs/xfs/xfs_trans.c95
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/zonefs/super.c8
417 files changed, 11373 insertions, 7432 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index baf2b152229e..23cf9b2fbfe4 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -28,14 +28,18 @@ static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
/**
* v9fs_fid_add - add a fid to a dentry
* @dentry: dentry that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&dentry->d_lock);
__add_fid(dentry, fid);
spin_unlock(&dentry->d_lock);
+
+ *pfid = NULL;
}
/**
@@ -56,7 +60,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
h = (struct hlist_head *)&inode->i_private;
hlist_for_each_entry(fid, h, ilist) {
if (uid_eq(fid->uid, uid)) {
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
ret = fid;
break;
}
@@ -68,15 +72,19 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
/**
* v9fs_open_fid_add - add an open fid to an inode
* @inode: inode that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid)
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&inode->i_lock);
hlist_add_head(&fid->ilist, (struct hlist_head *)&inode->i_private);
spin_unlock(&inode->i_lock);
+
+ *pfid = NULL;
}
@@ -104,7 +112,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
ret = fid;
- refcount_inc(&ret->count);
+ p9_fid_get(ret);
break;
}
}
@@ -150,9 +158,9 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
{
struct dentry *ds;
const unsigned char **wnames, *uname;
- int i, n, l, clone, access;
+ int i, n, l, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid;
+ struct p9_fid *fid, *root_fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -169,17 +177,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = v9fs_fid_find(ds, uid, any);
if (fid) {
/* Found the parent fid do a lookup with that */
- struct p9_fid *ofid = fid;
+ old_fid = fid;
- fid = p9_client_walk(ofid, 1, &dentry->d_name.name, 1);
- p9_client_clunk(ofid);
+ fid = p9_client_walk(old_fid, 1, &dentry->d_name.name, 1);
+ p9_fid_put(old_fid);
goto fid_out;
}
up_read(&v9ses->rename_sem);
/* start from the root and try to do a lookup */
- fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
- if (!fid) {
+ root_fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+ if (!root_fid) {
/* the user is not attached to the fs yet */
if (access == V9FS_ACCESS_SINGLE)
return ERR_PTR(-EPERM);
@@ -194,12 +202,13 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
- refcount_inc(&fid->count);
- v9fs_fid_add(dentry->d_sb->s_root, fid);
+ root_fid = p9_fid_get(fid);
+ v9fs_fid_add(dentry->d_sb->s_root, &fid);
}
/* If we are root ourself just return that */
if (dentry->d_sb->s_root == dentry)
- return fid;
+ return root_fid;
+
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -211,19 +220,20 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
- old_fid = fid;
- clone = 1;
+ fid = root_fid;
+ old_fid = root_fid;
i = 0;
while (i < n) {
l = min(n - i, P9_MAXWELEM);
/*
* We need to hold rename lock when doing a multipath
- * walk to ensure none of the patch component change
+ * walk to ensure none of the path components change
*/
- fid = p9_client_walk(fid, l, &wnames[i], clone);
+ fid = p9_client_walk(old_fid, l, &wnames[i],
+ old_fid == root_fid /* clone */);
/* non-cloning walk will return the same fid */
if (fid != old_fid) {
- p9_client_clunk(old_fid);
+ p9_fid_put(old_fid);
old_fid = fid;
}
if (IS_ERR(fid)) {
@@ -231,7 +241,6 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
goto err_out;
}
i += l;
- clone = 0;
}
kfree(wnames);
fid_out:
@@ -239,11 +248,11 @@ fid_out:
spin_lock(&dentry->d_lock);
if (d_unhashed(dentry)) {
spin_unlock(&dentry->d_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(-ENOENT);
} else {
__add_fid(dentry, fid);
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
spin_unlock(&dentry->d_lock);
}
}
@@ -300,7 +309,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
fid = clone_fid(ofid);
if (IS_ERR(fid))
goto error_out;
- p9_client_clunk(ofid);
+ p9_fid_put(ofid);
/*
* writeback fid will only be used to write back the
* dirty pages. We always request for the open fid in read-write
@@ -309,7 +318,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
*/
err = p9_client_open(fid, O_RDWR);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(err);
goto error_out;
}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index f7f33509e169..8a4e8cd12ca2 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -13,9 +13,9 @@ static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
{
return v9fs_fid_lookup(dentry->d_parent);
}
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid);
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid);
static inline struct p9_fid *clone_fid(struct p9_fid *fid)
{
return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1);
@@ -29,7 +29,7 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
return fid;
nfid = clone_fid(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return nfid;
}
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index d0833fa69faf..47b9a1122f34 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -73,7 +73,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
BUG_ON(!fid);
}
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
rreq->netfs_priv = fid;
return 0;
}
@@ -86,7 +86,7 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
{
struct p9_fid *fid = rreq->netfs_priv;
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
/**
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 1c609e99d280..f89f01734587 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -54,7 +54,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
- p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+ p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
dentry->d_fsdata = NULL;
}
@@ -85,7 +85,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
retval = v9fs_refresh_inode_dotl(fid, inode);
else
retval = v9fs_refresh_inode(fid, inode);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval == -ENOENT)
return 0;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 958680f7f23e..000fbaae9b18 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -218,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
if ((filp->f_mode & FMODE_WRITE)) {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2573c08f335c..aec43ba83799 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -63,15 +63,16 @@ int v9fs_file_open(struct inode *inode, struct file *file)
err = p9_client_open(fid, omode);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
if ((file->f_flags & O_APPEND) &&
(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
+
+ file->private_data = fid;
}
- file->private_data = fid;
mutex_lock(&v9inode->v_mutex);
if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
!v9inode->writeback_fid &&
@@ -95,10 +96,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
return 0;
out_error:
- p9_client_clunk(file->private_data);
+ p9_fid_put(file->private_data);
file->private_data = NULL;
return err;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3d8297714772..4d1a4a8d9277 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -399,10 +399,8 @@ void v9fs_evict_inode(struct inode *inode)
fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
/* clunk the fid stashed in writeback_fid */
- if (v9inode->writeback_fid) {
- p9_client_clunk(v9inode->writeback_fid);
- v9inode->writeback_fid = NULL;
- }
+ p9_fid_put(v9inode->writeback_fid);
+ v9inode->writeback_fid = NULL;
}
static int v9fs_test_inode(struct inode *inode, void *data)
@@ -569,7 +567,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
if (v9fs_proto_dotl(v9ses))
retval = p9_client_unlinkat(dfid, dentry->d_name.name,
v9fs_at_to_dotl_flags(flags));
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (retval == -EOPNOTSUPP) {
/* Try the one based on path */
v9fid = v9fs_fid_clone(dentry);
@@ -633,14 +631,12 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- p9_client_clunk(dfid);
- return ERR_PTR(err);
+ goto error;
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
@@ -651,8 +647,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS,
"p9_client_walk failed %d\n", err);
- fid = NULL;
- p9_client_clunk(dfid);
goto error;
}
/*
@@ -663,21 +657,17 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
"inode creation failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
}
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return ofid;
error:
- if (ofid)
- p9_client_clunk(ofid);
-
- if (fid)
- p9_client_clunk(fid);
-
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
return ERR_PTR(err);
}
@@ -708,7 +698,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -744,7 +734,7 @@ static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
@@ -785,7 +775,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
*/
name = dentry->d_name.name;
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (fid == ERR_PTR(-ENOENT))
inode = NULL;
else if (IS_ERR(fid))
@@ -804,11 +794,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
res = d_splice_alias(inode, dentry);
if (!IS_ERR(fid)) {
if (!res)
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
else if (!IS_ERR(res))
- v9fs_fid_add(res, fid);
+ v9fs_fid_add(res, &fid);
else
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
return res;
}
@@ -847,7 +837,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_proto_dotu(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- fid = NULL;
goto error;
}
@@ -882,7 +871,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
out:
@@ -890,8 +879,7 @@ out:
return err;
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
goto out;
}
@@ -939,9 +927,9 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct inode *old_inode;
struct inode *new_inode;
struct v9fs_session_info *v9ses;
- struct p9_fid *oldfid, *dfid;
- struct p9_fid *olddirfid;
- struct p9_fid *newdirfid;
+ struct p9_fid *oldfid = NULL, *dfid = NULL;
+ struct p9_fid *olddirfid = NULL;
+ struct p9_fid *newdirfid = NULL;
struct p9_wstat wstat;
if (flags)
@@ -958,21 +946,22 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
dfid = v9fs_parent_fid(old_dentry);
olddirfid = clone_fid(dfid);
- if (dfid && !IS_ERR(dfid))
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(olddirfid)) {
retval = PTR_ERR(olddirfid);
- goto done;
+ goto error;
}
dfid = v9fs_parent_fid(new_dentry);
newdirfid = clone_fid(dfid);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(newdirfid)) {
retval = PTR_ERR(newdirfid);
- goto clunk_olddir;
+ goto error;
}
down_write(&v9ses->rename_sem);
@@ -983,7 +972,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = p9_client_rename(oldfid, newdirfid,
new_dentry->d_name.name);
if (retval != -EOPNOTSUPP)
- goto clunk_newdir;
+ goto error_locked;
}
if (old_dentry->d_parent != new_dentry->d_parent) {
/*
@@ -992,14 +981,14 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
retval = -EXDEV;
- goto clunk_newdir;
+ goto error_locked;
}
v9fs_blank_wstat(&wstat);
wstat.muid = v9ses->uname;
wstat.name = new_dentry->d_name.name;
retval = p9_client_wstat(oldfid, &wstat);
-clunk_newdir:
+error_locked:
if (!retval) {
if (new_inode) {
if (S_ISDIR(new_inode->i_mode))
@@ -1020,13 +1009,11 @@ clunk_newdir:
d_move(old_dentry, new_dentry);
}
up_write(&v9ses->rename_sem);
- p9_client_clunk(newdirfid);
-
-clunk_olddir:
- p9_client_clunk(olddirfid);
-done:
- p9_client_clunk(oldfid);
+error:
+ p9_fid_put(newdirfid);
+ p9_fid_put(olddirfid);
+ p9_fid_put(oldfid);
return retval;
}
@@ -1060,7 +1047,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
return PTR_ERR(fid);
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -1136,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
retval = p9_client_wstat(fid, &wstat);
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval < 0)
return retval;
@@ -1261,7 +1248,7 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_CAST(fid);
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return ERR_CAST(st);
@@ -1308,7 +1295,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -1364,7 +1351,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
v9fs_refresh_inode(oldfid, d_inode(old_dentry));
v9fs_invalidate_inode_attr(dir);
}
- p9_client_clunk(oldfid);
+ p9_fid_put(oldfid);
return retval;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index b6eb1160296c..5cfa4b4f070f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -238,7 +238,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct inode *inode;
struct p9_fid *fid = NULL;
struct v9fs_inode *v9inode;
- struct p9_fid *dfid, *ofid, *inode_fid;
+ struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
struct dentry *res = NULL;
@@ -274,7 +274,6 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- p9_client_clunk(dfid);
goto out;
}
@@ -286,38 +285,34 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
- p9_client_clunk(dfid);
- goto error;
+ goto out;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
mode, gid, &qid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
- p9_client_clunk(dfid);
- goto error;
+ goto out;
}
v9fs_invalidate_inode_attr(dir);
/* instantiate inode and assign the unopened fid to the dentry */
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- fid = NULL;
- goto error;
+ goto out;
}
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
- goto error;
+ goto out;
}
/* Now set the ACL based on the default value */
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
v9inode = V9FS_I(inode);
@@ -336,7 +331,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode_fid)) {
err = PTR_ERR(inode_fid);
mutex_unlock(&v9inode->v_mutex);
- goto err_clunk_old_fid;
+ goto out;
}
v9inode->writeback_fid = (void *) inode_fid;
}
@@ -344,25 +339,20 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
/* Since we are opening a file, assign the open fid to the file */
err = finish_open(file, dentry, generic_file_open);
if (err)
- goto err_clunk_old_fid;
+ goto out;
file->private_data = ofid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, ofid);
+ v9fs_open_fid_add(inode, &ofid);
file->f_mode |= FMODE_CREATED;
out:
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
dput(res);
return err;
-
-error:
- if (fid)
- p9_client_clunk(fid);
-err_clunk_old_fid:
- if (ofid)
- p9_client_clunk(ofid);
- goto out;
}
/**
@@ -400,7 +390,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -422,7 +411,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -435,10 +423,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -457,10 +444,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -489,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
*/
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -603,7 +589,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = p9_client_setattr(fid, &p9attr);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
@@ -619,12 +605,12 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = v9fs_acl_chmod(inode, fid);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
}
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -743,7 +729,6 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -755,9 +740,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/* Not in cached mode. No need to populate inode with stat */
@@ -770,10 +754,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
}
error:
- if (fid)
- p9_client_clunk(fid);
-
- p9_client_clunk(dfid);
+ p9_fid_put(fid);
+ p9_fid_put(dfid);
return err;
}
@@ -803,14 +785,14 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
oldfid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(oldfid)) {
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return PTR_ERR(oldfid);
}
err = p9_client_link(dfid, oldfid, dentry->d_name.name);
- p9_client_clunk(dfid);
- p9_client_clunk(oldfid);
+ p9_fid_put(dfid);
+ p9_fid_put(oldfid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
return err;
@@ -826,7 +808,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
return PTR_ERR(fid);
v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
ihold(d_inode(old_dentry));
d_instantiate(dentry, d_inode(old_dentry));
@@ -866,7 +848,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -891,7 +872,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -905,9 +885,8 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
goto error;
}
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -923,10 +902,9 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
d_instantiate(dentry, inode);
}
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -956,7 +934,7 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval)
return ERR_PTR(retval);
set_delayed_call(done, kfree_link, target);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 97e23b4e6982..2d9ee073d12c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -184,13 +184,13 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = v9fs_get_acl(inode, fid);
if (retval)
goto release_sb;
- v9fs_fid_add(root, fid);
+ v9fs_fid_add(root, &fid);
p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
return dget(sb->s_root);
clunk_fid:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_session_close(v9ses);
free_session:
kfree(v9ses);
@@ -203,7 +203,7 @@ release_sb:
* attached the fid to dentry so it won't get clunked
* automatically.
*/
- p9_client_clunk(fid);
+ p9_fid_put(fid);
deactivate_locked_super(sb);
return ERR_PTR(retval);
}
@@ -270,7 +270,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
res = simple_statfs(dentry, buf);
done:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return res;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index a824441b95a2..1f9298a4bd42 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -44,7 +44,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
if (err)
retval = err;
}
- p9_client_clunk(attr_fid);
+ p9_fid_put(attr_fid);
return retval;
}
@@ -71,7 +71,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -98,7 +98,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_set(fid, name, value, value_len, flags);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -128,7 +128,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
retval);
else
p9_client_write(fid, 0, &from, &retval);
- err = p9_client_clunk(fid);
+ err = p9_fid_put(fid);
if (!retval && err)
retval = err;
return retval;
diff --git a/fs/Kconfig b/fs/Kconfig
index 5976eb33535f..a547307c1ae8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -247,8 +247,7 @@ config HUGETLB_PAGE
#
# Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of minimizing overhead of struct page associated with
-# each HugeTLB page.
+# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
#
config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
bool
@@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
- bool "Default optimizing vmemmap pages of HugeTLB to on"
+ bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
default n
depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
- pages associated with each HugeTLB page is default off. Say Y here
- to enable optimizing vmemmap pages of HugeTLB by default. It can then
- be disabled on the command line via hugetlb_free_vmemmap=off.
+ The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
+ enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+ (boot command line) or hugetlb_optimize_vmemmap (sysctl).
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 07ad744eef77..988c2ac7cece 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -158,7 +158,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name[i] = tolower(name[i]);
cell->name[i] = 0;
- atomic_set(&cell->ref, 1);
+ refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
cell->volumes = RB_ROOT;
@@ -287,7 +287,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
@@ -295,7 +295,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
afs_queue_cell(cell, afs_cell_trace_get_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
afs_cell_trace_wait);
_debug("wait_for_cell");
wait_var_event(&cell->state,
@@ -490,13 +490,13 @@ static void afs_cell_destroy(struct rcu_head *rcu)
{
struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
struct afs_net *net = cell->net;
- int u;
+ int r;
_enter("%p{%s}", cell, cell->name);
- u = atomic_read(&cell->ref);
- ASSERTCMP(u, ==, 0);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
+ r = refcount_read(&cell->ref);
+ ASSERTCMP(r, ==, 0);
+ trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
@@ -539,13 +539,10 @@ void afs_cells_timer(struct timer_list *timer)
*/
struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u;
+ int r;
- if (atomic_read(&cell->ref) <= 0)
- BUG();
-
- u = atomic_inc_return(&cell->ref);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
+ __refcount_inc(&cell->ref, &r);
+ trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason);
return cell;
}
@@ -556,12 +553,14 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
if (cell) {
unsigned int debug_id = cell->debug_id;
- unsigned int u, a;
+ unsigned int a;
+ bool zero;
+ int r;
a = atomic_read(&cell->active);
- u = atomic_dec_return(&cell->ref);
- trace_afs_cell(debug_id, u, a, reason);
- if (u == 0) {
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
call_rcu(&cell->rcu, afs_cell_destroy);
@@ -574,14 +573,12 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
-
- if (atomic_read(&cell->ref) <= 0)
- BUG();
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
+ WARN_ON(r == 0);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
return cell;
}
@@ -593,7 +590,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
{
unsigned int debug_id;
time64_t now, expire_delay;
- int u, a;
+ int r, a;
if (!cell)
return;
@@ -607,9 +604,9 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, u, a, reason);
+ trace_afs_cell(debug_id, r, a, reason);
WARN_ON(a == 0);
if (a == 1)
/* 'cell' may now be garbage collected. */
@@ -621,11 +618,11 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
*/
void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
}
/*
@@ -739,7 +736,7 @@ again:
active = 1;
if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
afs_cell_trace_unuse_delete);
smp_store_release(&cell->state, AFS_CELL_REMOVED);
}
@@ -866,7 +863,7 @@ void afs_manage_cells(struct work_struct *work)
bool sched_cell = false;
active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_manage);
ASSERTCMP(active, >=, 1);
@@ -874,7 +871,7 @@ void afs_manage_cells(struct work_struct *work)
if (purging) {
if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_unuse_pin);
}
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3f5de28be79..0a090d614e76 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -212,8 +212,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
* to maintain cache coherency.
*/
if (call->server) {
- trace_afs_server(call->server,
- atomic_read(&call->server->ref),
+ trace_afs_server(call->server->debug_id,
+ refcount_read(&call->server->ref),
atomic_read(&call->server->active),
afs_server_trace_callback);
afs_break_callbacks(call->server, call->count, call->request);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index c4210a3964d8..bbcc5afd1576 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call)
if (call->error == 0) {
spin_lock(&vnode->lock);
trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
- vnode->locked_at = call->reply_time;
+ vnode->locked_at = call->issue_time;
afs_schedule_lock_extension(vnode);
spin_unlock(&vnode->lock);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 4943413d9c5f..7d37f63ef0f0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -131,7 +131,7 @@ bad:
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
{
- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
+ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry;
}
static void xdr_decode_AFSCallBack(const __be32 **_bp,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 64dab70d4a4f..6d3a3dbe4928 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -104,12 +104,14 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
inode->i_mapping->a_ops = &afs_file_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_SYMLINK:
/* Symlinks with a mode of 0644 are actually mountpoints. */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6f25d9e75b5..723d162078a3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -122,7 +122,7 @@ struct afs_call {
};
struct afs_operation *op;
unsigned int server_index;
- atomic_t usage;
+ refcount_t ref;
enum afs_call_state state;
spinlock_t state_lock;
int error; /* error code */
@@ -137,7 +137,6 @@ struct afs_call {
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
- bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
@@ -151,7 +150,7 @@ struct afs_call {
} __attribute__((packed));
__be64 tmp64;
};
- ktime_t reply_time; /* Time of first reply packet */
+ ktime_t issue_time; /* Time of issue of operation */
};
struct afs_call_type {
@@ -365,7 +364,7 @@ struct afs_cell {
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
- atomic_t ref; /* Struct refcount */
+ refcount_t ref; /* Struct refcount */
atomic_t active; /* Active usage counter */
unsigned long flags;
#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */
@@ -410,7 +409,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
- atomic_t usage;
+ refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
@@ -446,7 +445,7 @@ struct afs_vlserver_entry {
struct afs_vlserver_list {
struct rcu_head rcu;
- atomic_t usage;
+ refcount_t ref;
u8 nr_servers;
u8 index; /* Server currently in use */
u8 preferred; /* Preferred server */
@@ -517,7 +516,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
- atomic_t ref; /* Object refcount */
+ refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
unsigned int rtt; /* Server's current RTT in uS */
@@ -571,7 +570,7 @@ struct afs_volume {
struct rcu_head rcu;
afs_volid_t vid; /* volume ID */
};
- atomic_t usage;
+ refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -1493,14 +1492,14 @@ extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
*/
static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
{
- atomic_inc(&vlserver->usage);
+ refcount_inc(&vlserver->ref);
return vlserver;
}
static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
{
if (vllist)
- atomic_inc(&vllist->usage);
+ refcount_inc(&vllist->ref);
return vllist;
}
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 933e67fcdab1..805328ca5428 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -69,6 +69,7 @@ int afs_abort_to_error(u32 abort_code)
/* Unified AFS error table */
case UAEPERM: return -EPERM;
case UAENOENT: return -ENOENT;
+ case UAEAGAIN: return -EAGAIN;
case UAEACCES: return -EACCES;
case UAEBUSY: return -EBUSY;
case UAEEXIST: return -EEXIST;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e1b863449296..2a0c83d71565 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -47,7 +47,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
/* display one cell per line on subsequent lines */
seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
- atomic_read(&cell->ref),
+ refcount_read(&cell->ref),
atomic_read(&cell->active),
cell->dns_expiry - ktime_get_real_seconds(),
vllist ? vllist->nr_servers : 0,
@@ -217,7 +217,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
}
seq_printf(m, "%3d %08llx %s %s\n",
- atomic_read(&vol->usage), vol->vid,
+ refcount_read(&vol->ref), vol->vid,
afs_vol_types[vol->type],
vol->name);
@@ -388,7 +388,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
alist = rcu_dereference(server->addresses);
seq_printf(m, "%pU %3d %3d\n",
&server->uuid,
- atomic_read(&server->ref),
+ refcount_read(&server->ref),
atomic_read(&server->active));
seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
server->flags, server->rtt, server->cb_s_break);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index a5434f3e57c6..eccc3cd0cb70 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -145,14 +145,14 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->type = type;
call->net = net;
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
- atomic_set(&call->usage, 1);
+ refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
call->iter = &call->def_iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_alloc, 1, o,
+ trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o,
__builtin_return_address(0));
return call;
}
@@ -163,14 +163,16 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
void afs_put_call(struct afs_call *call)
{
struct afs_net *net = call->net;
- int n = atomic_dec_return(&call->usage);
- int o = atomic_read(&net->nr_outstanding_calls);
+ unsigned int debug_id = call->debug_id;
+ bool zero;
+ int r, o;
- trace_afs_call(call, afs_call_trace_put, n, o,
+ zero = __refcount_dec_and_test(&call->ref, &r);
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
__builtin_return_address(0));
- ASSERTCMP(n, >=, 0);
- if (n == 0) {
+ if (zero) {
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
@@ -185,7 +187,7 @@ void afs_put_call(struct afs_call *call)
afs_put_addrlist(call->alist);
kfree(call->request);
- trace_afs_call(call, afs_call_trace_free, 0, o,
+ trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
__builtin_return_address(0));
kfree(call);
@@ -198,9 +200,11 @@ void afs_put_call(struct afs_call *call)
static struct afs_call *afs_get_call(struct afs_call *call,
enum afs_call_trace why)
{
- int u = atomic_inc_return(&call->usage);
+ int r;
- trace_afs_call(call, why, u,
+ __refcount_inc(&call->ref, &r);
+
+ trace_afs_call(call->debug_id, why, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
return call;
@@ -347,6 +351,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (call->max_lifespan)
rxrpc_kernel_set_max_life(call->net->socket, rxcall,
call->max_lifespan);
+ call->issue_time = ktime_get_real();
/* send the request */
iov[0].iov_base = call->request;
@@ -497,12 +502,6 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
- if (!call->have_reply_time &&
- rxrpc_kernel_get_reply_time(call->net->socket,
- call->rxcall,
- &call->reply_time))
- call->have_reply_time = true;
-
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
if (ret == 0 && call->unmarshalling_error)
@@ -668,14 +667,13 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long call_user_ID)
{
struct afs_call *call = (struct afs_call *)call_user_ID;
- int u;
+ int r;
trace_afs_notify_call(rxcall, call);
call->need_attention = true;
- u = atomic_fetch_add_unless(&call->usage, 1, 0);
- if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u + 1,
+ if (__refcount_inc_not_zero(&call->ref, &r)) {
+ trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 6e5b9a19b234..4981baf97835 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -228,7 +228,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
- atomic_set(&server->ref, 1);
+ refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
RCU_INIT_POINTER(server->addresses, alist);
@@ -243,7 +243,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server->rtt = UINT_MAX;
afs_inc_servers_outstanding(net);
- trace_afs_server(server, 1, 1, afs_server_trace_alloc);
+ trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
_leave(" = %p", server);
return server;
@@ -352,9 +352,12 @@ void afs_servers_timer(struct timer_list *timer)
struct afs_server *afs_get_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int u = atomic_inc_return(&server->ref);
+ unsigned int a;
+ int r;
- trace_afs_server(server, u, atomic_read(&server->active), reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_read(&server->active);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -364,14 +367,14 @@ struct afs_server *afs_get_server(struct afs_server *server,
static struct afs_server *afs_maybe_use_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int r = atomic_fetch_add_unless(&server->ref, 1, 0);
unsigned int a;
+ int r;
- if (r == 0)
+ if (!__refcount_inc_not_zero(&server->ref, &r))
return NULL;
a = atomic_inc_return(&server->active);
- trace_afs_server(server, r, a, reason);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -380,10 +383,13 @@ static struct afs_server *afs_maybe_use_server(struct afs_server *server,
*/
struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
{
- unsigned int r = atomic_inc_return(&server->ref);
- unsigned int a = atomic_inc_return(&server->active);
+ unsigned int a;
+ int r;
- trace_afs_server(server, r, a, reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_inc_return(&server->active);
+
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -393,14 +399,17 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int usage;
+ unsigned int a, debug_id = server->debug_id;
+ bool zero;
+ int r;
if (!server)
return;
- usage = atomic_dec_return(&server->ref);
- trace_afs_server(server, usage, atomic_read(&server->active), reason);
- if (unlikely(usage == 0))
+ a = atomic_inc_return(&server->active);
+ zero = __refcount_dec_and_test(&server->ref, &r);
+ trace_afs_server(debug_id, r - 1, a, reason);
+ if (unlikely(zero))
__afs_put_server(net, server);
}
@@ -436,7 +445,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
{
struct afs_server *server = container_of(rcu, struct afs_server, rcu);
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
afs_put_addrlist(rcu_access_pointer(server->addresses));
kfree(server);
@@ -487,7 +496,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
active = atomic_read(&server->active);
if (active == 0) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_gc);
next = rcu_dereference_protected(
server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
@@ -553,7 +562,7 @@ void afs_manage_servers(struct work_struct *work)
_debug("manage %pU %u", &server->uuid, active);
if (purging) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_purging);
if (active != 0)
pr_notice("Can't purge s=%08x\n", server->debug_id);
@@ -633,7 +642,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
_enter("");
- trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ atomic_read(&server->active),
afs_server_trace_update);
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 38b2ba1d9ec0..acc48216136a 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -17,7 +17,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
if (vlserver) {
- atomic_set(&vlserver->usage, 1);
+ refcount_set(&vlserver->ref, 1);
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
@@ -39,13 +39,9 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
{
- if (vlserver) {
- unsigned int u = atomic_dec_return(&vlserver->usage);
- //_debug("VL PUT %p{%u}", vlserver, u);
-
- if (u == 0)
- call_rcu(&vlserver->rcu, afs_vlserver_rcu);
- }
+ if (vlserver &&
+ refcount_dec_and_test(&vlserver->ref))
+ call_rcu(&vlserver->rcu, afs_vlserver_rcu);
}
struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
@@ -54,7 +50,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
if (vllist) {
- atomic_set(&vllist->usage, 1);
+ refcount_set(&vllist->ref, 1);
rwlock_init(&vllist->lock);
}
@@ -64,10 +60,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
{
if (vllist) {
- unsigned int u = atomic_dec_return(&vllist->usage);
-
- //_debug("VLLS PUT %p{%u}", vllist, u);
- if (u == 0) {
+ if (refcount_dec_and_test(&vllist->ref)) {
int i;
for (i = 0; i < vllist->nr_servers; i++) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index cc665cef0abe..f4937029dcd7 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -52,7 +52,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&cell->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -87,7 +87,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type_force = params->force;
volume->name_len = vldb->name_len;
- atomic_set(&volume->usage, 1);
+ refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
rwlock_init(&volume->servers_lock);
rwlock_init(&volume->cb_v_break_lock);
@@ -228,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(net, rcu_access_pointer(volume->servers));
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -242,8 +242,10 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
enum afs_volume_trace reason)
{
if (volume) {
- int u = atomic_inc_return(&volume->usage);
- trace_afs_volume(volume->vid, u, reason);
+ int r;
+
+ __refcount_inc(&volume->ref, &r);
+ trace_afs_volume(volume->vid, r + 1, reason);
}
return volume;
}
@@ -257,9 +259,12 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
{
if (volume) {
afs_volid_t vid = volume->vid;
- int u = atomic_dec_return(&volume->usage);
- trace_afs_volume(vid, u, reason);
- if (u == 0)
+ bool zero;
+ int r;
+
+ zero = __refcount_dec_and_test(&volume->ref, &r);
+ trace_afs_volume(vid, r - 1, reason);
+ if (zero)
afs_destroy_volume(net, volume);
}
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2c885b22de34..9ebdd36eaf2f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -91,7 +91,7 @@ try_again:
goto flush_conflicting_write;
}
- *_page = &folio->page;
+ *_page = folio_file_page(folio, pos / PAGE_SIZE);
_leave(" = 0");
return 0;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index fdc7d675b4b0..11571cca86c1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp,
struct afs_callback *cb = &scb->callback;
ktime_t cb_expiry;
- cb_expiry = call->reply_time;
- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100);
cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC);
scb->have_cb = true;
*_bp += xdr_size(x);
diff --git a/fs/attr.c b/fs/attr.c
index b5b8835ddf15..1552a5f23d6b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -183,6 +183,8 @@ EXPORT_SYMBOL(setattr_prepare);
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
+ if (offset < 0)
+ return -EINVAL;
if (inode->i_size < offset) {
unsigned long limit;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 918826eaceea..d5a44fa88acf 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type;
*/
struct autofs_info {
struct dentry *dentry;
- struct inode *inode;
-
int flags;
struct completion expire_complete;
@@ -148,6 +146,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
task_pgrp(current) == sbi->oz_pgrp);
}
+static inline bool autofs_empty(struct autofs_info *ino)
+{
+ return ino->count < 2;
+}
+
struct inode *autofs_get_inode(struct super_block *, umode_t);
void autofs_free_ino(struct autofs_info *);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b3fefd6237c3..038b3d2d9f57 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (simple_empty(dentry))
+ if (autofs_empty(ino))
return NULL;
/* Case 2: tree mount, expire iff entire tree is not busy */
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 9edf243713eb..affa70360b1f 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
INIT_LIST_HEAD(&ino->expiring);
ino->last_used = jiffies;
ino->sbi = sbi;
+ ino->count = 1;
}
return ino;
}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 91fe4548c256..ca03c1cae2be 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
#include "autofs_i.h"
+static int autofs_dir_permission(struct user_namespace *, struct inode *, int);
static int autofs_dir_symlink(struct user_namespace *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
@@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = {
const struct inode_operations autofs_dir_inode_operations = {
.lookup = autofs_lookup,
+ .permission = autofs_dir_permission,
.unlink = autofs_dir_unlink,
.symlink = autofs_dir_symlink,
.mkdir = autofs_dir_mkdir,
@@ -77,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
@@ -93,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
* it.
*/
spin_lock(&sbi->lookup_lock);
- if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
+ if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -288,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path)
struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
- /*
- * If this is an indirect mount the dentry could have gone away
- * as a result of an expire and a new one created.
+ /* If this is an indirect mount the dentry could have gone away
+ * and a new one created.
+ *
+ * This is unusual and I can't remember the case for which it
+ * was originally added now. But an example of how this can
+ * happen is an autofs indirect mount that has the "browse"
+ * option set and also has the "symlink" option in the autofs
+ * map entry. In this case the daemon will remove the browse
+ * directory and create a symlink as the mount leaving the
+ * struct path stale.
+ *
+ * Another not so obvious case is when a mount in an autofs
+ * indirect mount that uses the "nobrowse" option is being
+ * expired at the same time as a path walk. If the mount has
+ * been umounted but the mount point directory seen before
+ * becoming unhashed (during a lockless path walk) when a stat
+ * family system call is made the mount won't be re-mounted as
+ * it should. In this case the mount point that's been removed
+ * (by the daemon) will be stale and the a new mount point
+ * dentry created.
*/
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
@@ -362,7 +382,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
* the mount never trigger mounts themselves (they have an
* autofs trigger mount mounted on them). But v4 pseudo direct
* mounts do need the leaves to trigger mounts. In this case
- * we have no choice but to use the list_empty() check and
+ * we have no choice but to use the autofs_empty() check and
* require user space behave.
*/
if (sbi->version > 4) {
@@ -371,7 +391,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
goto done;
}
} else {
- if (!simple_empty(dentry)) {
+ if (!autofs_empty(ino)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -426,9 +446,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
if (rcu_walk) {
/* We don't need fs_lock in rcu_walk mode,
- * just testing 'AUTOFS_INFO_NO_RCU' is enough.
- * simple_empty() takes a spinlock, so leave it
- * to last.
+ * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough.
+ *
* We only return -EISDIR when certain this isn't
* a mount-trap.
*/
@@ -441,9 +460,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
return -EISDIR;
- if (list_empty(&dentry->d_subdirs))
- return 0;
- if (!simple_empty(dentry))
+ if (!autofs_empty(ino))
return -EISDIR;
return 0;
}
@@ -463,7 +480,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
* we can avoid needless calls ->d_automount() and avoid
* an incorrect ELOOP error return.
*/
- if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
+ if ((!path_is_mountpoint(path) && !autofs_empty(ino)) ||
(d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}
@@ -526,11 +543,30 @@ static struct dentry *autofs_lookup(struct inode *dir,
return NULL;
}
+static int autofs_dir_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ if (mask & MAY_WRITE) {
+ struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+
+ if (!autofs_oz_mode(sbi))
+ return -EACCES;
+
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
+ return -EACCES;
+ }
+
+ return generic_permission(mnt_userns, inode, mask);
+}
+
static int autofs_dir_symlink(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
@@ -539,16 +575,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
pr_debug("%s <- %pd\n", symname, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
BUG_ON(!ino);
autofs_clean_ino(ino);
@@ -571,7 +597,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
d_add(dentry, inode);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
@@ -601,17 +626,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -683,16 +697,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
pr_debug("dentry %p, removing %pd\n", dentry, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
if (ino->count != 1)
return -ENOTEMPTY;
@@ -704,7 +708,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
if (sbi->version < 5)
autofs_clear_leaf_automount_flags(dentry);
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -726,16 +729,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
struct autofs_info *p_ino;
struct inode *inode;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -753,7 +746,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
autofs_set_leaf_automount_flags(dentry);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c3aecfb0a71d..e0375ba9d0fe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -440,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
-static bool space_cache_v1_done(struct btrfs_block_group *cache)
-{
- bool ret;
-
- spin_lock(&cache->lock);
- ret = cache->cached != BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- return ret;
-}
-
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl)
-{
- wait_event(caching_ctl->wait, space_cache_v1_done(cache));
-}
-
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -750,9 +737,8 @@ done:
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
@@ -785,10 +771,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- if (btrfs_test_opt(fs_info, SPACE_CACHE))
- cache->cached = BTRFS_CACHE_FAST;
- else
- cache->cached = BTRFS_CACHE_STARTED;
+ cache->cached = BTRFS_CACHE_STARTED;
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
@@ -801,8 +784,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
- if (load_cache_only && caching_ctl)
- btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
@@ -1640,9 +1623,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret)
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ }
next:
btrfs_put_block_group(bg);
@@ -3310,7 +3295,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 35e0e860cc0b..6b3cdc4cbc41 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -263,9 +263,7 @@ void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6e556031a8f3..ebfa35fe1c38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2075,6 +2075,9 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
if (level <= write_lock_level) {
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4db85b9dc7ed..df8c99c99df9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,6 @@ struct btrfs_free_cluster {
enum btrfs_caching_type {
BTRFS_CACHE_NO,
BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
BTRFS_CACHE_FINISHED,
BTRFS_CACHE_ERROR,
};
@@ -1089,8 +1088,6 @@ struct btrfs_fs_info {
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
- /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */
- wait_queue_head_t zone_finish_wait;
/* Updates are not protected by any lock */
struct btrfs_commit_stats commit_stats;
@@ -1173,6 +1170,8 @@ enum {
BTRFS_ROOT_ORPHAN_CLEANUP,
/* This root has a drop operation that was started previously. */
BTRFS_ROOT_UNFINISHED_DROP,
+ /* This reloc root needs to have its buffers lockdep class reset. */
+ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f43196a893ca..41cddd3ff059 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found:
*/
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -1129,8 +1129,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4c3166f3c725..1af28b066b42 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -87,88 +87,6 @@ struct async_submit_bio {
};
/*
- * Lockdep class keys for extent_buffer->lock's in this root. For a given
- * eb, the lockdep key is determined by the btrfs_root it belongs to and
- * the level the eb occupies in the tree.
- *
- * Different roots are used for different purposes and may nest inside each
- * other and they require separate keysets. As lockdep keys should be
- * static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->root_key.objectid. This ensures that all special purpose
- * roots have separate keysets.
- *
- * Lock-nesting across peer nodes is always done with the immediate parent
- * node locked thus preventing deadlock. As lockdep doesn't know this, use
- * subclass to avoid triggering lockdep warning in such cases.
- *
- * The key is set by the readpage_end_io_hook after the buffer has passed
- * csum validation but before the pages are unlocked. It is also set by
- * btrfs_init_new_buffer on freshly allocated blocks.
- *
- * We also add a check to make sure the highest level of the tree is the
- * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
- * needs update as well.
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# if BTRFS_MAX_LEVEL != 8
-# error
-# endif
-
-#define DEFINE_LEVEL(stem, level) \
- .names[level] = "btrfs-" stem "-0" #level,
-
-#define DEFINE_NAME(stem) \
- DEFINE_LEVEL(stem, 0) \
- DEFINE_LEVEL(stem, 1) \
- DEFINE_LEVEL(stem, 2) \
- DEFINE_LEVEL(stem, 3) \
- DEFINE_LEVEL(stem, 4) \
- DEFINE_LEVEL(stem, 5) \
- DEFINE_LEVEL(stem, 6) \
- DEFINE_LEVEL(stem, 7)
-
-static struct btrfs_lockdep_keyset {
- u64 id; /* root objectid */
- /* Longest entry: btrfs-free-space-00 */
- char names[BTRFS_MAX_LEVEL][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL];
-} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
- { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
- { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
- { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
- { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
- { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
- { .id = 0, DEFINE_NAME("tree") },
-};
-
-#undef DEFINE_LEVEL
-#undef DEFINE_NAME
-
-void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
- int level)
-{
- struct btrfs_lockdep_keyset *ks;
-
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
-
- /* find the matching keyset, id 0 is the default entry */
- for (ks = btrfs_lockdep_keysets; ks->id; ks++)
- if (ks->id == objectid)
- break;
-
- lockdep_set_class_and_name(&eb->lock,
- &ks->keys[level], ks->names[level]);
-}
-
-#endif
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -3150,7 +3068,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
- init_waitqueue_head(&fs_info->zone_finish_wait);
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8993b428e09c..47ad8e0a2d33 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -137,14 +137,4 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level);
-#else
-static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level)
-{
-}
-#endif
-
#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ea3ec1e761e8..6914cd8024ba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2551,17 +2551,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- btrfs_cache_block_group(cache, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret)
goto out;
@@ -2584,12 +2577,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(block_group);
+ ret = btrfs_cache_block_group(block_group, true);
if (ret)
goto out;
@@ -4399,7 +4387,7 @@ have_block_group:
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -4867,6 +4855,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
+ u64 lockdep_owner = owner;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
@@ -4886,11 +4875,26 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ *
+ * The exception however is in replace_path() in relocation, where we
+ * hold the lock on the original fs root and then search for the reloc
+ * root. At that point we need to make sure any reloc root buffers are
+ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+ * lockdep happy.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ /*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
* set to the appropriate level and owner.
*/
- btrfs_set_buffer_lockdep_class(owner, buf, level);
+ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -6153,13 +6157,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bfae67c593c5..cf4f19e80e2f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3233,7 +3233,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
u32 bio_size = bio->bi_iter.bi_size;
u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- bool contig;
+ bool contig = false;
int ret;
ASSERT(bio);
@@ -3242,10 +3242,35 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
if (bio_ctrl->compress_type != compress_type)
return 0;
- if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* We can always add a page into an empty bio. */
+ contig = true;
+ } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+
+ /*
+ * The contig check requires the following conditions to be met:
+ * 1) The pages are belonging to the same inode
+ * This is implied by the call chain.
+ *
+ * 2) The range has adjacent logical bytenr
+ *
+ * 3) The range has adjacent file offset
+ * This is required for the usage of btrfs_bio->file_offset.
+ */
+ if (bio_end_sector(bio) == sector &&
+ page_offset(bvec->bv_page) + bvec->bv_offset +
+ bvec->bv_len == page_offset(page) + pg_offset)
+ contig = true;
+ } else {
+ /*
+ * For compression, all IO should have its logical bytenr
+ * set to the starting bytenr of the compressed extent.
+ */
contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
+ }
+
if (!contig)
return 0;
@@ -6140,6 +6165,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *exists = NULL;
struct page *p;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ u64 lockdep_owner = owner_root;
int uptodate = 1;
int ret;
@@ -6164,7 +6190,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
- btrfs_set_buffer_lockdep_class(owner_root, eb, level);
+
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66c822182ecc..5a3f6e0d9688 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2482,6 +2482,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
@@ -2498,6 +2499,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f0c97d25b4a0..1372210869b1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1644,10 +1644,9 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
done_offset = end;
if (done_offset == start) {
- struct btrfs_fs_info *info = inode->root->fs_info;
-
- wait_var_event(&info->zone_finish_wait,
- !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
continue;
}
@@ -7694,6 +7693,20 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
bool unlock_extents = false;
/*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
* Cap the size of reads to that usually seen in buffered I/O as we need
* to allocate a contiguous array for the checksums.
*/
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 33461b4f9c8b..9063072b399b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,6 +14,93 @@
#include "locking.h"
/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if BTRFS_MAX_LEVEL != 8
+#error
+#endif
+
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
+};
+
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* Find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
+}
+
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
+{
+ if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ eb, btrfs_header_level(eb));
+}
+
+#endif
+
+/*
* Extent buffer locking
* =====================
*
@@ -164,6 +251,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
while (1) {
eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
@@ -185,6 +274,8 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
while (1) {
eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bbc45534ae9a..ab268be09bb5 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -131,4 +131,18 @@ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
+#else
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+{
+}
+static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+}
+#endif
+
#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a6dc827e75af..45c02aba2492 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1326,7 +1326,9 @@ again:
btrfs_release_path(path);
path->lowest_level = level;
+ set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
path->lowest_level = 0;
if (ret) {
if (ret > 0)
@@ -3573,7 +3575,12 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- return btrfs_commit_transaction(trans);
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a64b26b16904..d647cb2938c0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -349,9 +349,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ err = ret;
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d0cbeb7ae81c..435559ba94fa 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -199,7 +199,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
if (flags & BTRFS_BLOCK_GROUP_DATA)
- return SZ_1G;
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return SZ_32M;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4c7089b1681b..f89beac3c665 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1816,6 +1816,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 9e0e0ae2288c..43f905ab0a18 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1233,7 +1233,8 @@ static void extent_err(const struct extent_buffer *eb, int slot,
}
static int check_extent_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
@@ -1453,6 +1454,26 @@ static int check_extent_item(struct extent_buffer *leaf,
total_refs, inline_refs);
return -EUCLEAN;
}
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
@@ -1621,7 +1642,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
- ret = check_extent_item(leaf, key, slot);
+ ret = check_extent_item(leaf, key, slot, prev_key);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index dcf75a8daa20..9205c4a5ca81 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1146,7 +1146,9 @@ again:
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -1457,7 +1459,7 @@ static int add_link(struct btrfs_trans_handle *trans,
* on the inode will not free it. We will fixup the link count later.
*/
if (other_inode->i_nlink == 0)
- inc_nlink(other_inode);
+ set_nlink(other_inode, 1);
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
@@ -1600,7 +1602,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* free it. We will fixup the link count later.
*/
if (!ret && inode->i_nlink == 0)
- inc_nlink(inode);
+ set_nlink(inode, 1);
}
if (ret < 0)
goto out;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 272901514b0c..f63ff91e2883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2345,8 +2345,11 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
&bdev, &disk_super);
- if (ret)
+ if (ret) {
+ btrfs_put_dev_args_from_path(args);
return ret;
+ }
+
args->devid = btrfs_stack_device_id(&disk_super->dev_item);
memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -5264,6 +5267,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
ctl->stripe_size);
}
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
/* Align to BTRFS_STRIPE_LEN */
ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
ctl->chunk_size = ctl->stripe_size * data_stripes;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7421abcf325a..5bb8d8c86311 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -371,6 +371,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b150b07ba1a7..62e7007a7e46 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -421,10 +421,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
* since btrfs adds the pages one by one to a bio, and btrfs cannot
* increase the metadata reservation even if it increases the number of
* extents, it is safe to stick with the limit.
+ *
+ * With the zoned emulation, we can have non-zoned device on the zoned
+ * mode. In this case, we don't have a valid max zone append size. So,
+ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
*/
- zone_info->max_zone_append_size =
- min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
- (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ if (bdev_is_zoned(bdev)) {
+ zone_info->max_zone_append_size = min_t(u64,
+ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ } else {
+ zone_info->max_zone_append_size =
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+ }
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
@@ -1178,7 +1187,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
* offset.
*/
static int calculate_alloc_pointer(struct btrfs_block_group *cache,
- u64 *offset_ret)
+ u64 *offset_ret, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
@@ -1188,6 +1197,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
int ret;
u64 length;
+ /*
+ * Avoid tree lookups for a new block group, there's no use for it.
+ * It must always be 0.
+ *
+ * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+ * For new a block group, this function is called from
+ * btrfs_make_block_group() which is already taking the chunk mutex.
+ * Thus, we cannot call calculate_alloc_pointer() which takes extent
+ * buffer locks to avoid deadlock.
+ */
+ if (new) {
+ *offset_ret = 0;
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1323,6 +1347,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
else
num_conventional++;
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
+
if (!is_sequential) {
alloc_offsets[i] = WP_CONVENTIONAL;
continue;
@@ -1389,45 +1420,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
__set_bit(i, active);
break;
}
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
}
if (num_sequential > 0)
cache->seq_zone = true;
if (num_conventional > 0) {
- /*
- * Avoid calling calculate_alloc_pointer() for new BG. It
- * is no use for new BG. It must be always 0.
- *
- * Also, we have a lock chain of extent buffer lock ->
- * chunk mutex. For new BG, this function is called from
- * btrfs_make_block_group() which is already taking the
- * chunk mutex. Thus, we cannot call
- * calculate_alloc_pointer() which takes extent buffer
- * locks to avoid deadlock.
- */
-
/* Zone capacity is always zone size in emulation */
cache->zone_capacity = cache->length;
- if (new) {
- cache->alloc_offset = 0;
- goto out;
- }
- ret = calculate_alloc_pointer(cache, &last_alloc);
- if (ret || map->num_stripes == num_conventional) {
- if (!ret)
- cache->alloc_offset = last_alloc;
- else
- btrfs_err(fs_info,
+ ret = calculate_alloc_pointer(cache, &last_alloc, new);
+ if (ret) {
+ btrfs_err(fs_info,
"zoned: failed to determine allocation offset of bg %llu",
- cache->start);
+ cache->start);
+ goto out;
+ } else if (map->num_stripes == num_conventional) {
+ cache->alloc_offset = last_alloc;
+ cache->zone_is_active = 1;
goto out;
}
}
@@ -1495,13 +1504,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- if (cache->zone_is_active) {
- btrfs_get_block_group(cache);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
-
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1526,10 +1528,16 @@ out:
ret = -EIO;
}
- if (!ret)
+ if (!ret) {
cache->meta_write_pointer = cache->alloc_offset + cache->start;
-
- if (ret) {
+ if (cache->zone_is_active) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+ } else {
kfree(cache->physical_map);
cache->physical_map = NULL;
}
@@ -2007,8 +2015,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* For active_bg_list */
btrfs_put_block_group(block_group);
- clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
- wake_up_all(&fs_info->zone_finish_wait);
+ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
return 0;
}
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 6cba2c6de2f9..2ad58c465208 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -111,6 +111,7 @@ struct cachefiles_cache {
char *tag; /* cache binding tag */
refcount_t unbind_pincount;/* refcount to do daemon unbind */
struct xarray reqs; /* xarray of pending on-demand requests */
+ unsigned long req_id_next;
struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
u32 ondemand_id_next;
};
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 1fee702d5529..0254ed39f68c 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -158,9 +158,13 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
/* fail OPEN request if daemon reports an error */
if (size < 0) {
- if (!IS_ERR_VALUE(size))
- size = -EINVAL;
- req->error = size;
+ if (!IS_ERR_VALUE(size)) {
+ req->error = -EINVAL;
+ ret = -EINVAL;
+ } else {
+ req->error = size;
+ ret = 0;
+ }
goto out;
}
@@ -238,14 +242,19 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
unsigned long id = 0;
size_t n;
int ret = 0;
- XA_STATE(xas, &cache->reqs, 0);
+ XA_STATE(xas, &cache->reqs, cache->req_id_next);
/*
- * Search for a request that has not ever been processed, to prevent
- * requests from being processed repeatedly.
+ * Cyclically search for a request that has not ever been processed,
+ * to prevent requests from being processed repeatedly, and make
+ * request distribution fair.
*/
xa_lock(&cache->reqs);
req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ if (!req && cache->req_id_next > 0) {
+ xas_set(&xas, 0);
+ req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW);
+ }
if (!req) {
xa_unlock(&cache->reqs);
return 0;
@@ -260,6 +269,7 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
}
xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+ cache->req_id_next = xas.xa_index + 1;
xa_unlock(&cache->reqs);
id = xas.xa_index;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d6e5916138e4..dcf701b05cc1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
* Reference snap context in folio->private. Also set
* PagePrivate so that we get invalidate_folio callback.
*/
- VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
+ VM_WARN_ON_FOLIO(folio->private, folio);
folio_attach_private(folio, snapc);
return ceph_fscache_dirty_folio(mapping, folio);
@@ -237,7 +237,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (err >= 0 && err < subreq->len)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, err, true);
+ netfs_subreq_terminated(subreq, err, false);
num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
ceph_put_page_vector(osd_data->pages, num_pages, false);
@@ -313,8 +313,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- ceph_netfs_issue_op_inline(subreq))
+ if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
@@ -329,7 +328,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
- err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
goto out;
@@ -338,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
/* should always give us a page-aligned read */
WARN_ON_ONCE(page_off);
len = err;
+ err = 0;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
req->r_callback = finish_netfs_read;
@@ -345,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req->r_inode = inode;
ihold(inode);
- err = ceph_osdc_start_request(req->r_osdc, req, false);
- if (err)
- iput(inode);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
@@ -621,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(osdc, req, true);
- if (!err)
- err = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
@@ -1151,8 +1148,7 @@ new_request:
}
req->r_mtime = inode->i_mtime;
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
wbc->nr_to_write -= i;
@@ -1327,16 +1323,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
int r;
r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
- if (r == 0)
- folio_wait_fscache(folio);
- if (r < 0) {
- if (folio)
- folio_put(folio);
- } else {
- WARN_ON_ONCE(!folio_test_locked(folio));
- *pagep = &folio->page;
- }
- return r;
+ if (r < 0)
+ return r;
+
+ folio_wait_fscache(folio);
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
+ return 0;
}
/*
@@ -1439,7 +1432,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
inode, off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- ci->i_inline_version == CEPH_INLINE_NONE) {
+ !ceph_has_inline_data(ci)) {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
@@ -1696,9 +1689,8 @@ int ceph_uninline_data(struct file *file)
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
goto out_unlock;
@@ -1739,9 +1731,8 @@ int ceph_uninline_data(struct file *file)
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
@@ -1912,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+ ceph_osdc_start_request(&fsc->client->osdc, rd_req);
wr_req->r_mtime = ci->netfs.inode.i_mtime;
- err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+ ceph_osdc_start_request(&fsc->client->osdc, wr_req);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
- if (!err2)
- err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ac8fd5e7f540..53cfe026b3ea 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* @ci: inode to be moved
* @session: new auth caps session
*/
-static void change_auth_cap_ses(struct ceph_inode_info *ci,
- struct ceph_mds_session *session)
+void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
{
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1978,14 +1978,15 @@ retry:
}
dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode),
+ " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
/*
* If we no longer need to hold onto old our caps, and we may
@@ -3005,7 +3006,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
}
if (S_ISREG(ci->netfs.inode.i_mode) &&
- ci->i_inline_version != CEPH_INLINE_NONE &&
+ ceph_has_inline_data(ci) &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -3578,24 +3579,23 @@ static void handle_cap_grant(struct inode *inode,
fill_inline = true;
}
- if (ci->i_auth_cap == cap &&
- le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- if (newcaps & ~extra_info->issued)
- wake = true;
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
- if (ci->i_requested_max_size > max_size ||
- !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
- /* re-request max_size if necessary */
- ci->i_requested_max_size = 0;
- wake = true;
- }
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
- ceph_kick_flushing_inode_caps(session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_kick_flushing_inode_caps(session, ci);
+ }
up_read(&session->s_mdsc->snap_rwsem);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
+ spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index eae417d71136..e7e2ebac330d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
- int err = -EROFS;
+ int err;
int op;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
@@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
+ err = -EROFS;
goto out;
}
@@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct ceph_mds_request *req;
int err;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
@@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+ spin_unlock(&dentry->d_lock);
+
+ synchronize_rcu();
+
if (result == -EJUKEBOX)
goto out;
@@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
@@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
ceph_dir_clear_complete(req->r_parent);
/* drop the dentry -- we don't know its status */
- if (!d_unhashed(req->r_dentry))
- d_drop(req->r_dentry);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
- pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
@@ -1180,6 +1215,8 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
@@ -1187,6 +1224,16 @@ retry:
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
ihold(req->r_old_inode);
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+ dentry->d_name.hash);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
/*
@@ -1195,10 +1242,20 @@ retry:
*/
drop_nlink(inode);
d_delete(dentry);
- } else if (err == -EJUKEBOX) {
- try_async = false;
- ceph_mdsc_put_request(req);
- goto retry;
+ } else {
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
}
} else {
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -1237,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
(!ceph_quota_is_same_realm(old_dir, new_dir)))
return -EXDEV;
+ err = ceph_wait_on_conflict_unlink(new_dentry);
+ if (err)
+ return err;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index da59e836a06e..04fd34557de8 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
size_t start;
int idx = 0;
- bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
ITER_GET_BVECS_PAGES, &start);
if (bytes < 0)
return size ?: bytes;
- iov_iter_advance(iter, bytes);
size += bytes;
for ( ; bytes; idx++, bvec_idx++) {
@@ -241,8 +240,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
- if ((file->f_mode & FMODE_WRITE) &&
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
ret = ceph_uninline_data(file);
if (ret < 0)
goto error;
@@ -569,7 +567,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ pr_warn("async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
@@ -612,6 +610,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_mds_reply_inode in = { };
struct ceph_mds_reply_info_in iinfo = { .in = &in };
struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
struct inode *inode;
struct timespec64 now;
struct ceph_string *pool_ns;
@@ -657,10 +656,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
/* Directories always inherit the setgid bit. */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(dir->i_gid) &&
- !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID))
- mode &= ~S_ISGID;
} else {
in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
}
@@ -714,6 +709,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
ret = finish_open(file, dentry, ceph_open);
}
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
+ spin_unlock(&dentry->d_lock);
+
return ret;
}
@@ -740,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -786,9 +796,16 @@ retry:
(req->r_dir_caps =
try_prep_async_create(dir, dentry, &lo,
&req->r_deleg_ino))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
req->r_callback = ceph_async_create_cb;
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_CREATE;
+ spin_unlock(&dentry->d_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
err = ceph_finish_async_create(dir, dentry,
@@ -807,9 +824,7 @@ retry:
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
if (err == -ENOENT) {
dentry = ceph_handle_snapdir(req, dentry);
if (IS_ERR(dentry)) {
@@ -965,9 +980,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
false, false);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -1230,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_inode = inode;
req->r_priv = aio_req;
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
if (ret < 0) {
req->r_result = ret;
@@ -1262,7 +1276,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
- bool should_dirty = !write && iter_is_iovec(iter);
+ bool should_dirty = !write && user_backed_iter(iter);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -1367,9 +1381,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
continue;
}
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(req->r_osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (write)
ceph_update_write_metrics(metric, req->r_start_latency,
@@ -1432,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
r_private_item);
list_del_init(&req->r_private_item);
if (ret >= 0)
- ret = ceph_osdc_start_request(req->r_osdc,
- req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
if (ret < 0) {
req->r_result = ret;
ceph_aio_complete_req(req);
@@ -1546,9 +1558,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
false, true);
req->r_mtime = mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, ret);
@@ -1632,7 +1643,7 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ if (!ceph_has_inline_data(ci)) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
@@ -1895,7 +1906,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1935,57 +1946,15 @@ out_unlocked:
*/
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_mapping->host;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- loff_t i_size;
- loff_t ret;
-
- inode_lock(inode);
-
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *inode = file_inode(file);
+ int ret;
+
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (ret < 0)
- goto out;
- }
-
- i_size = i_size_read(inode);
- switch (whence) {
- case SEEK_END:
- offset += i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- ret = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- break;
- case SEEK_HOLE:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- offset = i_size;
- break;
+ return ret;
}
-
- ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
-
-out:
- inode_unlock(inode);
- return ret;
+ return generic_file_llseek(file, offset, whence);
}
static inline void ceph_zero_partial_page(
@@ -2054,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode,
}
req->r_mtime = inode->i_mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret == -ENOENT)
- ret = 0;
- }
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
ceph_osdc_put_request(req);
out:
@@ -2361,7 +2328,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
if (IS_ERR(req))
ret = PTR_ERR(req);
else {
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -2554,7 +2521,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
+ NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 56c53ab3618e..42351d7a0dd6 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1049,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
iinfo->inline_version >= ci->i_inline_version) {
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
ci->i_inline_version = iinfo->inline_version;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (ceph_has_inline_data(ci) &&
(locked_page || (info_caps & cache_caps)))
fill_inline = true;
}
@@ -2275,9 +2275,15 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
*
* This cost much when doing the Locker state transition and
* usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
*/
if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
- || (mask & CEPH_STAT_RSTAT))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
return USE_AUTH_MDS;
else
return USE_ANY_MDS;
@@ -2321,7 +2327,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (inline_version == 0) {
/* the reply is supposed to contain inline data */
err = -EINVAL;
- } else if (inline_version == CEPH_INLINE_NONE) {
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
err = -ENODATA;
} else {
err = req->r_reply_info.targeti.inline_len;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 33f517d549ce..80f8b9ec1a31 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -456,7 +456,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
dout("added delegated inode 0x%llx\n",
start - 1);
} else if (err == -EBUSY) {
- pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ pr_warn("MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
@@ -655,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDenty. That means the just created file is possiblly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct dentry *pdentry = dentry->d_parent;
+ struct dentry *udentry, *found = NULL;
+ struct ceph_dentry_info *di;
+ struct qstr dname;
+ u32 hash = dentry->d_name.hash;
+ int err;
+
+ dname.name = dentry->d_name.name;
+ dname.len = dentry->d_name.len;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+ hnode, hash) {
+ udentry = di->dentry;
+
+ spin_lock(&udentry->d_lock);
+ if (udentry->d_name.hash != hash)
+ goto next;
+ if (unlikely(udentry->d_parent != pdentry))
+ goto next;
+ if (!hash_hashed(&di->hnode))
+ goto next;
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ if (!d_same_name(udentry, pdentry, &dname))
+ goto next;
+
+ spin_unlock(&udentry->d_lock);
+ found = dget(udentry);
+ break;
+next:
+ spin_unlock(&udentry->d_lock);
+ }
+ rcu_read_unlock();
+
+ if (likely(!found))
+ return 0;
+
+ dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
+ dentry, dentry, found, found);
+
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+ TASK_KILLABLE);
+ dput(found);
+ return err;
+}
+
/*
* sessions
@@ -1220,14 +1293,17 @@ static int encode_supported_features(void **p, void *end)
if (count > 0) {
size_t i;
size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
if (WARN_ON_ONCE(*p + 4 + size > end))
return -ERANGE;
ceph_encode_32(p, size);
memset(*p, 0, size);
- for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
*p += size;
} else {
if (WARN_ON_ONCE(*p + 4 > end))
@@ -2884,6 +2960,64 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
+ /*
+ * For async create we will choose the auth MDS of frag in parent
+ * directory to send the request and ususally this works fine, but
+ * if the migrated the dirtory to another MDS before it could handle
+ * it the request will be forwarded.
+ *
+ * And then the auth cap will be changed.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
+ struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ /*
+ * The request maybe handled very fast and the new inode
+ * hasn't been linked to the dentry yet. We need to wait
+ * for the ceph_finish_async_create(), which shouldn't be
+ * stuck too long or fail in thoery, to finish when forwarding
+ * the request.
+ */
+ if (!d_inode(req->r_dentry)) {
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+ if (err) {
+ mutex_lock(&req->r_fill_mutex);
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ goto out_session;
+ }
+ }
+
+ ci = ceph_inode(d_inode(req->r_dentry));
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
+ dout("do_request session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
+
+ /* Remove the auth cap from old session */
+ spin_lock(&cap->session->s_cap_lock);
+ cap->session->s_nr_caps--;
+ list_del_init(&cap->session_caps);
+ spin_unlock(&cap->session->s_cap_lock);
+
+ /* Add the auth cap to the new session */
+ cap->mds = mds;
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ session->s_nr_caps++;
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ change_auth_cap_ses(ci, session);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
err = __send_request(session, req, false);
out_session:
@@ -3464,11 +3598,26 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
- renewed_caps(mdsc, session, 0);
- if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
- metric_schedule_delayed(&mdsc->metric);
+
+ if (session->s_state == CEPH_MDS_SESSION_OPEN) {
+ pr_notice("mds%d is already opened\n", session->s_mds);
+ } else {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
+ renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
+ &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
+ }
+
+ /*
+ * The connection maybe broken and the session in client
+ * side has been reinitialized, need to update the seq
+ * anyway.
+ */
+ if (!session->s_seq && seq)
+ session->s_seq = seq;
+
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 1140aecd82ce..256e3eada6c1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -29,14 +29,12 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_ALTERNATE_NAME,
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
};
-/*
- * This will always have the highest feature bit value
- * as the last element of the array.
- */
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
@@ -45,10 +43,8 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
- \
- CEPHFS_FEATURE_MAX, \
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
}
-#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
/*
* Some lock dependencies:
@@ -582,6 +578,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
TASK_KILLABLE);
}
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 30387733765d..8d0a6d2c2da4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
__decode_and_drop_type(p, end, u8, bad_ext);
}
if (mdsmap_ev >= 8) {
- u32 name_len;
/* enabled */
ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
- ceph_decode_32_safe(p, end, name_len, bad_ext);
- ceph_decode_need(p, end, name_len, bad_ext);
- *p += name_len;
+ /* fs_name */
+ ceph_decode_skip_string(p, end, bad_ext);
}
/* damaged */
if (mdsmap_ev >= 9) {
@@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
} else {
m->m_damaged = false;
}
+ if (mdsmap_ev >= 17) {
+ /* balancer */
+ ceph_decode_skip_string(p, end, bad_ext);
+ /* standby_count_wanted */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* old_max_mds */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* min_compat_client */
+ ceph_decode_skip_8(p, end, bad_ext);
+ /* required_client_features */
+ ceph_decode_skip_set(p, end, 64, bad_ext);
+ ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+ } else {
+ /* This forces the usage of the (sync) SETXATTR Op */
+ m->m_max_xattr_size = 0;
+ }
bad_ext:
dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
!!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 40140805bdcf..3fc48b43cab0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -72,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
/*
- * express utilization in terms of large blocks to avoid
+ * Express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
- *
- * NOTE: for the time being, we make bsize == frsize to humor
- * not-yet-ancient versions of glibc that are broken.
- * Someday, we will probably want to report a real block
- * size... whatever that may mean for a network file system!
*/
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
/*
@@ -95,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
}
+ /*
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
+ */
+ buf->f_bsize = buf->f_frsize;
+
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
@@ -816,6 +818,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
+ hash_init(fsc->async_unlink_conflict);
+ spin_lock_init(&fsc->async_unlink_conflict_lock);
+
spin_lock(&ceph_fsc_lock);
list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
spin_unlock(&ceph_fsc_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f59dac66955b..40630e6f691c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -19,6 +19,7 @@
#include <linux/security.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/hashtable.h>
#include <linux/ceph/libceph.h>
@@ -99,6 +100,8 @@ struct ceph_mount_options {
char *mon_addr;
};
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
+
struct ceph_fs_client {
struct super_block *sb;
@@ -124,6 +127,9 @@ struct ceph_fs_client {
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
+ DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+ spinlock_t async_unlink_conflict_lock;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
@@ -280,7 +286,8 @@ struct ceph_dentry_info {
struct dentry *dentry;
struct ceph_mds_session *lease_session;
struct list_head lease_list;
- unsigned flags;
+ struct hlist_node hnode;
+ unsigned long flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
@@ -289,10 +296,14 @@ struct ceph_dentry_info {
u64 offset;
};
-#define CEPH_DENTRY_REFERENCED 1
-#define CEPH_DENTRY_LEASE_LIST 2
-#define CEPH_DENTRY_SHRINK_LIST 4
-#define CEPH_DENTRY_PRIMARY_LINK 8
+#define CEPH_DENTRY_REFERENCED (1 << 0)
+#define CEPH_DENTRY_LEASE_LIST (1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
+#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
+#define CEPH_DENTRY_ASYNC_CREATE_BIT (5)
+#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT)
struct ceph_inode_xattrs_info {
/*
@@ -758,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
+extern void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session);
@@ -1218,6 +1231,14 @@ extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+static inline bool ceph_has_inline_data(struct ceph_inode_info *ci)
+{
+ if (ci->i_inline_version == CEPH_INLINE_NONE ||
+ ci->i_inline_version == 1) /* initial version, no data */
+ return false;
+ return true;
+}
+
/* file.c */
extern const struct file_operations ceph_file_fops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index f141f5246163..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1086,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value=%.*s\n", (int)size, value);
+ dout("setxattr value size: %zu\n", size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1184,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+ (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+ dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+ __func__, ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
+ }
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
@@ -1201,8 +1207,6 @@ retry:
ceph_cap_string(issued));
__build_xattrs(inode);
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 8c9f2c00be72..7c9785973f49 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,9 +5,9 @@
ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_CIFS) += cifs.o
-cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
+cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \
- cifs_unicode.o nterr.o cifsencrypt.o \
+ cached_dir.o cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
@@ -31,4 +31,4 @@ cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
-cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o
+cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o
diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c
new file mode 100644
index 000000000000..b401339f6e73
--- /dev/null
+++ b/fs/cifs/cached_dir.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions to handle the cached directory entries
+ *
+ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
+ */
+
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "smb2proto.h"
+#include "cached_dir.h"
+
+/*
+ * Open the and cache a directory handle.
+ * If error then *cfid is not initialized.
+ */
+int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
+ struct cifs_sb_info *cifs_sb,
+ bool lookup_only, struct cached_fid **ret_cfid)
+{
+ struct cifs_ses *ses;
+ struct TCP_Server_Info *server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_II;
+ struct cifs_fid *pfid;
+ struct dentry *dentry;
+ struct cached_fid *cfid;
+
+ if (tcon == NULL || tcon->nohandlecache ||
+ is_smb1_server(tcon->ses->server))
+ return -EOPNOTSUPP;
+
+ ses = tcon->ses;
+ server = ses->server;
+
+ if (cifs_sb->root == NULL)
+ return -ENOENT;
+
+ if (strlen(path))
+ return -ENOENT;
+
+ dentry = cifs_sb->root;
+
+ cfid = tcon->cfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->is_valid) {
+ cifs_dbg(FYI, "found a cached root file handle\n");
+ *ret_cfid = cfid;
+ kref_get(&cfid->refcount);
+ mutex_unlock(&cfid->fid_mutex);
+ return 0;
+ }
+
+ /*
+ * We do not hold the lock for the open because in case
+ * SMB2_open needs to reconnect, it will end up calling
+ * cifs_mark_open_files_invalid() which takes the lock again
+ * thus causing a deadlock
+ */
+ mutex_unlock(&cfid->fid_mutex);
+
+ if (lookup_only)
+ return -ENOENT;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
+ pfid = &cfid->fid;
+ server->ops->new_lease_key(pfid);
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE);
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, server,
+ &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_free;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_free;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, server,
+ flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ mutex_lock(&cfid->fid_mutex);
+
+ /*
+ * Now we need to check again as the cached root might have
+ * been successfully re-opened from a concurrent process
+ */
+
+ if (cfid->is_valid) {
+ /* work was already done */
+
+ /* stash fids for close() later */
+ struct cifs_fid fid = {
+ .persistent_fid = pfid->persistent_fid,
+ .volatile_fid = pfid->volatile_fid,
+ };
+
+ /*
+ * caller expects this func to set the fid in cfid to valid
+ * cached root, so increment the refcount.
+ */
+ kref_get(&cfid->refcount);
+
+ mutex_unlock(&cfid->fid_mutex);
+
+ if (rc == 0) {
+ /* close extra handle outside of crit sec */
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ }
+ rc = 0;
+ goto oshr_free;
+ }
+
+ /* Cached root is still invalid, continue normaly */
+
+ if (rc) {
+ if (rc == -EREMCHG) {
+ tcon->need_reconnect = true;
+ pr_warn_once("server share %s deleted\n",
+ tcon->treeName);
+ }
+ goto oshr_exit;
+ }
+
+ atomic_inc(&tcon->num_remote_opens);
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ cfid->tcon = tcon;
+ cfid->is_valid = true;
+ cfid->dentry = dentry;
+ dget(dentry);
+ kref_init(&cfid->refcount);
+
+ /* BB TBD check to see if oplock level check can be removed below */
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
+ /*
+ * See commit 2f94a3125b87. Increment the refcount when we
+ * get a lease for root, release it if lease break occurs
+ */
+ kref_get(&cfid->refcount);
+ cfid->has_lease = true;
+ smb2_parse_contexts(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
+ } else
+ goto oshr_exit;
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ if (!smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&cfid->file_all_info))
+ cfid->file_all_info_is_valid = true;
+
+ cfid->time = jiffies;
+
+oshr_exit:
+ mutex_unlock(&cfid->fid_mutex);
+oshr_free:
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ if (rc == 0)
+ *ret_cfid = cfid;
+
+ return rc;
+}
+
+int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **ret_cfid)
+{
+ struct cached_fid *cfid;
+
+ cfid = tcon->cfid;
+
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry == dentry) {
+ cifs_dbg(FYI, "found a cached root file handle by dentry\n");
+ *ret_cfid = cfid;
+ kref_get(&cfid->refcount);
+ mutex_unlock(&cfid->fid_mutex);
+ return 0;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ return -ENOENT;
+}
+
+static void
+smb2_close_cached_fid(struct kref *ref)
+{
+ struct cached_fid *cfid = container_of(ref, struct cached_fid,
+ refcount);
+ struct cached_dirent *dirent, *q;
+
+ if (cfid->is_valid) {
+ cifs_dbg(FYI, "clear cached root file handle\n");
+ SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid);
+ }
+
+ /*
+ * We only check validity above to send SMB2_close,
+ * but we still need to invalidate these entries
+ * when this function is called
+ */
+ cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
+ cfid->has_lease = false;
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ /*
+ * Delete all cached dirent names
+ */
+ mutex_lock(&cfid->dirents.de_mutex);
+ list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
+ list_del(&dirent->entry);
+ kfree(dirent->name);
+ kfree(dirent);
+ }
+ cfid->dirents.is_valid = 0;
+ cfid->dirents.is_failed = 0;
+ cfid->dirents.ctx = NULL;
+ cfid->dirents.pos = 0;
+ mutex_unlock(&cfid->dirents.de_mutex);
+
+}
+
+void close_cached_dir(struct cached_fid *cfid)
+{
+ mutex_lock(&cfid->fid_mutex);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ mutex_unlock(&cfid->fid_mutex);
+}
+
+void close_cached_dir_lease_locked(struct cached_fid *cfid)
+{
+ if (cfid->has_lease) {
+ cfid->has_lease = false;
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
+}
+
+void close_cached_dir_lease(struct cached_fid *cfid)
+{
+ mutex_lock(&cfid->fid_mutex);
+ close_cached_dir_lease_locked(cfid);
+ mutex_unlock(&cfid->fid_mutex);
+}
+
+/*
+ * Called from cifs_kill_sb when we unmount a share
+ */
+void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
+{
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct cached_fid *cfid;
+ struct cifs_tcon *tcon;
+ struct tcon_link *tlink;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
+ if (IS_ERR(tcon))
+ continue;
+ cfid = tcon->cfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ }
+}
+
+/*
+ * Invalidate and close all cached dirs when a TCON has been reset
+ * due to a session loss.
+ */
+void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
+{
+ mutex_lock(&tcon->cfid->fid_mutex);
+ tcon->cfid->is_valid = false;
+ /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
+ close_cached_dir_lease_locked(tcon->cfid);
+ memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->cfid->fid_mutex);
+}
+
+static void
+smb2_cached_lease_break(struct work_struct *work)
+{
+ struct cached_fid *cfid = container_of(work,
+ struct cached_fid, lease_break);
+
+ close_cached_dir_lease(cfid);
+}
+
+int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
+{
+ if (tcon->cfid->is_valid &&
+ !memcmp(lease_key,
+ tcon->cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ tcon->cfid->time = 0;
+ INIT_WORK(&tcon->cfid->lease_break,
+ smb2_cached_lease_break);
+ queue_work(cifsiod_wq,
+ &tcon->cfid->lease_break);
+ return true;
+ }
+ return false;
+}
+
+struct cached_fid *init_cached_dir(void)
+{
+ struct cached_fid *cfid;
+
+ cfid = kzalloc(sizeof(*cfid), GFP_KERNEL);
+ if (!cfid)
+ return NULL;
+ INIT_LIST_HEAD(&cfid->dirents.entries);
+ mutex_init(&cfid->dirents.de_mutex);
+ mutex_init(&cfid->fid_mutex);
+ return cfid;
+}
+
+void free_cached_dir(struct cifs_tcon *tcon)
+{
+ kfree(tcon->cfid);
+}
diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h
new file mode 100644
index 000000000000..bd262dc8b179
--- /dev/null
+++ b/fs/cifs/cached_dir.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Functions to handle the cached directory entries
+ *
+ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
+ */
+
+#ifndef _CACHED_DIR_H
+#define _CACHED_DIR_H
+
+
+struct cached_dirent {
+ struct list_head entry;
+ char *name;
+ int namelen;
+ loff_t pos;
+
+ struct cifs_fattr fattr;
+};
+
+struct cached_dirents {
+ bool is_valid:1;
+ bool is_failed:1;
+ struct dir_context *ctx; /*
+ * Only used to make sure we only take entries
+ * from a single context. Never dereferenced.
+ */
+ struct mutex de_mutex;
+ int pos; /* Expected ctx->pos */
+ struct list_head entries;
+};
+
+struct cached_fid {
+ bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+ bool has_lease:1;
+ unsigned long time; /* jiffies of when lease was taken */
+ struct kref refcount;
+ struct cifs_fid fid;
+ struct mutex fid_mutex;
+ struct cifs_tcon *tcon;
+ struct dentry *dentry;
+ struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
+ struct cached_dirents dirents;
+};
+
+extern struct cached_fid *init_cached_dir(void);
+extern void free_cached_dir(struct cifs_tcon *tcon);
+extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
+ struct cifs_sb_info *cifs_sb,
+ bool lookup_only, struct cached_fid **cfid);
+extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **cfid);
+extern void close_cached_dir(struct cached_fid *cfid);
+extern void close_cached_dir_lease(struct cached_fid *cfid);
+extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
+extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
+extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
+extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
+
+#endif /* _CACHED_DIR_H */
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2cfbac8bb965..c05477e28cff 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -36,13 +36,13 @@ cifs_dump_mem(char *label, void *data, int length)
void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb_hdr *smb = (struct smb_hdr *)buf;
+ struct smb_hdr *smb = buf;
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
cifs_dbg(VFS, "smb buf %p len %u\n", smb,
- server->ops->calc_smb_size(smb, server));
+ server->ops->calc_smb_size(smb));
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -55,7 +55,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
return;
cifs_dbg(VFS, "Dump pending requests:\n");
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
mid_entry->mid_state,
@@ -78,7 +78,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
mid_entry->resp_buf, 62);
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -168,7 +168,6 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
{
- struct list_head *tmp, *tmp1, *tmp2;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -184,14 +183,10 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
#endif /* CIFS_DEBUG2 */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp2, &tcon->openFileList) {
- cfile = list_entry(tmp2, struct cifsFileInfo,
- tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
seq_printf(m,
"0x%x 0x%llx 0x%x %d %d %d %pd",
tcon->tid,
@@ -218,7 +213,6 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
- struct list_head *tmp2, *tmp3;
struct mid_q_entry *mid_entry;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
@@ -381,9 +375,7 @@ skip_rdma:
seq_printf(m, "\n\n\tSessions: ");
i = 0;
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
i++;
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
@@ -447,9 +439,7 @@ skip_rdma:
else
seq_puts(m, "none\n");
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3, struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
++j;
seq_printf(m, "\n\t%d) ", j);
cifs_debug_tcon(m, tcon);
@@ -473,10 +463,8 @@ skip_rdma:
seq_printf(m, "\n\t\t[NONE]");
seq_puts(m, "\n\n\tMIDs: ");
- spin_lock(&GlobalMid_Lock);
- list_for_each(tmp3, &server->pending_mid_q) {
- mid_entry = list_entry(tmp3, struct mid_q_entry,
- qhead);
+ spin_lock(&server->mid_lock);
+ list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
seq_printf(m, "\n\tState: %d com: %d pid:"
" %d cbdata: %p mid %llu\n",
mid_entry->mid_state,
@@ -485,7 +473,7 @@ skip_rdma:
mid_entry->callback_data,
mid_entry->mid);
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
seq_printf(m, "\n--\n");
}
if (c == 0)
@@ -504,7 +492,6 @@ static ssize_t cifs_stats_proc_write(struct file *file,
{
bool bv;
int rc;
- struct list_head *tmp1, *tmp2, *tmp3;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -514,8 +501,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
#ifdef CONFIG_CIFS_STATS2
int i;
- atomic_set(&totBufAllocCount, 0);
- atomic_set(&totSmBufAllocCount, 0);
+ atomic_set(&total_buf_alloc_count, 0);
+ atomic_set(&total_small_buf_alloc_count, 0);
#endif /* CONFIG_CIFS_STATS2 */
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
@@ -525,9 +512,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
GlobalCurrentXid = 0;
spin_unlock(&GlobalMid_Lock);
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp1, &cifs_tcp_ses_list) {
- server = list_entry(tmp1, struct TCP_Server_Info,
- tcp_ses_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
server->max_in_flight = 0;
#ifdef CONFIG_CIFS_STATS2
for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
@@ -538,13 +523,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
server->fastest_cmd[0] = 0;
}
#endif /* CONFIG_CIFS_STATS2 */
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3,
- struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
atomic_set(&tcon->num_smbs_sent, 0);
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
@@ -569,7 +549,6 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_STATS2
int j;
#endif /* STATS2 */
- struct list_head *tmp2, *tmp3;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -579,17 +558,17 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "Share (unique mount targets): %d\n",
tconInfoAllocCount.counter);
seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
- bufAllocCount.counter,
+ buf_alloc_count.counter,
cifs_min_rcv + tcpSesAllocCount.counter);
seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
- smBufAllocCount.counter, cifs_min_small);
+ small_buf_alloc_count.counter, cifs_min_small);
#ifdef CONFIG_CIFS_STATS2
seq_printf(m, "Total Large %d Small %d Allocations\n",
- atomic_read(&totBufAllocCount),
- atomic_read(&totSmBufAllocCount));
+ atomic_read(&total_buf_alloc_count),
+ atomic_read(&total_small_buf_alloc_count));
#endif /* CONFIG_CIFS_STATS2 */
- seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
+ seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&mid_count));
seq_printf(m,
"\n%d session %d share reconnects\n",
tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
@@ -619,13 +598,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smb2slowcmd[j]),
server->hostname, j);
#endif /* STATS2 */
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3,
- struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
i++;
seq_printf(m, "\n%d) %s", i, tcon->treeName);
if (tcon->need_reconnect)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index bf861fef2f0c..fa480d62f313 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1379,6 +1379,7 @@ chown_chgrp_exit:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen,
u32 __maybe_unused unused)
@@ -1512,6 +1513,7 @@ out:
cifs_put_tlink(tlink);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */
int
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 663cb9db4908..46f5718754f9 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -32,10 +32,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
int rc;
struct kvec *iov = rqst->rq_iov;
int n_vec = rqst->rq_nvec;
- int is_smb2 = server->vals->header_preamble_size == 0;
/* iov[0] is actual data and not the rfc1002 length for SMB2+ */
- if (is_smb2) {
+ if (!is_smb1(server)) {
if (iov[0].iov_len <= 4)
return -EIO;
i = 0;
@@ -141,13 +140,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
if ((cifs_pdu == NULL) || (server == NULL))
return -EINVAL;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
server->tcpStatus == CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return rc;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (!server->session_estab) {
memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8f2e003e0590..8042d7280dec 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -46,6 +46,7 @@
#include "netlink.h"
#endif
#include "fs_context.h"
+#include "cached_dir.h"
/*
* DOS dates from 1980/1/1 through 2107/12/31
@@ -68,6 +69,34 @@ bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
+
+/*
+ * Global transaction id (XID) information
+ */
+unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
+unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
+
+/*
+ * Global counters, updated atomically
+ */
+atomic_t sesInfoAllocCount;
+atomic_t tconInfoAllocCount;
+atomic_t tcpSesNextId;
+atomic_t tcpSesAllocCount;
+atomic_t tcpSesReconnectCount;
+atomic_t tconInfoReconnectCount;
+
+atomic_t mid_count;
+atomic_t buf_alloc_count;
+atomic_t small_buf_alloc_count;
+#ifdef CONFIG_CIFS_STATS2
+atomic_t total_buf_alloc_count;
+atomic_t total_small_buf_alloc_count;
+#endif/* STATS2 */
+struct list_head cifs_tcp_ses_list;
+spinlock_t cifs_tcp_ses_lock;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
@@ -255,30 +284,13 @@ out_no_root:
static void cifs_kill_sb(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifs_tcon *tcon;
- struct cached_fid *cfid;
- struct rb_root *root = &cifs_sb->tlink_tree;
- struct rb_node *node;
- struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
* before we kill the sb.
*/
if (cifs_sb->root) {
- for (node = rb_first(root); node; node = rb_next(node)) {
- tlink = rb_entry(node, struct tcon_link, tl_rbnode);
- tcon = tlink_tcon(tlink);
- if (IS_ERR(tcon))
- continue;
- cfid = &tcon->crfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- mutex_unlock(&cfid->fid_mutex);
- }
+ close_all_cached_dirs(cifs_sb);
/* finally release root dentry */
dput(cifs_sb->root);
@@ -681,6 +693,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ);
seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ);
}
+ seq_printf(s, ",closetimeo=%lu", cifs_sb->ctx->closetimeo / HZ);
if (tcon->ses->chan_max > 1)
seq_printf(s, ",multichannel,max_channels=%zu",
@@ -703,14 +716,17 @@ static void cifs_umount_begin(struct super_block *sb)
tcon = cifs_sb_master_tcon(cifs_sb);
spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to force umount this and woken up
all waiting network requests, nothing to do */
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
} else if (tcon->tc_count == 1)
tcon->status = TID_EXITING;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
@@ -1232,6 +1248,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
lock_two_nondirectories(target_inode, src_inode);
cifs_dbg(FYI, "about to flush pages\n");
+
+ rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
+ off + len - 1);
+ if (rc)
+ goto out;
+
/* should we flush first and last page first */
truncate_inode_pages(&target_inode->i_data, 0);
@@ -1537,8 +1559,7 @@ cifs_destroy_request_bufs(void)
kmem_cache_destroy(cifs_sm_req_cachep);
}
-static int
-cifs_init_mids(void)
+static int init_mids(void)
{
cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
sizeof(struct mid_q_entry), 0,
@@ -1556,8 +1577,7 @@ cifs_init_mids(void)
return 0;
}
-static void
-cifs_destroy_mids(void)
+static void destroy_mids(void)
{
mempool_destroy(cifs_mid_poolp);
kmem_cache_destroy(cifs_mid_cachep);
@@ -1579,11 +1599,11 @@ init_cifs(void)
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
- atomic_set(&bufAllocCount, 0);
- atomic_set(&smBufAllocCount, 0);
+ atomic_set(&buf_alloc_count, 0);
+ atomic_set(&small_buf_alloc_count, 0);
#ifdef CONFIG_CIFS_STATS2
- atomic_set(&totBufAllocCount, 0);
- atomic_set(&totSmBufAllocCount, 0);
+ atomic_set(&total_buf_alloc_count, 0);
+ atomic_set(&total_small_buf_alloc_count, 0);
if (slow_rsp_threshold < 1)
cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
else if (slow_rsp_threshold > 32767)
@@ -1591,7 +1611,7 @@ init_cifs(void)
"slow response threshold set higher than recommended (0 to 32767)\n");
#endif /* CONFIG_CIFS_STATS2 */
- atomic_set(&midCount, 0);
+ atomic_set(&mid_count, 0);
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
@@ -1654,7 +1674,7 @@ init_cifs(void)
if (rc)
goto out_destroy_deferredclose_wq;
- rc = cifs_init_mids();
+ rc = init_mids();
if (rc)
goto out_destroy_inodecache;
@@ -1711,7 +1731,7 @@ out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
out_destroy_mids:
- cifs_destroy_mids();
+ destroy_mids();
out_destroy_inodecache:
cifs_destroy_inodecache();
out_destroy_deferredclose_wq:
@@ -1747,7 +1767,7 @@ exit_cifs(void)
dfs_cache_destroy();
#endif
cifs_destroy_request_bufs();
- cifs_destroy_mids();
+ destroy_mids();
cifs_destroy_inodecache();
destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b17be47a8e59..81f4c15936d0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 37
-#define CIFS_VERSION "2.37"
+#define SMB3_PRODUCT_BUILD 38
+#define CIFS_VERSION "2.38"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a643c84ff1e9..ae7f571a7dba 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -417,7 +417,7 @@ struct smb_version_operations {
int (*close_dir)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *);
/* calculate a size of SMB message */
- unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
+ unsigned int (*calc_smb_size)(void *buf);
/* check for STATUS_PENDING and process the response if yes */
bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
@@ -557,6 +557,8 @@ struct smb_version_values {
#define HEADER_SIZE(server) (server->vals->header_size)
#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+#define HEADER_PREAMBLE_SIZE(server) (server->vals->header_preamble_size)
+#define MID_HEADER_SIZE(server) (HEADER_SIZE(server) - 1 - HEADER_PREAMBLE_SIZE(server))
/**
* CIFS superblock mount flags (mnt_cifs_flags) to consider when
@@ -605,6 +607,7 @@ inc_rfc1001_len(void *buf, int count)
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
+ spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
/* 15 character server name + 0x20 16th byte indicating type = srv */
@@ -622,6 +625,7 @@ struct TCP_Server_Info {
#endif
wait_queue_head_t response_q;
wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
+ spinlock_t mid_lock; /* protect mid queue and it's entries */
struct list_head pending_mid_q;
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
@@ -748,6 +752,11 @@ struct TCP_Server_Info {
#endif
};
+static inline bool is_smb1(struct TCP_Server_Info *server)
+{
+ return HEADER_PREAMBLE_SIZE(server) != 0;
+}
+
static inline void cifs_server_lock(struct TCP_Server_Info *server)
{
unsigned int nofs_flag = memalloc_nofs_save();
@@ -1008,6 +1017,7 @@ struct cifs_ses {
struct list_head rlist; /* reconnect list */
struct list_head tcon_list;
struct cifs_tcon *tcon_ipc;
+ spinlock_t ses_lock; /* protect anything here that is not protected */
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
@@ -1125,42 +1135,6 @@ struct cifs_fattr {
u32 cf_cifstag;
};
-struct cached_dirent {
- struct list_head entry;
- char *name;
- int namelen;
- loff_t pos;
-
- struct cifs_fattr fattr;
-};
-
-struct cached_dirents {
- bool is_valid:1;
- bool is_failed:1;
- struct dir_context *ctx; /*
- * Only used to make sure we only take entries
- * from a single context. Never dereferenced.
- */
- struct mutex de_mutex;
- int pos; /* Expected ctx->pos */
- struct list_head entries;
-};
-
-struct cached_fid {
- bool is_valid:1; /* Do we have a useable root fid */
- bool file_all_info_is_valid:1;
- bool has_lease:1;
- unsigned long time; /* jiffies of when lease was taken */
- struct kref refcount;
- struct cifs_fid *fid;
- struct mutex fid_mutex;
- struct cifs_tcon *tcon;
- struct dentry *dentry;
- struct work_struct lease_break;
- struct smb2_file_all_info file_all_info;
- struct cached_dirents dirents;
-};
-
/*
* there is one of these for each connection to a resource on a particular
* session
@@ -1169,6 +1143,7 @@ struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
struct list_head rlist; /* reconnect list */
+ spinlock_t tc_lock; /* protect anything here that is not protected */
atomic_t num_local_opens; /* num of all opens including disconnected */
atomic_t num_remote_opens; /* num of all network opens on server */
struct list_head openFileList;
@@ -1253,7 +1228,7 @@ struct cifs_tcon {
struct fscache_volume *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
- struct cached_fid crfid; /* Cached root fid */
+ struct cached_fid *cfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
@@ -1899,33 +1874,78 @@ require use of the stronger protocol */
*/
/****************************************************************************
- * Locking notes. All updates to global variables and lists should be
- * protected by spinlocks or semaphores.
+ * Here are all the locks (spinlock, mutex, semaphore) in cifs.ko, arranged according
+ * to the locking order. i.e. if two locks are to be held together, the lock that
+ * appears higher in this list needs to be taken before the other.
+ *
+ * If you hold a lock that is lower in this list, and you need to take a higher lock
+ * (or if you think that one of the functions that you're calling may need to), first
+ * drop the lock you hold, pick up the higher lock, then the lower one. This will
+ * ensure that locks are picked up only in one direction in the below table
+ * (top to bottom).
*
- * Spinlocks
- * ---------
- * GlobalMid_Lock protects:
- * list operations on pending_mid_q and oplockQ
- * updates to XID counters, multiplex id and SMB sequence numbers
- * list operations on global DnotifyReqList
- * updates to ses->status and TCP_Server_Info->tcpStatus
- * updates to server->CurrentMid
- * tcp_ses_lock protects:
- * list operations on tcp and SMB session lists
- * tcon->open_file_lock protects the list of open files hanging off the tcon
- * inode->open_file_lock protects the openFileList hanging off the inode
- * cfile->file_info_lock protects counters and fields in cifs file struct
- * f_owner.lock protects certain per file struct operations
- * mapping->page_lock protects certain per page operations
+ * Also, if you expect a function to be called with a lock held, explicitly document
+ * this in the comments on top of your function definition.
*
- * Note that the cifs_tcon.open_file_lock should be taken before
- * not after the cifsInodeInfo.open_file_lock
+ * And also, try to keep the critical sections (lock hold time) to be as minimal as
+ * possible. Blocking / calling other functions with a lock held always increase
+ * the risk of a possible deadlock.
*
- * Semaphores
- * ----------
- * cifsInodeInfo->lock_sem protects:
- * the list of locks held by the inode
+ * Following this rule will avoid unnecessary deadlocks, which can get really hard to
+ * debug. Also, any new lock that you introduce, please add to this list in the correct
+ * order.
*
+ * Please populate this list whenever you introduce new locks in your changes. Or in
+ * case I've missed some existing locks. Please ensure that it's added in the list
+ * based on the locking order expected.
+ *
+ * =====================================================================================
+ * Lock Protects Initialization fn
+ * =====================================================================================
+ * vol_list_lock
+ * vol_info->ctx_lock vol_info->ctx
+ * cifs_sb_info->tlink_tree_lock cifs_sb_info->tlink_tree cifs_setup_cifs_sb
+ * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session
+ * reconnect_mutex
+ * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session
+ * cifs_ses->session_mutex cifs_ses sesInfoAlloc
+ * cifs_tcon
+ * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc
+ * cifs_tcon->pending_opens
+ * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc
+ * cifs_tcon->bytes_written
+ * cifs_tcp_ses_lock cifs_tcp_ses_list sesInfoAlloc
+ * GlobalMid_Lock GlobalMaxActiveXid init_cifs
+ * GlobalCurrentXid
+ * GlobalTotalActiveXid
+ * TCP_Server_Info->srv_lock (anything in struct not protected by another lock and can change)
+ * TCP_Server_Info->mid_lock TCP_Server_Info->pending_mid_q cifs_get_tcp_session
+ * ->CurrentMid
+ * (any changes in mid_q_entry fields)
+ * TCP_Server_Info->req_lock TCP_Server_Info->in_flight cifs_get_tcp_session
+ * ->credits
+ * ->echo_credits
+ * ->oplock_credits
+ * ->reconnect_instance
+ * cifs_ses->ses_lock (anything that is not protected by another lock and can change)
+ * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc
+ * ->iface_count
+ * ->iface_last_update
+ * cifs_ses->chan_lock cifs_ses->chans
+ * ->chans_need_reconnect
+ * ->chans_in_reconnect
+ * cifs_tcon->tc_lock (anything that is not protected by another lock and can change)
+ * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode
+ * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc
+ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
+ * ->can_cache_brlcks
+ * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
+ * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc
+ * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
+ * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
+ * ->invalidHandle initiate_cifs_search
+ * ->oplock_break_cancelled
+ * cifs_aio_ctx->aio_mutex cifs_aio_ctx cifs_aio_ctx_alloc
****************************************************************************/
#ifdef DECLARE_GLOBALS_HERE
@@ -1941,47 +1961,44 @@ require use of the stronger protocol */
* sessions (and from that the tree connections) can be found
* by iterating over cifs_tcp_ses_list
*/
-GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
+extern struct list_head cifs_tcp_ses_list;
/*
* This lock protects the cifs_tcp_ses_list, the list of smb sessions per
* tcp session, and the list of tcon's per smb session. It also protects
- * the reference counters for the server, smb session, and tcon. It also
- * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally,
- * changes to the tcon->tidStatus should be done while holding this lock.
+ * the reference counters for the server, smb session, and tcon.
* generally the locks should be taken in order tcp_ses_lock before
* tcon->open_file_lock and that before file->file_info_lock since the
* structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file
*/
-GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
+extern spinlock_t cifs_tcp_ses_lock;
/*
* Global transaction id (XID) information
*/
-GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
-GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
- /* on midQ entries */
+extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
+extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
+
/*
* Global counters, updated atomically
*/
-GLOBAL_EXTERN atomic_t sesInfoAllocCount;
-GLOBAL_EXTERN atomic_t tconInfoAllocCount;
-GLOBAL_EXTERN atomic_t tcpSesNextId;
-GLOBAL_EXTERN atomic_t tcpSesAllocCount;
-GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
-GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
+extern atomic_t sesInfoAllocCount;
+extern atomic_t tconInfoAllocCount;
+extern atomic_t tcpSesNextId;
+extern atomic_t tcpSesAllocCount;
+extern atomic_t tcpSesReconnectCount;
+extern atomic_t tconInfoReconnectCount;
/* Various Debug counters */
-GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
+extern atomic_t buf_alloc_count; /* current number allocated */
+extern atomic_t small_buf_alloc_count;
#ifdef CONFIG_CIFS_STATS2
-GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
-GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+extern atomic_t total_buf_alloc_count; /* total allocated over all time */
+extern atomic_t total_small_buf_alloc_count;
extern unsigned int slow_rsp_threshold; /* number of secs before logging */
#endif
-GLOBAL_EXTERN atomic_t smBufAllocCount;
-GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
extern bool enable_oplocks; /* enable or disable oplocks */
@@ -1998,6 +2015,7 @@ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
extern unsigned int cifs_min_small; /* min size of small buf pool */
extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
+extern atomic_t mid_count;
void cifs_oplock_break(struct work_struct *work);
void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
@@ -2085,9 +2103,9 @@ static inline bool cifs_is_referral_server(struct cifs_tcon *tcon,
return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER));
}
-static inline u64 cifs_flock_len(struct file_lock *fl)
+static inline u64 cifs_flock_len(const struct file_lock *fl)
{
- return fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1;
+ return (u64)fl->fl_end - fl->fl_start + 1;
}
static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d59aebefa71c..3bc94bcc7177 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,12 +78,8 @@ extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
char **devname);
-/* extern void renew_parental_timestamps(struct dentry *direntry);*/
-extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
- struct TCP_Server_Info *server);
-extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
-extern void cifs_delete_mid(struct mid_q_entry *mid);
-extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry);
+extern void delete_mid(struct mid_q_entry *mid);
+extern void release_mid(struct mid_q_entry *mid);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
@@ -155,7 +151,7 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file);
-extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
+extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -521,6 +517,7 @@ extern int generate_smb30signingkey(struct cifs_ses *ses,
extern int generate_smb311signingkey(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern int CIFSSMBCopy(unsigned int xid,
struct cifs_tcon *source_tcon,
const char *fromName,
@@ -551,6 +548,7 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage, int remap_special_chars);
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
@@ -599,7 +597,6 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
-void smb2_cached_lease_break(struct work_struct *work);
int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
struct sdesc **sdesc);
diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c
index 9e91a5a40aae..56ec1b233f52 100644
--- a/fs/cifs/cifsroot.c
+++ b/fs/cifs/cifsroot.c
@@ -59,7 +59,7 @@ static int __init cifs_root_setup(char *line)
pr_err("Root-CIFS: UNC path too long\n");
return 1;
}
- strlcpy(root_dev, line, len);
+ strscpy(root_dev, line, len);
srvaddr = parse_srvaddr(&line[2], s);
if (*s) {
int n = snprintf(root_opts,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6371b9eebdad..7aa91e272027 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -29,7 +29,6 @@
#include "cifsproto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
-#include "smb2proto.h"
#include "fscache.h"
#include "smbdirect.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -62,52 +61,6 @@ static struct {
#define CIFS_NUM_PROT 1
#endif /* CIFS_POSIX */
-/*
- * Mark as invalid, all open files on tree connections since they
- * were closed when session to server was lost.
- */
-void
-cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
-{
- struct cifsFileInfo *open_file = NULL;
- struct list_head *tmp;
- struct list_head *tmp1;
-
- /* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
- if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) {
- spin_unlock(&cifs_tcp_ses_lock);
- return;
- }
- tcon->status = TID_IN_FILES_INVALIDATE;
- spin_unlock(&cifs_tcp_ses_lock);
-
- /* list all files open on tree connection and mark them invalid */
- spin_lock(&tcon->open_file_lock);
- list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
- open_file = list_entry(tmp, struct cifsFileInfo, tlist);
- open_file->invalidHandle = true;
- open_file->oplock_break_cancelled = true;
- }
- spin_unlock(&tcon->open_file_lock);
-
- mutex_lock(&tcon->crfid.fid_mutex);
- tcon->crfid.is_valid = false;
- /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
- close_cached_dir_lease_locked(&tcon->crfid);
- memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid));
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- spin_lock(&cifs_tcp_ses_lock);
- if (tcon->status == TID_IN_FILES_INVALIDATE)
- tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
-
- /*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
- * to this tcon.
- */
-}
/* reconnect the socket, tcon, and smb session if needed */
static int
@@ -134,18 +87,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* only tree disconnect, open, and write, (and ulogoff which does not
* have tcon) are allowed as we start force umount
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_EXITING) {
if (smb_command != SMB_COM_WRITE_ANDX &&
smb_command != SMB_COM_OPEN_ANDX &&
smb_command != SMB_COM_TREE_DISCONNECT) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb_command);
return -ENODEV;
}
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
retries = server->nr_targets;
@@ -165,12 +118,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
/* are we still trying to reconnect? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
break;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (retries && --retries)
continue;
@@ -201,13 +154,13 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = -EHOSTDOWN;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* need to prevent multiple threads trying to simultaneously
@@ -457,52 +410,6 @@ decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
return 0;
}
-int
-cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
-{
- bool srv_sign_required = server->sec_mode & server->vals->signing_required;
- bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
- bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
-
- /*
- * Is signing required by mnt options? If not then check
- * global_secflags to see if it is there.
- */
- if (!mnt_sign_required)
- mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
- CIFSSEC_MUST_SIGN);
-
- /*
- * If signing is required then it's automatically enabled too,
- * otherwise, check to see if the secflags allow it.
- */
- mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
- (global_secflags & CIFSSEC_MAY_SIGN);
-
- /* If server requires signing, does client allow it? */
- if (srv_sign_required) {
- if (!mnt_sign_enabled) {
- cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n");
- return -ENOTSUPP;
- }
- server->sign = true;
- }
-
- /* If client requires signing, does server allow it? */
- if (mnt_sign_required) {
- if (!srv_sign_enabled) {
- cifs_dbg(VFS, "Server does not support signing!\n");
- return -ENOTSUPP;
- }
- server->sign = true;
- }
-
- if (cifs_rdma_enabled(server) && server->sign)
- cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n");
-
- return 0;
-}
-
static bool
should_set_ext_sec_flag(enum securityEnum sectype)
{
@@ -684,7 +591,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = mid->callback_data;
struct cifs_credits credits = { .value = 1, .instance = 0 };
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, CIFS_ECHO_OP);
}
@@ -1379,184 +1286,6 @@ openRetry:
return rc;
}
-/*
- * Discard any remaining data in the current SMB. To do this, we borrow the
- * current bigbuf.
- */
-int
-cifs_discard_remaining_data(struct TCP_Server_Info *server)
-{
- unsigned int rfclen = server->pdu_size;
- int remaining = rfclen + server->vals->header_preamble_size -
- server->total_read;
-
- while (remaining > 0) {
- int length;
-
- length = cifs_discard_from_socket(server,
- min_t(size_t, remaining,
- CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
- if (length < 0)
- return length;
- server->total_read += length;
- remaining -= length;
- }
-
- return 0;
-}
-
-static int
-__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
- bool malformed)
-{
- int length;
-
- length = cifs_discard_remaining_data(server);
- dequeue_mid(mid, malformed);
- mid->resp_buf = server->smallbuf;
- server->smallbuf = NULL;
- return length;
-}
-
-static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
-{
- struct cifs_readdata *rdata = mid->callback_data;
-
- return __cifs_readv_discard(server, mid, rdata->result);
-}
-
-int
-cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
-{
- int length, len;
- unsigned int data_offset, data_len;
- struct cifs_readdata *rdata = mid->callback_data;
- char *buf = server->smallbuf;
- unsigned int buflen = server->pdu_size +
- server->vals->header_preamble_size;
- bool use_rdma_mr = false;
-
- cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
- __func__, mid->mid, rdata->offset, rdata->bytes);
-
- /*
- * read the rest of READ_RSP header (sans Data array), or whatever we
- * can if there's not enough data. At this point, we've read down to
- * the Mid.
- */
- len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
- HEADER_SIZE(server) + 1;
-
- length = cifs_read_from_socket(server,
- buf + HEADER_SIZE(server) - 1, len);
- if (length < 0)
- return length;
- server->total_read += length;
-
- if (server->ops->is_session_expired &&
- server->ops->is_session_expired(buf)) {
- cifs_reconnect(server, true);
- return -1;
- }
-
- if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server)) {
- cifs_discard_remaining_data(server);
- return -1;
- }
-
- /* set up first two iov for signature check and to get credits */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = server->vals->header_preamble_size;
- rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
- rdata->iov[1].iov_len =
- server->total_read - server->vals->header_preamble_size;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
- rdata->iov[0].iov_base, rdata->iov[0].iov_len);
- cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
- rdata->iov[1].iov_base, rdata->iov[1].iov_len);
-
- /* Was the SMB read successful? */
- rdata->result = server->ops->map_error(buf, false);
- if (rdata->result != 0) {
- cifs_dbg(FYI, "%s: server returned error %d\n",
- __func__, rdata->result);
- /* normal error on read response */
- return __cifs_readv_discard(server, mid, false);
- }
-
- /* Is there enough to get to the rest of the READ_RSP header? */
- if (server->total_read < server->vals->read_rsp_size) {
- cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
- __func__, server->total_read,
- server->vals->read_rsp_size);
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- data_offset = server->ops->read_data_offset(buf) +
- server->vals->header_preamble_size;
- if (data_offset < server->total_read) {
- /*
- * win2k8 sometimes sends an offset of 0 when the read
- * is beyond the EOF. Treat it as if the data starts just after
- * the header.
- */
- cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
- __func__, data_offset);
- data_offset = server->total_read;
- } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
- /* data_offset is beyond the end of smallbuf */
- cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
- __func__, data_offset);
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
- __func__, server->total_read, data_offset);
-
- len = data_offset - server->total_read;
- if (len > 0) {
- /* read any junk before data into the rest of smallbuf */
- length = cifs_read_from_socket(server,
- buf + server->total_read, len);
- if (length < 0)
- return length;
- server->total_read += length;
- }
-
- /* how much data is in the response? */
-#ifdef CONFIG_CIFS_SMB_DIRECT
- use_rdma_mr = rdata->mr;
-#endif
- data_len = server->ops->read_data_length(buf, use_rdma_mr);
- if (!use_rdma_mr && (data_offset + data_len > buflen)) {
- /* data_len is corrupt -- discard frame */
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- length = rdata->read_into_pages(server, rdata, data_len);
- if (length < 0)
- return length;
-
- server->total_read += length;
-
- cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
- server->total_read, buflen, data_len);
-
- /* discard anything left over */
- if (server->total_read < buflen)
- return cifs_readv_discard(server, mid);
-
- dequeue_mid(mid, false);
- mid->resp_buf = server->smallbuf;
- server->smallbuf = NULL;
- return length;
-}
-
static void
cifs_readv_callback(struct mid_q_entry *mid)
{
@@ -1607,7 +1336,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -1909,183 +1638,6 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
return rc;
}
-void
-cifs_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
-
- if (wdata->cfile)
- cifsFileInfo_put(wdata->cfile);
-
- kvfree(wdata->pages);
- kfree(wdata);
-}
-
-/*
- * Write failed with a retryable error. Resend the write request. It's also
- * possible that the page was redirtied so re-clean the page.
- */
-static void
-cifs_writev_requeue(struct cifs_writedata *wdata)
-{
- int i, rc = 0;
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct TCP_Server_Info *server;
- unsigned int rest_len;
-
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- i = 0;
- rest_len = wdata->bytes;
- do {
- struct cifs_writedata *wdata2;
- unsigned int j, nr_pages, wsize, tailsz, cur_len;
-
- wsize = server->ops->wp_retry_size(inode);
- if (wsize < rest_len) {
- nr_pages = wsize / PAGE_SIZE;
- if (!nr_pages) {
- rc = -ENOTSUPP;
- break;
- }
- cur_len = nr_pages * PAGE_SIZE;
- tailsz = PAGE_SIZE;
- } else {
- nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
- cur_len = rest_len;
- tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
- }
-
- wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
- if (!wdata2) {
- rc = -ENOMEM;
- break;
- }
-
- for (j = 0; j < nr_pages; j++) {
- wdata2->pages[j] = wdata->pages[i + j];
- lock_page(wdata2->pages[j]);
- clear_page_dirty_for_io(wdata2->pages[j]);
- }
-
- wdata2->sync_mode = wdata->sync_mode;
- wdata2->nr_pages = nr_pages;
- wdata2->offset = page_offset(wdata2->pages[0]);
- wdata2->pagesz = PAGE_SIZE;
- wdata2->tailsz = tailsz;
- wdata2->bytes = cur_len;
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
- &wdata2->cfile);
- if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
- rc);
- if (!is_retryable_error(rc))
- rc = -EBADF;
- } else {
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2,
- cifs_writedata_release);
- }
-
- for (j = 0; j < nr_pages; j++) {
- unlock_page(wdata2->pages[j]);
- if (rc != 0 && !is_retryable_error(rc)) {
- SetPageError(wdata2->pages[j]);
- end_page_writeback(wdata2->pages[j]);
- put_page(wdata2->pages[j]);
- }
- }
-
- kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc) {
- if (is_retryable_error(rc))
- continue;
- i += nr_pages;
- break;
- }
-
- rest_len -= cur_len;
- i += nr_pages;
- } while (i < wdata->nr_pages);
-
- /* cleanup remaining pages from the original wdata */
- for (; i < wdata->nr_pages; i++) {
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- put_page(wdata->pages[i]);
- }
-
- if (rc != 0 && !is_retryable_error(rc))
- mapping_set_error(inode->i_mapping, rc);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-void
-cifs_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
- int i = 0;
-
- if (wdata->result == 0) {
- spin_lock(&inode->i_lock);
- cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
- spin_unlock(&inode->i_lock);
- cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
- wdata->bytes);
- } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
- return cifs_writev_requeue(wdata);
-
- for (i = 0; i < wdata->nr_pages; i++) {
- struct page *page = wdata->pages[i];
- if (wdata->result == -EAGAIN)
- __set_page_dirty_nobuffers(page);
- else if (wdata->result < 0)
- SetPageError(page);
- end_page_writeback(page);
- cifs_readpage_to_fscache(inode, page);
- put_page(page);
- }
- if (wdata->result != -EAGAIN)
- mapping_set_error(inode->i_mapping, wdata->result);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-struct cifs_writedata *
-cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
-{
- struct page **pages =
- kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (pages)
- return cifs_writedata_direct_alloc(pages, complete);
-
- return NULL;
-}
-
-struct cifs_writedata *
-cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
-{
- struct cifs_writedata *wdata;
-
- wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
- if (wdata != NULL) {
- wdata->pages = pages;
- kref_init(&wdata->refcount);
- INIT_LIST_HEAD(&wdata->list);
- init_completion(&wdata->done);
- INIT_WORK(&wdata->work, complete);
- }
- return wdata;
-}
-
/*
* Check the mid_state and signature on received buffer (if any), and queue the
* workqueue completion task.
@@ -2132,7 +1684,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(tcon->ses->server, &credits, 0);
}
@@ -3660,7 +3212,6 @@ setACLerrorExit:
return rc;
}
-/* BB fix tabs in this function FIXME BB */
int
CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
@@ -3677,7 +3228,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
GetExtAttrRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ (void **) &pSMBr);
if (rc)
return rc;
@@ -3723,7 +3274,7 @@ GetExtAttrRetry:
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
struct file_chattr_info *pfinfo;
- /* BB Do we need a cast or hash here ? */
+
if (count != 16) {
cifs_dbg(FYI, "Invalid size ret in GetExtAttr\n");
rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 386bb523c69e..a0a06b6f252b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -119,10 +119,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
goto requeue_resolve;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
strlen(ipaddr));
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
kfree(ipaddr);
/* rc == 1 means success here */
@@ -205,17 +205,22 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
/* If server is a channel, select the primary channel */
pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&pserver->srv_lock);
if (!all_channels) {
pserver->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&pserver->srv_lock);
return;
}
+ spin_unlock(&pserver->srv_lock);
+ spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
spin_lock(&ses->chan_lock);
- for (i = 0; i < ses->chan_count; i++)
+ for (i = 0; i < ses->chan_count; i++) {
+ spin_lock(&ses->chans[i].server->srv_lock);
ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chans[i].server->srv_lock);
+ }
spin_unlock(&ses->chan_lock);
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -252,17 +257,8 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
/* check if iface is still active */
- if (!cifs_chan_is_iface_active(ses, server)) {
- /*
- * HACK: drop the lock before calling
- * cifs_chan_update_iface to avoid deadlock
- */
- ses->ses_count++;
- spin_unlock(&cifs_tcp_ses_lock);
+ if (!cifs_chan_is_iface_active(ses, server))
cifs_chan_update_iface(ses, server);
- spin_lock(&cifs_tcp_ses_lock);
- ses->ses_count--;
- }
spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
@@ -323,7 +319,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
kref_get(&mid->refcount);
if (mid->mid_state == MID_REQUEST_SUBMITTED)
@@ -331,14 +327,14 @@ cifs_abort_connection(struct TCP_Server_Info *server)
list_move(&mid->qhead, &retry_list);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
list_del_init(&mid->qhead);
mid->callback(mid);
- cifs_mid_q_entry_release(mid);
+ release_mid(mid);
}
if (cifs_rdma_enabled(server)) {
@@ -350,11 +346,11 @@ cifs_abort_connection(struct TCP_Server_Info *server)
static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets)
{
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->nr_targets = num_targets;
if (server->tcpStatus == CifsExiting) {
/* the demux thread will exit normally next time through the loop */
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return false;
}
@@ -364,7 +360,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
server->hostname);
server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return true;
}
@@ -414,20 +410,20 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
} else {
atomic_inc(&tcpSesReconnectCount);
set_credits(server, 1);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
}
} while (server->tcpStatus == CifsNeedReconnect);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return rc;
@@ -541,10 +537,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
*/
atomic_inc(&tcpSesReconnectCount);
set_credits(server, 1);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -556,11 +552,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
dfs_cache_free_tgts(&tl);
/* Need to set up echo worker again once connection has been established */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
-
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return rc;
@@ -569,12 +564,12 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
/* If tcp session is not an dfs connection, then reconnect to last target server */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (!server->is_dfs_conn) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return __cifs_reconnect(server, mark_smb_session);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
mutex_lock(&server->refpath_lock);
if (!server->origin_fullpath || !server->leaf_fullpath) {
@@ -670,18 +665,18 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
(!server->ops->can_echo || server->ops->can_echo(server)) &&
time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
cifs_reconnect(server, false);
return true;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return false;
}
@@ -726,18 +721,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
else
length = sock_recvmsg(server->ssocket, smb_msg, 0);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ESHUTDOWN;
}
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_reconnect(server, false);
return -ECONNABORTED;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -849,7 +844,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
#ifdef CONFIG_CIFS_STATS2
mid->when_received = jiffies;
#endif
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&mid->server->mid_lock);
if (!malformed)
mid->mid_state = MID_RESPONSE_RECEIVED;
else
@@ -859,12 +854,12 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
* function has finished processing it is a bug.
*/
if (mid->mid_flags & MID_DELETED) {
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
pr_warn_once("trying to dequeue a deleted mid\n");
} else {
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
}
}
@@ -876,7 +871,7 @@ smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
/*
* SMB1 does not use credits.
*/
- if (server->vals->header_preamble_size)
+ if (is_smb1(server))
return 0;
return le16_to_cpu(shdr->CreditRequest);
@@ -903,21 +898,68 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dequeue_mid(mid, malformed);
}
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+ bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+ bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+ bool mnt_sign_enabled;
+
+ /*
+ * Is signing required by mnt options? If not then check
+ * global_secflags to see if it is there.
+ */
+ if (!mnt_sign_required)
+ mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+ CIFSSEC_MUST_SIGN);
+
+ /*
+ * If signing is required then it's automatically enabled too,
+ * otherwise, check to see if the secflags allow it.
+ */
+ mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+ (global_secflags & CIFSSEC_MAY_SIGN);
+
+ /* If server requires signing, does client allow it? */
+ if (srv_sign_required) {
+ if (!mnt_sign_enabled) {
+ cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n");
+ return -EOPNOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ /* If client requires signing, does server allow it? */
+ if (mnt_sign_required) {
+ if (!srv_sign_enabled) {
+ cifs_dbg(VFS, "Server does not support signing!\n");
+ return -EOPNOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ if (cifs_rdma_enabled(server) && server->sign)
+ cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n");
+
+ return 0;
+}
+
+
static void clean_demultiplex_info(struct TCP_Server_Info *server)
{
int length;
/* take it off the list, if it's not already */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
list_del_init(&server->tcp_ses_list);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cancel_delayed_work_sync(&server->echo);
cancel_delayed_work_sync(&server->resolve);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up_all(&server->response_q);
/* check if we have blocked requests that need to free */
@@ -948,7 +990,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
struct list_head *tmp, *tmp2;
INIT_LIST_HEAD(&dispose_list);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid);
@@ -957,7 +999,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
list_move(&mid_entry->qhead, &dispose_list);
mid_entry->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
@@ -965,7 +1007,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
- cifs_mid_q_entry_release(mid_entry);
+ release_mid(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
@@ -1008,7 +1050,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* make sure this will fit in a large buffer */
if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
- server->vals->header_preamble_size) {
+ HEADER_PREAMBLE_SIZE(server)) {
cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server, true);
return -ECONNABORTED;
@@ -1023,8 +1065,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1
- + server->vals->header_preamble_size);
+ pdu_length - MID_HEADER_SIZE(server));
if (length < 0)
return length;
@@ -1039,19 +1080,18 @@ int
cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
- int length;
+ int rc;
/*
* We know that we received enough to get to the MID as we
* checked the pdu_length earlier. Now check to see
- * if the rest of the header is OK. We borrow the length
- * var for the rest of the loop to avoid a new stack var.
+ * if the rest of the header is OK.
*
* 48 bytes is enough to display the header and a little bit
* into the payload for debugging purposes.
*/
- length = server->ops->check_message(buf, server->total_read, server);
- if (length != 0)
+ rc = server->ops->check_message(buf, server->total_read, server);
+ if (rc)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
@@ -1066,9 +1106,9 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return -1;
if (!mid)
- return length;
+ return rc;
- handle_mid(mid, server, buf, length);
+ handle_mid(mid, server, buf, rc);
return 0;
}
@@ -1081,7 +1121,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
/*
* SMB1 does not use credits.
*/
- if (server->vals->header_preamble_size)
+ if (is_smb1(server))
return;
if (shdr->CreditRequest) {
@@ -1139,10 +1179,10 @@ cifs_demultiplex_thread(void *p)
if (length < 0)
continue;
- if (server->vals->header_preamble_size == 0)
- server->total_read = 0;
- else
+ if (is_smb1(server))
server->total_read = length;
+ else
+ server->total_read = 0;
/*
* The right amount was read from socket - 4 bytes,
@@ -1157,8 +1197,7 @@ next_pdu:
server->pdu_size = pdu_length;
/* make sure we have enough to get to the MID */
- if (server->pdu_size < HEADER_SIZE(server) - 1 -
- server->vals->header_preamble_size) {
+ if (server->pdu_size < MID_HEADER_SIZE(server)) {
cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
server->pdu_size);
cifs_reconnect(server, true);
@@ -1167,9 +1206,8 @@ next_pdu:
/* read down to the MID */
length = cifs_read_from_socket(server,
- buf + server->vals->header_preamble_size,
- HEADER_SIZE(server) - 1
- - server->vals->header_preamble_size);
+ buf + HEADER_PREAMBLE_SIZE(server),
+ MID_HEADER_SIZE(server));
if (length < 0)
continue;
server->total_read += length;
@@ -1205,7 +1243,7 @@ next_pdu:
if (length < 0) {
for (i = 0; i < num_mids; i++)
if (mids[i])
- cifs_mid_q_entry_release(mids[i]);
+ release_mid(mids[i]);
continue;
}
@@ -1232,7 +1270,7 @@ next_pdu:
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
- cifs_mid_q_entry_release(mids[i]);
+ release_mid(mids[i]);
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
@@ -1240,7 +1278,7 @@ next_pdu:
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_server_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
- atomic_read(&midCount));
+ atomic_read(&mid_count));
cifs_dump_mem("Received Data is: ", bufs[i],
HEADER_SIZE(server));
smb2_add_credits_from_hdr(bufs[i], server);
@@ -1411,6 +1449,7 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return true;
}
+/* this function must be called with srv_lock held */
static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
@@ -1471,6 +1510,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ spin_lock(&server->srv_lock);
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
* DFS failover implementation in cifs_reconnect() requires unique tcp sessions for
@@ -1478,15 +1518,20 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
* shares or even links that may connect to same server but having completely
* different failover targets.
*/
- if (server->is_dfs_conn)
+ if (server->is_dfs_conn) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
#endif
/*
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx))
+ if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
+ spin_unlock(&server->srv_lock);
++server->srv_count;
spin_unlock(&cifs_tcp_ses_lock);
@@ -1534,9 +1579,9 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
else
cancel_delayed_work_sync(&server->reconnect);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_crypto_secmech_release(server);
@@ -1595,8 +1640,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
if (primary_server) {
spin_lock(&cifs_tcp_ses_lock);
++primary_server->srv_count;
- tcp_ses->primary_server = primary_server;
spin_unlock(&cifs_tcp_ses_lock);
+ tcp_ses->primary_server = primary_server;
}
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
@@ -1612,6 +1657,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->lstrp = jiffies;
tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression);
spin_lock_init(&tcp_ses->req_lock);
+ spin_lock_init(&tcp_ses->srv_lock);
+ spin_lock_init(&tcp_ses->mid_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
@@ -1685,9 +1732,9 @@ smbd_connected:
* to the struct since the kernel thread not created yet
* no need to spinlock this update of tcpStatus
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcp_ses->srv_lock);
tcp_ses->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcp_ses->srv_lock);
if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
@@ -1729,6 +1776,7 @@ out_err:
return ERR_PTR(rc);
}
+/* this function must be called with ses_lock held */
static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
if (ctx->sectype != Unspecified &&
@@ -1864,10 +1912,17 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->ses_status == SES_EXITING)
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
continue;
- if (!match_session(ses, ctx))
+ }
+ if (!match_session(ses, ctx)) {
+ spin_unlock(&ses->ses_lock);
continue;
+ }
+ spin_unlock(&ses->ses_lock);
+
++ses->ses_count;
spin_unlock(&cifs_tcp_ses_lock);
return ses;
@@ -1882,26 +1937,28 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_EXITING) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return;
}
+ spin_unlock(&ses->ses_lock);
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE");
+ spin_lock(&cifs_tcp_ses_lock);
if (--ses->ses_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
+ spin_unlock(&cifs_tcp_ses_lock);
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
if (ses->ses_status == SES_GOOD)
ses->ses_status = SES_EXITING;
- spin_unlock(&cifs_tcp_ses_lock);
cifs_free_ipc(ses);
@@ -2236,6 +2293,7 @@ get_ses_fail:
return ERR_PTR(rc);
}
+/* this function must be called with tc_lock held */
static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
if (tcon->status == TID_EXITING)
@@ -2258,16 +2316,17 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static struct cifs_tcon *
cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
- struct list_head *tmp;
struct cifs_tcon *tcon;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &ses->tcon_list) {
- tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-
- if (!match_tcon(tcon, ctx))
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->tc_lock);
+ if (!match_tcon(tcon, ctx)) {
+ spin_unlock(&tcon->tc_lock);
continue;
+ }
++tcon->tc_count;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return tcon;
}
@@ -2619,6 +2678,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 0;
if (old->ctx->acdirmax != new->ctx->acdirmax)
return 0;
+ if (old->ctx->closetimeo != new->ctx->closetimeo)
+ return 0;
return 1;
}
@@ -2644,7 +2705,7 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
int
cifs_match_super(struct super_block *sb, void *data)
{
- struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+ struct cifs_mnt_data *mnt_data = data;
struct smb3_fs_context *ctx;
struct cifs_sb_info *cifs_sb;
struct TCP_Server_Info *tcp_srv;
@@ -2667,6 +2728,9 @@ cifs_match_super(struct super_block *sb, void *data)
ctx = mnt_data->ctx;
+ spin_lock(&tcp_srv->srv_lock);
+ spin_lock(&ses->ses_lock);
+ spin_lock(&tcon->tc_lock);
if (!match_server(tcp_srv, ctx) ||
!match_session(ses, ctx) ||
!match_tcon(tcon, ctx) ||
@@ -2677,6 +2741,10 @@ cifs_match_super(struct super_block *sb, void *data)
rc = compare_mount_options(sb, mnt_data);
out:
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&tcp_srv->srv_lock);
+
spin_unlock(&cifs_tcp_ses_lock);
cifs_put_tlink(tlink);
return rc;
@@ -2954,6 +3022,7 @@ ip_connect(struct TCP_Server_Info *server)
return generic_ip_connect(server);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
@@ -3059,6 +3128,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
}
}
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
{
@@ -3175,6 +3245,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
if (tcon->posix_extensions)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* tell server which Unix caps we support */
if (cap_unix(tcon->ses)) {
/*
@@ -3182,16 +3253,17 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
* for just this mount.
*/
reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->ses->server->srv_lock);
if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
(le64_to_cpu(tcon->fsUnixInfo.Capability) &
CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->ses->server->srv_lock);
rc = -EACCES;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->ses->server->srv_lock);
} else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
tcon->unix_ext = 0; /* server does not support them */
/* do not care if a following call succeed - informational */
@@ -3273,9 +3345,9 @@ static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx)
rc = mount_get_conns(mnt_ctx);
if (mnt_ctx->server) {
cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&mnt_ctx->server->srv_lock);
mnt_ctx->server->is_dfs_conn = true;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&mnt_ctx->server->srv_lock);
}
return rc;
}
@@ -3919,7 +3991,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -3990,28 +4062,28 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
return -ENOSYS;
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (!server->ops->need_neg(server) ||
server->tcpStatus != CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return 0;
}
server->tcpStatus = CifsInNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
if (rc == 0) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
server->tcpStatus = CifsGood;
else
rc = -EHOSTDOWN;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
}
return rc;
@@ -4027,7 +4099,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
bool is_binding = false;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if (server->dstaddr.ss_family == AF_INET6)
scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
else
@@ -4036,7 +4108,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (ses->ses_status != SES_GOOD &&
ses->ses_status != SES_NEW &&
ses->ses_status != SES_NEED_RECON) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return 0;
}
@@ -4045,7 +4117,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (CIFS_ALL_CHANS_GOOD(ses) ||
cifs_chan_in_reconnect(ses, server)) {
spin_unlock(&ses->chan_lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return 0;
}
is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
@@ -4054,7 +4126,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (!is_binding)
ses->ses_status = SES_IN_SETUP;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
if (!is_binding) {
ses->capabilities = server->capabilities;
@@ -4078,22 +4150,22 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_IN_SETUP)
ses->ses_status = SES_NEED_RECON;
spin_lock(&ses->chan_lock);
cifs_chan_clear_in_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_IN_SETUP)
ses->ses_status = SES_GOOD;
spin_lock(&ses->chan_lock);
cifs_chan_clear_in_reconnect(ses, server);
cifs_chan_clear_need_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
}
return rc;
@@ -4167,8 +4239,10 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, ctx);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
out:
kfree(ctx->username);
@@ -4557,15 +4631,15 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
struct dfs_info3_param ref = {0};
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
return 0;
}
tcon->status = TID_IN_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
if (!tree) {
@@ -4604,15 +4678,15 @@ out:
cifs_put_tcp_super(sb);
if (rc) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
tcon->need_reconnect = false;
}
@@ -4625,28 +4699,28 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
const struct smb_version_operations *ops = tcon->ses->server->ops;
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
return 0;
}
tcon->status = TID_IN_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
if (rc) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&cifs_tcp_ses_lock);
tcon->need_reconnect = false;
+ spin_unlock(&tcon->tc_lock);
}
return rc;
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 34a8f3baed5e..a9b6c3eba6de 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1526,15 +1526,21 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- if (!server->is_dfs_conn)
+ spin_lock(&server->srv_lock);
+ if (!server->is_dfs_conn) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
+ spin_unlock(&server->srv_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->tc_lock);
if (!tcon->ipc && !tcon->need_reconnect) {
tcon->tc_count++;
list_add_tail(&tcon->ulist, &tcons);
}
+ spin_unlock(&tcon->tc_lock);
}
}
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ce9b22aecfba..08f7392716e2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,6 +193,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
return PTR_ERR(full_path);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -261,6 +262,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
* rare for path not covered on files)
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
desired_access = 0;
if (OPEN_FMODE(oflags) & FMODE_READ)
@@ -316,6 +318,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* If Open reported that we actually created a file then we now have to
* set the mode if possible.
@@ -357,6 +360,9 @@ cifs_create_get_file_info:
rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
xid);
else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* TODO: Add support for calling POSIX query info here, but passing in fid */
rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
xid, fid);
@@ -377,7 +383,9 @@ cifs_create_get_file_info:
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
cifs_create_set_dentry:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (rc != 0) {
cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
rc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e64cda7a7610..fa738adc031f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -26,6 +26,7 @@
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
+#include "smb2proto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
@@ -33,6 +34,48 @@
#include "smbdirect.h"
#include "fs_context.h"
#include "cifs_ioctl.h"
+#include "cached_dir.h"
+
+/*
+ * Mark as invalid, all open files on tree connections since they
+ * were closed when session to server was lost.
+ */
+void
+cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
+{
+ struct cifsFileInfo *open_file = NULL;
+ struct list_head *tmp;
+ struct list_head *tmp1;
+
+ /* only send once per connect */
+ spin_lock(&tcon->ses->ses_lock);
+ if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) {
+ spin_unlock(&tcon->ses->ses_lock);
+ return;
+ }
+ tcon->status = TID_IN_FILES_INVALIDATE;
+ spin_unlock(&tcon->ses->ses_lock);
+
+ /* list all files open on tree connection and mark them invalid */
+ spin_lock(&tcon->open_file_lock);
+ list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
+ open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+ open_file->invalidHandle = true;
+ open_file->oplock_break_cancelled = true;
+ }
+ spin_unlock(&tcon->open_file_lock);
+
+ invalidate_all_cached_dirs(tcon);
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_FILES_INVALIDATE)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&tcon->tc_lock);
+
+ /*
+ * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * to this tcon.
+ */
+}
static inline int cifs_convert_flags(unsigned int flags)
{
@@ -52,6 +95,7 @@ static inline int cifs_convert_flags(unsigned int flags)
FILE_READ_DATA);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static u32 cifs_posix_convert_flags(unsigned int flags)
{
u32 posix_flags = 0;
@@ -85,6 +129,7 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
return posix_flags;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static inline int cifs_get_disposition(unsigned int flags)
{
@@ -100,6 +145,7 @@ static inline int cifs_get_disposition(unsigned int flags)
return FILE_OPEN;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
int cifs_posix_open(const char *full_path, struct inode **pinode,
struct super_block *sb, int mode, unsigned int f_flags,
__u32 *poplock, __u16 *pnetfid, unsigned int xid)
@@ -161,6 +207,7 @@ posix_open_ret:
kfree(presp_data);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
@@ -579,6 +626,7 @@ int cifs_open(struct inode *inode, struct file *file)
else
oplock = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (!tcon->broken_posix_open && tcon->unix_ext &&
cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -603,6 +651,7 @@ int cifs_open(struct inode *inode, struct file *file)
* or DFS errors.
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &fid);
@@ -630,6 +679,7 @@ int cifs_open(struct inode *inode, struct file *file)
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
/*
* Time to set mode which we can not set earlier due to
@@ -647,6 +697,7 @@ int cifs_open(struct inode *inode, struct file *file)
CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid,
cfile->pid);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
use_cache:
fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
@@ -664,7 +715,9 @@ out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/*
* Try to reacquire byte range locks that were released when session
@@ -673,10 +726,12 @@ static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
static int
cifs_relock_file(struct cifsFileInfo *cfile)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING);
if (cinode->can_cache_brlcks) {
@@ -685,11 +740,13 @@ cifs_relock_file(struct cifsFileInfo *cfile)
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
rc = cifs_push_posix_locks(cfile);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = tcon->ses->server->ops->push_mand_locks(cfile);
up_read(&cinode->lock_sem);
@@ -750,6 +807,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
else
oplock = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext && cap_unix(tcon->ses) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -773,6 +831,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
* in the reconnect path it is important to retry hard
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
desired_access = cifs_convert_flags(cfile->f_flags);
@@ -817,7 +876,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
goto reopen_error_exit;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
reopen_success:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
cfile->invalidHandle = false;
mutex_unlock(&cfile->fh_mutex);
cinode = CIFS_I(inode);
@@ -903,12 +964,12 @@ int cifs_close(struct inode *inode, struct file *file)
* So, Increase the ref count to avoid use-after-free.
*/
if (!mod_delayed_work(deferredclose_wq,
- &cfile->deferred, cifs_sb->ctx->acregmax))
+ &cfile->deferred, cifs_sb->ctx->closetimeo))
cifsFileInfo_get(cfile);
} else {
/* Deferred close for files */
queue_delayed_work(deferredclose_wq,
- &cfile->deferred, cifs_sb->ctx->acregmax);
+ &cfile->deferred, cifs_sb->ctx->closetimeo);
cfile->deferred_close_scheduled = true;
spin_unlock(&cinode->deferred_lock);
return 0;
@@ -928,9 +989,7 @@ int cifs_close(struct inode *inode, struct file *file)
void
cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
{
- struct cifsFileInfo *open_file;
- struct list_head *tmp;
- struct list_head *tmp1;
+ struct cifsFileInfo *open_file, *tmp;
struct list_head tmp_list;
if (!tcon->use_persistent || !tcon->need_reopen_files)
@@ -943,8 +1002,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
/* list all files open on tree connection, reopen resilient handles */
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(open_file, &tcon->openFileList, tlist) {
if (!open_file->invalidHandle)
continue;
cifsFileInfo_get(open_file);
@@ -952,8 +1010,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
}
spin_unlock(&tcon->open_file_lock);
- list_for_each_safe(tmp, tmp1, &tmp_list) {
- open_file = list_entry(tmp, struct cifsFileInfo, rlist);
+ list_for_each_entry_safe(open_file, tmp, &tmp_list, rlist) {
if (cifs_reopen_file(open_file, false /* do not flush */))
tcon->need_reopen_files = true;
list_del_init(&open_file->rlist);
@@ -1196,6 +1253,7 @@ try_again:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* Check if there is another lock that prevents us to set the lock (posix
* style). If such a lock exists, update the flock structure with its
@@ -1334,6 +1392,7 @@ hash_lockowner(fl_owner_t owner)
{
return cifs_lock_secret ^ hash32_ptr((const void *)owner);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
struct lock_to_push {
struct list_head llist;
@@ -1344,6 +1403,7 @@ struct lock_to_push {
__u8 type;
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
@@ -1431,14 +1491,17 @@ err_out:
}
goto out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_push_locks(struct cifsFileInfo *cfile)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* we are going to update can_cache_brlcks here - need a write access */
cifs_down_write(&cinode->lock_sem);
@@ -1447,11 +1510,13 @@ cifs_push_locks(struct cifsFileInfo *cfile)
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
rc = cifs_push_posix_locks(cfile);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = tcon->ses->server->ops->push_mand_locks(cfile);
cinode->can_cache_brlcks = false;
@@ -1515,6 +1580,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
__u16 netfid = cfile->fid.netfid;
if (posix_lck) {
@@ -1534,6 +1600,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type, wait_flag);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
if (!rc)
@@ -1594,6 +1661,7 @@ cifs_free_llist(struct list_head *llist)
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
int
cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
unsigned int xid)
@@ -1706,6 +1774,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
kfree(buf);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
@@ -1719,6 +1788,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
struct TCP_Server_Info *server = tcon->ses->server;
struct inode *inode = d_inode(cfile->dentry);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (posix_lck) {
int posix_lock_type;
@@ -1740,7 +1810,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
NULL, posix_lock_type, wait_flag);
goto out;
}
-
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (lock) {
struct cifsLockInfo *lock;
@@ -1861,9 +1931,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
rc = -EACCES;
xid = get_xid();
- cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
- cmd, flock->fl_flags, flock->fl_type,
- flock->fl_start, flock->fl_end);
+ cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd,
+ flock->fl_flags, flock->fl_type, (long long)flock->fl_start,
+ (long long)flock->fl_end);
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -2204,6 +2274,185 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
return -ENOENT;
}
+void
+cifs_writedata_release(struct kref *refcount)
+{
+ struct cifs_writedata *wdata = container_of(refcount,
+ struct cifs_writedata, refcount);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
+
+ if (wdata->cfile)
+ cifsFileInfo_put(wdata->cfile);
+
+ kvfree(wdata->pages);
+ kfree(wdata);
+}
+
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+ int i, rc = 0;
+ struct inode *inode = d_inode(wdata->cfile->dentry);
+ struct TCP_Server_Info *server;
+ unsigned int rest_len;
+
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ i = 0;
+ rest_len = wdata->bytes;
+ do {
+ struct cifs_writedata *wdata2;
+ unsigned int j, nr_pages, wsize, tailsz, cur_len;
+
+ wsize = server->ops->wp_retry_size(inode);
+ if (wsize < rest_len) {
+ nr_pages = wsize / PAGE_SIZE;
+ if (!nr_pages) {
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ cur_len = nr_pages * PAGE_SIZE;
+ tailsz = PAGE_SIZE;
+ } else {
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
+ cur_len = rest_len;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
+ }
+
+ wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+ if (!wdata2) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ for (j = 0; j < nr_pages; j++) {
+ wdata2->pages[j] = wdata->pages[i + j];
+ lock_page(wdata2->pages[j]);
+ clear_page_dirty_for_io(wdata2->pages[j]);
+ }
+
+ wdata2->sync_mode = wdata->sync_mode;
+ wdata2->nr_pages = nr_pages;
+ wdata2->offset = page_offset(wdata2->pages[0]);
+ wdata2->pagesz = PAGE_SIZE;
+ wdata2->tailsz = tailsz;
+ wdata2->bytes = cur_len;
+
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
+ &wdata2->cfile);
+ if (!wdata2->cfile) {
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
+ }
+
+ for (j = 0; j < nr_pages; j++) {
+ unlock_page(wdata2->pages[j]);
+ if (rc != 0 && !is_retryable_error(rc)) {
+ SetPageError(wdata2->pages[j]);
+ end_page_writeback(wdata2->pages[j]);
+ put_page(wdata2->pages[j]);
+ }
+ }
+
+ kref_put(&wdata2->refcount, cifs_writedata_release);
+ if (rc) {
+ if (is_retryable_error(rc))
+ continue;
+ i += nr_pages;
+ break;
+ }
+
+ rest_len -= cur_len;
+ i += nr_pages;
+ } while (i < wdata->nr_pages);
+
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
+ if (rc != 0 && !is_retryable_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
+ kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+void
+cifs_writev_complete(struct work_struct *work)
+{
+ struct cifs_writedata *wdata = container_of(work,
+ struct cifs_writedata, work);
+ struct inode *inode = d_inode(wdata->cfile->dentry);
+ int i = 0;
+
+ if (wdata->result == 0) {
+ spin_lock(&inode->i_lock);
+ cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+ spin_unlock(&inode->i_lock);
+ cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+ wdata->bytes);
+ } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+ return cifs_writev_requeue(wdata);
+
+ for (i = 0; i < wdata->nr_pages; i++) {
+ struct page *page = wdata->pages[i];
+
+ if (wdata->result == -EAGAIN)
+ __set_page_dirty_nobuffers(page);
+ else if (wdata->result < 0)
+ SetPageError(page);
+ end_page_writeback(page);
+ cifs_readpage_to_fscache(inode, page);
+ put_page(page);
+ }
+ if (wdata->result != -EAGAIN)
+ mapping_set_error(inode->i_mapping, wdata->result);
+ kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+ struct page **pages =
+ kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (pages)
+ return cifs_writedata_direct_alloc(pages, complete);
+
+ return NULL;
+}
+
+struct cifs_writedata *
+cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+{
+ struct cifs_writedata *wdata;
+
+ wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
+ if (wdata != NULL) {
+ wdata->pages = pages;
+ kref_init(&wdata->refcount);
+ INIT_LIST_HEAD(&wdata->list);
+ init_completion(&wdata->done);
+ INIT_WORK(&wdata->work, complete);
+ }
+ return wdata;
+}
+
+
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
{
struct address_space *mapping = page->mapping;
@@ -3022,7 +3271,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
if (ctx->direct_io) {
ssize_t result;
- result = iov_iter_get_pages_alloc(
+ result = iov_iter_get_pages_alloc2(
from, &pagevec, cur_len, &start);
if (result < 0) {
cifs_dbg(VFS,
@@ -3036,7 +3285,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
break;
}
cur_len = (size_t)result;
- iov_iter_advance(from, cur_len);
nr_pages =
(cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -3758,7 +4006,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
if (ctx->direct_io) {
ssize_t result;
- result = iov_iter_get_pages_alloc(
+ result = iov_iter_get_pages_alloc2(
&direct_iov, &pagevec,
cur_len, &start);
if (result < 0) {
@@ -3774,7 +4022,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
cur_len = (size_t)result;
- iov_iter_advance(&direct_iov, cur_len);
rdata = cifs_readdata_direct_alloc(
pagevec, cifs_uncached_readv_complete);
@@ -4004,7 +4251,7 @@ static ssize_t __cifs_readv(
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (iter_is_iovec(to))
+ if (user_backed_iter(to))
ctx->should_dirty = true;
if (direct) {
@@ -4459,10 +4706,11 @@ static void cifs_readahead(struct readahead_control *ractl)
* TODO: Send a whole batch of pages to be read
* by the cache.
*/
- page = readahead_page(ractl);
- last_batch_size = 1 << thp_order(page);
+ struct folio *folio = readahead_folio(ractl);
+
+ last_batch_size = folio_nr_pages(folio);
if (cifs_readpage_from_fscache(ractl->mapping->host,
- page) < 0) {
+ &folio->page) < 0) {
/*
* TODO: Deal with cache read failure
* here, but for the moment, delegate
@@ -4470,7 +4718,7 @@ static void cifs_readahead(struct readahead_control *ractl)
*/
caching = false;
}
- unlock_page(page);
+ folio_unlock(folio);
next_cached++;
cache_nr_pages--;
if (cache_nr_pages == 0)
@@ -4811,8 +5059,6 @@ void cifs_oplock_break(struct work_struct *work)
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
bool purge_cache = false;
- bool is_deferred = false;
- struct cifs_deferred_close *dclose;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -4849,22 +5095,6 @@ void cifs_oplock_break(struct work_struct *work)
oplock_break_ack:
/*
- * When oplock break is received and there are no active
- * file handles but cached, then schedule deferred close immediately.
- * So, new open will not use cached handle.
- */
- spin_lock(&CIFS_I(inode)->deferred_lock);
- is_deferred = cifs_is_deferred_close(cfile, &dclose);
- spin_unlock(&CIFS_I(inode)->deferred_lock);
- if (is_deferred &&
- cfile->deferred_close_scheduled &&
- delayed_work_pending(&cfile->deferred)) {
- if (cancel_delayed_work(&cfile->deferred)) {
- _cifsFileInfo_put(cfile, false, false);
- goto oplock_break_done;
- }
- }
- /*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
* not bother sending an oplock release if session to server still is
@@ -4875,7 +5105,7 @@ oplock_break_ack:
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
-oplock_break_done:
+
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 8dc0d923ef6a..0e13dec86b25 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -147,6 +147,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("actimeo", Opt_actimeo),
fsparam_u32("acdirmax", Opt_acdirmax),
fsparam_u32("acregmax", Opt_acregmax),
+ fsparam_u32("closetimeo", Opt_closetimeo),
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
@@ -1074,6 +1075,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
break;
+ case Opt_closetimeo:
+ ctx->closetimeo = HZ * result.uint_32;
+ if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) {
+ cifs_errorf(fc, "closetimeo too large\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_echo_interval:
ctx->echo_interval = result.uint_32;
break;
@@ -1521,6 +1529,7 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->acregmax = CIFS_DEF_ACTIMEO;
ctx->acdirmax = CIFS_DEF_ACTIMEO;
+ ctx->closetimeo = SMB3_DEF_DCLOSETIMEO;
/* Most clients set timeout to 0, allows server to use its default */
ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index 5f093cb7e9b9..bbaee4c2281f 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -125,6 +125,7 @@ enum cifs_param {
Opt_actimeo,
Opt_acdirmax,
Opt_acregmax,
+ Opt_closetimeo,
Opt_echo_interval,
Opt_max_credits,
Opt_snapshot,
@@ -247,6 +248,8 @@ struct smb3_fs_context {
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
unsigned long acdirmax;
+ /* timeout for deferred close of files in jiffies */
+ unsigned long closetimeo;
struct smb_version_operations *ops;
struct smb_version_values *vals;
char *prepath;
@@ -279,4 +282,9 @@ static inline struct smb3_fs_context *smb3_fc2context(const struct fs_context *f
extern int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx);
extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb);
+/*
+ * max deferred close timeout (jiffies) - 2^30
+ */
+#define SMB3_MAX_DCLOSETIMEO (1 << 30)
+#define SMB3_DEF_DCLOSETIMEO (5 * HZ) /* Can increase later, other clients use larger */
#endif
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index aa3b941a5555..67b601041f0a 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -108,17 +108,6 @@ static inline void cifs_readpage_to_fscache(struct inode *inode,
__cifs_readpage_to_fscache(inode, page);
}
-static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- if (PageFsCache(page)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- wait_on_page_fscache(page);
- fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
- }
- return true;
-}
-
#else /* CONFIG_CIFS_FSCACHE */
static inline
void cifs_fscache_fill_coherency(struct inode *inode,
@@ -154,11 +143,6 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page)
static inline
void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {}
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- return true; /* May release page */
-}
-
#endif /* CONFIG_CIFS_FSCACHE */
#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 81da81e18553..bac08c20f559 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -25,6 +25,7 @@
#include "fscache.h"
#include "fs_context.h"
#include "cifs_ioctl.h"
+#include "cached_dir.h"
static void cifs_set_ops(struct inode *inode)
{
@@ -339,6 +340,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_get_file_info_unix(struct file *filp)
{
@@ -432,6 +434,14 @@ int cifs_get_inode_info_unix(struct inode **pinode,
cgiiu_exit:
return rc;
}
+#else
+int cifs_get_inode_info_unix(struct inode **pinode,
+ const unsigned char *full_path,
+ struct super_block *sb, unsigned int xid)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
@@ -795,6 +805,7 @@ static __u64 simple_hashstr(const char *str)
return hash;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/**
* cifs_backup_query_path_info - SMB1 fallback code to get ino
*
@@ -847,6 +858,7 @@ cifs_backup_query_path_info(int xid,
*data = (FILE_ALL_INFO *)info.srch_entries_start;
return 0;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static void
cifs_set_fattr_ino(int xid,
@@ -991,6 +1003,7 @@ cifs_get_inode_info(struct inode **inode,
rc = 0;
break;
case -EACCES:
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* perm errors, try again with backup flags if possible
*
@@ -1022,6 +1035,9 @@ cifs_get_inode_info(struct inode **inode,
/* nothing we can do, bail out */
goto out;
}
+#else
+ goto out;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
break;
default:
cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc);
@@ -1037,8 +1053,9 @@ cifs_get_inode_info(struct inode **inode,
/*
* 4. Tweak fattr based on mount options
*/
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
handle_mnt_opt:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* query for SFU type info if supported and needed */
if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -1223,7 +1240,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
static int
cifs_find_inode(struct inode *inode, void *opaque)
{
- struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+ struct cifs_fattr *fattr = opaque;
/* don't match inode with different uniqueid */
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
@@ -1247,7 +1264,7 @@ cifs_find_inode(struct inode *inode, void *opaque)
static int
cifs_init_inode(struct inode *inode, void *opaque)
{
- struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+ struct cifs_fattr *fattr = opaque;
CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
CIFS_I(inode)->createtime = fattr->cf_createtime;
@@ -1435,6 +1452,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
return server->ops->set_file_info(inode, full_path, &info_buf, xid);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
* and rename it to a random name that hopefully won't conflict with
@@ -1565,6 +1583,7 @@ undo_setattr:
goto out_close;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* copied from fs/nfs/dir.c with small changes */
static void
@@ -1627,6 +1646,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
}
cifs_close_deferred_file_under_dentry(tcon, full_path);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -1636,6 +1656,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
if ((rc == 0) || (rc == -ENOENT))
goto psx_del_no_retry;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
retry_std_delete:
if (!server->ops->unlink) {
@@ -1714,9 +1735,11 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
if (tcon->posix_extensions)
rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
else if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
xid);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
else
rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
xid, NULL);
@@ -1746,6 +1769,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
if (parent->i_mode & S_ISGID)
mode |= S_ISGID;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext) {
struct cifs_unix_set_info_args args = {
.mode = mode,
@@ -1768,6 +1792,9 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
} else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
struct TCP_Server_Info *server = tcon->ses->server;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
(mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
@@ -1788,6 +1815,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
return 0;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -1850,6 +1878,7 @@ posix_mkdir_get_info:
xid);
goto posix_mkdir_out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
struct dentry *direntry, umode_t mode)
@@ -1892,6 +1921,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
goto mkdir_out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
@@ -1899,6 +1929,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
if (rc != -EOPNOTSUPP)
goto mkdir_out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (!server->ops->mkdir) {
rc = -ENOSYS;
@@ -2015,9 +2046,12 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifs_fid fid;
struct cifs_open_parms oparms;
- int oplock, rc;
+ int oplock;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
+ int rc;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -2043,6 +2077,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (server->vals->protocol_id != 0)
goto do_rename_exit;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* open-file renames don't work across directories */
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
@@ -2064,6 +2099,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
cifs_sb->local_nls, cifs_remap(cifs_sb));
CIFSSMBClose(xid, tcon, fid.netfid);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
if (rc == 0)
d_move(from_dentry, to_dentry);
@@ -2081,11 +2117,13 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
- FILE_UNIX_BASIC_INFO *info_buf_target;
unsigned int xid;
int rc, tmprc;
int retry_count = 0;
+ FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ FILE_UNIX_BASIC_INFO *info_buf_target;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -2139,6 +2177,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
if (flags & RENAME_NOREPLACE)
goto cifs_rename_exit;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (rc == -EEXIST && tcon->unix_ext) {
/*
* Are src and dst hardlinks of same inode? We can only tell
@@ -2178,6 +2217,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
*/
unlink_target:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
+
/* Try unlinking the target dentry if it's not negative */
if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
if (d_is_dir(target_dentry))
@@ -2337,14 +2378,18 @@ int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
struct dentry *dentry = file_dentry(filp);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (!cifs_dentry_needs_reval(dentry))
return rc;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tlink_tcon(cfile->tlink)->unix_ext)
rc = cifs_get_file_info_unix(filp);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_get_file_info(filp);
return rc;
@@ -2653,6 +2698,7 @@ set_size_out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
{
@@ -2800,6 +2846,7 @@ out:
free_xid(xid);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
@@ -2995,16 +3042,20 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
struct iattr *attrs)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
- struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
int rc, retries = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
do {
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (pTcon->unix_ext)
rc = cifs_setattr_unix(direntry, attrs);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_setattr_nounix(direntry, attrs);
retries++;
} while (is_retryable_error(rc) && retries < 2);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0359b604bdbc..b6e6e5d6c8dd 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -333,6 +333,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(pSMBFile->tlink);
caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (CIFS_UNIX_EXTATTR_CAP & caps) {
__u64 ExtAttrMask = 0;
rc = CIFSGetExtAttr(xid, tcon,
@@ -345,6 +346,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
if (rc != EOPNOTSUPP)
break;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
#endif /* CONFIG_CIFS_POSIX */
rc = 0;
if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index bbdf3281559c..6803cb27eecc 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -286,6 +286,7 @@ out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* SMB 1.0 Protocol specific functions
*/
@@ -368,6 +369,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
CIFSSMBClose(xid, tcon, fid.netfid);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/*
* SMB 2.1/SMB3 Protocol specific functions
@@ -532,11 +534,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
goto cifs_hl_exit;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext)
rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
server = tcon->ses->server;
if (!server->ops->create_hardlink) {
rc = -ENOSYS;
@@ -704,10 +710,12 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* else
rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
cifs_sb_target->local_nls); */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0e84e6fcf8ab..87f60f736731 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -23,6 +23,7 @@
#include "dns_resolve.h"
#endif
#include "fs_context.h"
+#include "cached_dir.h"
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -69,6 +70,7 @@ sesInfoAlloc(void)
ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
if (ret_buf) {
atomic_inc(&sesInfoAllocCount);
+ spin_lock_init(&ret_buf->ses_lock);
ret_buf->ses_status = SES_NEW;
++ret_buf->ses_count;
INIT_LIST_HEAD(&ret_buf->smb_ses_list);
@@ -115,21 +117,19 @@ tconInfoAlloc(void)
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->crfid.fid = kzalloc(sizeof(*ret_buf->crfid.fid), GFP_KERNEL);
- if (!ret_buf->crfid.fid) {
+ ret_buf->cfid = init_cached_dir();
+ if (!ret_buf->cfid) {
kfree(ret_buf);
return NULL;
}
- INIT_LIST_HEAD(&ret_buf->crfid.dirents.entries);
- mutex_init(&ret_buf->crfid.dirents.de_mutex);
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
++ret_buf->tc_count;
+ spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
spin_lock_init(&ret_buf->open_file_lock);
- mutex_init(&ret_buf->crfid.fid_mutex);
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
@@ -138,17 +138,17 @@ tconInfoAlloc(void)
}
void
-tconInfoFree(struct cifs_tcon *buf_to_free)
+tconInfoFree(struct cifs_tcon *tcon)
{
- if (buf_to_free == NULL) {
+ if (tcon == NULL) {
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
+ free_cached_dir(tcon);
atomic_dec(&tconInfoAllocCount);
- kfree(buf_to_free->nativeFileSystem);
- kfree_sensitive(buf_to_free->password);
- kfree(buf_to_free->crfid.fid);
- kfree(buf_to_free);
+ kfree(tcon->nativeFileSystem);
+ kfree_sensitive(tcon->password);
+ kfree(tcon);
}
struct smb_hdr *
@@ -172,9 +172,9 @@ cifs_buf_get(void)
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
memset(ret_buf, 0, buf_size + 3);
- atomic_inc(&bufAllocCount);
+ atomic_inc(&buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totBufAllocCount);
+ atomic_inc(&total_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
@@ -189,7 +189,7 @@ cifs_buf_release(void *buf_to_free)
}
mempool_free(buf_to_free, cifs_req_poolp);
- atomic_dec(&bufAllocCount);
+ atomic_dec(&buf_alloc_count);
return;
}
@@ -205,9 +205,9 @@ cifs_small_buf_get(void)
ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
- atomic_inc(&smBufAllocCount);
+ atomic_inc(&small_buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totSmBufAllocCount);
+ atomic_inc(&total_small_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
@@ -223,7 +223,7 @@ cifs_small_buf_release(void *buf_to_free)
}
mempool_free(buf_to_free, cifs_sm_req_poolp);
- atomic_dec(&smBufAllocCount);
+ atomic_dec(&small_buf_alloc_count);
return;
}
@@ -354,7 +354,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
/* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb))
return -EIO;
- clc_len = smbCalcSize(smb, server);
+ clc_len = smbCalcSize(smb);
if (4 + rfclen != total_read) {
cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
@@ -400,7 +400,6 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
- struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *pCifsInode;
@@ -467,18 +466,14 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &srv->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp2, &tcon->openFileList) {
- netfile = list_entry(tmp2, struct cifsFileInfo,
- tlist);
+ list_for_each_entry(netfile, &tcon->openFileList, tlist) {
if (pSMB->Fid != netfile->fid.netfid)
continue;
@@ -742,6 +737,8 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -763,16 +760,16 @@ void
cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
- struct list_head *tmp;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -793,7 +790,6 @@ void
cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
{
struct cifsFileInfo *cfile;
- struct list_head *tmp;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
void *page;
@@ -802,12 +798,13 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
INIT_LIST_HEAD(&file_head);
page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
full_path = build_path_from_dentry(cfile->dentry, page);
if (strstr(full_path, path)) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -1029,7 +1026,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
saved_len = count;
while (count && npages < max_pages) {
- rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+ rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start);
if (rc < 0) {
cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc);
break;
@@ -1041,7 +1038,6 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
break;
}
- iov_iter_advance(iter, rc);
count -= rc;
rc += start;
cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 235aa1b395eb..1b52e6ac431c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -909,9 +909,9 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr)
* portion, the number of word parameters and the data portion of the message
*/
unsigned int
-smbCalcSize(void *buf, struct TCP_Server_Info *server)
+smbCalcSize(void *buf)
{
- struct smb_hdr *ptr = (struct smb_hdr *)buf;
+ struct smb_hdr *ptr = buf;
return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
2 /* size of the bcc field */ + get_bcc(ptr));
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 384cabdf47ca..8e060c00c969 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -21,6 +21,7 @@
#include "cifsfs.h"
#include "smb2proto.h"
#include "fs_context.h"
+#include "cached_dir.h"
/*
* To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -805,8 +806,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
end_of_smb = cfile->srch_inf.ntwrk_buf_start +
server->ops->calc_smb_size(
- cfile->srch_inf.ntwrk_buf_start,
- server);
+ cfile->srch_inf.ntwrk_buf_start);
cur_ent = cfile->srch_inf.srch_entries_start;
first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
@@ -1071,7 +1071,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
tcon = tlink_tcon(cifsFile->tlink);
}
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
cifs_put_tlink(tlink);
if (rc)
goto cache_not_found;
@@ -1142,7 +1142,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
tcon = tlink_tcon(cifsFile->tlink);
rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path,
&current_entry, &num_to_fill);
- open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
+ open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
if (rc) {
cifs_dbg(FYI, "fce error %d\n", rc);
goto rddir2_exit;
@@ -1160,8 +1160,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start,
- tcon->ses->server);
+ cifsFile->srch_inf.ntwrk_buf_start);
end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 02c8b2906196..3af3b05b6c74 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -499,6 +499,7 @@ out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
struct TCP_Server_Info *server,
SESSION_SETUP_ANDX *pSMB)
@@ -591,7 +592,6 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*pbcc_area = bcc_ptr;
}
-
static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -753,6 +753,7 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
for it later, but it is not very important */
cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
struct cifs_ses *ses)
@@ -1170,6 +1171,7 @@ struct sess_data {
struct kvec iov[3];
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
sess_alloc_buffer(struct sess_data *sess_data, int wct)
{
@@ -1846,3 +1848,4 @@ out:
kfree(sess_data);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 2e20ee4dab7b..f36b2d2d40ca 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -92,17 +92,17 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct mid_q_entry *mid;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if (compare_mid(mid->mid, buf) &&
mid->mid_state == MID_REQUEST_SUBMITTED &&
le16_to_cpu(mid->command) == buf->Command) {
kref_get(&mid->refcount);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return NULL;
}
@@ -166,7 +166,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
__u16 last_mid, cur_mid;
bool collision, reconnect = false;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
/* mid is 16 bit only for CIFS/SMB */
cur_mid = (__u16)((server->CurrentMid) & 0xffff);
@@ -225,7 +225,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
}
cur_mid++;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
if (reconnect) {
cifs_signal_cifsd_for_reconnect(server, false);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index f5dcc4940b6d..9dfd2dd612c2 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -61,7 +61,6 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
- true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
CIFSMaxBufSize, NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 8571a459c710..b83f59051b26 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -23,6 +23,7 @@
#include "smb2glob.h"
#include "smb2pdu.h"
#include "smb2proto.h"
+#include "cached_dir.h"
static void
free_set_inf_compound(struct smb_rqst *rqst)
@@ -515,16 +516,16 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (strcmp(full_path, ""))
rc = -ENOENT;
else
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
/* If it is a root and its handle is cached then use it */
if (!rc) {
- if (tcon->crfid.file_all_info_is_valid) {
+ if (cfid->file_all_info_is_valid) {
move_smb2_info_to_cifs(data,
- &tcon->crfid.file_all_info);
+ &cfid->file_all_info);
} else {
rc = SMB2_query_info(xid, tcon,
- cfid->fid->persistent_fid,
- cfid->fid->volatile_fid, smb2_data);
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid, smb2_data);
if (!rc)
move_smb2_info_to_cifs(data, smb2_data);
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 17813c3d0c6e..d73e5672aac4 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -16,6 +16,7 @@
#include "smb2status.h"
#include "smb2glob.h"
#include "nterr.h"
+#include "cached_dir.h"
static int
check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
@@ -132,15 +133,15 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
}
int
-smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
+smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
- __u64 mid;
- __u32 clc_len; /* calculated length */
- int command;
- int pdu_size = sizeof(struct smb2_pdu);
int hdr_size = sizeof(struct smb2_hdr);
+ int pdu_size = sizeof(struct smb2_pdu);
+ int command;
+ __u32 calc_len; /* calculated length */
+ __u64 mid;
/*
* Add function to do table lookup of StructureSize by command
@@ -154,7 +155,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) {
if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
ses = iter;
break;
@@ -221,30 +222,33 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
}
}
- clc_len = smb2_calc_size(buf, srvr);
+ calc_len = smb2_calc_size(buf);
+
+ /* For SMB2_IOCTL, OutputOffset and OutputLength are optional, so might
+ * be 0, and not a real miscalculation */
+ if (command == SMB2_IOCTL_HE && calc_len == 0)
+ return 0;
- if (shdr->Command == SMB2_NEGOTIATE)
- clc_len += get_neg_ctxt_len(shdr, len, clc_len);
+ if (command == SMB2_NEGOTIATE_HE)
+ calc_len += get_neg_ctxt_len(shdr, len, calc_len);
- if (len != clc_len) {
- cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
- clc_len, len, mid);
+ if (len != calc_len) {
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
- if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
+ if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
- if (clc_len == len + 1)
+ if (calc_len == len + 1)
return 0;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
- if (((clc_len + 7) & ~7) == len)
+ if (((calc_len + 7) & ~7) == len)
return 0;
/*
@@ -253,12 +257,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
* SMB2/SMB3 frame length (header + smb2 response specific data)
* Some windows servers also pad up to 8 bytes when compounding.
*/
- if (clc_len < len)
+ if (calc_len < len)
return 0;
- pr_warn_once(
- "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n",
- len, clc_len, command, mid);
+ /* Only log a message if len was really miscalculated */
+ if (unlikely(cifsFYI))
+ cifs_dbg(FYI, "Server response too short: calculated "
+ "length %u doesn't match read length %u (cmd=%d, mid=%llu)\n",
+ calc_len, len, command, mid);
+ else
+ pr_warn("Server response too short: calculated length "
+ "%u doesn't match read length %u (cmd=%d, mid=%llu)\n",
+ calc_len, len, command, mid);
return 1;
}
@@ -400,9 +410,9 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
* portion, the number of word parameters and the data portion of the message.
*/
unsigned int
-smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
+smb2_calc_size(void *buf)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_pdu *pdu = buf;
struct smb2_hdr *shdr = &pdu->hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
@@ -639,15 +649,7 @@ smb2_is_valid_lease_break(char *buffer)
}
spin_unlock(&tcon->open_file_lock);
- if (tcon->crfid.is_valid &&
- !memcmp(rsp->LeaseKey,
- tcon->crfid.fid->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- tcon->crfid.time = 0;
- INIT_WORK(&tcon->crfid.lease_break,
- smb2_cached_lease_break);
- queue_work(cifsiod_wq,
- &tcon->crfid.lease_break);
+ if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 8802995b2d3d..421be43af425 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -27,6 +27,7 @@
#include "smbdirect.h"
#include "fscache.h"
#include "fs_context.h"
+#include "cached_dir.h"
/* Change credits for different ops and return the total number of credits */
static int
@@ -126,13 +127,13 @@ smb2_add_credits(struct TCP_Server_Info *server,
optype, scredits, add);
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect
|| server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
switch (rc) {
case -1:
@@ -218,12 +219,12 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_lock(&server->req_lock);
} else {
spin_unlock(&server->req_lock);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
spin_lock(&server->req_lock);
scredits = server->credits;
@@ -319,19 +320,19 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
{
__u64 mid;
/* for SMB2 we need the current value */
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
mid = server->CurrentMid++;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
static void
smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (server->CurrentMid >= val)
server->CurrentMid -= val;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
static struct mid_q_entry *
@@ -346,7 +347,7 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
return NULL;
}
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if ((mid->mid == wire_mid) &&
(mid->mid_state == MID_REQUEST_SUBMITTED) &&
@@ -356,11 +357,11 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return NULL;
}
@@ -386,7 +387,7 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
shdr->Id.SyncId.ProcessId);
cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
- server->ops->calc_smb_size(buf, server));
+ server->ops->calc_smb_size(buf));
#endif
}
@@ -403,9 +404,9 @@ smb2_negotiate(const unsigned int xid,
{
int rc;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
server->CurrentMid = 0;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
@@ -680,7 +681,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+ FSCTL_QUERY_NETWORK_INTERFACE_INFO,
NULL /* no data input */, 0 /* no data input */,
CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
if (rc == -EOPNOTSUPP) {
@@ -702,300 +703,6 @@ out:
}
static void
-smb2_close_cached_fid(struct kref *ref)
-{
- struct cached_fid *cfid = container_of(ref, struct cached_fid,
- refcount);
- struct cached_dirent *dirent, *q;
-
- if (cfid->is_valid) {
- cifs_dbg(FYI, "clear cached root file handle\n");
- SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
- cfid->fid->volatile_fid);
- }
-
- /*
- * We only check validity above to send SMB2_close,
- * but we still need to invalidate these entries
- * when this function is called
- */
- cfid->is_valid = false;
- cfid->file_all_info_is_valid = false;
- cfid->has_lease = false;
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- /*
- * Delete all cached dirent names
- */
- mutex_lock(&cfid->dirents.de_mutex);
- list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
- list_del(&dirent->entry);
- kfree(dirent->name);
- kfree(dirent);
- }
- cfid->dirents.is_valid = 0;
- cfid->dirents.is_failed = 0;
- cfid->dirents.ctx = NULL;
- cfid->dirents.pos = 0;
- mutex_unlock(&cfid->dirents.de_mutex);
-
-}
-
-void close_cached_dir(struct cached_fid *cfid)
-{
- mutex_lock(&cfid->fid_mutex);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- mutex_unlock(&cfid->fid_mutex);
-}
-
-void close_cached_dir_lease_locked(struct cached_fid *cfid)
-{
- if (cfid->has_lease) {
- cfid->has_lease = false;
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
-}
-
-void close_cached_dir_lease(struct cached_fid *cfid)
-{
- mutex_lock(&cfid->fid_mutex);
- close_cached_dir_lease_locked(cfid);
- mutex_unlock(&cfid->fid_mutex);
-}
-
-void
-smb2_cached_lease_break(struct work_struct *work)
-{
- struct cached_fid *cfid = container_of(work,
- struct cached_fid, lease_break);
-
- close_cached_dir_lease(cfid);
-}
-
-/*
- * Open the and cache a directory handle.
- * Only supported for the root handle.
- * If error then *cfid is not initialized.
- */
-int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
- const char *path,
- struct cifs_sb_info *cifs_sb,
- struct cached_fid **cfid)
-{
- struct cifs_ses *ses;
- struct TCP_Server_Info *server;
- struct cifs_open_parms oparms;
- struct smb2_create_rsp *o_rsp = NULL;
- struct smb2_query_info_rsp *qi_rsp = NULL;
- int resp_buftype[2];
- struct smb_rqst rqst[2];
- struct kvec rsp_iov[2];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec qi_iov[1];
- int rc, flags = 0;
- __le16 utf16_path = 0; /* Null - since an open of top of share */
- u8 oplock = SMB2_OPLOCK_LEVEL_II;
- struct cifs_fid *pfid;
- struct dentry *dentry;
-
- if (tcon == NULL || tcon->nohandlecache ||
- is_smb1_server(tcon->ses->server))
- return -ENOTSUPP;
-
- ses = tcon->ses;
- server = ses->server;
-
- if (cifs_sb->root == NULL)
- return -ENOENT;
-
- if (strlen(path))
- return -ENOENT;
-
- dentry = cifs_sb->root;
-
- mutex_lock(&tcon->crfid.fid_mutex);
- if (tcon->crfid.is_valid) {
- cifs_dbg(FYI, "found a cached root file handle\n");
- *cfid = &tcon->crfid;
- kref_get(&tcon->crfid.refcount);
- mutex_unlock(&tcon->crfid.fid_mutex);
- return 0;
- }
-
- /*
- * We do not hold the lock for the open because in case
- * SMB2_open needs to reconnect, it will end up calling
- * cifs_mark_open_files_invalid() which takes the lock again
- * thus causing a deadlock
- */
-
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- if (!server->ops->new_lease_key)
- return -EIO;
-
- pfid = tcon->crfid.fid;
- server->ops->new_lease_key(pfid);
-
- memset(rqst, 0, sizeof(rqst));
- resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
-
- /* Open */
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
- rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
-
- oparms.tcon = tcon;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE);
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.fid = pfid;
- oparms.reconnect = false;
-
- rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, &utf16_path);
- if (rc)
- goto oshr_free;
- smb2_set_next_command(tcon, &rqst[0]);
-
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, server,
- &rqst[1], COMPOUND_FID,
- COMPOUND_FID, FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- if (rc)
- goto oshr_free;
-
- smb2_set_related(&rqst[1]);
-
- rc = compound_send_recv(xid, ses, server,
- flags, 2, rqst,
- resp_buftype, rsp_iov);
- mutex_lock(&tcon->crfid.fid_mutex);
-
- /*
- * Now we need to check again as the cached root might have
- * been successfully re-opened from a concurrent process
- */
-
- if (tcon->crfid.is_valid) {
- /* work was already done */
-
- /* stash fids for close() later */
- struct cifs_fid fid = {
- .persistent_fid = pfid->persistent_fid,
- .volatile_fid = pfid->volatile_fid,
- };
-
- /*
- * caller expects this func to set the fid in crfid to valid
- * cached root, so increment the refcount.
- */
- kref_get(&tcon->crfid.refcount);
-
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- if (rc == 0) {
- /* close extra handle outside of crit sec */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- }
- rc = 0;
- goto oshr_free;
- }
-
- /* Cached root is still invalid, continue normaly */
-
- if (rc) {
- if (rc == -EREMCHG) {
- tcon->need_reconnect = true;
- pr_warn_once("server share %s deleted\n",
- tcon->treeName);
- }
- goto oshr_exit;
- }
-
- atomic_inc(&tcon->num_remote_opens);
-
- o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- oparms.fid->persistent_fid = o_rsp->PersistentFileId;
- oparms.fid->volatile_fid = o_rsp->VolatileFileId;
-#ifdef CONFIG_CIFS_DEBUG2
- oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
-#endif /* CIFS_DEBUG2 */
-
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- tcon->crfid.dentry = dentry;
- dget(dentry);
- kref_init(&tcon->crfid.refcount);
-
- /* BB TBD check to see if oplock level check can be removed below */
- if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
- /*
- * See commit 2f94a3125b87. Increment the refcount when we
- * get a lease for root, release it if lease break occurs
- */
- kref_get(&tcon->crfid.refcount);
- tcon->crfid.has_lease = true;
- smb2_parse_contexts(server, o_rsp,
- &oparms.fid->epoch,
- oparms.fid->lease_key, &oplock,
- NULL, NULL);
- } else
- goto oshr_exit;
-
- qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
- goto oshr_exit;
- if (!smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- sizeof(struct smb2_file_all_info),
- &rsp_iov[1], sizeof(struct smb2_file_all_info),
- (char *)&tcon->crfid.file_all_info))
- tcon->crfid.file_all_info_is_valid = true;
- tcon->crfid.time = jiffies;
-
-
-oshr_exit:
- mutex_unlock(&tcon->crfid.fid_mutex);
-oshr_free:
- SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- if (rc == 0)
- *cfid = &tcon->crfid;
- return rc;
-}
-
-int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
- struct dentry *dentry,
- struct cached_fid **cfid)
-{
- mutex_lock(&tcon->crfid.fid_mutex);
- if (tcon->crfid.dentry == dentry) {
- cifs_dbg(FYI, "found a cached root file handle by dentry\n");
- *cfid = &tcon->crfid;
- kref_get(&tcon->crfid.refcount);
- mutex_unlock(&tcon->crfid.fid_mutex);
- return 0;
- }
- mutex_unlock(&tcon->crfid.fid_mutex);
- return -ENOENT;
-}
-
-static void
smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb)
{
@@ -1013,9 +720,9 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid);
+ rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid);
if (rc == 0)
- memcpy(&fid, cfid->fid, sizeof(struct cifs_fid));
+ memcpy(&fid, &cfid->fid, sizeof(struct cifs_fid));
else
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
@@ -1076,9 +783,16 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ struct cached_fid *cfid;
- if ((*full_path == 0) && tcon->crfid.is_valid)
- return 0;
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
+ if (!rc) {
+ if (cfid->is_valid) {
+ close_cached_dir(cfid);
+ return 0;
+ }
+ close_cached_dir(cfid);
+ }
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
@@ -1145,9 +859,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
size_t name_len, value_len, user_name_len;
while (src_size > 0) {
- name = &src->ea_data[0];
name_len = (size_t)src->ea_name_length;
- value = &src->ea_data[src->ea_name_length + 1];
value_len = (size_t)le16_to_cpu(src->ea_value_length);
if (name_len == 0)
@@ -1159,6 +871,9 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
goto out;
}
+ name = &src->ea_data[0];
+ value = &src->ea_data[src->ea_name_length + 1];
+
if (ea_name) {
if (ea_name_len == name_len &&
memcmp(ea_name, name, name_len) == 0) {
@@ -1608,9 +1323,8 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
struct resume_key_req *res_key;
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- NULL, 0 /* no input */, CIFSMaxBufSize,
- (char **)&res_key, &ret_data_len);
+ FSCTL_SRV_REQUEST_RESUME_KEY, NULL, 0 /* no input */,
+ CIFSMaxBufSize, (char **)&res_key, &ret_data_len);
if (rc == -EOPNOTSUPP) {
pr_warn_once("Server share %s does not support copy range\n", tcon->treeName);
@@ -1752,7 +1466,7 @@ smb2_ioctl_query_info(const unsigned int xid,
rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.info_type, true, buffer, qi.output_buffer_length,
+ qi.info_type, buffer, qi.output_buffer_length,
CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
free_req1_func = SMB2_ioctl_free;
@@ -1886,17 +1600,8 @@ smb2_copychunk_range(const unsigned int xid,
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
- struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
-
- /*
- * We need to flush all unwritten data before we can send the
- * copychunk ioctl to the server.
- */
- inode = d_inode(trgtfile->dentry);
- filemap_write_and_wait(inode->i_mapping);
-
if (pcchunk == NULL)
return -ENOMEM;
@@ -1928,9 +1633,8 @@ smb2_copychunk_range(const unsigned int xid,
retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
- true /* is_fsctl */, (char *)pcchunk,
- sizeof(struct copychunk_ioctl), CIFSMaxBufSize,
- (char **)&retbuf, &ret_data_len);
+ (char *)pcchunk, sizeof(struct copychunk_ioctl),
+ CIFSMaxBufSize, (char **)&retbuf, &ret_data_len);
if (rc == 0) {
if (ret_data_len !=
sizeof(struct copychunk_ioctl_rsp)) {
@@ -2090,7 +1794,6 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
- true /* is_fctl */,
&setsparse, 1, CIFSMaxBufSize, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
@@ -2173,7 +1876,6 @@ smb2_duplicate_extents(const unsigned int xid,
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
- true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
CIFSMaxBufSize, NULL,
@@ -2208,7 +1910,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SET_INTEGRITY_INFORMATION,
- true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
CIFSMaxBufSize, NULL,
@@ -2261,7 +1962,6 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
- true /* is_fsctl */,
NULL, 0 /* no input data */, max_response_size,
(char **)&retbuf,
&ret_data_len);
@@ -2574,7 +2274,6 @@ static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
- struct list_head *tmp, *tmp1;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -2582,12 +2281,12 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
return;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
+ spin_lock(&tcon->tc_lock);
tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
tcon->treeName);
@@ -2723,8 +2422,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
+ /*
+ * We can only call this for things we know are directories.
+ */
if (!strcmp(path, ""))
- open_cached_dir(xid, tcon, path, cifs_sb, &cfid); /* cfid null if open dir failed */
+ open_cached_dir(xid, tcon, path, cifs_sb, false,
+ &cfid); /* cfid null if open dir failed */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -2750,8 +2453,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
if (cfid) {
rc = SMB2_query_info_init(tcon, server,
&rqst[1],
- cfid->fid->persistent_fid,
- cfid->fid->volatile_fid,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
class, type, 0,
output_len, 0,
NULL);
@@ -2981,7 +2684,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
do {
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
- true /* is_fsctl */,
(char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
if (!is_retryable_error(rc))
@@ -3188,8 +2890,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], fid.persistent_fid,
- fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3369,8 +3070,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], COMPOUND_FID,
- COMPOUND_FID, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3598,26 +3298,43 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
return pntsd;
}
+static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon,
+ loff_t offset, loff_t len, unsigned int xid)
+{
+ struct cifsFileInfo *cfile = file->private_data;
+ struct file_zero_data_information fsctl_buf;
+
+ cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
+
+ fsctl_buf.FileOffset = cpu_to_le64(offset);
+ fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+ return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information),
+ 0, NULL, NULL);
+}
+
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
struct cifs_ses *ses = tcon->ses;
- struct inode *inode;
- struct cifsInodeInfo *cifsi;
+ struct inode *inode = file_inode(file);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
- struct file_zero_data_information fsctl_buf;
long rc;
unsigned int xid;
__le64 eof;
xid = get_xid();
- inode = d_inode(cfile->dentry);
- cifsi = CIFS_I(inode);
-
trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
ses->Suid, offset, len);
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
/*
* We zero the range through ioctl, so we need remove the page caches
* first, otherwise the data may be inconsistent with the server.
@@ -3625,26 +3342,12 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
truncate_pagecache_range(inode, offset, offset + len - 1);
/* if file not oplocked can't be sure whether asking to extend size */
- if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false) {
- rc = -EOPNOTSUPP;
- trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
- tcon->tid, ses->Suid, offset, len, rc);
- free_xid(xid);
- return rc;
- }
-
- cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
-
- fsctl_buf.FileOffset = cpu_to_le64(offset);
- fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+ rc = -EOPNOTSUPP;
+ if (keep_size == false && !CIFS_CACHE_READ(cifsi))
+ goto zero_range_exit;
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true,
- (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information),
- 0, NULL, NULL);
- if (rc)
+ rc = smb3_zero_data(file, tcon, offset, len, xid);
+ if (rc < 0)
goto zero_range_exit;
/*
@@ -3657,6 +3360,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
}
zero_range_exit:
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
free_xid(xid);
if (rc)
trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
@@ -3670,7 +3375,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3679,14 +3384,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3703,11 +3406,13 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
+ (char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
@@ -3763,7 +3468,7 @@ static int smb3_simple_fallocate_range(unsigned int xid,
in_data.length = cpu_to_le64(len);
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -3964,39 +3669,50 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
- struct cifsInodeInfo *cifsi;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
__le64 eof;
+ loff_t old_eof;
xid = get_xid();
- inode = d_inode(cfile->dentry);
- cifsi = CIFS_I(inode);
+ inode_lock(inode);
- if (off >= i_size_read(inode) ||
- off + len >= i_size_read(inode)) {
+ old_eof = i_size_read(inode);
+ if ((off >= old_eof) ||
+ off + len >= old_eof) {
rc = -EINVAL;
goto out;
}
+ filemap_invalidate_lock(inode->i_mapping);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof - 1);
+ if (rc < 0)
+ goto out_2;
+
+ truncate_pagecache_range(inode, off, old_eof);
+
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(inode) - off - len, off);
+ old_eof - off - len, off);
if (rc < 0)
- goto out;
+ goto out_2;
- eof = cpu_to_le64(i_size_read(inode) - len);
+ eof = cpu_to_le64(old_eof - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
- goto out;
+ goto out_2;
rc = 0;
cifsi->server_eof = i_size_read(inode) - len;
truncate_setsize(inode, cifsi->server_eof);
fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+out_2:
+ filemap_invalidate_unlock(inode->i_mapping);
out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -4007,34 +3723,47 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
int rc;
unsigned int xid;
struct cifsFileInfo *cfile = file->private_data;
+ struct inode *inode = file_inode(file);
__le64 eof;
- __u64 count;
+ __u64 count, old_eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode)) {
+ inode_lock(inode);
+
+ old_eof = i_size_read(inode);
+ if (off >= old_eof) {
rc = -EINVAL;
goto out;
}
- count = i_size_read(file->f_inode) - off;
- eof = cpu_to_le64(i_size_read(file->f_inode) + len);
+ count = old_eof - off;
+ eof = cpu_to_le64(old_eof + len);
+
+ filemap_invalidate_lock(inode->i_mapping);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1);
+ if (rc < 0)
+ goto out_2;
+ truncate_pagecache_range(inode, off, old_eof);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
- goto out;
+ goto out_2;
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
if (rc < 0)
- goto out;
+ goto out_2;
- rc = smb3_zero_range(file, tcon, off, len, 1);
+ rc = smb3_zero_data(file, tcon, off, len, xid);
if (rc < 0)
- goto out;
+ goto out_2;
rc = 0;
+out_2:
+ filemap_invalidate_unlock(inode->i_mapping);
out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -4084,7 +3813,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -4144,7 +3873,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -4563,9 +4292,11 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
if (ses->Suid == ses_id) {
+ spin_lock(&ses->ses_lock);
ses_enc_key = enc ? ses->smb3encryptionkey :
ses->smb3decryptionkey;
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ spin_unlock(&ses->ses_lock);
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
@@ -5080,23 +4811,24 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->callback(mid);
} else {
- spin_lock(&cifs_tcp_ses_lock);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&dw->server->srv_lock);
if (dw->server->tcpStatus == CifsNeedReconnect) {
+ spin_lock(&dw->server->mid_lock);
mid->mid_state = MID_RETRY_NEEDED;
- spin_unlock(&GlobalMid_Lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->srv_lock);
mid->callback(mid);
} else {
+ spin_lock(&dw->server->mid_lock);
mid->mid_state = MID_REQUEST_SUBMITTED;
mid->mid_flags &= ~(MID_DELETED);
list_add_tail(&mid->qhead,
&dw->server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->srv_lock);
}
}
- cifs_mid_q_entry_release(mid);
+ release_mid(mid);
}
free_pages:
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index c705de32e225..6352ab32c7e7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -39,6 +39,7 @@
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
+#include "cached_dir.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -162,7 +163,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
return 0;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_EXITING) {
/*
* only tree disconnect, open, and write,
@@ -172,13 +173,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if ((smb2_command != SMB2_WRITE) &&
(smb2_command != SMB2_CREATE) &&
(smb2_command != SMB2_TREE_DISCONNECT)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb2_command);
return -ENODEV;
}
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) ||
(!tcon->ses->server) || !server)
return -EIO;
@@ -217,12 +218,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
}
/* are we still trying to reconnect? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
break;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (retries && --retries)
continue;
@@ -256,13 +257,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = -EHOSTDOWN;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* need to prevent multiple threads trying to simultaneously
@@ -354,7 +355,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
void *buf,
unsigned int *total_len)
{
- struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
+ struct smb2_pdu *spdu = buf;
/* lookup word count ie StructureSize from table */
__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
@@ -964,16 +965,17 @@ SMB2_negotiate(const unsigned int xid,
} else if (rc != 0)
goto neg_exit;
+ rc = -EIO;
if (strcmp(server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2.1 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
server->ops = &smb311_operations;
@@ -984,7 +986,7 @@ SMB2_negotiate(const unsigned int xid,
if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
server->ops = &smb21_operations;
@@ -998,7 +1000,7 @@ SMB2_negotiate(const unsigned int xid,
/* if requested single dialect ensure returned dialect matched */
cifs_server_dbg(VFS, "Invalid 0x%x dialect returned: not requested\n",
le16_to_cpu(rsp->DialectRevision));
- return -EIO;
+ goto neg_exit;
}
cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
@@ -1016,9 +1018,10 @@ SMB2_negotiate(const unsigned int xid,
else {
cifs_server_dbg(VFS, "Invalid dialect returned by server 0x%x\n",
le16_to_cpu(rsp->DialectRevision));
- rc = -EIO;
goto neg_exit;
}
+
+ rc = 0;
server->dialect = le16_to_cpu(rsp->DialectRevision);
/*
@@ -1172,7 +1175,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+ FSCTL_VALIDATE_NEGOTIATE_INFO,
(char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
(char **)&pneg_rsp, &rsplen);
if (rc == -EOPNOTSUPP) {
@@ -1927,7 +1930,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
- strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -1978,7 +1981,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
}
spin_unlock(&ses->chan_lock);
- close_cached_dir_lease(&tcon->crfid);
+ invalidate_all_cached_dirs(tcon);
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
(void **) &req,
@@ -2571,19 +2574,15 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
path_len = UniStrnlen((wchar_t *)path, PATH_MAX);
- /*
- * make room for one path separator between the treename and
- * path
- */
- *out_len = treename_len + 1 + path_len;
+ /* make room for one path separator only if @path isn't empty */
+ *out_len = treename_len + (path[0] ? 1 : 0) + path_len;
/*
- * final path needs to be null-terminated UTF16 with a
- * size aligned to 8
+ * final path needs to be 8-byte aligned as specified in
+ * MS-SMB2 2.2.13 SMB2 CREATE Request.
*/
-
- *out_size = roundup((*out_len+1)*2, 8);
- *out_path = kzalloc(*out_size, GFP_KERNEL);
+ *out_size = roundup(*out_len * sizeof(__le16), 8);
+ *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
if (!*out_path)
return -ENOMEM;
@@ -3055,7 +3054,7 @@ int
SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size)
{
struct smb2_ioctl_req *req;
@@ -3130,10 +3129,8 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
- if (is_fsctl)
- req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
- else
- req->Flags = 0;
+ /* always an FSCTL (for now) */
+ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
@@ -3160,9 +3157,9 @@ SMB2_ioctl_free(struct smb_rqst *rqst)
*/
int
SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen, u32 max_out_data_len,
- char **out_data, u32 *plen /* returned data len */)
+ u64 volatile_fid, u32 opcode, char *in_data, u32 indatalen,
+ u32 max_out_data_len, char **out_data,
+ u32 *plen /* returned data len */)
{
struct smb_rqst rqst;
struct smb2_ioctl_rsp *rsp = NULL;
@@ -3204,7 +3201,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rc = SMB2_ioctl_init(tcon, server,
&rqst, persistent_fid, volatile_fid, opcode,
- is_fsctl, in_data, indatalen, max_out_data_len);
+ in_data, indatalen, max_out_data_len);
if (rc)
goto ioctl_exit;
@@ -3296,7 +3293,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+ FSCTL_SET_COMPRESSION,
(char *)&fsctl_input /* data input */,
2 /* in data len */, CIFSMaxBufSize /* max out data */,
&ret_data /* out data */, NULL);
@@ -3776,7 +3773,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
credits.instance = server->reconnect_instance;
}
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, CIFS_ECHO_OP);
}
@@ -3911,15 +3908,15 @@ SMB2_echo(struct TCP_Server_Info *server)
cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->ops->need_neg &&
server->ops->need_neg(server)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/* No need to send echo on newly established connections */
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
return rc;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = smb2_plain_req_init(SMB2_ECHO, NULL, server,
(void **)&req, &total_len);
@@ -4201,7 +4198,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->offset, rdata->got_bytes);
queue_work(cifsiod_wq, &rdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -4440,7 +4437,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->offset, wdata->bytes);
queue_work(cifsiod_wq, &wdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index a69f1eed1cfe..3f740f24b96a 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -23,7 +23,7 @@ struct smb_rqst;
extern int map_smb2_to_linux_error(char *buf, bool log_err);
extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
-extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
+extern unsigned int smb2_calc_size(void *buf);
extern char *smb2_get_data_area_len(int *off, int *len,
struct smb2_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
@@ -54,16 +54,6 @@ extern bool smb2_is_valid_oplock_break(char *buffer,
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
- const char *path,
- struct cifs_sb_info *cifs_sb,
- struct cached_fid **cfid);
-extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
- struct dentry *dentry,
- struct cached_fid **cfid);
-extern void close_cached_dir(struct cached_fid *cfid);
-extern void close_cached_dir_lease(struct cached_fid *cfid);
-extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
@@ -147,13 +137,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon,
extern void SMB2_open_free(struct smb_rqst *rqst);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
+ char *in_data, u32 indatalen, u32 maxoutlen,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_ioctl_init(struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 55e79f6ee78d..1a5fc3314dbf 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -640,13 +640,13 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!is_signed)
return 0;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->ops->need_neg &&
server->ops->need_neg(server)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return 0;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (!is_binding && !server->session_estab) {
strncpy(shdr->Signature, "BSRSPYL", 8);
return 0;
@@ -750,7 +750,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
- atomic_inc(&midCount);
+ atomic_inc(&mid_count);
temp->mid_state = MID_REQUEST_ALLOCATED;
trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
le64_to_cpu(shdr->SessionId),
@@ -762,28 +762,30 @@ static int
smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct smb2_hdr *shdr, struct mid_q_entry **mid)
{
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -EAGAIN;
}
+ spin_unlock(&server->srv_lock);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_NEW) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are setting up session */
@@ -791,19 +793,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
if (ses->ses_status == SES_EXITING) {
if (shdr->Command != SMB2_LOGOFF) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are shutting down the session */
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
*mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
return -ENOMEM;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_add_tail(&(*mid)->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return 0;
}
@@ -854,7 +856,7 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
rc = smb2_sign_rqst(rqst, server);
if (rc) {
revert_current_mid_from_hdr(server, shdr);
- cifs_delete_mid(mid);
+ delete_mid(mid);
return ERR_PTR(rc);
}
@@ -869,13 +871,13 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return ERR_PTR(-EAGAIN);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
smb2_seq_num_into_buf(server, shdr);
@@ -888,7 +890,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
rc = smb2_sign_rqst(rqst, server);
if (rc) {
revert_current_mid_from_hdr(server, shdr);
- DeleteMidQEntry(mid);
+ release_mid(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfc9bd55870a..c2fe035e573b 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -21,6 +21,7 @@
#include <asm/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -37,13 +38,13 @@ cifs_wake_up_task(struct mid_q_entry *mid)
wake_up_process(mid->callback_data);
}
-struct mid_q_entry *
-AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
+static struct mid_q_entry *
+alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
if (server == NULL) {
- cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
+ cifs_dbg(VFS, "%s: null TCP session\n", __func__);
return NULL;
}
@@ -68,12 +69,12 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
- atomic_inc(&midCount);
+ atomic_inc(&mid_count);
temp->mid_state = MID_REQUEST_ALLOCATED;
return temp;
}
-static void _cifs_mid_q_entry_release(struct kref *refcount)
+static void __release_mid(struct kref *refcount)
{
struct mid_q_entry *midEntry =
container_of(refcount, struct mid_q_entry, refcount);
@@ -91,7 +92,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
server->ops->handle_cancelled_mid(midEntry, server);
midEntry->mid_state = MID_FREE;
- atomic_dec(&midCount);
+ atomic_dec(&mid_count);
if (midEntry->large_buf)
cifs_buf_release(midEntry->resp_buf);
else
@@ -152,29 +153,26 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
mempool_free(midEntry, cifs_mid_poolp);
}
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+void release_mid(struct mid_q_entry *mid)
{
- spin_lock(&GlobalMid_Lock);
- kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
- spin_unlock(&GlobalMid_Lock);
-}
+ struct TCP_Server_Info *server = mid->server;
-void DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
- cifs_mid_q_entry_release(midEntry);
+ spin_lock(&server->mid_lock);
+ kref_put(&mid->refcount, __release_mid);
+ spin_unlock(&server->mid_lock);
}
void
-cifs_delete_mid(struct mid_q_entry *mid)
+delete_mid(struct mid_q_entry *mid)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&mid->server->mid_lock);
if (!(mid->mid_flags & MID_DELETED)) {
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
- DeleteMidQEntry(mid);
+ release_mid(mid);
}
/*
@@ -263,8 +261,8 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int nvec;
unsigned long buflen = 0;
- if (server->vals->header_preamble_size == 0 &&
- rqst->rq_nvec >= 2 && rqst->rq_iov[0].iov_len == 4) {
+ if (!is_smb1(server) && rqst->rq_nvec >= 2 &&
+ rqst->rq_iov[0].iov_len == 4) {
iov = &rqst->rq_iov[1];
nvec = rqst->rq_nvec - 1;
} else {
@@ -348,7 +346,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
sigprocmask(SIG_BLOCK, &mask, &oldmask);
/* Generate a rfc1002 marker for SMB2+ */
- if (server->vals->header_preamble_size == 0) {
+ if (!is_smb1(server)) {
struct kvec hiov = {
.iov_base = &rfc1002_marker,
.iov_len = 4
@@ -577,12 +575,12 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
} else {
spin_unlock(&server->req_lock);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* For normal commands, reserve the last MAX_COMPOUND
@@ -725,11 +723,11 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if (ses->ses_status == SES_NEW) {
if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
(in_buf->Command != SMB_COM_NEGOTIATE)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are setting up session */
@@ -738,19 +736,19 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
if (ses->ses_status == SES_EXITING) {
/* check if SMB session is bad because we are setting it up */
if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are shutting down session */
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
- *ppmidQ = AllocMidQEntry(in_buf, ses->server);
+ *ppmidQ = alloc_mid(in_buf, ses->server);
if (*ppmidQ == NULL)
return -ENOMEM;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&ses->server->mid_lock);
list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&ses->server->mid_lock);
return 0;
}
@@ -782,13 +780,13 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
if (server->sign)
hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- mid = AllocMidQEntry(hdr, server);
+ mid = alloc_mid(hdr, server);
if (mid == NULL)
return ERR_PTR(-ENOMEM);
rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
if (rc) {
- DeleteMidQEntry(mid);
+ release_mid(mid);
return ERR_PTR(rc);
}
@@ -849,9 +847,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid->mid_state = MID_REQUEST_SUBMITTED;
/* put it on the pending_mid_q */
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_add_tail(&mid->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
/*
* Need to store the time in mid before calling I/O. For call_async,
@@ -865,7 +863,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc < 0) {
revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
- cifs_delete_mid(mid);
+ delete_mid(mid);
}
cifs_server_unlock(server);
@@ -912,10 +910,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
__func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return rc;
case MID_RETRY_NEEDED:
rc = -EAGAIN;
@@ -935,9 +933,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
__func__, mid->mid, mid->mid_state);
rc = -EIO;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
- DeleteMidQEntry(mid);
+ release_mid(mid);
return rc;
}
@@ -997,7 +995,7 @@ cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
return ERR_PTR(rc);
rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
if (rc) {
- cifs_delete_mid(mid);
+ delete_mid(mid);
return ERR_PTR(rc);
}
return mid;
@@ -1026,7 +1024,7 @@ static void
cifs_cancelled_callback(struct mid_q_entry *mid)
{
cifs_compound_callback(mid);
- DeleteMidQEntry(mid);
+ release_mid(mid);
}
/*
@@ -1078,12 +1076,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* Wait for all the requests to become available.
@@ -1130,7 +1128,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (IS_ERR(midQ[i])) {
revert_current_mid(server, i);
for (j = 0; j < i; j++)
- cifs_delete_mid(midQ[j]);
+ delete_mid(midQ[j]);
cifs_server_unlock(server);
/* Update # of requests on wire to server */
@@ -1186,17 +1184,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec);
cifs_server_unlock(server);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(server, midQ[i]);
@@ -1208,14 +1206,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
}
@@ -1240,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
buf = (char *)midQ[i]->resp_buf;
resp_iov[i].iov_base = buf;
resp_iov[i].iov_len = midQ[i]->resp_buf_size +
- server->vals->header_preamble_size;
+ HEADER_PREAMBLE_SIZE(server);
if (midQ[i]->large_buf)
resp_buf_type[i] = CIFS_LARGE_BUFFER;
@@ -1250,7 +1248,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
rc = server->ops->check_receive(midQ[i], server,
flags & CIFS_LOG_ERROR);
- /* mark it so buf will not be freed by cifs_delete_mid */
+ /* mark it so buf will not be freed by delete_mid */
if ((flags & CIFS_NO_RSP_BUF) == 0)
midQ[i]->resp_buf = NULL;
@@ -1259,19 +1257,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
struct kvec iov = {
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
};
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, &iov, 1);
cifs_server_unlock(server);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
out:
/*
@@ -1282,7 +1280,7 @@ out:
*/
for (i = 0; i < num_rqst; i++) {
if (!cancelled_mid[i])
- cifs_delete_mid(midQ[i]);
+ delete_mid(midQ[i]);
}
return rc;
@@ -1360,12 +1358,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1419,15 +1417,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = wait_for_response(server, midQ);
if (rc != 0) {
send_cancel(server, &rqst, midQ);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
- midQ->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
+ midQ->callback = release_mid;
+ spin_unlock(&server->mid_lock);
add_credits(server, &credits, 0);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
rc = cifs_sync_mid_result(midQ, server);
@@ -1447,7 +1445,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, server, 0);
out:
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
add_credits(server, &credits, 0);
return rc;
@@ -1505,12 +1503,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1540,7 +1538,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
if (rc) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
cifs_server_unlock(server);
return rc;
}
@@ -1557,7 +1555,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
cifs_server_unlock(server);
if (rc < 0) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
@@ -1568,19 +1566,19 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if ((rc == -ERESTARTSYS) &&
(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (in_buf->Command == SMB_COM_TRANSACTION2) {
/* POSIX lock. We send a NT_CANCEL SMB to cause the
blocking lock to return. */
rc = send_cancel(server, &rqst, midQ);
if (rc) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
} else {
@@ -1592,7 +1590,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* If we get -ENOLCK back the lock may have
already been removed. Don't exit in this case. */
if (rc && rc != -ENOLCK) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
}
@@ -1600,21 +1598,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = wait_for_response(server, midQ);
if (rc) {
send_cancel(server, &rqst, midQ);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
- midQ->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
+ midQ->callback = release_mid;
+ spin_unlock(&server->mid_lock);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
/* We got the response - restart system call. */
rstart = 1;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = cifs_sync_mid_result(midQ, server);
if (rc != 0)
@@ -1631,8 +1629,185 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, server, 0);
out:
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
if (rstart && rc == -EACCES)
return -ERESTARTSYS;
return rc;
}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+int
+cifs_discard_remaining_data(struct TCP_Server_Info *server)
+{
+ unsigned int rfclen = server->pdu_size;
+ int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) -
+ server->total_read;
+
+ while (remaining > 0) {
+ int length;
+
+ length = cifs_discard_from_socket(server,
+ min_t(size_t, remaining,
+ CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ remaining -= length;
+ }
+
+ return 0;
+}
+
+static int
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+ bool malformed)
+{
+ int length;
+
+ length = cifs_discard_remaining_data(server);
+ dequeue_mid(mid, malformed);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
+ return length;
+}
+
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ return __cifs_readv_discard(server, mid, rdata->result);
+}
+
+int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length, len;
+ unsigned int data_offset, data_len;
+ struct cifs_readdata *rdata = mid->callback_data;
+ char *buf = server->smallbuf;
+ unsigned int buflen = server->pdu_size + HEADER_PREAMBLE_SIZE(server);
+ bool use_rdma_mr = false;
+
+ cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+ __func__, mid->mid, rdata->offset, rdata->bytes);
+
+ /*
+ * read the rest of READ_RSP header (sans Data array), or whatever we
+ * can if there's not enough data. At this point, we've read down to
+ * the Mid.
+ */
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+ HEADER_SIZE(server) + 1;
+
+ length = cifs_read_from_socket(server,
+ buf + HEADER_SIZE(server) - 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ if (server->ops->is_session_expired &&
+ server->ops->is_session_expired(buf)) {
+ cifs_reconnect(server, true);
+ return -1;
+ }
+
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server)) {
+ cifs_discard_remaining_data(server);
+ return -1;
+ }
+
+ /* set up first two iov for signature check and to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = HEADER_PREAMBLE_SIZE(server);
+ rdata->iov[1].iov_base = buf + HEADER_PREAMBLE_SIZE(server);
+ rdata->iov[1].iov_len =
+ server->total_read - HEADER_PREAMBLE_SIZE(server);
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+ /* Was the SMB read successful? */
+ rdata->result = server->ops->map_error(buf, false);
+ if (rdata->result != 0) {
+ cifs_dbg(FYI, "%s: server returned error %d\n",
+ __func__, rdata->result);
+ /* normal error on read response */
+ return __cifs_readv_discard(server, mid, false);
+ }
+
+ /* Is there enough to get to the rest of the READ_RSP header? */
+ if (server->total_read < server->vals->read_rsp_size) {
+ cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+ __func__, server->total_read,
+ server->vals->read_rsp_size);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ data_offset = server->ops->read_data_offset(buf) +
+ HEADER_PREAMBLE_SIZE(server);
+ if (data_offset < server->total_read) {
+ /*
+ * win2k8 sometimes sends an offset of 0 when the read
+ * is beyond the EOF. Treat it as if the data starts just after
+ * the header.
+ */
+ cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+ __func__, data_offset);
+ data_offset = server->total_read;
+ } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ /* data_offset is beyond the end of smallbuf */
+ cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+ __func__, data_offset);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+ __func__, server->total_read, data_offset);
+
+ len = data_offset - server->total_read;
+ if (len > 0) {
+ /* read any junk before data into the rest of smallbuf */
+ length = cifs_read_from_socket(server,
+ buf + server->total_read, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ }
+
+ /* how much data is in the response? */
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ use_rdma_mr = rdata->mr;
+#endif
+ data_len = server->ops->read_data_length(buf, use_rdma_mr);
+ if (!use_rdma_mr && (data_offset + data_len > buflen)) {
+ /* data_len is corrupt -- discard frame */
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ length = rdata->read_into_pages(server, rdata, data_len);
+ if (length < 0)
+ return length;
+
+ server->total_read += length;
+
+ cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+ server->total_read, buflen, data_len);
+
+ /* discard anything left over */
+ if (server->total_read < buflen)
+ return cifs_readv_discard(server, mid);
+
+ dequeue_mid(mid, false);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
+ return length;
+}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 9d486fbbfbbd..998fa51f9b68 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -201,6 +201,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
if (!value)
@@ -224,6 +225,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_POSIX */
break;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
out:
@@ -364,7 +366,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
}
break;
}
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & SB_POSIXACL)
@@ -384,6 +386,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_POSIX */
break;
+#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
/* We could add an additional check for streams ie
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 14e0ef5e9a20..12bd61d20f69 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
/**
* fscrypt_fname_encrypt() - encrypt a filename
* @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
+ * or of the symlink (for symlink targets). Key must already be
+ * set up.
* @iname: the filename to encrypt
* @out: (output) the encrypted filename
* @olen: size of the encrypted filename. It must be at least @iname->len.
@@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
return 0;
}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
/**
* fname_decrypt() - decrypt a filename
@@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
return bp - dst;
}
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret)
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
@@ -281,6 +283,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
}
/**
+ * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
+ * @inode: parent inode of dentry name being encrypted. Key must
+ * already be set up.
+ * @orig_len: length of the original filename
+ * @max_len: maximum length to return
+ * @encrypted_len_ret: where calculated length should be returned (on success)
+ *
+ * Filenames that are shorter than the maximum length may have their lengths
+ * increased slightly by encryption, due to padding that is applied.
+ *
+ * Return: false if the orig_len is greater than max_len. Otherwise, true and
+ * fill out encrypted_len_ret with the length (up to max_len).
+ */
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+ u32 max_len, u32 *encrypted_len_ret)
+{
+ return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
+ orig_len, max_len,
+ encrypted_len_ret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
+
+/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
@@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len, NAME_MAX,
+ if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index f5be777d8279..3afdaa084773 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci);
/* fname.c */
-int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
- u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret);
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
/* hkdf.c */
-
struct fscrypt_hkdf {
struct crypto_shash *hmac_tfm;
};
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index af74599ae1cf..7c01025879b3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(policy, len,
- max_len - sizeof(struct fscrypt_symlink_data),
- &disk_link->len))
+ if (!__fscrypt_fname_encrypted_size(policy, len,
+ max_len - sizeof(struct fscrypt_symlink_data),
+ &disk_link->len))
return -ENAMETOOLONG;
disk_link->len += sizeof(struct fscrypt_symlink_data);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 8a054e6d1e68..80b8ca0f340b 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -694,6 +694,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
}
/**
+ * fscrypt_context_for_new_inode() - create an encryption context for a new inode
+ * @ctx: where context should be written
+ * @inode: inode from which to fetch policy and nonce
+ *
+ * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
+ * generate a new context and write it to ctx. ctx _must_ be at least
+ * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
+ *
+ * Return: size of the resulting context or a negative error code.
+ */
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
+{
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ BUILD_BUG_ON(sizeof(union fscrypt_context) !=
+ FSCRYPT_SET_CONTEXT_MAX_SIZE);
+
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
+ return -ENOKEY;
+
+ return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
+}
+EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
+
+/**
* fscrypt_set_context() - Set the fscrypt context of a new inode
* @inode: a new inode
* @fs_data: private data given by FS and passed to ->set_context()
@@ -709,12 +735,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
union fscrypt_context ctx;
int ctxsize;
- /* fscrypt_prepare_new_inode() should have set up the key already. */
- if (WARN_ON_ONCE(!ci))
- return -ENOKEY;
-
- BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+ ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
+ if (ctxsize < 0)
+ return ctxsize;
/*
* This may be the first time the inode number is available, so do any
diff --git a/fs/dax.c b/fs/dax.c
index 649ff51c9a26..c440dcef4b1b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
+static inline bool dax_mapping_is_cow(struct address_space *mapping)
+{
+ return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static inline void dax_mapping_set_cow(struct page *page)
+{
+ if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ /*
+ * Reset the index if the page was already mapped
+ * regularly before.
+ */
+ if (page->mapping)
+ page->index = 1;
+ page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ }
+ page->index++;
+}
+
+/*
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files. If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool cow)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
+ if (cow) {
+ dax_mapping_set_cow(page);
+ } else {
+ WARN_ON_ONCE(page->mapping);
+ page->mapping = mapping;
+ page->index = index + i++;
+ }
}
}
@@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+ if (dax_mapping_is_cow(page->mapping)) {
+ /* keep the CoW flag if this page is still shared */
+ if (page->index-- > 0)
+ continue;
+ } else
+ WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
page->index = 0;
}
@@ -456,6 +487,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
}
/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ struct page **page)
+{
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+
+ rcu_read_lock();
+ for (;;) {
+ entry = NULL;
+ if (!dax_mapping(mapping))
+ break;
+
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
+ continue;
+ }
+ if (!entry ||
+ dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ /*
+ * Because we are looking for entry from file's mapping
+ * and index, so the entry may not be inserted for now,
+ * or even a zero/empty entry. We don't think this is
+ * an error case. So, return a special value and do
+ * not output @page.
+ */
+ entry = (void *)~0UL;
+ } else {
+ *page = pfn_to_page(dax_to_pfn(entry));
+ dax_lock_entry(&xas, entry);
+ }
+ xas_unlock_irq(&xas);
+ break;
+ }
+ rcu_read_unlock();
+ return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ if (cookie == ~0UL)
+ return;
+
+ dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
* Find page cache entry at given index. If it is a DAX entry, return it
* with the entry locked. If the page cache doesn't contain an entry at
* that index, add a locked empty entry.
@@ -736,22 +830,42 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
}
/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
+ struct vm_area_struct *vma)
+{
+ return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
+ (iter->iomap.flags & IOMAP_F_DIRTY);
+}
+
+static bool dax_fault_is_cow(const struct iomap_iter *iter)
+{
+ return (iter->flags & IOMAP_WRITE) &&
+ (iter->iomap.flags & IOMAP_F_SHARED);
+}
+
+/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
* PTEs. If we happen to be trying to insert a PTE and there is a PMD
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_entry(struct xa_state *xas,
- struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ unsigned long flags)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
+ bool cow = dax_fault_is_cow(iter);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -763,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
+ dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+ cow);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -787,6 +902,9 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
@@ -931,20 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
- pfn_t *pfnp)
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- int id, rc;
+ int id, rc = 0;
long length;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- DAX_ACCESS, NULL, pfnp);
+ DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
+ if (!pfnp)
+ goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@@ -954,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
+
+out_check_addr:
+ if (!kaddr)
+ goto out;
+ if (!*kaddr)
+ rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
}
+/**
+ * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * @pos: address to do copy from.
+ * @length: size of copy operation.
+ * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
+ * @srcmap: iomap srcmap
+ * @daddr: destination address to copy to.
+ *
+ * This can be called from two places. Either during DAX write fault (page
+ * aligned), to copy the length size data to daddr. Or, while doing normal DAX
+ * write operation, dax_iomap_actor() might call this to do the copy of either
+ * start or end unaligned address. In the latter case the rest of the copy of
+ * aligned ranges is taken care by dax_iomap_actor() itself.
+ */
+static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+ const struct iomap *srcmap, void *daddr)
+{
+ loff_t head_off = pos & (align_size - 1);
+ size_t size = ALIGN(head_off + length, align_size);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, align_size);
+ bool copy_all = head_off == 0 && end == pg_end;
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+
+ if (copy_all) {
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ return ret ? -EIO : 0;
+ }
+
+ /* Copy the head part of the range */
+ if (head_off) {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
+
+ /* Copy the tail part of the range */
+ if (end < pg_end) {
+ loff_t tail_off = head_off + length;
+ loff_t tail_len = pg_end - end;
+
+ ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
+ tail_len);
+ if (ret)
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@@ -966,17 +1146,15 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct xa_state *xas,
- struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void **entry)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -985,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1003,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1037,23 +1215,34 @@ fallback:
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */
-static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
- unsigned int offset, size_t size)
+static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ unsigned offset = offset_in_page(pos);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
void *kaddr;
long ret;
- ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL);
- if (ret > 0) {
- memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, kaddr + offset, size);
- }
+ ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
+ NULL);
+ if (ret < 0)
+ return ret;
+ memset(kaddr + offset, 0, size);
+ if (srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret < 0)
+ return ret;
+ dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
+ } else
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@@ -1080,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
+ rc = dax_memzero(iter, pos, size);
dax_read_unlock(id);
if (rc < 0)
@@ -1129,15 +1318,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
+ const struct iomap *srcmap = &iomi->srcmap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
+ bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;
- if (iov_iter_rw(iter) == READ) {
+ if (!write) {
end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
@@ -1146,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
return iov_iter_zero(min(length, end - pos), iter);
}
- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ /*
+ * In DAX mode, enforce either pure overwrites of written extents, or
+ * writes to unwritten extents as part of a copy-on-write operation.
+ */
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ !(iomap->flags & IOMAP_F_SHARED)))
return -EIO;
/*
@@ -1188,6 +1384,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
@@ -1197,7 +1401,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (recovery)
xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
map_len, iter);
- else if (iov_iter_rw(iter) == WRITE)
+ else if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@@ -1268,17 +1472,6 @@ static vm_fault_t dax_fault_return(int error)
}
/*
- * MAP_SYNC on a dax mapping guarantees dirty metadata is
- * flushed on write-faults (non-cow), but not read-faults.
- */
-static bool dax_fault_is_synchronous(unsigned long flags,
- struct vm_area_struct *vma, const struct iomap *iomap)
-{
- return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
- && (iomap->flags & IOMAP_F_DIRTY);
-}
-
-/*
* When handling a synchronous page fault and the inode need a fsync, we can
* insert the PTE/PMD into page tables only after that fsync happened. Skip
* insertion for now and return the pfn so that caller can insert it after the
@@ -1335,15 +1528,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
const struct iomap_iter *iter, pfn_t *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = &iter->srcmap;
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+ bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
pfn_t pfn;
+ void *kaddr;
if (!pmd && vmf->cow_page)
return dax_fault_cow_page(vmf, iter);
@@ -1352,23 +1545,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
if (!write &&
(iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
if (!pmd)
- return dax_load_hole(xas, mapping, entry, vmf);
- return dax_pmd_load_hole(xas, vmf, iomap, entry);
+ return dax_load_hole(xas, vmf, iter, entry);
+ return dax_pmd_load_hole(xas, vmf, iter, entry);
}
- if (iomap->type != IOMAP_MAPPED) {
+ if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
- err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
- write && !sync);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
+
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (err)
+ return dax_fault_return(err);
+ }
- if (sync)
+ if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
/* insert PMD pfn */
@@ -1674,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+ struct iomap_iter *it_dest, u64 len, bool *same)
+{
+ const struct iomap *smap = &it_src->iomap;
+ const struct iomap *dmap = &it_dest->iomap;
+ loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ void *saddr, *daddr;
+ int id, ret;
+
+ len = min(len, min(smap->length, dmap->length));
+
+ if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+ *same = true;
+ return len;
+ }
+
+ if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+ *same = false;
+ return 0;
+ }
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+ &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+ &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ *same = !memcmp(saddr, daddr, len);
+ if (!*same)
+ len = 0;
+ dax_read_unlock(id);
+ return len;
+
+out_unlock:
+ dax_read_unlock(id);
+ return -EIO;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dst, loff_t dstoff, loff_t len, bool *same,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter src_iter = {
+ .inode = src,
+ .pos = srcoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ struct iomap_iter dst_iter = {
+ .inode = dst,
+ .pos = dstoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&src_iter, ops)) > 0) {
+ while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
+ dst_iter.processed = dax_range_compare_iter(&src_iter,
+ &dst_iter, len, same);
+ }
+ if (ret <= 0)
+ src_iter.processed = ret;
+ }
+ return ret;
+}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/dcache.c b/fs/dcache.c
index ea5cdec24ea7..bb0c4d0038db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2248,10 +2248,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
EXPORT_SYMBOL(d_add_ci);
-
-static inline bool d_same_name(const struct dentry *dentry,
- const struct dentry *parent,
- const struct qstr *name)
+/**
+ * d_same_name - compare dentry name with case-exact name
+ * @parent: parent dentry
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name: the case-exact name to be associated with the returned dentry
+ *
+ * Return: true if names are same, or false
+ */
+bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+ const struct qstr *name)
{
if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
if (dentry->d_name.len != name->len)
@@ -2262,6 +2268,49 @@ static inline bool d_same_name(const struct dentry *dentry,
dentry->d_name.len, dentry->d_name.name,
name) == 0;
}
+EXPORT_SYMBOL_GPL(d_same_name);
+
+/*
+ * This is __d_lookup_rcu() when the parent dentry has
+ * DCACHE_OP_COMPARE, which makes things much nastier.
+ */
+static noinline struct dentry *__d_lookup_rcu_op_compare(
+ const struct dentry *parent,
+ const struct qstr *name,
+ unsigned *seqp)
+{
+ u64 hashlen = name->hash_len;
+ struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_node *node;
+ struct dentry *dentry;
+
+ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ int tlen;
+ const char *tname;
+ unsigned seq;
+
+seqretry:
+ seq = raw_seqcount_begin(&dentry->d_seq);
+ if (dentry->d_parent != parent)
+ continue;
+ if (d_unhashed(dentry))
+ continue;
+ if (dentry->d_name.hash != hashlen_hash(hashlen))
+ continue;
+ tlen = dentry->d_name.len;
+ tname = dentry->d_name.name;
+ /* we want a consistent (name,len) pair */
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
+ goto seqretry;
+ }
+ if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
+ continue;
+ *seqp = seq;
+ return dentry;
+ }
+ return NULL;
+}
/**
* __d_lookup_rcu - search for a dentry (racy, store-free)
@@ -2309,6 +2358,9 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
* Keep the two functions in sync.
*/
+ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
+ return __d_lookup_rcu_op_compare(parent, name, seqp);
+
/*
* The hash list is protected using RCU.
*
@@ -2325,7 +2377,6 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
-seqretry:
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
@@ -2348,28 +2399,10 @@ seqretry:
continue;
if (d_unhashed(dentry))
continue;
-
- if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
- int tlen;
- const char *tname;
- if (dentry->d_name.hash != hashlen_hash(hashlen))
- continue;
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
- /* we want a consistent (name,len) pair */
- if (read_seqcount_retry(&dentry->d_seq, seq)) {
- cpu_relax();
- goto seqretry;
- }
- if (parent->d_op->d_compare(dentry,
- tlen, tname, name) != 0)
- continue;
- } else {
- if (dentry->d_name.hash_len != hashlen)
- continue;
- if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
- continue;
- }
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ continue;
*seqp = seq;
return dentry;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index df5e2d048799..f669163d5860 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -169,7 +169,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
const enum req_op dio_op = dio->opf & REQ_OP_MASK;
ssize_t ret;
- ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
+ ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
&sdio->from);
if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
@@ -191,7 +191,6 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
}
if (ret >= 0) {
- iov_iter_advance(sdio->iter, ret);
ret += sdio->from;
sdio->head = 0;
sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -1251,7 +1250,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
- dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
+ dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
sdio.iter = iter;
sdio.final_block_in_request = end >> blkbits;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 8e01d89c3319..b5fd9d71e67f 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -222,8 +222,10 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
rreq = erofs_fscache_alloc_request(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq))
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
goto out;
+ }
return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
rreq, mdev.m_pa);
@@ -301,8 +303,10 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
rreq = erofs_fscache_alloc_request(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq))
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
goto out_unlock;
+ }
pstart = mdev.m_pa + (pos - map.m_la);
return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index cfee49d33b95..a01cc82795a2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -195,7 +195,6 @@ struct erofs_workgroup {
atomic_t refcount;
};
-#if defined(CONFIG_SMP)
static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
int val)
{
@@ -224,34 +223,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
return atomic_cond_read_relaxed(&grp->refcount,
VAL != EROFS_LOCKED_MAGIC);
}
-#else
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- /* no need to spin on UP platforms, let's just disable preemption. */
- if (val != atomic_read(&grp->refcount)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- int v = atomic_read(&grp->refcount);
-
- /* workgroup is never freezed on uniprocessor systems */
- DBG_BUGON(v == EROFS_LOCKED_MAGIC);
- return v;
-}
-#endif /* !CONFIG_SMP */
#endif /* !CONFIG_EROFS_FS_ZIP */
/* we strictly follow PAGE_SIZE and no buffer head yet */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 95addc5c9d34..3173debeaa5a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
}
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
- &sbi->dax_part_off);
+ &sbi->dax_part_off,
+ NULL, NULL);
}
err = erofs_read_superblock(sb);
@@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
{
struct erofs_device_info *dif = ptr;
- fs_put_dax(dif->dax_dev);
+ fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
erofs_fscache_unregister_cookie(&dif->fscache);
@@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb)
return;
erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev);
+ fs_put_dax(sbi->dax_dev, NULL);
erofs_fscache_unregister_cookie(&sbi->s_fscache);
erofs_fscache_unregister_fs(sb);
kfree(sbi->opt.fsid);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ec9a1d780dc1..46627cb69abe 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
int __init erofs_init_shrinker(void)
{
- return register_shrinker(&erofs_shrinker_info);
+ return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
}
void erofs_exit_shrinker(void)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 572f0b8151ba..d58549ca1df9 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -141,7 +141,7 @@ struct z_erofs_maprecorder {
u8 type, headtype;
u16 clusterofs;
u16 delta[2];
- erofs_blk_t pblk, compressedlcs;
+ erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
};
@@ -192,7 +192,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedlcs = m->delta[0] &
+ m->compressedblks = m->delta[0] &
~Z_EROFS_VLE_DI_D0_CBLKCNT;
m->delta[0] = 1;
}
@@ -293,7 +293,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
m->delta[0] = 1;
return 0;
} else if (i + 1 != (int)vcnt) {
@@ -497,7 +497,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return 0;
}
lcn = m->lcn + 1;
- if (m->compressedlcs)
+ if (m->compressedblks)
goto out;
err = z_erofs_load_cluster_from_disk(m, lcn, false);
@@ -506,7 +506,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
/*
* If the 1st NONHEAD lcluster has already been handled initially w/o
- * valid compressedlcs, which means at least it mustn't be CBLKCNT, or
+ * valid compressedblks, which means at least it mustn't be CBLKCNT, or
* an internal implemenatation error is detected.
*
* The following code can also handle it properly anyway, but let's
@@ -523,12 +523,12 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
*/
- m->compressedlcs = 1;
+ m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
if (m->delta[0] != 1)
goto err_bonus_cblkcnt;
- if (m->compressedlcs)
+ if (m->compressedblks)
break;
fallthrough;
default:
@@ -539,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = (u64)m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e2daa940ebce..8b56b94e2f56 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
return to;
}
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int sync, void *key)
+{
+ int ret = default_wake_function(wq_entry, mode, sync, key);
+
+ list_del_init(&wq_entry->entry);
+ return ret;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
+ *
+ * In fact, we now use an even more aggressive function that
+ * unconditionally removes, because we don't reuse the wait
+ * entry between loop iterations. This lets us also avoid the
+ * performance issue if a process is killed, causing all of its
+ * threads to wake up without being removed normally.
*/
init_wait(&wait);
+ wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
diff --git a/fs/exec.c b/fs/exec.c
index 5fd73915c62c..9a5ca7b82bfc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -584,11 +584,11 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (kmapped_page) {
flush_dcache_page(kmapped_page);
- kunmap(kmapped_page);
+ kunmap_local(kaddr);
put_arg_page(kmapped_page);
}
kmapped_page = page;
- kaddr = kmap(kmapped_page);
+ kaddr = kmap_local_page(kmapped_page);
kpos = pos & PAGE_MASK;
flush_arg_page(bprm, kpos, kmapped_page);
}
@@ -602,7 +602,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
out:
if (kmapped_page) {
flush_dcache_page(kmapped_page);
- kunmap(kmapped_page);
+ kunmap_local(kaddr);
put_arg_page(kmapped_page);
}
return ret;
@@ -880,11 +880,11 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
- char *src = kmap(bprm->page[index]) + offset;
+ char *src = kmap_local_page(bprm->page[index]) + offset;
sp -= PAGE_SIZE - offset;
if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
ret = -EFAULT;
- kunmap(bprm->page[index]);
+ kunmap_local(src);
if (ret)
goto out;
}
@@ -1304,6 +1304,9 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
+ spin_lock_irq(&me->sighand->siglock);
+ posix_cpu_timers_exit(me);
+ spin_unlock_irq(&me->sighand->siglock);
exit_itimers(me);
flush_itimer_signals();
#endif
@@ -1683,13 +1686,13 @@ int remove_arg_zero(struct linux_binprm *bprm)
ret = -EFAULT;
goto out;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_arg_page(page);
} while (offset == PAGE_SIZE);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 4a7a2308eb72..a8f8eee4937c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -27,9 +27,9 @@ enum exfat_error_mode {
* exfat nls lossy flag
*/
enum {
- NLS_NAME_NO_LOSSY, /* no lossy */
- NLS_NAME_LOSSY, /* just detected incorrect filename(s) */
- NLS_NAME_OVERLEN, /* the length is over than its limit */
+ NLS_NAME_NO_LOSSY = 0, /* no lossy */
+ NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */
+ NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */
};
#define EXFAT_HASH_BITS 8
@@ -483,6 +483,7 @@ struct inode *exfat_build_inode(struct super_block *sb,
void exfat_hash_inode(struct inode *inode, loff_t i_pos);
void exfat_unhash_inode(struct inode *inode);
struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
+int __exfat_write_inode(struct inode *inode, int sync);
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
void exfat_evict_inode(struct inode *inode);
int exfat_block_truncate_page(struct inode *inode, loff_t from);
@@ -508,14 +509,16 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
#define exfat_fs_error_ratelimit(sb, fmt, args...) \
__exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \
fmt, ## args)
-void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...)
- __printf(3, 4) __cold;
+
+/* expand to pr_*() with prefix */
#define exfat_err(sb, fmt, ...) \
- exfat_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+ pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
#define exfat_warn(sb, fmt, ...) \
- exfat_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+ pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
#define exfat_info(sb, fmt, ...) \
- exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+ pr_info("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
+#define exfat_debug(sb, fmt, ...) \
+ pr_debug("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 9de6a6b844c9..ee0b7cf51157 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -331,7 +331,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
/* find new cluster */
if (hint_clu == EXFAT_EOF_CLUSTER) {
if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) {
- exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)\n",
+ exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)",
sbi->clu_srch_ptr);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 20d4e47f57ab..4e0793f35e8f 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -101,7 +101,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0;
/* check if the given file ID is opened */
if (ei->type != TYPE_FILE && ei->type != TYPE_DIR)
@@ -149,50 +148,19 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (ei->type == TYPE_FILE)
ei->attr |= ATTR_ARCHIVE;
- /* update the directory entry */
- if (!evict) {
- struct timespec64 ts;
- struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
- int err;
-
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES);
- if (!es)
- return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
-
- ts = current_time(inode);
- exfat_set_entry_time(sbi, &ts,
- &ep->dentry.file.modify_tz,
- &ep->dentry.file.modify_time,
- &ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_cs);
- ep->dentry.file.attr = cpu_to_le16(ei->attr);
-
- /* File size should be zero if there is no cluster allocated */
- if (ei->start_clu == EXFAT_EOF_CLUSTER) {
- ep2->dentry.stream.valid_size = 0;
- ep2->dentry.stream.size = 0;
- } else {
- ep2->dentry.stream.valid_size = cpu_to_le64(new_size);
- ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
- }
-
- if (new_size == 0) {
- /* Any directory can not be truncated to zero */
- WARN_ON(ei->type != TYPE_FILE);
-
- ep2->dentry.stream.flags = ALLOC_FAT_CHAIN;
- ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
- }
-
- exfat_update_dir_chksum_with_entry_set(es);
- err = exfat_free_dentry_set(es, inode_needs_sync(inode));
- if (err)
- return err;
- }
+ /*
+ * update the directory entry
+ *
+ * If the directory entry is updated by mark_inode_dirty(), the
+ * directory entry will be written after a writeback cycle of
+ * updating the bitmap/FAT, which may result in clusters being
+ * freed but referenced by the directory entry in the event of a
+ * sudden power failure.
+ * __exfat_write_inode() is called for directory entry, bitmap
+ * and FAT to be written in a same writeback.
+ */
+ if (__exfat_write_inode(inode, inode_needs_sync(inode)))
+ return -EIO;
/* cut off from the FAT chain */
if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER &&
@@ -243,12 +211,6 @@ void exfat_truncate(struct inode *inode, loff_t size)
if (err)
goto write_size;
- inode->i_ctime = inode->i_mtime = current_time(inode);
- if (IS_DIRSYNC(inode))
- exfat_sync_inode(inode);
- else
- mark_inode_dirty(inode);
-
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
inode->i_blkbits;
write_size:
@@ -330,6 +292,12 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
attr->ia_valid &= ~ATTR_MODE;
}
+ if (attr->ia_valid & ATTR_SIZE)
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+
+ setattr_copy(&init_user_ns, inode, attr);
+ exfat_truncate_atime(&inode->i_atime);
+
if (attr->ia_valid & ATTR_SIZE) {
error = exfat_block_truncate_page(inode, attr->ia_size);
if (error)
@@ -337,13 +305,15 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
down_write(&EXFAT_I(inode)->truncate_lock);
truncate_setsize(inode, attr->ia_size);
+
+ /*
+ * __exfat_write_inode() is called from exfat_truncate(), inode
+ * is already written by it, so mark_inode_dirty() is unneeded.
+ */
exfat_truncate(inode, attr->ia_size);
up_write(&EXFAT_I(inode)->truncate_lock);
- }
-
- setattr_copy(&init_user_ns, inode, attr);
- exfat_truncate_atime(&inode->i_atime);
- mark_inode_dirty(inode);
+ } else
+ mark_inode_dirty(inode);
out:
return error;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 0133d385d8e8..a795437b86d0 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -17,7 +17,7 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-static int __exfat_write_inode(struct inode *inode, int sync)
+int __exfat_write_inode(struct inode *inode, int sync)
{
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
@@ -75,6 +75,13 @@ static int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
+ if (on_disk_size) {
+ ep2->dentry.stream.flags = ei->flags;
+ ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu);
+ } else {
+ ep2->dentry.stream.flags = ALLOC_FAT_CHAIN;
+ ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
+ }
exfat_update_dir_chksum_with_entry_set(es);
return exfat_free_dentry_set(es, sync);
@@ -105,7 +112,7 @@ void exfat_sync_inode(struct inode *inode)
static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
unsigned int *clu, int create)
{
- int ret, modified = false;
+ int ret;
unsigned int last_clu;
struct exfat_chain new_clu;
struct super_block *sb = inode->i_sb;
@@ -196,7 +203,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
if (new_clu.flags == ALLOC_FAT_CHAIN)
ei->flags = ALLOC_FAT_CHAIN;
ei->start_clu = new_clu.dir;
- modified = true;
} else {
if (new_clu.flags != ei->flags) {
/* no-fat-chain bit is disabled,
@@ -206,7 +212,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
exfat_chain_cont_cluster(sb, ei->start_clu,
num_clusters);
ei->flags = ALLOC_FAT_CHAIN;
- modified = true;
}
if (new_clu.flags == ALLOC_FAT_CHAIN)
if (exfat_ent_set(sb, last_clu, new_clu.dir))
@@ -216,33 +221,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
num_clusters += num_to_be_allocated;
*clu = new_clu.dir;
- if (ei->dir.dir != DIR_DELETED && modified) {
- struct exfat_dentry *ep;
- struct exfat_entry_set_cache *es;
- int err;
-
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES);
- if (!es)
- return -EIO;
- /* get stream entry */
- ep = exfat_get_dentry_cached(es, 1);
-
- /* update directory entry */
- ep->dentry.stream.flags = ei->flags;
- ep->dentry.stream.start_clu =
- cpu_to_le32(ei->start_clu);
- ep->dentry.stream.valid_size =
- cpu_to_le64(i_size_read(inode));
- ep->dentry.stream.size =
- ep->dentry.stream.valid_size;
-
- exfat_update_dir_chksum_with_entry_set(es);
- err = exfat_free_dentry_set(es, inode_needs_sync(inode));
- if (err)
- return err;
- } /* end of if != DIR_DELETED */
-
inode->i_blocks +=
num_to_be_allocated << sbi->sect_per_clus_bits;
@@ -384,6 +362,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = current_time(inode);
exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
}
}
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 9380e0188b55..2e1a1a6b1021 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -46,23 +46,6 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
}
}
-/*
- * exfat_msg() - print preformated EXFAT specific messages.
- * All logs except what uses exfat_fs_error() should be written by exfat_msg()
- */
-void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- /* level means KERN_ pacility level */
- printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
- va_end(args);
-}
-
#define SECS_PER_MIN (60)
#define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN)
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index c6eaf7e9ea74..b617bebc3d0f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -318,7 +318,6 @@ static int exfat_find_empty_entry(struct inode *inode,
unsigned int ret, last_clu;
loff_t size = 0;
struct exfat_chain clu;
- struct exfat_dentry *ep = NULL;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -383,25 +382,6 @@ static int exfat_find_empty_entry(struct inode *inode,
p_dir->size++;
size = EXFAT_CLU_TO_B(p_dir->size, sbi);
- /* update the directory entry */
- if (p_dir->dir != sbi->root_dir) {
- struct buffer_head *bh;
-
- ep = exfat_get_dentry(sb,
- &(ei->dir), ei->entry + 1, &bh);
- if (!ep)
- return -EIO;
-
- ep->dentry.stream.valid_size = cpu_to_le64(size);
- ep->dentry.stream.size = ep->dentry.stream.valid_size;
- ep->dentry.stream.flags = p_dir->flags;
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
- if (exfat_update_dir_chksum(inode, &(ei->dir),
- ei->entry))
- return -EIO;
- }
-
/* directory inode should be updated in here */
i_size_write(inode, size);
ei->i_size_ondisk += sbi->cluster_size;
@@ -462,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
return namelen; /* return error value */
if ((lossy && !lookup) || !namelen)
- return -EINVAL;
+ return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL;
exfat_chain_set(p_dir, ei->start_clu,
EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index ef115e673406..705710f93e2d 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -509,7 +509,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
}
if (unilen > MAX_NAME_LENGTH) {
- exfat_err(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
+ exfat_debug(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
__func__, len, unilen, MAX_NAME_LENGTH);
return -ENAMETOOLONG;
}
@@ -671,7 +671,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
bh = sb_bread(sb, sector);
if (!bh) {
- exfat_err(sb, "failed to read sector(0x%llx)\n",
+ exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
ret = -EIO;
goto free_table;
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 6a4dfe9f31ee..35f0305cd493 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -464,7 +464,7 @@ static int exfat_read_boot_sector(struct super_block *sb)
*/
if (p_boot->sect_size_bits < EXFAT_MIN_SECT_SIZE_BITS ||
p_boot->sect_size_bits > EXFAT_MAX_SECT_SIZE_BITS) {
- exfat_err(sb, "bogus sector size bits : %u\n",
+ exfat_err(sb, "bogus sector size bits : %u",
p_boot->sect_size_bits);
return -EINVAL;
}
@@ -473,7 +473,7 @@ static int exfat_read_boot_sector(struct super_block *sb)
* sect_per_clus_bits could be at least 0 and at most 25 - sect_size_bits.
*/
if (p_boot->sect_per_clus_bits > EXFAT_MAX_SECT_PER_CLUS_BITS(p_boot)) {
- exfat_err(sb, "bogus sectors bits per cluster : %u\n",
+ exfat_err(sb, "bogus sectors bits per cluster : %u",
p_boot->sect_per_clus_bits);
return -EINVAL;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 27a0a8c74f7a..252c742379cf 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@@ -833,7 +833,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -1210,7 +1211,7 @@ failed_mount_group_desc:
failed_mount:
brelse(bh);
failed_sbi:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 841fa6d9d744..641abfa4b718 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -517,36 +517,36 @@ bad_block:
/* Here we know that we can set the new attribute. */
if (header) {
- /* assert(header == HDR(bh)); */
+ int offset;
+
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
__u32 hash = le32_to_cpu(header->h_hash);
+ struct mb_cache_entry *oe;
- ea_bdebug(bh, "modifying in-place");
+ oe = mb_cache_entry_delete_or_get(EA_BLOCK_CACHE(inode),
+ hash, bh->b_blocknr);
+ if (!oe) {
+ ea_bdebug(bh, "modifying in-place");
+ goto update_block;
+ }
/*
- * This must happen under buffer lock for
- * ext2_xattr_set2() to reliably detect modified block
+ * Someone is trying to reuse the block, leave it alone
*/
- mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
- bh->b_blocknr);
-
- /* keep the buffer locked while modifying it. */
- } else {
- int offset;
-
- unlock_buffer(bh);
- ea_bdebug(bh, "cloning");
- header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
- error = -ENOMEM;
- if (header == NULL)
- goto cleanup;
- header->h_refcount = cpu_to_le32(1);
-
- offset = (char *)here - bh->b_data;
- here = ENTRY((char *)header + offset);
- offset = (char *)last - bh->b_data;
- last = ENTRY((char *)header + offset);
+ mb_cache_entry_put(EA_BLOCK_CACHE(inode), oe);
}
+ unlock_buffer(bh);
+ ea_bdebug(bh, "cloning");
+ header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
+ error = -ENOMEM;
+ if (header == NULL)
+ goto cleanup;
+ header->h_refcount = cpu_to_le32(1);
+
+ offset = (char *)here - bh->b_data;
+ here = ENTRY((char *)header + offset);
+ offset = (char *)last - bh->b_data;
+ last = ENTRY((char *)header + offset);
} else {
/* Allocate a buffer where we construct the new block. */
header = kzalloc(sb->s_blocksize, GFP_KERNEL);
@@ -559,6 +559,7 @@ bad_block:
last = here = ENTRY(header+1);
}
+update_block:
/* Iff we are modifying the block in-place, bh is locked here. */
if (not_found) {
@@ -651,6 +652,55 @@ cleanup:
return error;
}
+static void ext2_xattr_release_block(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
+
+retry_ref:
+ lock_buffer(bh);
+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+ struct mb_cache_entry *oe;
+
+ /*
+ * This must happen under buffer lock to properly
+ * serialize with ext2_xattr_set() reusing the block.
+ */
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ /*
+ * Someone is trying to reuse the block. Wait
+ * and retry.
+ */
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+
+ /* Free the old block. */
+ ea_bdebug(bh, "freeing");
+ ext2_free_blocks(inode, bh->b_blocknr, 1);
+ /* We let our caller release bh, so we
+ * need to duplicate the buffer before. */
+ get_bh(bh);
+ bforget(bh);
+ unlock_buffer(bh);
+ } else {
+ /* Decrement the refcount only. */
+ le32_add_cpu(&HDR(bh)->h_refcount, -1);
+ dquot_free_block(inode, 1);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ ea_bdebug(bh, "refcount now=%d",
+ le32_to_cpu(HDR(bh)->h_refcount));
+ if (IS_SYNC(inode))
+ sync_dirty_buffer(bh);
+ }
+}
+
/*
* Second half of ext2_xattr_set(): Update the file system.
*/
@@ -747,34 +797,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* If there was an old block and we are no longer using it,
* release the old block.
*/
- lock_buffer(old_bh);
- if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
- __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
-
- /*
- * This must happen under buffer lock for
- * ext2_xattr_set2() to reliably detect freed block
- */
- mb_cache_entry_delete(ea_block_cache, hash,
- old_bh->b_blocknr);
- /* Free the old block. */
- ea_bdebug(old_bh, "freeing");
- ext2_free_blocks(inode, old_bh->b_blocknr, 1);
- mark_inode_dirty(inode);
- /* We let our caller release old_bh, so we
- * need to duplicate the buffer before. */
- get_bh(old_bh);
- bforget(old_bh);
- } else {
- /* Decrement the refcount only. */
- le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
- dquot_free_block_nodirty(inode, 1);
- mark_inode_dirty(inode);
- mark_buffer_dirty(old_bh);
- ea_bdebug(old_bh, "refcount now=%d",
- le32_to_cpu(HDR(old_bh)->h_refcount));
- }
- unlock_buffer(old_bh);
+ ext2_xattr_release_block(inode, old_bh);
}
cleanup:
@@ -828,30 +851,7 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl);
goto cleanup;
}
- lock_buffer(bh);
- if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
- __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-
- /*
- * This must happen under buffer lock for ext2_xattr_set2() to
- * reliably detect freed block
- */
- mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
- bh->b_blocknr);
- ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
- get_bh(bh);
- bforget(bh);
- unlock_buffer(bh);
- } else {
- le32_add_cpu(&HDR(bh)->h_refcount, -1);
- ea_bdebug(bh, "refcount now=%d",
- le32_to_cpu(HDR(bh)->h_refcount));
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- if (IS_SYNC(inode))
- sync_dirty_buffer(bh);
- dquot_free_block_nodirty(inode, 1);
- }
+ ext2_xattr_release_block(inode, bh);
EXT2_I(inode)->i_file_acl = 0;
cleanup:
@@ -943,7 +943,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
+
ce = mb_cache_entry_find_first(ea_block_cache, hash);
while (ce) {
struct buffer_head *bh;
@@ -955,22 +955,8 @@ again:
inode->i_ino, (unsigned long) ce->e_value);
} else {
lock_buffer(bh);
- /*
- * We have to be careful about races with freeing or
- * rehashing of xattr block. Once we hold buffer lock
- * xattr block's state is stable so we can check
- * whether the block got freed / rehashed or not.
- * Since we unhash mbcache entry under buffer lock when
- * freeing / rehashing xattr block, checking whether
- * entry is still hashed is reliable.
- */
- if (hlist_bl_unhashed(&ce->e_hash_list)) {
- mb_cache_entry_put(ea_block_cache, ce);
- unlock_buffer(bh);
- brelse(bh);
- goto again;
- } else if (le32_to_cpu(HDR(bh)->h_refcount) >
- EXT2_XATTR_REFCOUNT_MAX) {
+ if (le32_to_cpu(HDR(bh)->h_refcount) >
+ EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_value,
le32_to_cpu(HDR(bh)->h_refcount),
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 78ee3ef795ae..8ff4b9192a9f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -666,7 +666,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* it's possible we've just missed a transaction commit here,
* so ignore the returned status
*/
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
(void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 29fc575a4eb6..9bca5565547b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -724,6 +724,8 @@ enum {
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32)
+#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid)
+#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
@@ -753,6 +755,15 @@ enum {
EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+/*
+ * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
+ */
+struct fsuuid {
+ __u32 fsu_len;
+ __u32 fsu_flags;
+ __u8 fsu_uuid[];
+};
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -3016,7 +3027,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
-int ext4_update_overhead(struct super_block *sb);
+int ext4_update_overhead(struct super_block *sb, bool force);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
@@ -3583,6 +3594,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
extern int ext4_inline_data_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
int *has_inline, __u64 start, __u64 len);
+extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
@@ -3799,7 +3811,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
extern int ext4_resize_begin(struct super_block *sb);
-extern void ext4_resize_end(struct super_block *sb);
+extern int ext4_resize_end(struct super_block *sb, bool update_backups);
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3477a16d08ae..8e1fb18f465e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -267,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
trace_ext4_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
+ ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..23167efda95e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker);
+ err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
+ sbi->s_sb->s_id);
if (err)
goto err4;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index eb4c8ad1bb61..2af962cbb835 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -917,8 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
mutex_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
- jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
- __func__, cur_lblk_off, new_blk_size, inode->i_ino);
+ ext4_debug("will try writing %d to %d for inode %ld\n",
+ cur_lblk_off, new_blk_size, inode->i_ino);
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
@@ -1168,7 +1168,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
{
struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
- jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+ ext4_debug("Fast commit ended with status = %d for tid %u",
status, commit_tid);
if (status == EXT4_FC_STATUS_OK) {
stats->fc_num_commits++;
@@ -1375,14 +1375,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found", darg.ino);
+ ext4_debug("Inode %d not found", darg.ino);
return 0;
}
old_parent = ext4_iget(sb, darg.parent_ino,
EXT4_IGET_NORMAL);
if (IS_ERR(old_parent)) {
- jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
+ ext4_debug("Dir with inode %d not found", darg.parent_ino);
iput(inode);
return 0;
}
@@ -1407,21 +1407,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
if (IS_ERR(dir)) {
- jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+ ext4_debug("Dir with inode %d not found.", darg->parent_ino);
dir = NULL;
goto out;
}
dentry_dir = d_obtain_alias(dir);
if (IS_ERR(dentry_dir)) {
- jbd_debug(1, "Failed to obtain dentry");
+ ext4_debug("Failed to obtain dentry");
dentry_dir = NULL;
goto out;
}
dentry_inode = d_alloc(dentry_dir, &qstr_dname);
if (!dentry_inode) {
- jbd_debug(1, "Inode dentry not created.");
+ ext4_debug("Inode dentry not created.");
ret = -ENOMEM;
goto out;
}
@@ -1434,7 +1434,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
* could complete.
*/
if (ret && ret != -EEXIST) {
- jbd_debug(1, "Failed to link\n");
+ ext4_debug("Failed to link\n");
goto out;
}
@@ -1468,7 +1468,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return 0;
}
@@ -1576,7 +1576,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
/* Given that we just wrote the inode on disk, this SHOULD succeed. */
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return -EFSCORRUPTED;
}
@@ -1630,7 +1630,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "inode %d not found.", darg.ino);
+ ext4_debug("inode %d not found.", darg.ino);
inode = NULL;
ret = -EINVAL;
goto out;
@@ -1643,7 +1643,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
*/
dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
if (IS_ERR(dir)) {
- jbd_debug(1, "Dir %d not found.", darg.ino);
+ ext4_debug("Dir %d not found.", darg.ino);
goto out;
}
ret = ext4_init_new_dir(NULL, dir, inode);
@@ -1727,7 +1727,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return 0;
}
@@ -1741,7 +1741,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
cur = start;
remaining = len;
- jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
start, start_pblk, len, ext4_ext_is_unwritten(ex),
inode->i_ino);
@@ -1802,7 +1802,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
}
/* Range is mapped and needs a state change */
- jbd_debug(1, "Converting from %ld to %d %lld",
+ ext4_debug("Converting from %ld to %d %lld",
map.m_flags & EXT4_MAP_UNWRITTEN,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
@@ -1845,7 +1845,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
+ ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
return 0;
}
@@ -1853,7 +1853,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
if (ret)
goto out;
- jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
inode->i_ino, le32_to_cpu(lrange.fc_lblk),
le32_to_cpu(lrange.fc_len));
while (remaining > 0) {
@@ -1902,7 +1902,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
inode = ext4_iget(sb, state->fc_modified_inodes[i],
EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found.",
+ ext4_debug("Inode %d not found.",
state->fc_modified_inodes[i]);
continue;
}
@@ -2031,7 +2031,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
memcpy(&tl, cur, sizeof(tl));
val = cur + sizeof(tl);
- jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
+ ext4_debug("Scan phase, tag:%s, blk %lld\n",
tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
switch (le16_to_cpu(tl.fc_tag)) {
case EXT4_FC_TAG_ADD_RANGE:
@@ -2126,7 +2126,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
sbi->s_mount_state |= EXT4_FC_REPLAY;
}
if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
- jbd_debug(1, "Replay stops\n");
+ ext4_debug("Replay stops\n");
ext4_fc_set_bitmaps_and_counters(sb);
return 0;
}
@@ -2150,7 +2150,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
ext4_fc_set_bitmaps_and_counters(sb);
break;
}
- jbd_debug(3, "Replay phase, tag:%s\n",
+ ext4_debug("Replay phase, tag:%s\n",
tag2str(le16_to_cpu(tl.fc_tag)));
state->fc_replay_num_tags--;
switch (le16_to_cpu(tl.fc_tag)) {
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 07a8c75b65ed..860fc5119009 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -460,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle,
* the new i_size. But that is not done here - it is done in
* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
*/
- jbd_debug(5, "splicing indirect only\n");
+ ext4_debug("splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
if (err)
@@ -472,7 +472,7 @@ static int ext4_splice_branch(handle_t *handle,
err = ext4_mark_inode_dirty(handle, ar->inode);
if (unlikely(err))
goto err_out;
- jbd_debug(5, "splicing direct\n");
+ ext4_debug("splicing direct\n");
}
return err;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cff52ff6549d..a4fbe825694b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -6,6 +6,7 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>
@@ -35,6 +36,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_inode *raw_inode;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -1588,6 +1592,35 @@ out:
return ret;
}
+void *ext4_read_inline_link(struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ int ret, inline_size;
+ void *link;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = -ENOMEM;
+ inline_size = ext4_get_inline_size(inode);
+ link = kmalloc(inline_size + 1, GFP_NOFS);
+ if (!link)
+ goto out;
+
+ ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
+ if (ret < 0) {
+ kfree(link);
+ goto out;
+ }
+ nd_terminate_link(link, inode->i_size, ret);
+out:
+ if (ret < 0)
+ link = ERR_PTR(ret);
+ brelse(iloc.bh);
+ return link;
+}
+
struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9fd60fc8ba4c..601214453c3a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+ ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
/*
* When journalling data dirty buffers are tracked only in the
@@ -1571,7 +1573,14 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
folio_batch_init(&fbatch);
@@ -3142,13 +3151,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
journal_t *journal;
+ sector_t ret = 0;
int err;
+ inode_lock_shared(inode);
/*
* We can get here for an inline file via the FIBMAP ioctl
*/
if (ext4_has_inline_data(inode))
- return 0;
+ goto out;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
@@ -3187,10 +3198,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
jbd2_journal_unlock_updates(journal);
if (err)
- return 0;
+ goto out;
}
- return iomap_bmap(mapping, block, &ext4_iomap_ops);
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
+
+out:
+ inode_unlock_shared(inode);
+ return ret;
}
static int ext4_read_folio(struct file *file, struct folio *folio)
@@ -4687,8 +4702,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
return ext4_find_inline_data_nolock(inode);
@@ -5215,7 +5229,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ ext4_debug("called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index cb01c1da0f9d..3cf3ec4b1c21 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
+#include <linux/uuid.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
@@ -41,6 +42,15 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
}
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+{
+ memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
static
int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
ext4_update_sb_callback func,
@@ -944,7 +954,9 @@ static long ext4_ioctl_group_add(struct file *file,
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, input->group);
group_add_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1131,6 +1143,73 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
return 0;
}
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+ struct fsuuid __user *ufsuuid)
+{
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len == 0) {
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ lock_buffer(sbi->s_sbh);
+ memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+ const struct fsuuid __user *ufsuuid)
+{
+ int ret = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * If any checksums (group descriptors or metadata) are being used
+ * then the checksum seed feature is required to change the UUID.
+ */
+ if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ && !ext4_has_feature_csum_seed(sb))
+ || ext4_has_feature_stable_inodes(sb))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+ mnt_drop_write_file(filp);
+
+ return ret;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1223,7 +1302,9 @@ setversion_out:
err = err2;
mnt_drop_write_file(filp);
group_extend_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1371,7 +1452,9 @@ mext_out:
err = ext4_register_li_request(sb, o_group);
resizefs_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, true);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1509,6 +1592,10 @@ resizefs_out:
return ext4_ioctl_setlabel(filp,
(const void __user *)arg);
+ case EXT4_IOC_GETFSUUID:
+ return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+ case EXT4_IOC_SETFSUUID:
+ return ext4_ioctl_setuuid(filp, (const void __user *)arg);
default:
return -ENOTTY;
}
@@ -1586,6 +1673,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CHECKPOINT:
case FS_IOC_GETFSLABEL:
case FS_IOC_SETFSLABEL:
+ case EXT4_IOC_GETFSUUID:
+ case EXT4_IOC_SETFSUUID:
break;
default:
return -ENOIOCTLCMD;
@@ -1599,13 +1688,15 @@ static void set_overhead(struct ext4_super_block *es, const void *arg)
es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
}
-int ext4_update_overhead(struct super_block *sb)
+int ext4_update_overhead(struct super_block *sb, bool force)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sb_rdonly(sb) || sbi->s_overhead == 0 ||
- sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))
+ if (sb_rdonly(sb))
+ return 0;
+ if (!force &&
+ (sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
return 0;
-
return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9e06334771a3..bd8f8b5c3d30 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1933,6 +1933,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
unsigned ret = 0;
int len0 = len;
void *buddy;
+ bool split = false;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -1957,12 +1958,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain buddy itself */
while (len) {
- ord = mb_find_order_for_block(e4b, start);
+ if (!split)
+ ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
- buddy = mb_find_buddy(e4b, ord, &max);
+ if (!split)
+ buddy = mb_find_buddy(e4b, ord, &max);
+ else
+ split = false;
BUG_ON((start >> ord) >= max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
@@ -1989,6 +1994,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_clear_bit(cur + 1, buddy);
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
+ split = true;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
@@ -5928,6 +5934,15 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
+ goto error_return;
+ }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5944,6 +5959,8 @@ do_more:
overflow = EXT4_C2B(sbi, bit) + count -
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -5958,7 +5975,8 @@ do_more:
goto error_return;
}
- if (!ext4_inode_block_valid(inode, block, count)) {
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -6081,6 +6099,8 @@ do_more:
block += count;
count = overflow;
put_bh(bitmap_bh);
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
error_return:
@@ -6127,6 +6147,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
"block = %llu, count = %lu", block, count);
return;
}
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
@@ -6158,6 +6179,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block -= overflow;
count += overflow;
}
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
overflow = EXT4_LBLK_COFF(sbi, count);
if (overflow) {
@@ -6168,6 +6191,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
return;
} else
count += sbi->s_cluster_ratio - overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 42f590518b4c..54e7d3c95fd7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -417,7 +417,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
/*
@@ -465,6 +465,7 @@ int ext4_ext_migrate(struct inode *inode)
* the migration.
*/
ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -575,6 +576,7 @@ err_out:
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index db4ba99d1ceb..3a31b662f661 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
@@ -110,6 +126,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
+ if (block >= inode->i_size) {
+ ext4_error_inode(inode, func, line, block,
+ "Attempting to read directory block (%u) that is past i_size (%llu)",
+ block, inode->i_size);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
bh = ERR_PTR(-EIO);
else
@@ -3067,11 +3090,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 7de0612eb42d..69a9cf9137a6 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -181,8 +181,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
} else
brelse(iloc.bh);
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
+ ext4_debug("superblock will point to %lu\n", inode->i_ino);
+ ext4_debug("orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
out:
ext4_std_error(sb, err);
@@ -251,7 +251,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
}
mutex_lock(&sbi->s_orphan_lock);
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
@@ -267,7 +267,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %u\n", ino_next);
+ ext4_debug("superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode->i_sb,
sbi->s_sbh, EXT4_JTR_NONE);
@@ -286,7 +286,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct inode *i_prev =
&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
- jbd_debug(4, "orphan inode %lu will point to %u\n",
+ ext4_debug("orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
if (err) {
@@ -332,8 +332,8 @@ static void ext4_process_orphan(struct inode *inode,
ext4_msg(sb, KERN_DEBUG,
"%s: truncating inode %lu to %lld bytes",
__func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %lld bytes\n",
- inode->i_ino, inode->i_size);
+ ext4_debug("truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ret = ext4_truncate(inode);
@@ -353,8 +353,8 @@ static void ext4_process_orphan(struct inode *inode,
ext4_msg(sb, KERN_DEBUG,
"%s: deleting unreferenced inode %lu",
__func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
+ ext4_debug("deleting unreferenced inode %lu\n",
+ inode->i_ino);
(*nr_orphans)++;
}
iput(inode); /* The delete magic happens here! */
@@ -391,7 +391,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
if (!es->s_last_orphan && !oi->of_blocks) {
- jbd_debug(4, "no orphan inodes to clean up\n");
+ ext4_debug("no orphan inodes to clean up\n");
return;
}
@@ -415,7 +415,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
"clearing orphan list.\n");
es->s_last_orphan = 0;
}
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -459,7 +459,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
* so, skip the rest.
*/
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
es->s_last_orphan = 0;
break;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 8b70a4701293..fea2a68d067b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -97,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb)
return ret;
}
-void ext4_resize_end(struct super_block *sb)
+int ext4_resize_end(struct super_block *sb, bool update_backups)
{
clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
smp_mb__after_atomic();
+ if (update_backups)
+ return ext4_update_overhead(sb, true);
+ return 0;
}
static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -1380,6 +1383,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
return err;
}
+static void ext4_add_overhead(struct super_block *sb,
+ const ext4_fsblk_t overhead)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sbi->s_overhead += overhead;
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ smp_wmb();
+}
+
/*
* ext4_update_super() updates the super block so that the newly added
* groups can be seen by the filesystem.
@@ -1481,9 +1495,18 @@ static void ext4_update_super(struct super_block *sb,
}
/*
- * Update the fs overhead information
+ * Update the fs overhead information.
+ *
+ * For bigalloc, if the superblock already has a properly calculated
+ * overhead, update it with a value based on numbers already computed
+ * above for the newly allocated capacity.
*/
- ext4_calculate_overhead(sb);
+ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+ ext4_add_overhead(sb,
+ EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+ else
+ ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1988,6 +2011,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c68dec63e54..9a66abcca1a8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -3011,6 +3011,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
+
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=0");
+ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=1");
+ }
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -4272,7 +4281,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
return;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@@ -4284,7 +4293,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
if (!sbi)
return NULL;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4296,7 +4306,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
sbi->s_sb = sb;
return sbi;
err_out:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
return NULL;
}
@@ -5523,7 +5533,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
"Quota mode: %s.", descr, ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */
- ext4_update_overhead(sb);
+ ext4_update_overhead(sb, false);
return 0;
free_sbi:
@@ -5585,7 +5595,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
return NULL;
}
- jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index d281f5bcc526..3d3ed3c38f56 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -74,6 +74,21 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
struct buffer_head *bh;
+ char *inline_link;
+
+ /*
+ * Create a new inlined symlink is not supported, just provide a
+ * method to read the leftovers.
+ */
+ if (ext4_has_inline_data(inode)) {
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ inline_link = ext4_read_inline_link(inode);
+ if (!IS_ERR(inline_link))
+ set_delayed_call(callback, kfree_link, inline_link);
+ return inline_link;
+ }
if (!dentry) {
bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 564e28a1aa94..533216e80fa2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -436,6 +436,21 @@ error:
return err;
}
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+ struct mb_cache_entry *oe;
+
+ if (!EA_INODE_CACHE(inode))
+ return;
+ /* Wait for entry to get unused so that we can remove it */
+ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+ }
+}
+
static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
struct ext4_xattr_entry *entry, void *buffer,
@@ -976,10 +991,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
struct ext4_iloc iloc;
s64 ref_count;
- u32 hash;
int ret;
inode_lock(ea_inode);
@@ -1002,14 +1015,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
set_nlink(ea_inode, 1);
ext4_orphan_del(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_create(ea_inode_cache,
- GFP_NOFS, hash,
- ea_inode->i_ino,
- true /* reusable */);
- }
}
} else {
WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1022,12 +1027,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode);
ext4_orphan_add(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_delete(ea_inode_cache, hash,
- ea_inode->i_ino);
- }
}
}
@@ -1237,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
+retry_ref:
lock_buffer(bh);
hash = le32_to_cpu(BHDR(bh)->h_hash);
ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1246,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+ }
get_bh(bh);
unlock_buffer(bh);
@@ -1858,6 +1867,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
#define header(x) ((struct ext4_xattr_header *)(x))
if (s->base) {
+ int offset = (char *)s->here - bs->bh->b_data;
+
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, sb, bs->bh,
EXT4_JTR_NONE);
@@ -1873,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache,
+ hash, bs->bh->b_blocknr);
+ if (oe) {
+ /*
+ * Xattr block is getting reused. Leave
+ * it alone.
+ */
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto clone_block;
+ }
+ }
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
true /* is_block */);
@@ -1890,49 +1912,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error)
goto cleanup;
goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
+ }
+clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh);
- ea_bdebug(bs->bh, "cloning");
- s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
goto cleanup;
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
-
- /*
- * If existing entry points to an xattr inode, we need
- * to prevent ext4_xattr_set_entry() from decrementing
- * ref count on it because the reference belongs to the
- * original block. In this case, make the entry look
- * like it has an empty value.
- */
- if (!s->not_found && s->here->e_value_inum) {
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &tmp_inode);
- if (error)
- goto cleanup;
-
- if (!ext4_test_inode_state(tmp_inode,
- EXT4_STATE_LUSTRE_EA_INODE)) {
- /*
- * Defer quota free call for previous
- * inode until success is guaranteed.
- */
- old_ea_inode_quota = le32_to_cpu(
- s->here->e_value_size);
- }
- iput(tmp_inode);
- s->here->e_value_inum = 0;
- s->here->e_value_size = 0;
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
+ s->here->e_value_size);
}
+ iput(tmp_inode);
+
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
}
} else {
/* Allocate a buffer where we construct the new block. */
@@ -1999,18 +2019,13 @@ inserted:
lock_buffer(new_bh);
/*
* We have to be careful about races with
- * freeing, rehashing or adding references to
- * xattr block. Once we hold buffer lock xattr
- * block's state is stable so we can check
- * whether the block got freed / rehashed or
- * not. Since we unhash mbcache entry under
- * buffer lock when freeing / rehashing xattr
- * block, checking whether entry is still
- * hashed is reliable. Same rules hold for
- * e_reusable handling.
+ * adding references to xattr block. Once we
+ * hold buffer lock xattr block's state is
+ * stable so we can check the additional
+ * reference fits.
*/
- if (hlist_bl_unhashed(&ce->e_hash_list) ||
- !ce->e_reusable) {
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
/*
* Undo everything and check mbcache
* again.
@@ -2025,9 +2040,8 @@ inserted:
new_bh = NULL;
goto inserted;
}
- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ if (ref == EXT4_XATTR_REFCOUNT_MAX)
ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
@@ -2175,8 +2189,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
@@ -2204,8 +2219,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &is->s;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
+
error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 77efb9a627ad..824faf0b15a8 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -84,7 +84,7 @@ struct ext4_xattr_entry {
/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
-*/
+ */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \
((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
@@ -178,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
extern const struct xattr_handler *ext4_xattr_handlers[];
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 009e6c519e98..70e97075e535 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -729,14 +729,19 @@ out:
return ret;
}
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
+ bool pre_alloc);
+static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback, bool pre_alloc);
+
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
+ bool bypass_callback = false;
int ret;
- int i;
trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
dic->cluster_size, fi->i_compress_algorithm);
@@ -746,41 +751,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
goto out_end_io;
}
- dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
- if (!dic->tpages) {
- ret = -ENOMEM;
- goto out_end_io;
- }
-
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->rpages[i]) {
- dic->tpages[i] = dic->rpages[i];
- continue;
- }
-
- dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i]) {
- ret = -ENOMEM;
- goto out_end_io;
- }
- }
-
- if (cops->init_decompress_ctx) {
- ret = cops->init_decompress_ctx(dic);
- if (ret)
- goto out_end_io;
- }
-
- dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
- if (!dic->rbuf) {
- ret = -ENOMEM;
- goto out_destroy_decompress_ctx;
- }
-
- dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
- if (!dic->cbuf) {
- ret = -ENOMEM;
- goto out_vunmap_rbuf;
+ ret = f2fs_prepare_decomp_mem(dic, false);
+ if (ret) {
+ bypass_callback = true;
+ goto out_release;
}
dic->clen = le32_to_cpu(dic->cbuf->clen);
@@ -788,7 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
- goto out_vunmap_cbuf;
+ goto out_release;
}
ret = cops->decompress_pages(dic);
@@ -809,17 +783,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
}
}
-out_vunmap_cbuf:
- vm_unmap_ram(dic->cbuf, dic->nr_cpages);
-out_vunmap_rbuf:
- vm_unmap_ram(dic->rbuf, dic->cluster_size);
-out_destroy_decompress_ctx:
- if (cops->destroy_decompress_ctx)
- cops->destroy_decompress_ctx(dic);
+out_release:
+ f2fs_release_decomp_mem(dic, bypass_callback, false);
+
out_end_io:
trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
dic->clen, ret);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, in_task);
}
/*
@@ -829,7 +799,7 @@ out_end_io:
* (or in the case of a failure, cleans up without actually decompressing).
*/
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr)
+ block_t blkaddr, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
@@ -839,12 +809,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
if (failed)
WRITE_ONCE(dic->failed, true);
- else if (blkaddr)
+ else if (blkaddr && in_task)
f2fs_cache_compressed_page(sbi, page,
dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, in_task);
}
static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -871,19 +841,26 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
return is_page_in_cluster(cc, index);
}
-bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
- int index, int nr_pages)
+bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
+ int index, int nr_pages, bool uptodate)
{
- unsigned long pgidx;
- int i;
+ unsigned long pgidx = pages[index]->index;
+ int i = uptodate ? 0 : 1;
- if (nr_pages - index < cc->cluster_size)
+ /*
+ * when uptodate set to true, try to check all pages in cluster is
+ * uptodate or not.
+ */
+ if (uptodate && (pgidx % cc->cluster_size))
return false;
- pgidx = pvec->pages[index]->index;
+ if (nr_pages - index < cc->cluster_size)
+ return false;
- for (i = 1; i < cc->cluster_size; i++) {
- if (pvec->pages[index + i]->index != pgidx + i)
+ for (; i < cc->cluster_size; i++) {
+ if (pages[index + i]->index != pgidx + i)
+ return false;
+ if (uptodate && !PageUptodate(pages[index + i]))
return false;
}
@@ -1552,16 +1529,85 @@ destroy_out:
return err;
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic);
+static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi,
+ bool pre_alloc)
+{
+ return pre_alloc ^ f2fs_low_mem_mode(sbi);
+}
+
+static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
+ bool pre_alloc)
+{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ int i;
+
+ if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ return 0;
+
+ dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
+ if (!dic->tpages)
+ return -ENOMEM;
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i]) {
+ dic->tpages[i] = dic->rpages[i];
+ continue;
+ }
+
+ dic->tpages[i] = f2fs_compress_alloc_page();
+ if (!dic->tpages[i])
+ return -ENOMEM;
+ }
+
+ dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
+ if (!dic->rbuf)
+ return -ENOMEM;
+
+ dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
+ if (!dic->cbuf)
+ return -ENOMEM;
+
+ if (cops->init_decompress_ctx) {
+ int ret = cops->init_decompress_ctx(dic);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback, bool pre_alloc)
+{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+
+ if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ return;
+
+ if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
+
+ if (dic->cbuf)
+ vm_unmap_ram(dic->cbuf, dic->nr_cpages);
+
+ if (dic->rbuf)
+ vm_unmap_ram(dic->rbuf, dic->cluster_size);
+}
+
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
struct decompress_io_ctx *dic;
pgoff_t start_idx = start_idx_of_cluster(cc);
- int i;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ int i, ret;
- dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
- false, F2FS_I_SB(cc->inode));
+ dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi);
if (!dic)
return ERR_PTR(-ENOMEM);
@@ -1587,32 +1633,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->nr_rpages = cc->cluster_size;
dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages);
- if (!dic->cpages)
+ if (!dic->cpages) {
+ ret = -ENOMEM;
goto out_free;
+ }
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page;
page = f2fs_compress_alloc_page();
- if (!page)
+ if (!page) {
+ ret = -ENOMEM;
goto out_free;
+ }
f2fs_set_compressed_page(page, cc->inode,
start_idx + i + 1, dic);
dic->cpages[i] = page;
}
+ ret = f2fs_prepare_decomp_mem(dic, true);
+ if (ret)
+ goto out_free;
+
return dic;
out_free:
- f2fs_free_dic(dic);
- return ERR_PTR(-ENOMEM);
+ f2fs_free_dic(dic, true);
+ return ERR_PTR(ret);
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback)
{
int i;
+ f2fs_release_decomp_mem(dic, bypass_destroy_callback, true);
+
if (dic->tpages) {
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
@@ -1637,17 +1694,33 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic)
kmem_cache_free(dic_entry_slab, dic);
}
-static void f2fs_put_dic(struct decompress_io_ctx *dic)
+static void f2fs_late_free_dic(struct work_struct *work)
{
- if (refcount_dec_and_test(&dic->refcnt))
- f2fs_free_dic(dic);
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, free_work);
+
+ f2fs_free_dic(dic, false);
+}
+
+static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
+{
+ if (refcount_dec_and_test(&dic->refcnt)) {
+ if (in_task) {
+ f2fs_free_dic(dic, false);
+ } else {
+ INIT_WORK(&dic->free_work, f2fs_late_free_dic);
+ queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
+ &dic->free_work);
+ }
+ }
}
/*
* Update and unlock the cluster's pagecache pages, and release the reference to
* the decompress_io_ctx that was being held for I/O completion.
*/
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
int i;
@@ -1668,7 +1741,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
unlock_page(rpage);
}
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
static void f2fs_verify_cluster(struct work_struct *work)
@@ -1685,14 +1758,15 @@ static void f2fs_verify_cluster(struct work_struct *work)
SetPageError(rpage);
}
- __f2fs_decompress_end_io(dic, false);
+ __f2fs_decompress_end_io(dic, false, true);
}
/*
* This is called when a compressed cluster has been decompressed
* (or failed to be read and/or decompressed).
*/
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
if (!failed && dic->need_verity) {
/*
@@ -1704,7 +1778,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
} else {
- __f2fs_decompress_end_io(dic, failed);
+ __f2fs_decompress_end_io(dic, failed, in_task);
}
}
@@ -1713,12 +1787,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
*
* This is called when the page is no longer needed and can be freed.
*/
-void f2fs_put_page_dic(struct page *page)
+void f2fs_put_page_dic(struct page *page, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
/*
@@ -1903,6 +1977,9 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
char slab_name[32];
+ if (!f2fs_sb_has_compression(sbi))
+ return 0;
+
sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev));
sbi->page_array_slab_size = sizeof(struct page *) <<
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ed503d50f95f..aa3ccddfa037 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -119,7 +119,7 @@ struct bio_post_read_ctx {
block_t fs_blkaddr;
};
-static void f2fs_finish_read_bio(struct bio *bio)
+static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -133,8 +133,9 @@ static void f2fs_finish_read_bio(struct bio *bio)
if (f2fs_is_compressed_page(page)) {
if (bio->bi_status)
- f2fs_end_read_compressed_page(page, true, 0);
- f2fs_put_page_dic(page);
+ f2fs_end_read_compressed_page(page, true, 0,
+ in_task);
+ f2fs_put_page_dic(page, in_task);
continue;
}
@@ -191,7 +192,7 @@ static void f2fs_verify_bio(struct work_struct *work)
fsverity_verify_bio(bio);
}
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, true);
}
/*
@@ -203,7 +204,7 @@ static void f2fs_verify_bio(struct work_struct *work)
* can involve reading verity metadata pages from the file, and these verity
* metadata pages may be encrypted and/or compressed.
*/
-static void f2fs_verify_and_finish_bio(struct bio *bio)
+static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task)
{
struct bio_post_read_ctx *ctx = bio->bi_private;
@@ -211,7 +212,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
INIT_WORK(&ctx->work, f2fs_verify_bio);
fsverity_enqueue_verify_work(&ctx->work);
} else {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, in_task);
}
}
@@ -224,7 +225,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
* that the bio includes at least one compressed page. The actual decompression
* is done on a per-cluster basis, not a per-bio basis.
*/
-static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
+ bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -237,7 +239,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
/* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
f2fs_end_read_compressed_page(page, PageError(page),
- blkaddr);
+ blkaddr, in_task);
else
all_compressed = false;
@@ -262,15 +264,16 @@ static void f2fs_post_read_work(struct work_struct *work)
fscrypt_decrypt_bio(ctx->bio);
if (ctx->enabled_steps & STEP_DECOMPRESS)
- f2fs_handle_step_decompress(ctx);
+ f2fs_handle_step_decompress(ctx, true);
- f2fs_verify_and_finish_bio(ctx->bio);
+ f2fs_verify_and_finish_bio(ctx->bio, true);
}
static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
struct bio_post_read_ctx *ctx;
+ bool intask = in_task();
iostat_update_and_unbind_ctx(bio, 0);
ctx = bio->bi_private;
@@ -281,16 +284,29 @@ static void f2fs_read_end_io(struct bio *bio)
}
if (bio->bi_status) {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, intask);
return;
}
- if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
- INIT_WORK(&ctx->work, f2fs_post_read_work);
- queue_work(ctx->sbi->post_read_wq, &ctx->work);
- } else {
- f2fs_verify_and_finish_bio(bio);
+ if (ctx) {
+ unsigned int enabled_steps = ctx->enabled_steps &
+ (STEP_DECRYPT | STEP_DECOMPRESS);
+
+ /*
+ * If we have only decompression step between decompression and
+ * decrypt, we don't need post processing for this.
+ */
+ if (enabled_steps == STEP_DECOMPRESS &&
+ !f2fs_low_mem_mode(sbi)) {
+ f2fs_handle_step_decompress(ctx, intask);
+ } else if (enabled_steps) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ queue_work(ctx->sbi->post_read_wq, &ctx->work);
+ return;
+ }
}
+
+ f2fs_verify_and_finish_bio(bio, intask);
}
static void f2fs_write_end_io(struct bio *bio)
@@ -1682,8 +1698,6 @@ sync_out:
*/
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
- invalidate_mapping_pages(META_MAPPING(sbi),
- map->m_pblk, map->m_pblk);
if (map->m_multidev_dio) {
block_t blk_addr = map->m_pblk;
@@ -2223,7 +2237,7 @@ skip_reading_dnode:
if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, true);
continue;
}
@@ -2241,7 +2255,7 @@ submit_and_realloc:
page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, true);
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
@@ -2731,6 +2745,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.submitted = false,
.compr_blocks = compr_blocks,
.need_lock = LOCK_RETRY,
+ .post_read = f2fs_post_read_required(inode),
.io_type = io_type,
.io_wbc = wbc,
.bio = bio,
@@ -2902,7 +2917,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0, retry = 0;
- struct pagevec pvec;
+ struct page *pages[F2FS_ONSTACK_PAGES];
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
sector_t last_block;
@@ -2933,8 +2948,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int submitted = 0;
int i;
- pagevec_init(&pvec);
-
if (get_dirty_pages(mapping->host) <=
SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
set_inode_flag(mapping->host, FI_HOT_DATA);
@@ -2960,13 +2973,13 @@ retry:
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !retry && (index <= end)) {
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
- tag);
+ nr_pages = find_get_pages_range_tag(mapping, &index, end,
+ tag, F2FS_ONSTACK_PAGES, pages);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ struct page *page = pages[i];
bool need_readd;
readd:
need_readd = false;
@@ -2997,6 +3010,10 @@ readd:
if (!f2fs_cluster_is_empty(&cc))
goto lock_page;
+ if (f2fs_all_cluster_page_ready(&cc,
+ pages, i, nr_pages, true))
+ goto lock_page;
+
ret2 = f2fs_prepare_compress_overwrite(
inode, &pagep,
page->index, &fsdata);
@@ -3007,8 +3024,8 @@ readd:
} else if (ret2 &&
(!f2fs_compress_write_end(inode,
fsdata, page->index, 1) ||
- !f2fs_all_cluster_page_loaded(&cc,
- &pvec, i, nr_pages))) {
+ !f2fs_all_cluster_page_ready(&cc,
+ pages, i, nr_pages, false))) {
retry = 1;
break;
}
@@ -3098,7 +3115,7 @@ next:
if (need_readd)
goto readd;
}
- pagevec_release(&pvec);
+ release_pages(pages, nr_pages);
cond_resched();
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -3408,12 +3425,11 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
struct inode *cow_inode = F2FS_I(inode)->cow_inode;
pgoff_t index = page->index;
int err = 0;
- block_t ori_blk_addr;
+ block_t ori_blk_addr = NULL_ADDR;
/* If pos is beyond the end of file, reserve a new block in COW inode */
if ((pos & PAGE_MASK) >= i_size_read(inode))
- return __reserve_data_block(cow_inode, index, blk_addr,
- node_changed);
+ goto reserve_block;
/* Look for the block in COW inode first */
err = __find_data_block(cow_inode, index, blk_addr);
@@ -3427,10 +3443,12 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
if (err)
return err;
+reserve_block:
/* Finally, we should reserve a new block in COW inode for the update */
err = __reserve_data_block(cow_inode, index, blk_addr, node_changed);
if (err)
return err;
+ inc_atomic_write_cnt(inode);
if (ori_blk_addr != NULL_ADDR)
*blk_addr = ori_blk_addr;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c92625ef16d0..c01471573977 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = BLKS_PER_SEC(sbi);
+ blks_per_sec = CAP_BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, true);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 85921a1f2db8..3c7cdb70fe2e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -159,6 +159,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
+ int memory_mode; /* memory mode */
int discard_unit; /*
* discard command's offset/size should
* be aligned to this unit: block,
@@ -229,7 +230,6 @@ enum {
#define CP_PAUSE 0x00000040
#define CP_RESIZE 0x00000080
-#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */
#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */
@@ -598,6 +598,8 @@ enum {
#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
#define RECOVERY_MIN_RA_BLOCKS 1
+#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -757,6 +759,7 @@ enum {
FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */
FI_COMPRESS_RELEASED, /* compressed blocks were released */
FI_ALIGNED_WRITE, /* enable aligned write */
+ FI_COW_FILE, /* indicate COW file */
FI_MAX, /* max flag, never be used */
};
@@ -812,6 +815,8 @@ struct f2fs_inode_info {
unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
unsigned short i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
+
+ unsigned int atomic_write_cnt;
};
static inline void get_extent_info(struct extent_info *ext,
@@ -1198,6 +1203,7 @@ struct f2fs_io_info {
bool retry; /* need to reallocate block address */
int compr_blocks; /* # of compressed block addresses */
bool encrypted; /* indicate file is encrypted */
+ bool post_read; /* require post read */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
struct bio **bio; /* bio for ipu */
@@ -1234,7 +1240,6 @@ struct f2fs_dev_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_blkz; /* Total number of zones */
unsigned long *blkz_seq; /* Bitmap indicating sequential zones */
- block_t *zone_capacity_blocks; /* Array of zone capacity in blks */
#endif
};
@@ -1360,6 +1365,13 @@ enum {
DISCARD_UNIT_SECTION, /* basic discard unit is section */
};
+enum {
+ MEMORY_MODE_NORMAL, /* memory mode for normal devices */
+ MEMORY_MODE_LOW, /* memory mode for low memry devices */
+};
+
+
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1580,6 +1592,7 @@ struct decompress_io_ctx {
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
+ struct work_struct free_work; /* work for late free this structure itself */
};
#define NULL_CLUSTER ((unsigned int)(~0))
@@ -1664,6 +1677,7 @@ struct f2fs_sb_info {
unsigned int meta_ino_num; /* meta inode number*/
unsigned int log_blocks_per_seg; /* log2 blocks per segment */
unsigned int blocks_per_seg; /* blocks per segment */
+ unsigned int unusable_blocks_per_sec; /* unusable blocks per section */
unsigned int segs_per_sec; /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections; /* total section count */
@@ -1804,6 +1818,12 @@ struct f2fs_sb_info {
int max_fragment_chunk; /* max chunk size for block fragmentation mode */
int max_fragment_hole; /* max hole size for block fragmentation mode */
+ /* For atomic write statistics */
+ atomic64_t current_atomic_write;
+ s64 peak_atomic_write;
+ u64 committed_atomic_block;
+ u64 revoked_atomic_block;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -2418,6 +2438,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
}
+static inline void inc_atomic_write_cnt(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ u64 current_write;
+
+ fi->atomic_write_cnt++;
+ atomic64_inc(&sbi->current_atomic_write);
+ current_write = atomic64_read(&sbi->current_atomic_write);
+ if (current_write > sbi->peak_atomic_write)
+ sbi->peak_atomic_write = current_write;
+}
+
+static inline void release_atomic_write_cnt(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write);
+ fi->atomic_write_cnt = 0;
+}
+
static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
return atomic_read(&sbi->nr_pages[count_type]);
@@ -2696,16 +2738,6 @@ static inline struct page *f2fs_pagecache_get_page(
return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
}
-static inline void f2fs_copy_page(struct page *src, struct page *dst)
-{
- char *src_kaddr = kmap(src);
- char *dst_kaddr = kmap(dst);
-
- memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
- kunmap(dst);
- kunmap(src);
-}
-
static inline void f2fs_put_page(struct page *page, int unlock)
{
if (!page)
@@ -3208,6 +3240,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode)
return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
+static inline bool f2fs_is_cow_file(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_COW_FILE);
+}
+
static inline bool f2fs_is_first_block_written(struct inode *inode)
{
return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
@@ -4154,13 +4191,13 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr);
+ block_t blkaddr, bool in_task);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
-bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
- int index, int nr_pages);
+bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
+ int index, int nr_pages, bool uptodate);
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -4175,8 +4212,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
-void f2fs_put_page_dic(struct page *page);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task);
+void f2fs_put_page_dic(struct page *page, bool in_task);
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
@@ -4222,13 +4260,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
-static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { }
+static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
+ bool in_task) { }
static inline void f2fs_end_read_compressed_page(struct page *page,
- bool failed, block_t blkaddr)
+ bool failed, block_t blkaddr, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline void f2fs_put_page_dic(struct page *page)
+static inline void f2fs_put_page_dic(struct page *page, bool in_task)
{
WARN_ON_ONCE(1);
}
@@ -4254,8 +4293,9 @@ static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
unsigned int c_len) { }
#endif
-static inline void set_compress_context(struct inode *inode)
+static inline int set_compress_context(struct inode *inode)
{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
F2FS_I(inode)->i_compress_algorithm =
@@ -4278,6 +4318,10 @@ static inline void set_compress_context(struct inode *inode)
stat_inc_compr_inode(inode);
inc_compr_inode_stat(inode);
f2fs_mark_inode_dirty_sync(inode, true);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
}
static inline bool f2fs_disable_compressed_file(struct inode *inode)
@@ -4394,10 +4438,15 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
+static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
+}
+
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
- f2fs_is_atomic_file(inode))
+ f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode))
return false;
return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
}
@@ -4459,12 +4508,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
/* disallow direct IO if any of devices has unaligned blksize */
if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
return true;
- /*
- * for blkzoned device, fallback direct IO to buffered IO, so
- * all IOs can be serialized by log-structured write.
- */
- if (f2fs_sb_has_blkzoned(sbi))
- return true;
+
if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
if (block_unaligned_IO(inode, iocb, iter))
return true;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d66e37d80a2d..ce4905a073b3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1272,7 +1272,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
}
- f2fs_copy_page(psrc, pdst);
+ memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
@@ -1675,7 +1675,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
return 0;
if (f2fs_is_pinned_file(inode)) {
- block_t sec_blks = BLKS_PER_SEC(sbi);
+ block_t sec_blks = CAP_BLKS_PER_SEC(sbi);
block_t sec_len = roundup(map.m_len, sec_blks);
map.m_len = sec_blks;
@@ -1816,8 +1816,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
atomic_read(&inode->i_writecount) != 1)
return 0;
- if (f2fs_is_atomic_file(inode))
- f2fs_abort_atomic_write(inode, true);
+ f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1831,8 +1830,7 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
* until all the writers close its file. Since this should be done
* before dropping file lock, it needs to do in ->flush.
*/
- if (f2fs_is_atomic_file(inode) &&
- F2FS_I(inode)->atomic_write_task == current)
+ if (F2FS_I(inode)->atomic_write_task == current)
f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1867,22 +1865,15 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (masked_flags & F2FS_COMPR_FL) {
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
- }
- if (iflags & F2FS_NOCOMP_FL)
- return -EINVAL;
- if (iflags & F2FS_COMPR_FL) {
+ } else {
if (!f2fs_may_compress(inode))
return -EINVAL;
- if (S_ISREG(inode->i_mode) && inode->i_size)
+ if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
return -EINVAL;
-
- set_compress_context(inode);
+ if (set_compress_context(inode))
+ return -EOPNOTSUPP;
}
}
- if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
- if (masked_flags & F2FS_COMPR_FL)
- return -EINVAL;
- }
fi->i_flags = iflags | (fi->i_flags & ~mask);
f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) &&
@@ -2062,13 +2053,14 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
set_inode_flag(inode, FI_ATOMIC_FILE);
- set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE);
+ set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
f2fs_update_time(sbi, REQ_TIME);
fi->atomic_write_task = current;
stat_update_max_atomic_write(inode);
+ fi->atomic_write_cnt = 0;
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -2109,6 +2101,30 @@ unlock_out:
return ret;
}
+static int f2fs_ioc_abort_atomic_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(mnt_userns, inode))
+ return -EACCES;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ inode_lock(inode);
+
+ f2fs_abort_atomic_write(inode, true);
+
+ inode_unlock(inode);
+
+ mnt_drop_write_file(filp);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ return ret;
+}
+
static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -2426,7 +2442,7 @@ do_more:
ret = -EAGAIN;
goto out;
}
- range->start += BLKS_PER_SEC(sbi);
+ range->start += CAP_BLKS_PER_SEC(sbi);
if (range->start <= end)
goto do_more;
out:
@@ -2551,7 +2567,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
goto out;
}
- sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi));
+ sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi));
/*
* make sure there are enough free section for LFS allocation, this can
@@ -3897,10 +3913,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
for (i = 0; i < page_len; i++, redirty_idx++) {
page = find_lock_page(mapping, redirty_idx);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
+
+ /* It will never fail, when page has pinned above */
+ f2fs_bug_on(F2FS_I_SB(inode), !page);
+
set_page_dirty(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
@@ -3939,6 +3955,11 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
goto out;
}
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
goto out;
@@ -4006,6 +4027,11 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
goto out;
}
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
goto out;
@@ -4054,9 +4080,10 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_start_atomic_write(filp);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
+ case F2FS_IOC_ABORT_ATOMIC_WRITE:
+ return f2fs_ioc_abort_atomic_write(filp);
case F2FS_IOC_START_VOLATILE_WRITE:
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
- case F2FS_IOC_ABORT_VOLATILE_WRITE:
return -EOPNOTSUPP;
case F2FS_IOC_SHUTDOWN:
return f2fs_ioc_shutdown(filp, arg);
@@ -4725,7 +4752,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
case F2FS_IOC_START_VOLATILE_WRITE:
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
- case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_ATOMIC_WRITE:
case F2FS_IOC_SHUTDOWN:
case FITRIM:
case FS_IOC_SET_ENCRYPTION_POLICY:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d5fb426e0747..6da21d405ce1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -150,8 +150,11 @@ do_gc:
gc_control.nr_free_secs = foreground ? 1 : 0;
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, &gc_control))
- wait_ms = gc_th->no_gc_sleep_time;
+ if (f2fs_gc(sbi, &gc_control)) {
+ /* don't bother wait_ms by foreground gc */
+ if (!foreground)
+ wait_ms = gc_th->no_gc_sleep_time;
+ }
if (foreground)
wake_up_all(&gc_th->fggc_wq);
@@ -487,7 +490,7 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi,
unsigned long long age, u, accu;
unsigned long long max_mtime = sit_i->dirty_max_mtime;
unsigned long long min_mtime = sit_i->dirty_min_mtime;
- unsigned int sec_blocks = BLKS_PER_SEC(sbi);
+ unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi);
unsigned int vblocks;
unsigned int dirty_threshold = max(am->max_candidate_count,
am->candidate_ratio *
@@ -1487,7 +1490,7 @@ next_step:
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
(!force_migrate && get_valid_blocks(sbi, segno, true) ==
- BLKS_PER_SEC(sbi)))
+ CAP_BLKS_PER_SEC(sbi)))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 3fe145e8e594..19b956c2d697 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
return free_blks - ovp_blks;
}
-static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_invalid_user_blocks(block_t user_block_count)
{
- return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+ return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100;
}
-static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks)
{
- block_t reclaimable_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
@@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
{
- block_t invalid_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
+ block_t user_block_count = sbi->user_block_count;
+ block_t invalid_user_blocks = user_block_count -
+ written_block_count(sbi);
/*
* Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
- if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
- free_user_blocks(sbi) < limit_free_user_blocks(sbi))
- return true;
- return false;
+ return (invalid_user_blocks >
+ limit_invalid_user_blocks(user_block_count) &&
+ free_user_blocks(sbi) <
+ limit_free_user_blocks(invalid_user_blocks));
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index fc55f5bd1fcc..6d11c365d7b4 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -744,8 +744,7 @@ void f2fs_evict_inode(struct inode *inode)
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int err = 0;
- if (f2fs_is_atomic_file(inode))
- f2fs_abort_atomic_write(inode, true);
+ f2fs_abort_atomic_write(inode, true);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 740c72d088f3..e06a0c478b39 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1292,7 +1292,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
}
- f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+ if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
+ err = -EFSCORRUPTED;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto fail;
+ }
#endif
new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
@@ -1945,7 +1949,6 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
bool submitted = false;
- bool may_dirty = true;
/* give a priority to WB_SYNC threads */
if (atomic_read(&sbi->wb_sync_req[NODE]) &&
@@ -1998,11 +2001,8 @@ continue_unlock:
}
/* flush dirty inode */
- if (IS_INODE(page) && may_dirty) {
- may_dirty = false;
- if (flush_dirty_inode(page))
- goto lock_node;
- }
+ if (IS_INODE(page) && flush_dirty_inode(page))
+ goto lock_node;
write_node:
f2fs_wait_on_page_writeback(page, NODE, true, true);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c7afc588cf26..0de21f82d7bc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -190,18 +190,20 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- if (f2fs_is_atomic_file(inode)) {
- if (clean)
- truncate_inode_pages_final(inode->i_mapping);
- clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE);
- iput(fi->cow_inode);
- fi->cow_inode = NULL;
- clear_inode_flag(inode, FI_ATOMIC_FILE);
+ if (!f2fs_is_atomic_file(inode))
+ return;
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- sbi->atomic_files--;
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- }
+ if (clean)
+ truncate_inode_pages_final(inode->i_mapping);
+ clear_inode_flag(fi->cow_inode, FI_COW_FILE);
+ iput(fi->cow_inode);
+ fi->cow_inode = NULL;
+ release_atomic_write_cnt(inode);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ sbi->atomic_files--;
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -335,6 +337,11 @@ next:
}
out:
+ if (ret)
+ sbi->revoked_atomic_block += fi->atomic_write_cnt;
+ else
+ sbi->committed_atomic_block += fi->atomic_write_cnt;
+
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
return ret;
@@ -728,7 +735,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
get_valid_blocks(sbi, segno, true);
f2fs_bug_on(sbi, unlikely(!valid_blocks ||
- valid_blocks == BLKS_PER_SEC(sbi)));
+ valid_blocks == CAP_BLKS_PER_SEC(sbi)));
if (!IS_CURSEC(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
@@ -764,7 +771,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
if (!valid_blocks ||
- valid_blocks == BLKS_PER_SEC(sbi)) {
+ valid_blocks == CAP_BLKS_PER_SEC(sbi)) {
clear_bit(secno, dirty_i->dirty_secmap);
return;
}
@@ -3166,7 +3173,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
- f2fs_is_atomic_file(inode))
+ f2fs_is_cow_file(inode))
return CURSEG_HOT_DATA;
return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
} else {
@@ -3433,7 +3440,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
- invalidate_mapping_pages(META_MAPPING(sbi),
+ if (fio->post_read)
+ invalidate_mapping_pages(META_MAPPING(sbi),
fio->new_blkaddr, fio->new_blkaddr);
stat_inc_inplace_blocks(fio->sbi);
@@ -3616,10 +3624,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
block_t i;
+ if (!f2fs_post_read_required(inode))
+ return;
+
for (i = 0; i < len; i++)
f2fs_wait_on_block_writeback(inode, blkaddr + i);
+
+ invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1);
}
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
@@ -4362,6 +4376,12 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
return err;
seg_info_from_raw_sit(se, &sit);
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ return -EFSCORRUPTED;
+ }
+
sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
@@ -4410,6 +4430,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
break;
seg_info_from_raw_sit(se, &sit);
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ err = -EFSCORRUPTED;
+ break;
+ }
+
sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
@@ -4483,7 +4510,6 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno = 0, offset = 0, secno;
block_t valid_blocks, usable_blks_in_seg;
- block_t blks_per_sec = BLKS_PER_SEC(sbi);
while (1) {
/* find dirty segment based on free segmap */
@@ -4512,7 +4538,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
valid_blocks = get_valid_blocks(sbi, segno, true);
secno = GET_SEC_FROM_SEG(sbi, segno);
- if (!valid_blocks || valid_blocks == blks_per_sec)
+ if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))
continue;
if (IS_CURSEC(sbi, secno))
continue;
@@ -4895,7 +4921,7 @@ static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno,
static inline unsigned int f2fs_usable_zone_segs_in_sec(
struct f2fs_sb_info *sbi, unsigned int segno)
{
- unsigned int dev_idx, zone_idx, unusable_segs_in_sec;
+ unsigned int dev_idx, zone_idx;
dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno));
zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx);
@@ -4904,18 +4930,12 @@ static inline unsigned int f2fs_usable_zone_segs_in_sec(
if (is_conv_zone(sbi, zone_idx, dev_idx))
return sbi->segs_per_sec;
- /*
- * If the zone_capacity_blocks array is NULL, then zone capacity
- * is equal to the zone size for all zones
- */
- if (!FDEV(dev_idx).zone_capacity_blocks)
+ if (!sbi->unusable_blocks_per_sec)
return sbi->segs_per_sec;
/* Get the segment count beyond zone capacity block */
- unusable_segs_in_sec = (sbi->blocks_per_blkz -
- FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >>
- sbi->log_blocks_per_seg;
- return sbi->segs_per_sec - unusable_segs_in_sec;
+ return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >>
+ sbi->log_blocks_per_seg);
}
/*
@@ -4944,12 +4964,11 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
if (is_conv_zone(sbi, zone_idx, dev_idx))
return sbi->blocks_per_seg;
- if (!FDEV(dev_idx).zone_capacity_blocks)
+ if (!sbi->unusable_blocks_per_sec)
return sbi->blocks_per_seg;
sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
- sec_cap_blkaddr = sec_start_blkaddr +
- FDEV(dev_idx).zone_capacity_blocks[zone_idx];
+ sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi);
/*
* If segment starts before zone capacity and spans beyond
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 3f277dfcb131..d1d63766f2c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -101,6 +101,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
#define BLKS_PER_SEC(sbi) \
((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define CAP_BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \
+ (sbi)->unusable_blocks_per_sec)
#define GET_SEC_FROM_SEG(sbi, segno) \
(((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
@@ -609,10 +612,10 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
get_pages(sbi, F2FS_DIRTY_DENTS) +
get_pages(sbi, F2FS_DIRTY_IMETA);
unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
- unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi);
- unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi);
- unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi);
- unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi);
+ unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi);
+ unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi);
+ unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi);
+ unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
unsigned int free, need_lower, need_upper;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37221e94e5ef..2451623c05a7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/buffer_head.h>
@@ -159,6 +160,7 @@ enum {
Opt_gc_merge,
Opt_nogc_merge,
Opt_discard_unit,
+ Opt_memory_mode,
Opt_err,
};
@@ -235,6 +237,7 @@ static match_table_t f2fs_tokens = {
{Opt_gc_merge, "gc_merge"},
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
+ {Opt_memory_mode, "memory=%s"},
{Opt_err, NULL},
};
@@ -492,9 +495,19 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
-#ifdef CONFIG_FS_ENCRYPTION
+ struct fs_parameter param = {
+ .type = fs_value_is_string,
+ .string = arg->from ? arg->from : "",
+ };
+ struct fscrypt_dummy_policy *policy =
+ &F2FS_OPTION(sbi).dummy_enc_policy;
int err;
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ f2fs_warn(sbi, "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_err(sbi, "Encrypt feature is off");
return -EINVAL;
@@ -506,12 +519,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
+ if (is_remount && !fscrypt_is_dummy_policy_set(policy)) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
- err = fscrypt_set_test_dummy_encryption(
- sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
+
+ err = fscrypt_parse_test_dummy_encryption(&param, policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -524,12 +537,14 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
opt, err);
return -EINVAL;
}
+ err = fscrypt_add_test_dummy_key(sb, policy);
+ if (err) {
+ f2fs_warn(sbi, "Error adding test dummy encryption key [%d]",
+ err);
+ return err;
+ }
f2fs_warn(sbi, "Test dummy encryption mode enabled");
return 0;
-#else
- f2fs_warn(sbi, "test_dummy_encryption option not supported");
- return -EINVAL;
-#endif
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1222,6 +1237,22 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_memory_mode:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "normal")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_NORMAL;
+ } else if (!strcmp(name, "low")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_LOW;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1380,8 +1411,7 @@ static int f2fs_drop_inode(struct inode *inode)
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
- if (f2fs_is_atomic_file(inode))
- f2fs_abort_atomic_write(inode, true);
+ f2fs_abort_atomic_write(inode, true);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -1491,7 +1521,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
blkdev_put(FDEV(i).bdev, FMODE_EXCL);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
- kfree(FDEV(i).zone_capacity_blocks);
#endif
}
kvfree(sbi->devs);
@@ -1993,6 +2022,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
seq_printf(seq, ",discard_unit=%s", "section");
+ if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL)
+ seq_printf(seq, ",memory=%s", "normal");
+ else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW)
+ seq_printf(seq, ",memory=%s", "low");
+
return 0;
}
@@ -2014,6 +2048,7 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_ext_cnt = 0;
F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
+ F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL;
sbi->sb->s_flags &= ~SB_INLINECRYPT;
@@ -3579,6 +3614,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
spin_lock_init(&sbi->gc_urgent_high_lock);
+ atomic64_set(&sbi->current_atomic_write, 0);
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3636,24 +3672,29 @@ err_valid_block:
#ifdef CONFIG_BLK_DEV_ZONED
struct f2fs_report_zones_args {
+ struct f2fs_sb_info *sbi;
struct f2fs_dev_info *dev;
- bool zone_cap_mismatch;
};
static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
struct f2fs_report_zones_args *rz_args = data;
+ block_t unusable_blocks = (zone->len - zone->capacity) >>
+ F2FS_LOG_SECTORS_PER_BLOCK;
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return 0;
set_bit(idx, rz_args->dev->blkz_seq);
- rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >>
- F2FS_LOG_SECTORS_PER_BLOCK;
- if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch)
- rz_args->zone_cap_mismatch = true;
-
+ if (!rz_args->sbi->unusable_blocks_per_sec) {
+ rz_args->sbi->unusable_blocks_per_sec = unusable_blocks;
+ return 0;
+ }
+ if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) {
+ f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -3694,26 +3735,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (!FDEV(devi).blkz_seq)
return -ENOMEM;
- /* Get block zones type and zone-capacity */
- FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi,
- FDEV(devi).nr_blkz * sizeof(block_t),
- GFP_KERNEL);
- if (!FDEV(devi).zone_capacity_blocks)
- return -ENOMEM;
-
+ rep_zone_arg.sbi = sbi;
rep_zone_arg.dev = &FDEV(devi);
- rep_zone_arg.zone_cap_mismatch = false;
ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb,
&rep_zone_arg);
if (ret < 0)
return ret;
-
- if (!rep_zone_arg.zone_cap_mismatch) {
- kfree(FDEV(devi).zone_capacity_blocks);
- FDEV(devi).zone_capacity_blocks = NULL;
- }
-
return 0;
}
#endif
@@ -4579,7 +4607,7 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_sysfs();
if (err)
goto free_garbage_collection_cache;
- err = register_shrinker(&f2fs_shrinker_info);
+ err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
if (err)
goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 4c50aedd5144..eba5fb1629d7 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -339,6 +339,21 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
}
+ if (!strcmp(a->attr.name, "current_atomic_write")) {
+ s64 current_write = atomic64_read(&sbi->current_atomic_write);
+
+ return sysfs_emit(buf, "%lld\n", current_write);
+ }
+
+ if (!strcmp(a->attr.name, "peak_atomic_write"))
+ return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write);
+
+ if (!strcmp(a->attr.name, "committed_atomic_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block);
+
+ if (!strcmp(a->attr.name, "revoked_atomic_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block);
+
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@@ -608,6 +623,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "peak_atomic_write")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->peak_atomic_write = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "committed_atomic_block")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->committed_atomic_block = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "revoked_atomic_block")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->revoked_atomic_block = 0;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -713,6 +749,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \
.offset = _offset \
}
+#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \
+ F2FS_ATTR_OFFSET(struct_type, name, 0444, \
+ f2fs_sbi_show, NULL, \
+ offsetof(struct struct_name, elname))
+
#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0644, \
f2fs_sbi_show, f2fs_sbi_store, \
@@ -811,6 +852,8 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_FEATURE_RO_ATTR(block_zoned);
+F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec,
+ unusable_blocks_per_sec);
#endif
F2FS_FEATURE_RO_ATTR(atomic_write);
F2FS_FEATURE_RO_ATTR(extra_attr);
@@ -848,6 +891,12 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
+/* For atomic write */
+F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -919,6 +968,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(moved_blocks_background),
ATTR_LIST(avg_vblocks),
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ATTR_LIST(unusable_blocks_per_sec),
+#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compr_written_block),
ATTR_LIST(compr_saved_block),
@@ -934,6 +986,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_reclaimed_segments),
ATTR_LIST(max_fragment_chunk),
ATTR_LIST(max_fragment_hole),
+ ATTR_LIST(current_atomic_write),
+ ATTR_LIST(peak_atomic_write),
+ ATTR_LIST(committed_atomic_block),
+ ATTR_LIST(revoked_atomic_block),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c573314806cf..21620054e1c4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -889,22 +889,57 @@ out:
return err;
}
-static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
- struct dentry *old_dentry, struct inode *new_dir,
- struct dentry *new_dentry, unsigned int flags)
+static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
+ struct msdos_dir_entry **de)
+{
+ if (S_ISDIR(inode->i_mode)) {
+ if (fat_get_dotdot_entry(inode, bh, de))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int vfat_sync_ipos(struct inode *dir, struct inode *inode)
+{
+ if (IS_DIRSYNC(dir))
+ return fat_sync_inode(inode);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *dotdot_bh,
+ struct msdos_dir_entry *dotdot_de)
+{
+ fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart);
+ mark_buffer_dirty_inode(dotdot_bh, inode);
+ if (IS_DIRSYNC(dir))
+ return sync_dirty_buffer(dotdot_bh);
+ return 0;
+}
+
+static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts)
+{
+ inode_inc_iversion(dir);
+ fat_truncate_time(dir, ts, S_CTIME | S_MTIME);
+ if (IS_DIRSYNC(dir))
+ (void)fat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
{
struct buffer_head *dotdot_bh;
- struct msdos_dir_entry *dotdot_de;
+ struct msdos_dir_entry *dotdot_de = NULL;
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec64 ts;
loff_t new_i_pos;
- int err, is_dir, update_dotdot, corrupt = 0;
+ int err, is_dir, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = d_inode(old_dentry);
new_inode = d_inode(new_dentry);
@@ -913,15 +948,13 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (err)
goto out;
- is_dir = S_ISDIR(old_inode->i_mode);
- update_dotdot = (is_dir && old_dir != new_dir);
- if (update_dotdot) {
- if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
- err = -EIO;
+ if (old_dir != new_dir) {
+ err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de);
+ if (err)
goto out;
- }
}
+ is_dir = S_ISDIR(old_inode->i_mode);
ts = current_time(old_dir);
if (new_inode) {
if (is_dir) {
@@ -942,21 +975,15 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
fat_detach(old_inode);
fat_attach(old_inode, new_i_pos);
- if (IS_DIRSYNC(new_dir)) {
- err = fat_sync_inode(old_inode);
- if (err)
- goto error_inode;
- } else
- mark_inode_dirty(old_inode);
+ err = vfat_sync_ipos(new_dir, old_inode);
+ if (err)
+ goto error_inode;
- if (update_dotdot) {
- fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
- mark_buffer_dirty_inode(dotdot_bh, old_inode);
- if (IS_DIRSYNC(new_dir)) {
- err = sync_dirty_buffer(dotdot_bh);
- if (err)
- goto error_dotdot;
- }
+ if (dotdot_de) {
+ err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh,
+ dotdot_de);
+ if (err)
+ goto error_dotdot;
drop_nlink(old_dir);
if (!new_inode)
inc_nlink(new_dir);
@@ -966,12 +993,7 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
old_sinfo.bh = NULL;
if (err)
goto error_dotdot;
- inode_inc_iversion(old_dir);
- fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
- if (IS_DIRSYNC(old_dir))
- (void)fat_sync_inode(old_dir);
- else
- mark_inode_dirty(old_dir);
+ vfat_update_dir_metadata(old_dir, &ts);
if (new_inode) {
drop_nlink(new_inode);
@@ -991,10 +1013,9 @@ error_dotdot:
/* data cluster is shared, serious corruption */
corrupt = 1;
- if (update_dotdot) {
- fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
- mark_buffer_dirty_inode(dotdot_bh, old_inode);
- corrupt |= sync_dirty_buffer(dotdot_bh);
+ if (dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh,
+ dotdot_de);
}
error_inode:
fat_detach(old_inode);
@@ -1021,13 +1042,145 @@ error_inode:
goto out;
}
+static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode,
+ loff_t old_i_pos, loff_t new_i_pos)
+{
+ fat_detach(old_inode);
+ fat_detach(new_inode);
+ fat_attach(old_inode, new_i_pos);
+ fat_attach(new_inode, old_i_pos);
+}
+
+static void vfat_move_nlink(struct inode *src, struct inode *dst)
+{
+ drop_nlink(src);
+ inc_nlink(dst);
+}
+
+static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL;
+ struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL;
+ struct inode *old_inode, *new_inode;
+ struct timespec64 ts = current_time(old_dir);
+ loff_t old_i_pos, new_i_pos;
+ int err, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
+
+ old_inode = d_inode(old_dentry);
+ new_inode = d_inode(new_dentry);
+
+ /* Acquire super block lock for the operation to be atomic */
+ mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+ /* if directories are not the same, get ".." info to update */
+ if (old_dir != new_dir) {
+ err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh,
+ &old_dotdot_de);
+ if (err)
+ goto out;
+
+ err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh,
+ &new_dotdot_de);
+ if (err)
+ goto out;
+ }
+
+ old_i_pos = MSDOS_I(old_inode)->i_pos;
+ new_i_pos = MSDOS_I(new_inode)->i_pos;
+
+ vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos);
+
+ err = vfat_sync_ipos(old_dir, new_inode);
+ if (err)
+ goto error_exchange;
+ err = vfat_sync_ipos(new_dir, old_inode);
+ if (err)
+ goto error_exchange;
+
+ /* update ".." directory entry info */
+ if (old_dotdot_de) {
+ err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh,
+ old_dotdot_de);
+ if (err)
+ goto error_old_dotdot;
+ }
+ if (new_dotdot_de) {
+ err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh,
+ new_dotdot_de);
+ if (err)
+ goto error_new_dotdot;
+ }
+
+ /* if cross directory and only one is a directory, adjust nlink */
+ if (!old_dotdot_de != !new_dotdot_de) {
+ if (old_dotdot_de)
+ vfat_move_nlink(old_dir, new_dir);
+ else
+ vfat_move_nlink(new_dir, old_dir);
+ }
+
+ vfat_update_dir_metadata(old_dir, &ts);
+ /* if directories are not the same, update new_dir as well */
+ if (old_dir != new_dir)
+ vfat_update_dir_metadata(new_dir, &ts);
+
+out:
+ brelse(old_dotdot_bh);
+ brelse(new_dotdot_bh);
+ mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+ return err;
+
+error_new_dotdot:
+ if (new_dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(new_dir, new_inode,
+ new_dotdot_bh, new_dotdot_de);
+ }
+
+error_old_dotdot:
+ if (old_dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(old_dir, old_inode,
+ old_dotdot_bh, old_dotdot_de);
+ }
+
+error_exchange:
+ vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos);
+ corrupt |= vfat_sync_ipos(new_dir, new_inode);
+ corrupt |= vfat_sync_ipos(old_dir, old_inode);
+
+ if (corrupt < 0) {
+ fat_fs_error(new_dir->i_sb,
+ "%s: Filesystem corrupted (i_pos %lld, %lld)",
+ __func__, old_i_pos, new_i_pos);
+ }
+ goto out;
+}
+
+static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return vfat_rename_exchange(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+
+ /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
+ return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations vfat_dir_inode_operations = {
.create = vfat_create,
.lookup = vfat_lookup,
.unlink = vfat_unlink,
.mkdir = vfat_mkdir,
.rmdir = vfat_rmdir,
- .rename = vfat_rename,
+ .rename = vfat_rename2,
.setattr = fat_setattr,
.getattr = fat_getattr,
.update_time = fat_update_time,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05221366a16d..08a1993ab7fd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -134,10 +134,10 @@ static bool inode_io_list_move_locked(struct inode *inode,
static void wb_wakeup(struct bdi_writeback *wb)
{
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
static void finish_writeback_work(struct bdi_writeback *wb,
@@ -164,7 +164,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
if (work->done)
atomic_inc(&work->done->cnt);
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state)) {
list_add_tail(&work->list, &wb->work_list);
@@ -172,7 +172,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
} else
finish_writeback_work(wb, work);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
/**
@@ -2082,13 +2082,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (!list_empty(&wb->work_list)) {
work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
return work;
}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 74920826d8f6..451d8a077e12 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -263,6 +263,8 @@ void fscache_caching_failed(struct fscache_cookie *cookie)
{
clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags);
fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED);
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ fscache_cookie_failed);
}
EXPORT_SYMBOL(fscache_caching_failed);
@@ -739,6 +741,9 @@ again_locked:
fallthrough;
case FSCACHE_COOKIE_STATE_FAILED:
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+
if (atomic_read(&cookie->n_accesses) != 0)
break;
if (test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) {
@@ -1063,8 +1068,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
- __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie);
- set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags);
+ if (!test_and_set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie);
fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7cede9a3bc96..247ef4f76761 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
struct dentry *parent;
char name[32];
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
int i;
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 10eb50cbf398..e23e802a8013 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
WARN_ON(fcd->nr_free_ranges <= 0);
fcd->nr_free_ranges--;
}
+ __kick_dmap_free_worker(fcd, 0);
spin_unlock(&fcd->lock);
- kick_dmap_free_worker(fcd, 0);
return dmap;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0e537e580dc1..51897427a534 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -730,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
}
} else {
size_t off;
- err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+ err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
if (err < 0)
return err;
BUG_ON(!err);
cs->len = err;
cs->offset = off;
cs->pg = page;
- iov_iter_advance(cs->iter, err);
}
return lock_request(cs->req);
@@ -1356,7 +1355,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(to))
+ if (!user_backed_iter(to))
return -EINVAL;
fuse_copy_init(&cs, 1, to);
@@ -1949,7 +1948,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(from))
+ if (!user_backed_iter(from))
return -EINVAL;
fuse_copy_init(&cs, 0, from);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 74303d6e987b..b585b04e815e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fs_context.h>
+#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -21,6 +22,11 @@
#include <linux/types.h>
#include <linux/kernel.h>
+static bool __read_mostly allow_sys_admin_access;
+module_param(allow_sys_admin_access, bool, 0644);
+MODULE_PARM_DESC(allow_sys_admin_access,
+ "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
+
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -537,6 +543,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_file *ff;
void *security_ctx = NULL;
u32 security_ctxlen;
+ bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -561,7 +568,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.mode = mode;
inarg.umask = current_umask();
- if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+ if (fm->fc->handle_killpriv_v2 && trunc &&
!(flags & O_EXCL) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
@@ -623,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
+ if (fm->fc->atomic_o_trunc && trunc)
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
return err;
@@ -1224,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
+ if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ return 1;
+
if (fc->allow_other)
return current_in_userns(fc->user_ns);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 00fa861aeead..1a3afd469e3a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
- truncate_pagecache(inode, 0);
file_update_time(file);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
- } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
- invalidate_inode_pages2(inode->i_mapping);
}
-
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (is_wb_truncate || dax_truncate) {
+ if (is_wb_truncate || dax_truncate)
inode_lock(inode);
- fuse_set_nowrite(inode);
- }
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
- goto out;
+ goto out_inode_unlock;
}
+ if (is_wb_truncate || dax_truncate)
+ fuse_set_nowrite(inode);
+
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
-out:
+ if (is_wb_truncate || dax_truncate)
+ fuse_release_nowrite(inode);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ }
if (dax_truncate)
filemap_invalidate_unlock(inode->i_mapping);
-
- if (is_wb_truncate | dax_truncate) {
- fuse_release_nowrite(inode);
+out_inode_unlock:
+ if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
- }
return err;
}
@@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * Dirty pages might remain despite write_inode_now() call from
+ * fuse_flush() due to writes racing with the close.
+ */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
fuse_release_common(file, false);
/* return value is ignored by VFS */
@@ -1401,14 +1414,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
- ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+ ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
*nbytesp - nbytes,
max_pages - ap->num_pages,
&start);
if (ret < 0)
break;
- iov_iter_advance(ii, ret);
nbytes += ret;
ret += start;
@@ -1465,7 +1477,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
inode_unlock(inode);
}
- io->should_dirty = !write && iter_is_iovec(iter);
+ io->should_dirty = !write && user_backed_iter(iter);
while (count) {
ssize_t nres;
fl_owner_t owner = current->files;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8c0665c5dff8..6b3beda16c1b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
@@ -476,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (!fc->no_force_umount)
- fuse_abort_conn(fc);
+ if (fc->no_force_umount)
+ return;
+
+ fuse_abort_conn(fc);
+
+ // Only retire block-device-based superblocks.
+ if (sb->s_bdev != NULL)
+ retire_super(sb);
}
static void fuse_send_destroy(struct fuse_mount *fm)
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 33cde4bbccdc..61d8afcb10a3 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -9,6 +9,17 @@
#include <linux/compat.h>
#include <linux/fileattr.h>
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
+{
+ ssize_t ret = fuse_simple_request(fm, args);
+
+ /* Translate ENOSYS, which shouldn't be returned from fs */
+ if (ret == -ENOSYS)
+ ret = -ENOTTY;
+
+ return ret;
+}
+
/*
* CUSE servers compiled on 32bit broke on 64bit kernels because the
* ABI was defined to be 'struct iovec' which is different on 32bit
@@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_simple_request(fm, &ap.args);
+ transferred = fuse_send_ioctl(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
@@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].size = inarg.out_size;
args.out_args[1].value = ptr;
- err = fuse_simple_request(fm, &args);
+ err = fuse_send_ioctl(fm, &args);
if (!err) {
if (outarg.result < 0)
err = outarg.result;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 8db53fa67359..4d8d4f16c727 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -741,8 +741,7 @@ out:
}
/* Free virtqueues (device must already be reset) */
-static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
- struct virtio_fs *fs)
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
{
vdev->config->del_vqs(vdev);
}
@@ -757,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
- size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+ size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
if (kaddr)
*kaddr = fs->window_kaddr + offset;
@@ -895,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
virtio_reset_device(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_fs_cleanup_vqs(vdev);
kfree(fs->vqs);
out:
@@ -927,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
virtio_reset_device(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_fs_cleanup_vqs(vdev);
vdev->priv = NULL;
/* Put device reference on virtio_fs object */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 57ff883d432c..05bee80ac7de 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -82,31 +82,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
}
/**
- * gfs2_writepage - Write page for writeback mappings
- * @page: The page
- * @wbc: The writeback control
- */
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct iomap_writepage_ctx wpc = { };
-
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
- goto out;
- if (current->journal_info)
- goto redirty;
- return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops);
-
-redirty:
- redirty_page_for_writepage(wbc, page);
-out:
- unlock_page(page);
- return 0;
-}
-
-/**
* gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page
* @page: The page to write
* @wbc: The writeback control
@@ -765,7 +740,6 @@ cannot_release:
}
static const struct address_space_operations gfs2_aops = {
- .writepage = gfs2_writepage,
.writepages = gfs2_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a0562dd1bada..54a6d17b8c25 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2016,7 +2016,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
l_blocks++;
}
- gfs2_rlist_alloc(&rlist);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2cceb193dcd8..892006fbbb09 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -780,7 +780,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
if (!count)
return false;
- if (!iter_is_iovec(i))
+ if (!user_backed_iter(i))
return false;
size = PAGE_SIZE;
@@ -1066,8 +1066,7 @@ out_unlock:
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- if (statfs_gh)
- kfree(statfs_gh);
+ kfree(statfs_gh);
from->count = orig_count - written;
return written ? written : ret;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c992d53013d3..41b6c89e4bf7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -405,10 +405,13 @@ static void do_error(struct gfs2_glock *gl, const int ret)
/**
* demote_incompat_holders - demote incompatible demoteable holders
* @gl: the glock we want to promote
- * @new_gh: the new holder to be promoted
+ * @current_gh: the newly promoted holder
+ *
+ * We're passing the newly promoted holder in @current_gh, but actually, any of
+ * the strong holders would do.
*/
static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *new_gh)
+ struct gfs2_holder *current_gh)
{
struct gfs2_holder *gh, *tmp;
@@ -424,8 +427,10 @@ static void demote_incompat_holders(struct gfs2_glock *gl,
*/
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
return;
+ if (gh == current_gh)
+ continue;
if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, new_gh, gh)) {
+ !may_grant(gl, current_gh, gh)) {
/*
* We should not recurse into do_promote because
* __gfs2_glock_dq only calls handle_callback,
@@ -478,8 +483,7 @@ find_first_strong_holder(struct gfs2_glock *gl)
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
*
- * Returns: 0 if instantiate was successful, 2 if type specific operation is
- * underway, or error.
+ * Returns: 0 if instantiate was successful, or error.
*/
int gfs2_instantiate(struct gfs2_holder *gh)
{
@@ -489,7 +493,7 @@ int gfs2_instantiate(struct gfs2_holder *gh)
again:
if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
- return 0;
+ goto done;
/*
* Since we unlock the lockref lock, we set a flag to indicate
@@ -508,78 +512,55 @@ again:
goto again;
}
- ret = glops->go_instantiate(gh);
+ ret = glops->go_instantiate(gl);
if (!ret)
clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
- return ret;
+ if (ret)
+ return ret;
+
+done:
+ if (glops->go_held)
+ return glops->go_held(gh);
+ return 0;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
- * Returns: 1 if there is a blocked holder at the head of the list, or 2
- * if a type specific operation is underway.
+ * Returns: 1 if there is a blocked holder at the head of the list
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh, *tmp, *first_gh;
+ struct gfs2_holder *gh, *current_gh;
bool incompat_holders_demoted = false;
- bool lock_released;
- int ret;
-restart:
- first_gh = find_first_strong_holder(gl);
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- lock_released = false;
+ current_gh = find_first_strong_holder(gl);
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (!may_grant(gl, first_gh, gh)) {
+ if (!may_grant(gl, current_gh, gh)) {
/*
- * If we get here, it means we may not grant this holder for
- * some reason. If this holder is the head of the list, it
- * means we have a blocked holder at the head, so return 1.
+ * If we get here, it means we may not grant this
+ * holder for some reason. If this holder is at the
+ * head of the list, it means we have a blocked holder
+ * at the head, so return 1.
*/
if (list_is_first(&gh->gh_list, &gl->gl_holders))
return 1;
do_error(gl, 0);
break;
}
- if (!incompat_holders_demoted) {
- demote_incompat_holders(gl, first_gh);
- incompat_holders_demoted = true;
- first_gh = gh;
- }
- if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
- !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
- lock_released = true;
- spin_unlock(&gl->gl_lockref.lock);
- ret = gfs2_instantiate(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- }
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- /*
- * If we released the gl_lockref.lock the holders list may have
- * changed. For that reason, we start again at the start of
- * the holders queue.
- */
- if (lock_released)
- goto restart;
+ if (!incompat_holders_demoted) {
+ current_gh = gh;
+ demote_incompat_holders(gl, current_gh);
+ incompat_holders_demoted = true;
+ }
}
return 0;
}
@@ -657,7 +638,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh;
unsigned state = ret & LM_OUT_ST_MASK;
- int rv;
spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
@@ -715,6 +695,8 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
+ int rv;
+
spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl);
spin_lock(&gl->gl_lockref.lock);
@@ -723,13 +705,10 @@ retry:
goto out;
}
}
- rv = do_promote(gl);
- if (rv == 2)
- goto out_locked;
+ do_promote(gl);
}
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
-out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -886,7 +865,6 @@ __releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
- int ret;
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
return;
@@ -905,18 +883,14 @@ __acquires(&gl->gl_lockref.lock)
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
- ret = do_promote(gl);
- if (ret == 0)
+ if (do_promote(gl) == 0)
goto out_unlock;
- if (ret == 2)
- goto out;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
}
do_xmote(gl, gh, gl->gl_target);
-out:
return;
out_sched:
@@ -1314,6 +1288,25 @@ static void gfs2_glock_update_hold_time(struct gfs2_glock *gl,
}
/**
+ * gfs2_glock_holder_ready - holder is ready and its error code can be collected
+ * @gh: the glock holder
+ *
+ * Called when a glock holder no longer needs to be waited for because it is
+ * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has
+ * failed (gh_error != 0).
+ */
+
+int gfs2_glock_holder_ready(struct gfs2_holder *gh)
+{
+ if (gh->gh_error || (gh->gh_flags & GL_SKIP))
+ return gh->gh_error;
+ gh->gh_error = gfs2_instantiate(gh);
+ if (gh->gh_error)
+ gfs2_glock_dq(gh);
+ return gh->gh_error;
+}
+
+/**
* gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
@@ -1327,7 +1320,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
might_sleep();
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
gfs2_glock_update_hold_time(gh->gh_gl, start_time);
- return gh->gh_error;
+ return gfs2_glock_holder_ready(gh);
}
static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs)
@@ -1355,7 +1348,6 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd;
int i, ret = 0, timeout = 0;
unsigned long start_time = jiffies;
- bool keep_waiting;
might_sleep();
/*
@@ -1365,53 +1357,33 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
for (i = 0; i < num_gh; i++)
timeout += ghs[i].gh_gl->gl_hold_time << 1;
-wait_for_dlm:
if (!wait_event_timeout(sdp->sd_async_glock_wait,
- !glocks_pending(num_gh, ghs), timeout))
+ !glocks_pending(num_gh, ghs), timeout)) {
ret = -ESTALE; /* request timed out. */
+ goto out;
+ }
- /*
- * If dlm granted all our requests, we need to adjust the glock
- * minimum hold time values according to how long we waited.
- *
- * If our request timed out, we need to repeatedly release any held
- * glocks we acquired thus far to allow dlm to acquire the remaining
- * glocks without deadlocking. We cannot currently cancel outstanding
- * glock acquisitions.
- *
- * The HIF_WAIT bit tells us which requests still need a response from
- * dlm.
- *
- * If dlm sent us any errors, we return the first error we find.
- */
- keep_waiting = false;
for (i = 0; i < num_gh; i++) {
- /* Skip holders we have already dequeued below. */
- if (!gfs2_holder_queued(&ghs[i]))
- continue;
- /* Skip holders with a pending DLM response. */
- if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) {
- keep_waiting = true;
- continue;
- }
+ struct gfs2_holder *gh = &ghs[i];
+ int ret2;
- if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) {
- if (ret == -ESTALE)
- gfs2_glock_dq(&ghs[i]);
- else
- gfs2_glock_update_hold_time(ghs[i].gh_gl,
- start_time);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ gfs2_glock_update_hold_time(gh->gh_gl,
+ start_time);
}
+ ret2 = gfs2_glock_holder_ready(gh);
if (!ret)
- ret = ghs[i].gh_error;
+ ret = ret2;
}
- if (keep_waiting)
- goto wait_for_dlm;
+out:
+ if (ret) {
+ for (i = 0; i < num_gh; i++) {
+ struct gfs2_holder *gh = &ghs[i];
- /*
- * At this point, we've either acquired all locks or released them all.
- */
+ gfs2_glock_dq(gh);
+ }
+ }
return ret;
}
@@ -1490,10 +1462,10 @@ __acquires(&gl->gl_lockref.lock)
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder *first_gh;
+ struct gfs2_holder *current_gh;
- first_gh = find_first_strong_holder(gl);
- try_futile = !may_grant(gl, first_gh, gh);
+ current_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
@@ -1779,7 +1751,7 @@ static int glock_compare(const void *arg_a, const void *arg_b)
}
/**
- * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * nq_m_sync - synchronously acquire more than one glock in deadlock free order
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
* @p: placeholder for the holder structure to pass back
@@ -1800,8 +1772,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
for (x = 0; x < num_gh; x++) {
- p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
-
error = gfs2_glock_nq(p[x]);
if (error) {
while (x--)
@@ -1818,7 +1788,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
*
- *
* Returns: 0 on success (all glocks acquired),
* errno on failure (no glocks acquired)
*/
@@ -1833,7 +1802,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
case 0:
return 0;
case 1:
- ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
return gfs2_glock_nq(ghs);
default:
if (num_gh <= 4)
@@ -2245,20 +2213,6 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
glock_hash_walk(dump_glock_func, sdp);
}
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
-{
- struct gfs2_glock *gl = ip->i_gl;
- int ret;
-
- ret = gfs2_truncatei_resume(ip);
- gfs2_glock_assert_withdraw(gl, ret == 0);
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl, 1);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
static const char *state2str(unsigned state)
{
switch(state) {
@@ -2533,7 +2487,7 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
- ret = register_shrinker(&glock_shrinker);
+ ret = register_shrinker(&glock_shrinker, "gfs2-glock");
if (ret) {
destroy_workqueue(gfs2_delete_workqueue);
destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0ae9100a0bc..5aed8b500cf5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -213,6 +213,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
extern int gfs2_instantiate(struct gfs2_holder *gh);
+extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -273,7 +274,6 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 392800f082a6..49210a2e7ce7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -485,35 +485,33 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
* Returns: errno
*/
-static int inode_go_instantiate(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = gl->gl_object;
+
+ if (!ip) /* no inode to populate - read it in later */
+ return 0;
+
+ return gfs2_inode_refresh(ip);
+}
+
+static int inode_go_held(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
if (!ip) /* no inode to populate - read it in later */
- goto out;
-
- error = gfs2_inode_refresh(ip);
- if (error)
- goto out;
+ return 0;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
(gl->gl_state == LM_ST_EXCLUSIVE) &&
- (gh->gh_state == LM_ST_EXCLUSIVE)) {
- spin_lock(&sdp->sd_trunc_lock);
- if (list_empty(&ip->i_trunc_list))
- list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- wake_up(&sdp->sd_quota_wait);
- error = 1;
- }
+ (gh->gh_state == LM_ST_EXCLUSIVE))
+ error = gfs2_truncatei_resume(ip);
-out:
return error;
}
@@ -737,6 +735,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_instantiate = inode_go_instantiate,
+ .go_held = inode_go_held,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8c00fb389ae5..d09d9892cd05 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -219,7 +219,8 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_instantiate) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_glock *gl);
+ int (*go_held)(struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -396,7 +397,6 @@ struct gfs2_inode {
atomic_t i_sizehint; /* hint of the write size */
struct rw_semaphore i_rw_mutex;
struct list_head i_ordered;
- struct list_head i_trunc_list;
__be64 *i_hash_cache;
u32 i_entries;
u32 i_diskflags;
@@ -784,8 +784,6 @@ struct gfs2_sbd {
struct mutex sd_quota_mutex;
struct mutex sd_quota_sync_mutex;
wait_queue_head_t sd_quota_wait;
- struct list_head sd_trunc_list;
- spinlock_t sd_trunc_lock;
unsigned int sd_quota_slots;
unsigned long *sd_quota_bitmap;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2559a79cf14b..6ce369b096d4 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1058,7 +1058,7 @@ restart:
/*
* Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
- * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * to accommodate the largest slot number. (NB dlm slot numbers start at 1,
* gfs2 jids start at 0, so jid = slot - 1)
*/
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eec4159b08aa..723639376ae2 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -131,7 +131,7 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- ret = generic_writepages(mapping, wbc);
+ ret = filemap_fdatawrite_wbc(mapping, wbc);
if (need_resched()) {
blk_finish_plug(plug);
cond_resched();
@@ -222,8 +222,7 @@ out:
spin_unlock(&sdp->sd_ail_lock);
blk_finish_plug(&plug);
if (ret) {
- gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) "
- "returned: %d\n", ret);
+ gfs2_lm(sdp, "gfs2_ail1_start_one returned: %d\n", ret);
gfs2_withdraw(sdp);
}
trace_gfs2_ail_flush(sdp, wbc, 0);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 244187e3e70f..14ae9de76277 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -38,7 +38,6 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
- INIT_LIST_HEAD(&ip->i_trunc_list);
INIT_LIST_HEAD(&ip->i_ordered);
ip->i_qadata = NULL;
gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
@@ -148,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
- error = register_shrinker(&gfs2_qd_shrinker);
+ error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
if (error)
goto fail_shrinker;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c9b423c874a3..549879929c84 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -106,8 +106,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_quota_mutex);
mutex_init(&sdp->sd_quota_sync_mutex);
init_waitqueue_head(&sdp->sd_quota_wait);
- INIT_LIST_HEAD(&sdp->sd_trunc_list);
- spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_bitmap_lock);
INIT_LIST_HEAD(&sdp->sd_sc_inodes_list);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c98a7faa67d3..f201eaf59d0d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1517,25 +1517,6 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
}
}
-static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
-{
- struct gfs2_inode *ip;
-
- while(1) {
- ip = NULL;
- spin_lock(&sdp->sd_trunc_lock);
- if (!list_empty(&sdp->sd_trunc_list)) {
- ip = list_first_entry(&sdp->sd_trunc_list,
- struct gfs2_inode, i_trunc_list);
- list_del_init(&ip->i_trunc_list);
- }
- spin_unlock(&sdp->sd_trunc_lock);
- if (ip == NULL)
- return;
- gfs2_glock_finish_truncate(ip);
- }
-}
-
void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
if (!sdp->sd_statfs_force_sync) {
sdp->sd_statfs_force_sync = 1;
@@ -1558,7 +1539,6 @@ int gfs2_quotad(void *data)
unsigned long quotad_timeo = 0;
unsigned long t = 0;
DEFINE_WAIT(wait);
- int empty;
while (!kthread_should_stop()) {
@@ -1579,19 +1559,13 @@ int gfs2_quotad(void *data)
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
- /* Check for & recover partially truncated inodes */
- quotad_check_trunc_list(sdp);
-
try_to_freeze();
bypass:
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
- spin_lock(&sdp->sd_trunc_lock);
- empty = list_empty(&sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- if (empty && !sdp->sd_statfs_force_sync)
+ if (!sdp->sd_statfs_force_sync)
t -= schedule_timeout(t);
else
t = 0;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8a63870eef5a..f602fb844951 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1196,9 +1196,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
unsigned int length = rgd->rd_length;
@@ -2720,12 +2719,15 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
* gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
* and initialize an array of glock holders for them
* @rlist: the list of resource groups
+ * @state: the state we're requesting
+ * @flags: the modifier flags
*
* FIXME: Don't use NOFAIL
*
*/
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags)
{
unsigned int x;
@@ -2733,8 +2735,8 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
sizeof(struct gfs2_holder),
GFP_NOFS | __GFP_NOFAIL);
for (x = 0; x < rlist->rl_rgrps; x++)
- gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]);
+ gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, flags,
+ &rlist->rl_ghs[x]);
}
/**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 46dd94e9e085..00b30cf893af 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
@@ -64,7 +64,8 @@ struct gfs2_rgrp_list {
extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bdb773e5c88f..b5b0f285b27f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1196,7 +1196,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
gfs2_glock_dq(gh);
return false;
}
- return true;
+ return gfs2_glock_holder_ready(gh) == 0;
}
/**
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 0c5650fe1fd1..f6a66050380e 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1313,7 +1313,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
else
goto out;
- gfs2_rlist_alloc(&rlist);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE);
for (x = 0; x < rlist.rl_rgrps; x++) {
rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 20336cb3c040..f7a5b5124d8a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,6 @@
#include <linux/thread_info.h>
#include <asm/current.h>
-#include <linux/sched/signal.h> /* remove ASAP */
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -40,7 +39,6 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
-static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -284,39 +282,9 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
#endif
-static size_t
-hugetlbfs_read_actor(struct page *page, unsigned long offset,
- struct iov_iter *to, unsigned long size)
-{
- size_t copied = 0;
- int i, chunksize;
-
- /* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_SHIFT;
- offset = offset & ~PAGE_MASK;
-
- while (size) {
- size_t n;
- chunksize = PAGE_SIZE;
- if (offset)
- chunksize -= offset;
- if (chunksize > size)
- chunksize = size;
- n = copy_page_to_iter(&page[i], offset, chunksize, to);
- copied += n;
- if (n != chunksize)
- return copied;
- offset = 0;
- size -= chunksize;
- i++;
- }
- return copied;
-}
-
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to generic_file_buffered_read(), we can't use that
- * since it has PAGE_SIZE assumptions.
+ * data. This provides functionality similar to filemap_read().
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -363,7 +331,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* We have the page, copy it to user space buffer.
*/
- copied = hugetlbfs_read_actor(page, offset, to, nr);
+ copied = copy_page_to_iter(page, offset, nr, to);
put_page(page);
}
offset += copied;
@@ -1082,7 +1050,7 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
- /* If no limits set, just report 0 for max/free/used
+ /* If no limits set, just report 0 or -1 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->spool) {
long free_pages;
@@ -1309,7 +1277,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
ps = memparse(param->string, &rest);
ctx->hstate = size_to_hstate(ps);
if (!ctx->hstate) {
- pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
return 0;
@@ -1385,7 +1353,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
* specified. Any needed reservations (for minimum size) are taken
- * taken when the subpool is created.
+ * when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
@@ -1555,7 +1523,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
}
if (IS_ERR(mnt))
pr_err("Cannot mount internal hugetlbfs for page size %luK",
- huge_page_size(h) >> 10);
+ huge_page_size(h) / SZ_1K);
return mnt;
}
diff --git a/fs/inode.c b/fs/inode.c
index 524ee91f74a6..ba1de23c13c1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -422,6 +422,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
+ INIT_LIST_HEAD(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
@@ -1021,7 +1022,6 @@ struct inode *new_inode_pseudo(struct super_block *sb)
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
- INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
@@ -1165,7 +1165,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
- bool creating = inode->i_state & I_CREATING;
again:
spin_lock(&inode_hash_lock);
@@ -1199,7 +1198,12 @@ again:
inode->i_state |= I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- if (!creating)
+
+ /*
+ * Add inode to the sb list if it's not already. It has I_NEW at this
+ * point, so it should be safe to test i_sb_list locklessly.
+ */
+ if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
unlock:
spin_unlock(&inode_hash_lock);
@@ -2014,23 +2018,25 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
- int error;
+ int error = 0;
int kill;
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
- if (kill <= 0)
+ if (kill < 0)
return kill;
- if (flags & IOCB_NOWAIT)
- return -EAGAIN;
+ if (kill) {
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+ }
- error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error)
inode_has_no_xattr(inode);
-
return error;
}
@@ -2326,10 +2332,6 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
- !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
- mode &= ~S_ISGID;
} else
inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
@@ -2485,3 +2487,33 @@ struct timespec64 current_time(struct inode *inode)
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
+
+/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @mnt_userns: User namespace of the mount the inode was created from
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+ *
+ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
+ * raised and @dir has the S_ISGID bit raised ensure that the caller is
+ * either in the group of the parent directory or they have CAP_FSETID
+ * in their user namespace and are privileged over the parent directory.
+ * In all other cases, strip the S_ISGID bit from @mode.
+ *
+ * Return: the new mode to use for the file
+ */
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t mode)
+{
+ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+ return mode;
+ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+ return mode;
+ if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
+ return mode;
+ if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ return mode;
+
+ return mode & ~S_ISGID;
+}
+EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2b82c7f1de88..ca5c62901541 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1529,21 +1529,6 @@ unlock:
}
int
-iomap_writepage(struct page *page, struct writeback_control *wbc,
- struct iomap_writepage_ctx *wpc,
- const struct iomap_writeback_ops *ops)
-{
- int ret;
-
- wpc->ops = ops;
- ret = iomap_do_writepage(page, wbc, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
-}
-EXPORT_SYMBOL_GPL(iomap_writepage);
-
-int
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c75d33d5c3ce..4eb559a16c9e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -533,7 +533,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
}
- if (iter_is_iovec(iter))
+ if (user_backed_iter(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
iomi.flags |= IOMAP_WRITE;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 746132998c57..51bd38da21cd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
tid_t this_tid;
int result, batch_count = 0;
- jbd_debug(1, "Start checkpoint\n");
+ jbd2_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
@@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
- jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
@@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
trace_jbd2_drop_transaction(journal, transaction);
- jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+ jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 890b5543a1c5..b2b2bc9b88d9 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
- jbd_debug(3, "super block updated\n");
+ jbd2_debug(3, "super block updated\n");
mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* We hold j_checkpoint_mutex so tail cannot change under us.
@@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
REQ_SYNC);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
- jbd_debug(3, "superblock not updated\n");
+ jbd2_debug(3, "superblock not updated\n");
}
J_ASSERT(journal->j_running_transaction != NULL);
@@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction = journal->j_running_transaction;
trace_jbd2_start_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+ jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
@@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
__jbd2_journal_clean_checkpoint_list(journal, false);
spin_unlock(&journal->j_list_lock);
- jbd_debug(3, "JBD2: commit phase 1\n");
+ jbd2_debug(3, "JBD2: commit phase 1\n");
/*
* Clear revoked flag to reflect there is no revoked buffers
@@ -553,13 +553,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ write_lock(&journal->j_state_lock);
/*
* Reserved credits cannot be claimed anymore, free them
*/
atomic_sub(atomic_read(&journal->j_reserved_credits),
&commit_transaction->t_outstanding_credits);
- write_lock(&journal->j_state_lock);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2a\n");
+ jbd2_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
- jbd_debug(3, "JBD2: commit phase 2b\n");
+ jbd2_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (!descriptor) {
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD2: get descriptor\n");
+ jbd2_debug(4, "JBD2: get descriptor\n");
descriptor = jbd2_journal_get_descriptor_buffer(
commit_transaction,
@@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+ jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)descriptor->b_blocknr,
descriptor->b_data);
tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_buffers == NULL ||
space_left < tag_bytes + 16 + csum_size) {
- jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+ jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
@@ -839,7 +839,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD2: commit phase 3\n");
+ jbd2_debug(3, "JBD2: commit phase 3\n");
while (!list_empty(&io_bufs)) {
struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -882,7 +882,7 @@ start_journal_io:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD2: commit phase 4\n");
+ jbd2_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
while (!list_empty(&log_bufs)) {
@@ -906,7 +906,7 @@ start_journal_io:
if (err)
jbd2_journal_abort(journal, err);
- jbd_debug(3, "JBD2: commit phase 5\n");
+ jbd2_debug(3, "JBD2: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -945,7 +945,7 @@ start_journal_io:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD2: commit phase 6\n");
+ jbd2_debug(3, "JBD2: commit phase 6\n");
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -1122,7 +1122,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD2: commit phase 7\n");
+ jbd2_debug(3, "JBD2: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
@@ -1164,7 +1164,7 @@ restart_loop:
journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
trace_jbd2_end_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+ jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2a1b9da7c3e3..6350d3857c89 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -49,8 +49,7 @@
#include <asm/page.h>
#ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -81,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -115,7 +113,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
-EXPORT_SYMBOL(__jbd2_debug);
#endif
/* Checksumming functions */
@@ -203,11 +200,11 @@ loop:
if (journal->j_flags & JBD2_UNMOUNT)
goto end_loop;
- jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
+ jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
journal->j_commit_sequence, journal->j_commit_request);
if (journal->j_commit_sequence != journal->j_commit_request) {
- jbd_debug(1, "OK, requests differ\n");
+ jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
del_timer_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
@@ -222,7 +219,7 @@ loop:
* good idea, because that depends on threads that may
* be already stopped.
*/
- jbd_debug(1, "Now suspending kjournald2\n");
+ jbd2_debug(1, "Now suspending kjournald2\n");
write_unlock(&journal->j_state_lock);
try_to_freeze();
write_lock(&journal->j_state_lock);
@@ -252,7 +249,7 @@ loop:
finish_wait(&journal->j_wait_commit, &wait);
}
- jbd_debug(1, "kjournald2 wakes\n");
+ jbd2_debug(1, "kjournald2 wakes\n");
/*
* Were we woken up by a commit wakeup event?
@@ -260,7 +257,7 @@ loop:
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
journal->j_commit_request = transaction->t_tid;
- jbd_debug(1, "woke because of timeout\n");
+ jbd2_debug(1, "woke because of timeout\n");
}
goto loop;
@@ -268,7 +265,7 @@ end_loop:
del_timer_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
- jbd_debug(1, "Journal thread exiting.\n");
+ jbd2_debug(1, "Journal thread exiting.\n");
write_unlock(&journal->j_state_lock);
return 0;
}
@@ -481,7 +478,7 @@ repeat:
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/* Return if the txn has already requested to be committed */
if (journal->j_commit_request == target)
@@ -500,7 +497,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
*/
journal->j_commit_request = target;
- jbd_debug(1, "JBD2: requesting commit %u/%u\n",
+ jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
journal->j_commit_request,
journal->j_commit_sequence);
journal->j_running_transaction->t_requested = jiffies;
@@ -705,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
#endif
while (tid_gt(tid, journal->j_commit_sequence)) {
- jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
+ jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
tid, journal->j_commit_sequence);
read_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_commit);
@@ -1117,7 +1114,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
freed += journal->j_last - journal->j_first;
trace_jbd2_update_log_tail(journal, tid, block, freed);
- jbd_debug(1,
+ jbd2_debug(1,
"Cleaning journal tail from %u to %u (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, tid, block, freed);
@@ -1418,7 +1415,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
goto err_cleanup;
- if (register_shrinker(&journal->j_shrinker)) {
+ if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
goto err_cleanup;
}
@@ -1497,7 +1495,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
return NULL;
}
- jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+ jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
@@ -1577,7 +1575,7 @@ static int journal_reset(journal_t *journal)
* attempting a write to a potential-readonly device.
*/
if (sb->s_start == 0) {
- jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+ jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
"(start %ld, seq %u, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
@@ -1681,7 +1679,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
}
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+ jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
lock_buffer(journal->j_sb_buffer);
@@ -1722,7 +1720,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
return;
}
- jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
+ jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1865,7 +1863,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
errcode = journal->j_errno;
if (errcode == -ESHUTDOWN)
errcode = 0;
- jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+ jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
sb->s_errno = cpu_to_be32(errcode);
jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
@@ -2337,7 +2335,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
- jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
@@ -2406,7 +2404,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
{
journal_superblock_t *sb;
- jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
@@ -2863,7 +2861,7 @@ static struct journal_head *journal_alloc_journal_head(void)
#endif
ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
- jbd_debug(1, "out of memory for journal_head\n");
+ jbd2_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index e699d6ab2c0e..f548479615c6 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal,
return 0;
while (next_fc_block <= journal->j_fc_last) {
- jbd_debug(3, "Fast commit replay: next block %ld\n",
+ jbd2_debug(3, "Fast commit replay: next block %ld\n",
next_fc_block);
err = jread(&bh, journal, next_fc_block);
if (err) {
- jbd_debug(3, "Fast commit replay: read error\n");
+ jbd2_debug(3, "Fast commit replay: read error\n");
break;
}
@@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal,
}
if (err)
- jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+ jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
return err;
}
@@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal)
*/
if (!sb->s_start) {
- jbd_debug(1, "No recovery required, last transaction %d\n",
+ jbd2_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
@@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
- jbd_debug(1, "JBD2: recovery, exit status %d, "
+ jbd2_debug(1, "JBD2: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
- jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+ jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
@@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
- jbd_debug(1,
+ jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
#endif
@@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
- jbd_debug(1, "Starting recovery pass %d\n", pass);
+ jbd2_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
@@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block,
jbd2_has_feature_fast_commit(journal) ?
journal->j_fc_last : journal->j_last);
@@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal,
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal,
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
- jbd_debug(3, "Found magic %d, sequence %d\n",
+ jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
@@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal,
goto failed;
}
need_check_commit_time = true;
- jbd_debug(1,
+ jbd2_debug(1,
"invalid descriptor block found in %lu\n",
next_log_block);
}
@@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal,
* It likely does not belong to same journal,
* just end this recovery with success.
*/
- jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
brelse(bh);
goto done;
@@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN &&
!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
- jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+ jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
next_log_block);
need_check_commit_time = true;
}
@@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal,
continue;
default:
- jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4556e4689024 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
}
handle->h_revoke_credits--;
- jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+ jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
- jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+ jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
@@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
- jbd_debug(4, "cancelled existing revoke on "
+ jbd2_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
@@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
}
if (descriptor)
flush_descriptor(journal, descriptor, offset);
- jbd_debug(1, "Wrote %d revoke records\n", count);
+ jbd2_debug(1, "Wrote %d revoke records\n", count);
}
/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e9c308ae475f..e1be93ccd81c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -373,7 +373,7 @@ alloc_transaction:
return -ENOMEM;
}
- jbd_debug(3, "New handle %p going live.\n", handle);
+ jbd2_debug(3, "New handle %p going live.\n", handle);
/*
* We need to hold j_state_lock until t_updates has been incremented,
@@ -453,7 +453,7 @@ repeat:
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
jbd2_log_space_left(journal));
@@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
/* Don't extend a locked-down transaction! */
if (transaction->t_state != T_RUNNING) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
@@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
&transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto error_out;
@@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
handle->h_revoke_credits_requested += revoke_records;
result = 0;
- jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+ jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
error_out:
read_unlock(&journal->j_state_lock);
return result;
@@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- jbd_debug(2, "restarting handle %p\n", handle);
+ jbd2_debug(2, "restarting handle %p\n", handle);
stop_this_handle(handle);
handle->h_transaction = NULL;
@@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+ jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
repeat:
@@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
- jbd_debug(5, "journal_head %p\n", jh);
+ jbd2_debug(5, "journal_head %p\n", jh);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
@@ -1486,8 +1486,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int ret = 0;
- if (is_handle_aborted(handle))
- return -EROFS;
if (!buffer_jbd(bh))
return -EUCLEAN;
@@ -1496,7 +1494,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* of the running transaction.
*/
jh = bh2jh(bh);
- jbd_debug(5, "journal_head %p\n", jh);
+ jbd2_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
/*
@@ -1534,6 +1532,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
+ if (is_handle_aborted(handle)) {
+ /*
+ * Check journal aborting with @jh->b_state_lock locked,
+ * since 'jh->b_transaction' could be replaced with
+ * 'jh->b_next_transaction' during old transaction
+ * committing if journal aborted, which may fail
+ * assertion on 'jh->b_frozen_data == NULL'.
+ */
+ ret = -EROFS;
+ goto out_unlock_bh;
+ }
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
@@ -1818,7 +1828,7 @@ int jbd2_journal_stop(handle_t *handle)
pid_t pid;
if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
handle->h_ref);
if (is_handle_aborted(handle))
return -EIO;
@@ -1838,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle)
if (is_handle_aborted(handle))
err = -EIO;
- jbd_debug(4, "Handle %p going down\n", handle);
+ jbd2_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
@@ -1916,7 +1926,7 @@ int jbd2_journal_stop(handle_t *handle)
* completes the commit thread, it just doesn't write
* anything to disk. */
- jbd_debug(2, "transaction too old, requesting commit for "
+ jbd2_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
jbd2_log_start_commit(journal, tid);
@@ -2662,7 +2672,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
return -EROFS;
journal = transaction->t_journal;
- jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
spin_lock(&journal->j_list_lock);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b9..5d826274570c 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
* change between calls to kernel_read_file().
*
* Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
*
*/
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
loff_t i_size, pos;
- size_t copied;
+ ssize_t copied;
void *allocated = NULL;
bool whole_file;
int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
goto out;
}
/* The file is too big for sane activities. */
- if (i_size > INT_MAX) {
+ if (i_size > SSIZE_MAX) {
ret = -EFBIG;
goto out;
}
@@ -124,12 +124,12 @@ out:
}
EXPORT_SYMBOL_GPL(kernel_read_file);
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
struct file *file;
- int ret;
+ ssize_t ret;
if (!path || !*path)
return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
- void **buf, size_t buf_size,
- size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+ void **buf, size_t buf_size,
+ size_t *file_size,
+ enum kernel_read_file_id id)
{
struct file *file;
struct path root;
- int ret;
+ ssize_t ret;
if (!path || !*path)
return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
struct fd f = fdget(fd);
- int ret = -EBADF;
+ ssize_t ret = -EBADF;
if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 911444d21267..c5a5c7b90d72 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -121,8 +121,8 @@ out:
return rc;
}
-static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
- char *dname)
+static int calc_ntlmv2_hash(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ char *ntlmv2_hash, char *dname)
{
int ret, len, conv_len;
wchar_t *domain = NULL;
@@ -158,7 +158,7 @@ static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
}
conv_len = smb_strtoUTF16(uniname, user_name(sess->user), len,
- sess->conn->local_nls);
+ conn->local_nls);
if (conv_len < 0 || conv_len > len) {
ret = -EINVAL;
goto out;
@@ -182,7 +182,7 @@ static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
}
conv_len = smb_strtoUTF16((__le16 *)domain, dname, len,
- sess->conn->local_nls);
+ conn->local_nls);
if (conv_len < 0 || conv_len > len) {
ret = -EINVAL;
goto out;
@@ -215,8 +215,9 @@ out:
*
* Return: 0 on success, error number on error
*/
-int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name, char *cryptkey)
+int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct ntlmv2_resp *ntlmv2, int blen, char *domain_name,
+ char *cryptkey)
{
char ntlmv2_hash[CIFS_ENCPWD_SIZE];
char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE];
@@ -230,7 +231,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
return -ENOMEM;
}
- rc = calc_ntlmv2_hash(sess, ntlmv2_hash, domain_name);
+ rc = calc_ntlmv2_hash(conn, sess, ntlmv2_hash, domain_name);
if (rc) {
ksmbd_debug(AUTH, "could not get v2 hash rc %d\n", rc);
goto out;
@@ -333,7 +334,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
/* process NTLMv2 authentication */
ksmbd_debug(AUTH, "decode_ntlmssp_authenticate_blob dname%s\n",
domain_name);
- ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off),
+ ret = ksmbd_auth_ntlmv2(conn, sess,
+ (struct ntlmv2_resp *)((char *)authblob + nt_off),
nt_len - CIFS_ENCPWD_SIZE,
domain_name, conn->ntlmssp.cryptkey);
kfree(domain_name);
@@ -659,8 +661,9 @@ struct derivation {
bool binding;
};
-static int generate_key(struct ksmbd_session *sess, struct kvec label,
- struct kvec context, __u8 *key, unsigned int key_size)
+static int generate_key(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct kvec label, struct kvec context, __u8 *key,
+ unsigned int key_size)
{
unsigned char zero = 0x0;
__u8 i[4] = {0, 0, 0, 1};
@@ -720,8 +723,8 @@ static int generate_key(struct ksmbd_session *sess, struct kvec label,
goto smb3signkey_ret;
}
- if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
- sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L256, 4);
else
rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L128, 4);
@@ -756,17 +759,17 @@ static int generate_smb3signingkey(struct ksmbd_session *sess,
if (!chann)
return 0;
- if (sess->conn->dialect >= SMB30_PROT_ID && signing->binding)
+ if (conn->dialect >= SMB30_PROT_ID && signing->binding)
key = chann->smb3signingkey;
else
key = sess->smb3signingkey;
- rc = generate_key(sess, signing->label, signing->context, key,
+ rc = generate_key(conn, sess, signing->label, signing->context, key,
SMB3_SIGN_KEY_SIZE);
if (rc)
return rc;
- if (!(sess->conn->dialect >= SMB30_PROT_ID && signing->binding))
+ if (!(conn->dialect >= SMB30_PROT_ID && signing->binding))
memcpy(chann->smb3signingkey, key, SMB3_SIGN_KEY_SIZE);
ksmbd_debug(AUTH, "dumping generated AES signing keys\n");
@@ -820,30 +823,31 @@ struct derivation_twin {
struct derivation decryption;
};
-static int generate_smb3encryptionkey(struct ksmbd_session *sess,
+static int generate_smb3encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess,
const struct derivation_twin *ptwin)
{
int rc;
- rc = generate_key(sess, ptwin->encryption.label,
+ rc = generate_key(conn, sess, ptwin->encryption.label,
ptwin->encryption.context, sess->smb3encryptionkey,
SMB3_ENC_DEC_KEY_SIZE);
if (rc)
return rc;
- rc = generate_key(sess, ptwin->decryption.label,
+ rc = generate_key(conn, sess, ptwin->decryption.label,
ptwin->decryption.context,
sess->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE);
if (rc)
return rc;
ksmbd_debug(AUTH, "dumping generated AES encryption keys\n");
- ksmbd_debug(AUTH, "Cipher type %d\n", sess->conn->cipher_type);
+ ksmbd_debug(AUTH, "Cipher type %d\n", conn->cipher_type);
ksmbd_debug(AUTH, "Session Id %llu\n", sess->id);
ksmbd_debug(AUTH, "Session Key %*ph\n",
SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key);
- if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
- sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
ksmbd_debug(AUTH, "ServerIn Key %*ph\n",
SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3encryptionkey);
ksmbd_debug(AUTH, "ServerOut Key %*ph\n",
@@ -857,7 +861,8 @@ static int generate_smb3encryptionkey(struct ksmbd_session *sess,
return 0;
}
-int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess)
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
struct derivation_twin twin;
struct derivation *d;
@@ -874,10 +879,11 @@ int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess)
d->context.iov_base = "ServerIn ";
d->context.iov_len = 10;
- return generate_smb3encryptionkey(sess, &twin);
+ return generate_smb3encryptionkey(conn, sess, &twin);
}
-int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess)
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
struct derivation_twin twin;
struct derivation *d;
@@ -894,7 +900,7 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess)
d->context.iov_base = sess->Preauth_HashValue;
d->context.iov_len = 64;
- return generate_smb3encryptionkey(sess, &twin);
+ return generate_smb3encryptionkey(conn, sess, &twin);
}
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
index 95629651cf26..25b772653de0 100644
--- a/fs/ksmbd/auth.h
+++ b/fs/ksmbd/auth.h
@@ -38,8 +38,9 @@ struct kvec;
int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int nvec, int enc);
void ksmbd_copy_gss_neg_header(void *buf);
-int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name, char *cryptkey);
+int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct ntlmv2_resp *ntlmv2, int blen, char *domain_name,
+ char *cryptkey);
int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
int blob_len, struct ksmbd_conn *conn,
struct ksmbd_session *sess);
@@ -58,8 +59,10 @@ int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess,
struct ksmbd_conn *conn);
int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess,
struct ksmbd_conn *conn);
-int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess);
-int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess);
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
__u8 *pi_hash);
int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index e8f476c5f189..756ad631c019 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -36,6 +36,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
list_del(&conn->conns_list);
write_unlock(&conn_list_lock);
+ xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
kfree(conn);
@@ -65,13 +66,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->outstanding_credits = 0;
init_waitqueue_head(&conn->req_running_q);
+ init_waitqueue_head(&conn->r_count_q);
INIT_LIST_HEAD(&conn->conns_list);
- INIT_LIST_HEAD(&conn->sessions);
INIT_LIST_HEAD(&conn->requests);
INIT_LIST_HEAD(&conn->async_requests);
spin_lock_init(&conn->request_lock);
spin_lock_init(&conn->credits_lock);
ida_init(&conn->async_ida);
+ xa_init(&conn->sessions);
spin_lock_init(&conn->llist_lock);
INIT_LIST_HEAD(&conn->lock_list);
@@ -164,7 +166,6 @@ int ksmbd_conn_write(struct ksmbd_work *work)
struct kvec iov[3];
int iov_idx = 0;
- ksmbd_conn_try_dequeue_request(work);
if (!work->response_buf) {
pr_err("NULL response header\n");
return -EINVAL;
@@ -346,8 +347,8 @@ int ksmbd_conn_handler_loop(void *p)
out:
/* Wait till all reference dropped to the Server object*/
- while (atomic_read(&conn->r_count) > 0)
- schedule_timeout(HZ);
+ wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0);
+
unload_nls(conn->local_nls);
if (default_conn_ops.terminate_fn)
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index 98c1cbe45ec9..e7f7d5707951 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -20,13 +20,6 @@
#define KSMBD_SOCKET_BACKLOG 16
-/*
- * WARNING
- *
- * This is nothing but a HACK. Session status should move to channel
- * or to session. As of now we have 1 tcp_conn : 1 ksmbd_session, but
- * we need to change it to 1 tcp_conn : N ksmbd_sessions.
- */
enum {
KSMBD_SESS_NEW = 0,
KSMBD_SESS_GOOD,
@@ -55,7 +48,7 @@ struct ksmbd_conn {
struct nls_table *local_nls;
struct list_head conns_list;
/* smb session 1 per user */
- struct list_head sessions;
+ struct xarray sessions;
unsigned long last_active;
/* How many request are running currently */
atomic_t req_running;
@@ -65,6 +58,7 @@ struct ksmbd_conn {
unsigned int outstanding_credits;
spinlock_t credits_lock;
wait_queue_head_t req_running_q;
+ wait_queue_head_t r_count_q;
/* Lock to protect requests list*/
spinlock_t request_lock;
struct list_head requests;
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 52aa0adeb951..e0cbcfa98c7e 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -349,6 +349,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
+#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
/*
* Tree connect request flags.
@@ -364,6 +365,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_TREE_CONN_FLAG_READ_ONLY BIT(1)
#define KSMBD_TREE_CONN_FLAG_WRITABLE BIT(2)
#define KSMBD_TREE_CONN_FLAG_ADMIN_ACCOUNT BIT(3)
+#define KSMBD_TREE_CONN_FLAG_UPDATE BIT(4)
/*
* RPC over IPC.
diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c
index cb72d30f5b71..c9bca1c2c834 100644
--- a/fs/ksmbd/mgmt/share_config.c
+++ b/fs/ksmbd/mgmt/share_config.c
@@ -51,12 +51,16 @@ static void kill_share(struct ksmbd_share_config *share)
kfree(share);
}
-void __ksmbd_share_config_put(struct ksmbd_share_config *share)
+void ksmbd_share_config_del(struct ksmbd_share_config *share)
{
down_write(&shares_table_lock);
hash_del(&share->hlist);
up_write(&shares_table_lock);
+}
+void __ksmbd_share_config_put(struct ksmbd_share_config *share)
+{
+ ksmbd_share_config_del(share);
kill_share(share);
}
@@ -222,17 +226,3 @@ bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
}
return false;
}
-
-void ksmbd_share_configs_cleanup(void)
-{
- struct ksmbd_share_config *share;
- struct hlist_node *tmp;
- int i;
-
- down_write(&shares_table_lock);
- hash_for_each_safe(shares_table, i, tmp, share, hlist) {
- hash_del(&share->hlist);
- kill_share(share);
- }
- up_write(&shares_table_lock);
-}
diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h
index 953befc94e84..902f2cb1963a 100644
--- a/fs/ksmbd/mgmt/share_config.h
+++ b/fs/ksmbd/mgmt/share_config.h
@@ -64,6 +64,7 @@ static inline int test_share_config_flag(struct ksmbd_share_config *share,
return share->flags & flag;
}
+void ksmbd_share_config_del(struct ksmbd_share_config *share);
void __ksmbd_share_config_put(struct ksmbd_share_config *share);
static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
@@ -76,6 +77,4 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
struct ksmbd_share_config *ksmbd_share_config_get(char *name);
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
const char *filename);
-void ksmbd_share_configs_cleanup(void);
-
#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c
index 0d28e723a28c..97ab7987df6e 100644
--- a/fs/ksmbd/mgmt/tree_connect.c
+++ b/fs/ksmbd/mgmt/tree_connect.c
@@ -16,9 +16,10 @@
#include "user_session.h"
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
+ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ char *share_name)
{
- struct ksmbd_tree_conn_status status = {-EINVAL, NULL};
+ struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
struct ksmbd_tree_connect_response *resp = NULL;
struct ksmbd_share_config *sc;
struct ksmbd_tree_connect *tree_conn = NULL;
@@ -41,7 +42,7 @@ ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
goto out_error;
}
- peer_addr = KSMBD_TCP_PEER_SOCKADDR(sess->conn);
+ peer_addr = KSMBD_TCP_PEER_SOCKADDR(conn);
resp = ksmbd_ipc_tree_connect_request(sess,
sc,
tree_conn,
@@ -56,6 +57,20 @@ ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
goto out_error;
tree_conn->flags = resp->connection_flags;
+ if (test_tree_conn_flag(tree_conn, KSMBD_TREE_CONN_FLAG_UPDATE)) {
+ struct ksmbd_share_config *new_sc;
+
+ ksmbd_share_config_del(sc);
+ new_sc = ksmbd_share_config_get(share_name);
+ if (!new_sc) {
+ pr_err("Failed to update stale share config\n");
+ status.ret = -ESTALE;
+ goto out_error;
+ }
+ ksmbd_share_config_put(sc);
+ sc = new_sc;
+ }
+
tree_conn->user = sess->user;
tree_conn->share_conf = sc;
status.tree_conn = tree_conn;
diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h
index 18e2a996e0aa..71e50271dccf 100644
--- a/fs/ksmbd/mgmt/tree_connect.h
+++ b/fs/ksmbd/mgmt/tree_connect.h
@@ -12,6 +12,7 @@
struct ksmbd_share_config;
struct ksmbd_user;
+struct ksmbd_conn;
struct ksmbd_tree_connect {
int id;
@@ -40,7 +41,8 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn,
struct ksmbd_session;
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name);
+ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ char *share_name);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 8d8ffd8c6f19..3fa2139a0b30 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -32,11 +32,13 @@ static void free_channel_list(struct ksmbd_session *sess)
{
struct channel *chann, *tmp;
+ write_lock(&sess->chann_lock);
list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
chann_list) {
list_del(&chann->chann_list);
kfree(chann);
}
+ write_unlock(&sess->chann_lock);
}
static void __session_rpc_close(struct ksmbd_session *sess,
@@ -149,11 +151,6 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
if (!sess)
return;
- if (!atomic_dec_and_test(&sess->refcnt))
- return;
-
- list_del(&sess->sessions_entry);
-
down_write(&sessions_table_lock);
hash_del(&sess->hlist);
up_write(&sessions_table_lock);
@@ -181,53 +178,70 @@ static struct ksmbd_session *__session_lookup(unsigned long long id)
return NULL;
}
-void ksmbd_session_register(struct ksmbd_conn *conn,
- struct ksmbd_session *sess)
+int ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
- sess->conn = conn;
- list_add(&sess->sessions_entry, &conn->sessions);
+ sess->dialect = conn->dialect;
+ memcpy(sess->ClientGUID, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
+ return xa_err(xa_store(&conn->sessions, sess->id, sess, GFP_KERNEL));
}
-void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
+static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess)
{
- struct ksmbd_session *sess;
-
- while (!list_empty(&conn->sessions)) {
- sess = list_entry(conn->sessions.next,
- struct ksmbd_session,
- sessions_entry);
+ struct channel *chann, *tmp;
- ksmbd_session_destroy(sess);
+ write_lock(&sess->chann_lock);
+ list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
+ chann_list) {
+ if (chann->conn == conn) {
+ list_del(&chann->chann_list);
+ kfree(chann);
+ write_unlock(&sess->chann_lock);
+ return 0;
+ }
}
-}
+ write_unlock(&sess->chann_lock);
-static bool ksmbd_session_id_match(struct ksmbd_session *sess,
- unsigned long long id)
-{
- return sess->id == id;
+ return -ENOENT;
}
-struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
- unsigned long long id)
+void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
{
- struct ksmbd_session *sess = NULL;
+ struct ksmbd_session *sess;
- list_for_each_entry(sess, &conn->sessions, sessions_entry) {
- if (ksmbd_session_id_match(sess, id))
- return sess;
+ if (conn->binding) {
+ int bkt;
+
+ down_write(&sessions_table_lock);
+ hash_for_each(sessions_table, bkt, sess, hlist) {
+ if (!ksmbd_chann_del(conn, sess)) {
+ up_write(&sessions_table_lock);
+ goto sess_destroy;
+ }
+ }
+ up_write(&sessions_table_lock);
+ } else {
+ unsigned long id;
+
+ xa_for_each(&conn->sessions, id, sess) {
+ if (!ksmbd_chann_del(conn, sess))
+ goto sess_destroy;
+ }
}
- return NULL;
-}
-int get_session(struct ksmbd_session *sess)
-{
- return atomic_inc_not_zero(&sess->refcnt);
+ return;
+
+sess_destroy:
+ if (list_empty(&sess->ksmbd_chann_list)) {
+ xa_erase(&conn->sessions, sess->id);
+ ksmbd_session_destroy(sess);
+ }
}
-void put_session(struct ksmbd_session *sess)
+struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id)
{
- if (atomic_dec_and_test(&sess->refcnt))
- pr_err("get/%s seems to be mismatched.", __func__);
+ return xa_load(&conn->sessions, id);
}
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
@@ -236,10 +250,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
down_read(&sessions_table_lock);
sess = __session_lookup(id);
- if (sess) {
- if (!get_session(sess))
- sess = NULL;
- }
up_read(&sessions_table_lock);
return sess;
@@ -253,6 +263,8 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
sess = ksmbd_session_lookup(conn, id);
if (!sess && conn->binding)
sess = ksmbd_session_lookup_slowpath(id);
+ if (sess && sess->state != SMB2_SESSION_VALID)
+ sess = NULL;
return sess;
}
@@ -314,12 +326,11 @@ static struct ksmbd_session *__session_create(int protocol)
goto error;
set_session_flag(sess, protocol);
- INIT_LIST_HEAD(&sess->sessions_entry);
xa_init(&sess->tree_conns);
INIT_LIST_HEAD(&sess->ksmbd_chann_list);
INIT_LIST_HEAD(&sess->rpc_handle_list);
sess->sequence_number = 1;
- atomic_set(&sess->refcnt, 1);
+ rwlock_init(&sess->chann_lock);
switch (protocol) {
case CIFDS_SESSION_FLAG_SMB2:
diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h
index e241f16a3851..8934b8ee275b 100644
--- a/fs/ksmbd/mgmt/user_session.h
+++ b/fs/ksmbd/mgmt/user_session.h
@@ -33,8 +33,10 @@ struct preauth_session {
struct ksmbd_session {
u64 id;
+ __u16 dialect;
+ char ClientGUID[SMB2_CLIENT_GUID_SIZE];
+
struct ksmbd_user *user;
- struct ksmbd_conn *conn;
unsigned int sequence_number;
unsigned int flags;
@@ -48,6 +50,7 @@ struct ksmbd_session {
char sess_key[CIFS_KEY_SIZE];
struct hlist_node hlist;
+ rwlock_t chann_lock;
struct list_head ksmbd_chann_list;
struct xarray tree_conns;
struct ida tree_conn_ida;
@@ -57,9 +60,7 @@ struct ksmbd_session {
__u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
- struct list_head sessions_entry;
struct ksmbd_file_table file_table;
- atomic_t refcnt;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
@@ -84,8 +85,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess);
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id);
-void ksmbd_session_register(struct ksmbd_conn *conn,
- struct ksmbd_session *sess);
+int ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
unsigned long long id);
@@ -100,6 +101,4 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name);
void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id);
-int get_session(struct ksmbd_session *sess);
-void put_session(struct ksmbd_session *sess);
#endif /* __USER_SESSION_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 8b5560574d4c..9046cff4374b 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -30,6 +30,7 @@ static DEFINE_RWLOCK(lease_list_lock);
static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
u64 id, __u16 Tid)
{
+ struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
struct oplock_info *opinfo;
@@ -38,7 +39,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
return NULL;
opinfo->sess = sess;
- opinfo->conn = sess->conn;
+ opinfo->conn = conn;
opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
opinfo->op_state = OPLOCK_STATE_NONE;
opinfo->pending_break = 0;
@@ -615,18 +616,13 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
struct ksmbd_file *fp;
fp = ksmbd_lookup_durable_fd(br_info->fid);
- if (!fp) {
- atomic_dec(&conn->r_count);
- ksmbd_free_work_struct(work);
- return;
- }
+ if (!fp)
+ goto out;
if (allocate_oplock_break_buf(work)) {
pr_err("smb2_allocate_rsp_buf failed! ");
- atomic_dec(&conn->r_count);
ksmbd_fd_put(work, fp);
- ksmbd_free_work_struct(work);
- return;
+ goto out;
}
rsp_hdr = smb2_get_msg(work->response_buf);
@@ -667,8 +663,16 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
ksmbd_fd_put(work, fp);
ksmbd_conn_write(work);
+
+out:
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
@@ -731,9 +735,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
if (allocate_oplock_break_buf(work)) {
ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! ");
- ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
- return;
+ goto out;
}
rsp_hdr = smb2_get_msg(work->response_buf);
@@ -771,8 +773,16 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
inc_rfc1001_len(work->response_buf, 44);
ksmbd_conn_write(work);
+
+out:
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
@@ -972,7 +982,7 @@ int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
}
list_for_each_entry(lb, &lease_table_list, l_entry) {
- if (!memcmp(lb->client_guid, sess->conn->ClientGUID,
+ if (!memcmp(lb->client_guid, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE))
goto found;
}
@@ -988,7 +998,7 @@ found:
rcu_read_unlock();
if (opinfo->o_fp->f_ci == ci)
goto op_next;
- err = compare_guid_key(opinfo, sess->conn->ClientGUID,
+ err = compare_guid_key(opinfo, sess->ClientGUID,
lctx->lease_key);
if (err) {
err = -EINVAL;
@@ -1122,7 +1132,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
struct oplock_info *m_opinfo;
/* is lease already granted ? */
- m_opinfo = same_client_has_lease(ci, sess->conn->ClientGUID,
+ m_opinfo = same_client_has_lease(ci, sess->ClientGUID,
lctx);
if (m_opinfo) {
copy_lease(m_opinfo, opinfo);
@@ -1240,7 +1250,7 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
{
struct oplock_info *op, *brk_op;
struct ksmbd_inode *ci;
- struct ksmbd_conn *conn = work->sess->conn;
+ struct ksmbd_conn *conn = work->conn;
if (!test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_OPLOCKS))
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 4cd03d661df0..ce42bff42ef9 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -261,7 +261,13 @@ static void handle_ksmbd_work(struct work_struct *wk)
ksmbd_conn_try_dequeue_request(work);
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index f8f456377a51..6e25ace36568 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -90,11 +90,6 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*off = 0;
*len = 0;
- /* error reqeusts do not have data area */
- if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
- (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE)
- return ret;
-
/*
* Following commands have data areas so we have to get the location
* of the data buffer offset and data buffer length for the particular
@@ -136,8 +131,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength);
break;
case SMB2_WRITE:
- if (((struct smb2_write_req *)hdr)->DataOffset) {
- *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset);
+ if (((struct smb2_write_req *)hdr)->DataOffset ||
+ ((struct smb2_write_req *)hdr)->Length) {
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
+ offsetof(struct smb2_write_req, Buffer));
*len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
break;
}
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 353f047e783c..19412ac701a6 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -535,9 +535,10 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
struct smb2_query_info_req *req;
req = smb2_get_msg(work->request_buf);
- if (req->InfoType == SMB2_O_INFO_FILE &&
- (req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
- req->FileInfoClass == FILE_ALL_INFORMATION))
+ if ((req->InfoType == SMB2_O_INFO_FILE &&
+ (req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
+ req->FileInfoClass == FILE_ALL_INFORMATION)) ||
+ req->InfoType == SMB2_O_INFO_SECURITY)
sz = large_sz;
}
@@ -588,10 +589,12 @@ int smb2_check_user_session(struct ksmbd_work *work)
return -EINVAL;
}
-static void destroy_previous_session(struct ksmbd_user *user, u64 id)
+static void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id)
{
struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
struct ksmbd_user *prev_user;
+ struct channel *chann;
if (!prev_sess)
return;
@@ -601,13 +604,14 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id)
if (!prev_user ||
strcmp(user->name, prev_user->name) ||
user->passkey_sz != prev_user->passkey_sz ||
- memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) {
- put_session(prev_sess);
+ memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
return;
- }
- put_session(prev_sess);
- ksmbd_session_destroy(prev_sess);
+ prev_sess->state = SMB2_SESSION_EXPIRED;
+ write_lock(&prev_sess->chann_lock);
+ list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list)
+ chann->conn->status = KSMBD_SESS_EXITING;
+ write_unlock(&prev_sess->chann_lock);
}
/**
@@ -1139,12 +1143,16 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
status);
rsp->hdr.Status = status;
rc = -EINVAL;
+ kfree(conn->preauth_info);
+ conn->preauth_info = NULL;
goto err_out;
}
rc = init_smb3_11_server(conn);
if (rc < 0) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ kfree(conn->preauth_info);
+ conn->preauth_info = NULL;
goto err_out;
}
@@ -1439,7 +1447,7 @@ static int ntlm_authenticate(struct ksmbd_work *work)
/* Check for previous session */
prev_id = le64_to_cpu(req->PreviousSessionId);
if (prev_id && prev_id != sess->id)
- destroy_previous_session(user, prev_id);
+ destroy_previous_session(conn, user, prev_id);
if (sess->state == SMB2_SESSION_VALID) {
/*
@@ -1493,7 +1501,7 @@ static int ntlm_authenticate(struct ksmbd_work *work)
if (smb3_encryption_negotiated(conn) &&
!(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
- rc = conn->ops->generate_encryptionkey(sess);
+ rc = conn->ops->generate_encryptionkey(conn, sess);
if (rc) {
ksmbd_debug(SMB,
"SMB3 encryption key generation failed\n");
@@ -1510,7 +1518,9 @@ static int ntlm_authenticate(struct ksmbd_work *work)
binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
+ read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
+ read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
@@ -1518,7 +1528,9 @@ binding_session:
chann->conn = conn;
INIT_LIST_HEAD(&chann->chann_list);
+ write_lock(&sess->chann_lock);
list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ write_unlock(&sess->chann_lock);
}
}
@@ -1561,7 +1573,7 @@ static int krb5_authenticate(struct ksmbd_work *work)
/* Check previous session */
prev_sess_id = le64_to_cpu(req->PreviousSessionId);
if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(sess->user, prev_sess_id);
+ destroy_previous_session(conn, sess->user, prev_sess_id);
if (sess->state == SMB2_SESSION_VALID)
ksmbd_free_user(sess->user);
@@ -1580,7 +1592,7 @@ static int krb5_authenticate(struct ksmbd_work *work)
sess->sign = true;
if (smb3_encryption_negotiated(conn)) {
- retval = conn->ops->generate_encryptionkey(sess);
+ retval = conn->ops->generate_encryptionkey(conn, sess);
if (retval) {
ksmbd_debug(SMB,
"SMB3 encryption key generation failed\n");
@@ -1592,7 +1604,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
}
if (conn->dialect >= SMB30_PROT_ID) {
+ read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
+ read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
@@ -1600,7 +1614,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
chann->conn = conn;
INIT_LIST_HEAD(&chann->chann_list);
+ write_lock(&sess->chann_lock);
list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ write_unlock(&sess->chann_lock);
}
}
@@ -1650,7 +1666,9 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
rsp->hdr.SessionId = cpu_to_le64(sess->id);
- ksmbd_session_register(conn, sess);
+ rc = ksmbd_session_register(conn, sess);
+ if (rc)
+ goto out_err;
} else if (conn->dialect >= SMB30_PROT_ID &&
(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) {
@@ -1662,7 +1680,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
- if (conn->dialect != sess->conn->dialect) {
+ if (conn->dialect != sess->dialect) {
rc = -EINVAL;
goto out_err;
}
@@ -1672,7 +1690,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
- if (strncmp(conn->ClientGUID, sess->conn->ClientGUID,
+ if (strncmp(conn->ClientGUID, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE)) {
rc = -ENOENT;
goto out_err;
@@ -1828,6 +1846,7 @@ out_err:
if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION)
try_delay = true;
+ xa_erase(&conn->sessions, sess->id);
ksmbd_session_destroy(sess);
work->sess = NULL;
if (try_delay)
@@ -1873,7 +1892,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n",
name, treename);
- status = ksmbd_tree_conn_connect(sess, name);
+ status = ksmbd_tree_conn_connect(conn, sess, name);
if (status.ret == KSMBD_TREE_CONN_STATUS_OK)
rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id);
else
@@ -1925,8 +1944,10 @@ out_err1:
rsp->hdr.Status = STATUS_SUCCESS;
rc = 0;
break;
+ case -ESTALE:
+ case -ENOENT:
case KSMBD_TREE_CONN_STATUS_NO_SHARE:
- rsp->hdr.Status = STATUS_BAD_NETWORK_PATH;
+ rsp->hdr.Status = STATUS_BAD_NETWORK_NAME;
break;
case -ENOMEM:
case KSMBD_TREE_CONN_STATUS_NOMEM:
@@ -2039,6 +2060,7 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
ksmbd_close_tree_conn_fds(work);
ksmbd_tree_conn_disconnect(sess, tcon);
+ work->tcon = NULL;
return 0;
}
@@ -2308,15 +2330,15 @@ static int smb2_remove_smb_xattrs(struct path *path)
name += strlen(name) + 1) {
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
- if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
- strncmp(&name[XATTR_USER_PREFIX_LEN], DOS_ATTRIBUTE_PREFIX,
- DOS_ATTRIBUTE_PREFIX_LEN) &&
- strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN))
- continue;
-
- err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, name);
- if (err)
- ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
+ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
+ STREAM_PREFIX_LEN)) {
+ err = ksmbd_vfs_remove_xattr(user_ns, path->dentry,
+ name);
+ if (err)
+ ksmbd_debug(SMB, "remove xattr failed : %s\n",
+ name);
+ }
}
out:
kvfree(xattr_list);
@@ -2969,7 +2991,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
rc = build_sec_desc(user_ns,
- pntsd, NULL,
+ pntsd, NULL, 0,
OWNER_SECINFO |
GROUP_SECINFO |
DACL_SECINFO,
@@ -3022,12 +3044,6 @@ int smb2_open(struct ksmbd_work *work)
list_add(&fp->node, &fp->f_ci->m_fp_list);
write_unlock(&fp->f_ci->m_lock);
- rc = ksmbd_vfs_getattr(&path, &stat);
- if (rc) {
- generic_fillattr(user_ns, d_inode(path.dentry), &stat);
- rc = 0;
- }
-
/* Check delete pending among previous fp before oplock break */
if (ksmbd_inode_pending_delete(fp)) {
rc = -EBUSY;
@@ -3114,6 +3130,10 @@ int smb2_open(struct ksmbd_work *work)
}
}
+ rc = ksmbd_vfs_getattr(&path, &stat);
+ if (rc)
+ goto err_out;
+
if (stat.result_mask & STATX_BTIME)
fp->create_time = ksmbd_UnixTimeToNT(stat.btime);
else
@@ -3129,9 +3149,6 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
- generic_fillattr(user_ns, file_inode(fp->filp),
- &stat);
-
rsp->StructureSize = cpu_to_le16(89);
rcu_read_lock();
opinfo = rcu_dereference(fp->f_opinfo);
@@ -3814,6 +3831,15 @@ static int verify_info_level(int info_level)
return 0;
}
+static int smb2_resp_buf_len(struct ksmbd_work *work, unsigned short hdr2_len)
+{
+ int free_len;
+
+ free_len = (int)(work->response_sz -
+ (get_rfc1002_len(work->response_buf) + 4)) - hdr2_len;
+ return free_len;
+}
+
static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
unsigned short hdr2_len,
unsigned int out_buf_len)
@@ -3823,9 +3849,7 @@ static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
if (out_buf_len > work->conn->vals->max_trans_size)
return -EINVAL;
- free_len = (int)(work->response_sz -
- (get_rfc1002_len(work->response_buf) + 4)) -
- hdr2_len;
+ free_len = smb2_resp_buf_len(work, hdr2_len);
if (free_len < 0)
return -EINVAL;
@@ -4858,7 +4882,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
struct smb2_query_info_rsp *rsp)
{
struct ksmbd_session *sess = work->sess;
- struct ksmbd_conn *conn = sess->conn;
+ struct ksmbd_conn *conn = work->conn;
struct ksmbd_share_config *share = work->tcon->share_conf;
int fsinfoclass = 0;
struct kstatfs stfs;
@@ -5088,10 +5112,10 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL;
struct smb_fattr fattr = {{0}};
struct inode *inode;
- __u32 secdesclen;
+ __u32 secdesclen = 0;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
int addition_info = le32_to_cpu(req->AdditionalInformation);
- int rc;
+ int rc = 0, ppntsd_size = 0;
if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO |
PROTECTED_DACL_SECINFO |
@@ -5137,11 +5161,14 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR))
- ksmbd_vfs_get_sd_xattr(work->conn, user_ns,
- fp->filp->f_path.dentry, &ppntsd);
-
- rc = build_sec_desc(user_ns, pntsd, ppntsd, addition_info,
- &secdesclen, &fattr);
+ ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, user_ns,
+ fp->filp->f_path.dentry,
+ &ppntsd);
+
+ /* Check if sd buffer size exceeds response buffer size */
+ if (smb2_resp_buf_len(work, 8) > ppntsd_size)
+ rc = build_sec_desc(user_ns, pntsd, ppntsd, ppntsd_size,
+ addition_info, &secdesclen, &fattr);
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
kfree(ppntsd);
@@ -5776,7 +5803,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
}
next:
return smb2_rename(work, fp, user_ns, rename_info,
- work->sess->conn->local_nls);
+ work->conn->local_nls);
}
static int set_file_disposition_info(struct ksmbd_file *fp,
@@ -5908,7 +5935,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return smb2_create_link(work, work->tcon->share_conf,
(struct smb2_file_link_info *)req->Buffer,
buf_len, fp->filp,
- work->sess->conn->local_nls);
+ work->conn->local_nls);
}
case FILE_DISPOSITION_INFORMATION:
{
@@ -6495,14 +6522,12 @@ int smb2_write(struct ksmbd_work *work)
writethrough = true;
if (is_rdma_channel == false) {
- if ((u64)le16_to_cpu(req->DataOffset) + length >
- get_rfc1002_len(work->request_buf)) {
- pr_err("invalid write data offset %u, smb_len %u\n",
- le16_to_cpu(req->DataOffset),
- get_rfc1002_len(work->request_buf));
+ if (le16_to_cpu(req->DataOffset) <
+ offsetof(struct smb2_write_req, Buffer)) {
err = -EINVAL;
goto out;
}
+
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
@@ -8356,10 +8381,14 @@ int smb3_check_sign_req(struct ksmbd_work *work)
if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
+ read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, conn);
- if (!chann)
+ if (!chann) {
+ read_unlock(&work->sess->chann_lock);
return 0;
+ }
signing_key = chann->smb3signingkey;
+ read_unlock(&work->sess->chann_lock);
}
if (!signing_key) {
@@ -8419,10 +8448,14 @@ void smb3_set_sign_rsp(struct ksmbd_work *work)
le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
+ read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, work->conn);
- if (!chann)
+ if (!chann) {
+ read_unlock(&work->sess->chann_lock);
return;
+ }
signing_key = chann->smb3signingkey;
+ read_unlock(&work->sess->chann_lock);
}
if (!signing_key)
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index e1369b4345a9..318c16fa81da 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -421,7 +421,7 @@ struct smb_version_ops {
int (*check_sign_req)(struct ksmbd_work *work);
void (*set_sign_rsp)(struct ksmbd_work *work);
int (*generate_signingkey)(struct ksmbd_session *sess, struct ksmbd_conn *conn);
- int (*generate_encryptionkey)(struct ksmbd_session *sess);
+ int (*generate_encryptionkey)(struct ksmbd_conn *conn, struct ksmbd_session *sess);
bool (*is_transform_hdr)(void *buf);
int (*decrypt_req)(struct ksmbd_work *work);
int (*encrypt_resp)(struct ksmbd_work *work);
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 38f23bf981ac..3781bca2c8fc 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -690,6 +690,7 @@ posix_default_acl:
static void set_ntacl_dacl(struct user_namespace *user_ns,
struct smb_acl *pndacl,
struct smb_acl *nt_dacl,
+ unsigned int aces_size,
const struct smb_sid *pownersid,
const struct smb_sid *pgrpsid,
struct smb_fattr *fattr)
@@ -703,9 +704,19 @@ static void set_ntacl_dacl(struct user_namespace *user_ns,
if (nt_num_aces) {
ntace = (struct smb_ace *)((char *)nt_dacl + sizeof(struct smb_acl));
for (i = 0; i < nt_num_aces; i++) {
- memcpy((char *)pndace + size, ntace, le16_to_cpu(ntace->size));
- size += le16_to_cpu(ntace->size);
- ntace = (struct smb_ace *)((char *)ntace + le16_to_cpu(ntace->size));
+ unsigned short nt_ace_size;
+
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+
+ nt_ace_size = le16_to_cpu(ntace->size);
+ if (nt_ace_size > aces_size)
+ break;
+
+ memcpy((char *)pndace + size, ntace, nt_ace_size);
+ size += nt_ace_size;
+ aces_size -= nt_ace_size;
+ ntace = (struct smb_ace *)((char *)ntace + nt_ace_size);
num_aces++;
}
}
@@ -878,7 +889,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
/* Convert permission bits from mode to equivalent CIFS ACL */
int build_sec_desc(struct user_namespace *user_ns,
struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd,
- int addition_info, __u32 *secdesclen,
+ int ppntsd_size, int addition_info, __u32 *secdesclen,
struct smb_fattr *fattr)
{
int rc = 0;
@@ -938,15 +949,25 @@ int build_sec_desc(struct user_namespace *user_ns,
if (!ppntsd) {
set_mode_dacl(user_ns, dacl_ptr, fattr);
- } else if (!ppntsd->dacloffset) {
- goto out;
} else {
struct smb_acl *ppdacl_ptr;
+ unsigned int dacl_offset = le32_to_cpu(ppntsd->dacloffset);
+ int ppdacl_size, ntacl_size = ppntsd_size - dacl_offset;
+
+ if (!dacl_offset ||
+ (dacl_offset + sizeof(struct smb_acl) > ppntsd_size))
+ goto out;
+
+ ppdacl_ptr = (struct smb_acl *)((char *)ppntsd + dacl_offset);
+ ppdacl_size = le16_to_cpu(ppdacl_ptr->size);
+ if (ppdacl_size > ntacl_size ||
+ ppdacl_size < sizeof(struct smb_acl))
+ goto out;
- ppdacl_ptr = (struct smb_acl *)((char *)ppntsd +
- le32_to_cpu(ppntsd->dacloffset));
set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr,
- nowner_sid_ptr, ngroup_sid_ptr, fattr);
+ ntacl_size - sizeof(struct smb_acl),
+ nowner_sid_ptr, ngroup_sid_ptr,
+ fattr);
}
pntsd->dacloffset = cpu_to_le32(offset);
offset += le16_to_cpu(dacl_ptr->size);
@@ -980,24 +1001,31 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct smb_sid owner_sid, group_sid;
struct dentry *parent = path->dentry->d_parent;
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
- int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0;
- int rc = 0, num_aces, dacloffset, pntsd_type, acl_len;
+ int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size;
+ int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
- acl_len = ksmbd_vfs_get_sd_xattr(conn, user_ns,
- parent, &parent_pntsd);
- if (acl_len <= 0)
+ pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ parent, &parent_pntsd);
+ if (pntsd_size <= 0)
return -ENOENT;
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
- if (!dacloffset) {
+ if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) {
rc = -EINVAL;
goto free_parent_pntsd;
}
parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset);
+ acl_len = pntsd_size - dacloffset;
num_aces = le32_to_cpu(parent_pdacl->num_aces);
pntsd_type = le16_to_cpu(parent_pntsd->type);
+ pdacl_size = le16_to_cpu(parent_pdacl->size);
+
+ if (pdacl_size > acl_len || pdacl_size < sizeof(struct smb_acl)) {
+ rc = -EINVAL;
+ goto free_parent_pntsd;
+ }
aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, GFP_KERNEL);
if (!aces_base) {
@@ -1008,11 +1036,23 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
aces = (struct smb_ace *)aces_base;
parent_aces = (struct smb_ace *)((char *)parent_pdacl +
sizeof(struct smb_acl));
+ aces_size = acl_len - sizeof(struct smb_acl);
if (pntsd_type & DACL_AUTO_INHERITED)
inherited_flags = INHERITED_ACE;
for (i = 0; i < num_aces; i++) {
+ int pace_size;
+
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+
+ pace_size = le16_to_cpu(parent_aces->size);
+ if (pace_size > aces_size)
+ break;
+
+ aces_size -= pace_size;
+
flags = parent_aces->flags;
if (!smb_inherit_flags(flags, is_dir))
goto pass;
@@ -1057,8 +1097,7 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
ace_cnt++;
pass:
- parent_aces =
- (struct smb_ace *)((char *)parent_aces + le16_to_cpu(parent_aces->size));
+ parent_aces = (struct smb_ace *)((char *)parent_aces + pace_size);
}
if (nt_size > 0) {
@@ -1153,7 +1192,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
struct smb_ntsd *pntsd = NULL;
struct smb_acl *pdacl;
struct posix_acl *posix_acls;
- int rc = 0, acl_size;
+ int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset;
struct smb_sid sid;
int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
struct smb_ace *ace;
@@ -1162,37 +1201,33 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
struct smb_ace *others_ace = NULL;
struct posix_acl_entry *pa_entry;
unsigned int sid_type = SIDOWNER;
- char *end_of_acl;
+ unsigned short ace_size;
ksmbd_debug(SMB, "check permission using windows acl\n");
- acl_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
- path->dentry, &pntsd);
- if (acl_size <= 0 || !pntsd || !pntsd->dacloffset) {
- kfree(pntsd);
- return 0;
- }
+ pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ path->dentry, &pntsd);
+ if (pntsd_size <= 0 || !pntsd)
+ goto err_out;
+
+ dacl_offset = le32_to_cpu(pntsd->dacloffset);
+ if (!dacl_offset ||
+ (dacl_offset + sizeof(struct smb_acl) > pntsd_size))
+ goto err_out;
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
- end_of_acl = ((char *)pntsd) + acl_size;
- if (end_of_acl <= (char *)pdacl) {
- kfree(pntsd);
- return 0;
- }
+ acl_size = pntsd_size - dacl_offset;
+ pdacl_size = le16_to_cpu(pdacl->size);
- if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size) ||
- le16_to_cpu(pdacl->size) < sizeof(struct smb_acl)) {
- kfree(pntsd);
- return 0;
- }
+ if (pdacl_size > acl_size || pdacl_size < sizeof(struct smb_acl))
+ goto err_out;
if (!pdacl->num_aces) {
- if (!(le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) &&
+ if (!(pdacl_size - sizeof(struct smb_acl)) &&
*pdaccess & ~(FILE_READ_CONTROL_LE | FILE_WRITE_DAC_LE)) {
rc = -EACCES;
goto err_out;
}
- kfree(pntsd);
- return 0;
+ goto err_out;
}
if (*pdaccess & FILE_MAXIMAL_ACCESS_LE) {
@@ -1200,11 +1235,16 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
DELETE;
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ aces_size = acl_size - sizeof(struct smb_acl);
for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+ ace_size = le16_to_cpu(ace->size);
+ if (ace_size > aces_size)
+ break;
+ aces_size -= ace_size;
granted |= le32_to_cpu(ace->access_req);
ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
- if (end_of_acl < (char *)ace)
- goto err_out;
}
if (!pdacl->num_aces)
@@ -1216,7 +1256,15 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
id_to_sid(uid, sid_type, &sid);
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ aces_size = acl_size - sizeof(struct smb_acl);
for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+ ace_size = le16_to_cpu(ace->size);
+ if (ace_size > aces_size)
+ break;
+ aces_size -= ace_size;
+
if (!compare_sids(&sid, &ace->sid) ||
!compare_sids(&sid_unix_NFS_mode, &ace->sid)) {
found = 1;
@@ -1226,8 +1274,6 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
others_ace = ace;
ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
- if (end_of_acl < (char *)ace)
- goto err_out;
}
if (*pdaccess & FILE_MAXIMAL_ACCESS_LE && found) {
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index 811af3309429..fcb2c83f2992 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -193,7 +193,7 @@ struct posix_acl_state {
int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
int acl_len, struct smb_fattr *fattr);
int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
- struct smb_ntsd *ppntsd, int addition_info,
+ struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info,
__u32 *secdesclen, struct smb_fattr *fattr);
int init_acl_state(struct posix_acl_state *state, int cnt);
void free_acl_state(struct posix_acl_state *state);
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 7c849024999f..78d01033604c 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -481,12 +481,11 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
char *buf, size_t count, loff_t *pos, bool sync,
ssize_t *written)
{
- struct ksmbd_session *sess = work->sess;
struct file *filp;
loff_t offset = *pos;
int err = 0;
- if (sess->conn->connection_type) {
+ if (work->conn->connection_type) {
if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
pr_err("no right to write(%pd)\n",
fp->filp->f_path.dentry);
@@ -1540,6 +1539,11 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
}
*pntsd = acl.sd_buf;
+ if (acl.sd_size < sizeof(struct smb_ntsd)) {
+ pr_err("sd size is invalid\n");
+ goto out_free;
+ }
+
(*pntsd)->osidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->osidoffset) -
NDR_NTSD_OFFSETOF);
(*pntsd)->gsidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->gsidoffset) -
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index c4d59d2735f0..da9163b00350 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -569,7 +569,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
atomic_set(&fp->refcount, 1);
fp->filp = filp;
- fp->conn = work->sess->conn;
+ fp->conn = work->conn;
fp->tcon = work->tcon;
fp->volatile_id = KSMBD_NO_FID;
fp->persistent_id = KSMBD_NO_FID;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 176b468a61c7..bf274f23969b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -32,6 +32,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
if (!nlmsvc_ops)
return nlm_lck_denied_nolocks;
+ if (lock->lock_start > OFFSET_MAX ||
+ (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start))))
+ return nlm4_fbig;
+
/* Obtain host handle */
if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
|| (argp->monitor && nsm_monitor(host) < 0))
@@ -50,6 +54,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
+ lock->fl.fl_start = (loff_t)lock->lock_start;
+ lock->fl.fl_end = lock->lock_len ?
+ (loff_t)(lock->lock_start + lock->lock_len - 1) :
+ OFFSET_MAX;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
if (!lock->fl.fl_owner) {
@@ -87,6 +95,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_host *host;
struct nlm_file *file;
+ struct nlm_lockowner *test_owner;
__be32 rc = rpc_success;
dprintk("lockd: TEST4 called\n");
@@ -96,6 +105,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+ test_owner = argp->lock.fl.fl_owner;
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
@@ -103,7 +113,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
else
dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
- nlmsvc_release_lockowner(&argp->lock);
+ nlmsvc_put_lockowner(test_owner);
nlmsvc_release_host(host);
nlm_release_file(file);
return rc;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cb3658ab9b7a..9c1aa75441e1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -340,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
return lockowner;
}
-static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
{
if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
return;
@@ -590,7 +590,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
int error;
int mode;
__be32 ret;
- struct nlm_lockowner *test_owner;
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
@@ -604,9 +603,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- /* If there's a conflicting lock, remember to clean up the test lock */
- test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
-
mode = lock_to_openmode(&lock->fl);
error = vfs_test_lock(file->f_file[mode], &lock->fl);
if (error) {
@@ -635,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
conflock->fl.fl_end = lock->fl.fl_end;
locks_release_private(&lock->fl);
- /* Clean up the test lock */
- lock->fl.fl_owner = NULL;
- nlmsvc_put_lockowner(test_owner);
-
ret = nlm_lck_denied;
out:
return ret;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4dc1b40a489a..b09ca35b527c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -116,6 +116,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_host *host;
struct nlm_file *file;
+ struct nlm_lockowner *test_owner;
__be32 rc = rpc_success;
dprintk("lockd: TEST called\n");
@@ -125,6 +126,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+ test_owner = argp->lock.fl.fl_owner;
+
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
if (resp->status == nlm_drop_reply)
@@ -133,7 +136,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
dprintk("lockd: TEST status %d vers %d\n",
ntohl(resp->status), rqstp->rq_vers);
- nlmsvc_release_lockowner(&argp->lock);
+ nlmsvc_put_lockowner(test_owner);
nlmsvc_release_host(host);
nlm_release_file(file);
return rc;
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 856267c0864b..712fdfeb8ef0 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -20,13 +20,6 @@
#include "svcxdr.h"
-static inline loff_t
-s64_to_loff_t(__s64 offset)
-{
- return (loff_t)offset;
-}
-
-
static inline s64
loff_t_to_s64(loff_t offset)
{
@@ -70,8 +63,6 @@ static bool
svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
- u64 len, start;
- s64 end;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
return false;
@@ -81,20 +72,14 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
return false;
- if (xdr_stream_decode_u64(xdr, &start) < 0)
+ if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0)
return false;
- if (xdr_stream_decode_u64(xdr, &len) < 0)
+ if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0)
return false;
locks_init_lock(fl);
fl->fl_flags = FL_POSIX;
fl->fl_type = F_RDLCK;
- end = start + len - 1;
- fl->fl_start = s64_to_loff_t(start);
- if (len == 0 || end < 0)
- fl->fl_end = OFFSET_MAX;
- else
- fl->fl_end = s64_to_loff_t(end);
return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index c266cfdc3291..607f94a0e789 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2129,6 +2129,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
else
error = locks_lock_file_wait(f.file, &fl);
+ locks_release_private(&fl);
out_putf:
fdput(f);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..47ccfcbe0a22 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
/*
* Mbcache is a simple key-value store. Keys need not be unique, however
* key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
*
* Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
* Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -90,7 +90,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
return -ENOMEM;
INIT_LIST_HEAD(&entry->e_list);
- /* One ref for hash, one ref returned */
+ /* Initial hash reference */
atomic_set(&entry->e_refcnt, 1);
entry->e_key = key;
entry->e_value = value;
@@ -106,25 +106,45 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
}
}
hlist_bl_add_head(&entry->e_hash_list, head);
- hlist_bl_unlock(head);
-
+ /*
+ * Add entry to LRU list before it can be found by
+ * mb_cache_entry_delete() to avoid races
+ */
spin_lock(&cache->c_list_lock);
list_add_tail(&entry->e_list, &cache->c_list);
- /* Grab ref for LRU list */
- atomic_inc(&entry->e_refcnt);
cache->c_entry_count++;
spin_unlock(&cache->c_list_lock);
+ hlist_bl_unlock(head);
return 0;
}
EXPORT_SYMBOL(mb_cache_entry_create);
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
{
+ struct hlist_bl_head *head;
+
+ head = mb_cache_entry_head(cache, entry->e_key);
+ hlist_bl_lock(head);
+ hlist_bl_del(&entry->e_hash_list);
+ hlist_bl_unlock(head);
kmem_cache_free(mb_entry_cache, entry);
}
EXPORT_SYMBOL(__mb_cache_entry_free);
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+ wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
struct mb_cache_entry *entry,
u32 key)
@@ -142,10 +162,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key && entry->e_reusable &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
node = node->next;
}
entry = NULL;
@@ -205,10 +224,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_value == value) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key && entry->e_value == value &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
}
entry = NULL;
out:
@@ -217,42 +235,42 @@ out:
}
EXPORT_SYMBOL(mb_cache_entry_get);
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
* @cache - cache we work with
* @key - key
* @value - value
*
- * Remove entry from cache @cache with key @key and value @value.
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
*/
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+ u32 key, u64 value)
{
- struct hlist_bl_node *node;
- struct hlist_bl_head *head;
struct mb_cache_entry *entry;
- head = mb_cache_entry_head(cache, key);
- hlist_bl_lock(head);
- hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_value == value) {
- /* We keep hash list reference to keep entry alive */
- hlist_bl_del_init(&entry->e_hash_list);
- hlist_bl_unlock(head);
- spin_lock(&cache->c_list_lock);
- if (!list_empty(&entry->e_list)) {
- list_del_init(&entry->e_list);
- if (!WARN_ONCE(cache->c_entry_count == 0,
- "mbcache: attempt to decrement c_entry_count past zero"))
- cache->c_entry_count--;
- atomic_dec(&entry->e_refcnt);
- }
- spin_unlock(&cache->c_list_lock);
- mb_cache_entry_put(cache, entry);
- return;
- }
- }
- hlist_bl_unlock(head);
+ entry = mb_cache_entry_get(cache, key, value);
+ if (!entry)
+ return NULL;
+
+ /*
+ * Drop the ref we got from mb_cache_entry_get() and the initial hash
+ * ref if we are the last user
+ */
+ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+ return entry;
+
+ spin_lock(&cache->c_list_lock);
+ if (!list_empty(&entry->e_list))
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ spin_unlock(&cache->c_list_lock);
+ __mb_cache_entry_free(cache, entry);
+ return NULL;
}
-EXPORT_SYMBOL(mb_cache_entry_delete);
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
/* mb_cache_entry_touch - cache entry got used
* @cache - cache the entry belongs to
@@ -281,34 +299,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
unsigned long nr_to_scan)
{
struct mb_cache_entry *entry;
- struct hlist_bl_head *head;
unsigned long shrunk = 0;
spin_lock(&cache->c_list_lock);
while (nr_to_scan-- && !list_empty(&cache->c_list)) {
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
- if (entry->e_referenced) {
+ /* Drop initial hash reference if there is no user */
+ if (entry->e_referenced ||
+ atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
entry->e_referenced = 0;
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
list_del_init(&entry->e_list);
cache->c_entry_count--;
- /*
- * We keep LRU list reference so that entry doesn't go away
- * from under us.
- */
spin_unlock(&cache->c_list_lock);
- head = mb_cache_entry_head(cache, entry->e_key);
- hlist_bl_lock(head);
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- }
- hlist_bl_unlock(head);
- if (mb_cache_entry_put(cache, entry))
- shrunk++;
+ __mb_cache_entry_free(cache, entry);
+ shrunk++;
cond_resched();
spin_lock(&cache->c_list_lock);
}
@@ -367,7 +375,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
cache->c_shrink.count_objects = mb_cache_count;
cache->c_shrink.scan_objects = mb_cache_scan;
cache->c_shrink.seeks = DEFAULT_SEEKS;
- if (register_shrinker(&cache->c_shrink)) {
+ if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
kfree(cache->c_hash);
kfree(cache);
goto err_out;
@@ -400,11 +408,6 @@ void mb_cache_destroy(struct mb_cache *cache)
* point.
*/
list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- } else
- WARN_ON(1);
list_del(&entry->e_list);
WARN_ON(atomic_read(&entry->e_refcnt) != 1);
mb_cache_entry_put(cache, entry);
diff --git a/fs/namei.c b/fs/namei.c
index ed3ffd9b22a3..53b4bc094db2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3024,6 +3024,65 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode to be created in @dir
+ *
+ * Umask stripping depends on whether or not the filesystem supports POSIX
+ * ACLs. If the filesystem doesn't support it umask stripping is done directly
+ * in here. If the filesystem does support POSIX ACLs umask stripping is
+ * deferred until the filesystem calls posix_acl_create().
+ *
+ * Returns: mode
+ */
+static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+ if (!IS_POSIXACL(dir))
+ mode &= ~current_umask();
+ return mode;
+}
+
+/**
+ * vfs_prepare_mode - prepare the mode to be used for a new inode
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode
+ * @mask_perms: allowed permission by the vfs
+ * @type: type of file to be created
+ *
+ * This helper consolidates and enforces vfs restrictions on the @mode of a new
+ * object to be created.
+ *
+ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
+ * the kernel documentation for mode_strip_umask()). Moving umask stripping
+ * after setgid stripping allows the same ordering for both non-POSIX ACL and
+ * POSIX ACL supporting filesystems.
+ *
+ * Note that it's currently valid for @type to be 0 if a directory is created.
+ * Filesystems raise that flag individually and we need to check whether each
+ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
+ * non-zero type.
+ *
+ * Returns: mode to be passed to the filesystem
+ */
+static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t mode,
+ umode_t mask_perms, umode_t type)
+{
+ mode = mode_strip_sgid(mnt_userns, dir, mode);
+ mode = mode_strip_umask(dir, mode);
+
+ /*
+ * Apply the vfs mandated allowed permission mask and set the type of
+ * file to be created before we call into the filesystem.
+ */
+ mode &= (mask_perms & ~S_IFMT);
+ mode |= (type & S_IFMT);
+
+ return mode;
+}
+
+/**
* vfs_create - create new file
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
@@ -3048,8 +3107,8 @@ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
- mode &= S_IALLUGO;
- mode |= S_IFREG;
+
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IALLUGO, S_IFREG);
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
@@ -3312,8 +3371,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
+ mode = vfs_prepare_mode(mnt_userns, dir->d_inode, mode, mode, mode);
if (likely(got_write))
create_error = may_o_create(mnt_userns, &nd->path,
dentry, mode);
@@ -3544,6 +3602,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
if (error)
goto out_err;
@@ -3821,6 +3880,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->mknod)
return -EPERM;
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
@@ -3871,9 +3931,8 @@ retry:
if (IS_ERR(dentry))
goto out1;
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mknod(&path, dentry, mode, dev);
+ error = security_path_mknod(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode), dev);
if (error)
goto out2;
@@ -3943,7 +4002,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->mkdir)
return -EPERM;
- mode &= (S_IRWXUGO|S_ISVTX);
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
return error;
@@ -3971,9 +4030,8 @@ retry:
if (IS_ERR(dentry))
goto out_putname;
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mkdir(&path, dentry, mode);
+ error = security_path_mkdir(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(path.mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index 68789f896f08..df137ba19d37 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4238,6 +4238,13 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
err = -EPERM;
goto out_fput;
}
+
+ /* We're not controlling the target namespace. */
+ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out_fput;
+ }
+
kattr->mnt_userns = get_user_ns(mnt_userns);
out_fput:
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 5e56da748b2a..fea5f8821da5 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -301,18 +301,14 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-/*
- * Try to open the udev path for the WWN. At least on Debian the udev
- * by-id path will always point to the dm-multipath device if one exists.
- */
static struct block_device *
-bl_open_udev_path(struct pnfs_block_volume *v)
+bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
struct block_device *bdev;
const char *devname;
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
- v->scsi.designator_len, v->scsi.designator);
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
+ prefix, v->scsi.designator_len, v->scsi.designator);
if (!devname)
return ERR_PTR(-ENOMEM);
@@ -326,28 +322,6 @@ bl_open_udev_path(struct pnfs_block_volume *v)
return bdev;
}
-/*
- * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
- * wwn- links will only point to the first discovered SCSI device there.
- */
-static struct block_device *
-bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
-{
- struct block_device *bdev;
- const char *devname;
-
- devname = kasprintf(GFP_KERNEL,
- "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
- v->scsi.designator_type,
- v->scsi.designator_len, v->scsi.designator);
- if (!devname)
- return ERR_PTR(-ENOMEM);
-
- bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
- kfree(devname);
- return bdev;
-}
-
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -360,9 +334,15 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
if (!bl_validate_designator(v))
return -EINVAL;
- bdev = bl_open_dm_mpath_udev_path(v);
+ /*
+ * Try to open the RH/Fedora specific dm-mpath udev path first, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ * On other distributions like Debian, the default SCSI by-id path will
+ * point to the dm-multipath device if one exists.
+ */
+ bdev = bl_open_path(v, "dm-uuid-mpath-0x");
if (IS_ERR(bdev))
- bdev = bl_open_udev_path(v);
+ bdev = bl_open_path(v, "wwn-0x");
if (IS_ERR(bdev))
return PTR_ERR(bdev);
d->bdev = bdev;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e828504cc396..da8da5cdbbc1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -708,9 +708,9 @@ static int nfs_init_server(struct nfs_server *server,
}
if (ctx->rsize)
- server->rsize = nfs_block_size(ctx->rsize, NULL);
+ server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto);
if (ctx->wsize)
- server->wsize = nfs_block_size(ctx->wsize, NULL);
+ server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto);
server->acregmin = ctx->acregmin * HZ;
server->acregmax = ctx->acregmax * HZ;
@@ -755,18 +755,19 @@ error:
static void nfs_server_set_fsinfo(struct nfs_server *server,
struct nfs_fsinfo *fsinfo)
{
+ struct nfs_client *clp = server->nfs_client;
unsigned long max_rpc_payload, raw_max_rpc_payload;
/* Work out a lot of parameters */
if (server->rsize == 0)
- server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+ server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto);
if (server->wsize == 0)
- server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+ server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto);
if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
- server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+ server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto);
if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
- server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+ server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto);
raw_max_rpc_payload = rpc_max_payload(server->client);
max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 0c4e8dd6aa96..5d6c2ddc7ea6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1084,7 +1084,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
struct nfs_cache_array *array;
unsigned int i;
- array = kmap(desc->page);
+ array = kmap_local_page(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
@@ -1110,7 +1110,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
if (array->page_is_eof)
desc->eof = !desc->eob;
- kunmap(desc->page);
+ kunmap_local(array);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
(unsigned long long)desc->dir_cookie);
}
@@ -1739,6 +1739,10 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
goto out_bad;
}
+ if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 &&
+ nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ goto out_bad;
+
if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
@@ -1778,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
int ret;
if (flags & LOOKUP_RCU) {
+ if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+ return -ECHILD;
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
@@ -1786,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
+ /* Wait for unlink to complete */
+ wait_var_event(&dentry->d_fsdata,
+ dentry->d_fsdata != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -2373,7 +2382,8 @@ static void nfs_dentry_remove_handle_error(struct inode *dir,
{
switch (error) {
case -ENOENT:
- d_delete(dentry);
+ if (d_really_is_positive(dentry))
+ d_delete(dentry);
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case 0:
@@ -2454,7 +2464,6 @@ out:
int nfs_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
- int need_rehash = 0;
dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
dir->i_ino, dentry);
@@ -2469,15 +2478,27 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
error = nfs_sillyrename(dir, dentry);
goto out;
}
- if (!d_unhashed(dentry)) {
- __d_drop(dentry);
- need_rehash = 1;
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
+ */
+ error = -ETXTBSY;
+ if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) {
+ spin_unlock(&dentry->d_lock);
+ goto out;
}
+ if (dentry->d_fsdata)
+ /* old devname */
+ kfree(dentry->d_fsdata);
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- if (need_rehash)
- d_rehash(dentry);
+ dentry->d_fsdata = NULL;
+ wake_up_var(&dentry->d_fsdata);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2584,6 +2605,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(nfs_link);
+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *new_dentry = data->new_dentry;
+
+ new_dentry->d_fsdata = NULL;
+ wake_up_var(&new_dentry->d_fsdata);
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2614,8 +2644,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct dentry *dentry = NULL, *rehash = NULL;
+ struct dentry *dentry = NULL;
struct rpc_task *task;
+ bool must_unblock = false;
int error = -EBUSY;
if (flags)
@@ -2633,18 +2664,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
* the new target.
*/
if (new_inode && !S_ISDIR(new_inode->i_mode)) {
- /*
- * To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- rehash = new_dentry;
+ error = -ETXTBSY;
+ if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+ goto out;
+ if (new_dentry->d_fsdata) {
+ /* old devname */
+ kfree(new_dentry->d_fsdata);
+ new_dentry->d_fsdata = NULL;
}
+ spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
int err;
+ spin_unlock(&new_dentry->d_lock);
+
/* copy the target dentry's name */
dentry = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
@@ -2657,14 +2697,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
goto out;
new_dentry = dentry;
- rehash = NULL;
new_inode = NULL;
+ } else {
+ new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ must_unblock = true;
+ spin_unlock(&new_dentry->d_lock);
}
+
}
if (S_ISREG(old_inode->i_mode))
nfs_sync_inode(old_inode);
- task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+ must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
@@ -2688,8 +2733,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
spin_unlock(&old_inode->i_lock);
}
out:
- if (rehash)
- d_rehash(rehash);
trace_nfs_rename_exit(old_dir, old_dentry,
new_dir, new_dentry, error);
if (!error) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4eb2a8380a28..1707f46b1335 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -60,44 +60,12 @@
#include "iostat.h"
#include "pnfs.h"
#include "fscache.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_VFS
static struct kmem_cache *nfs_direct_cachep;
-struct nfs_direct_req {
- struct kref kref; /* release manager */
-
- /* I/O parameters */
- struct nfs_open_context *ctx; /* file open context info */
- struct nfs_lock_context *l_ctx; /* Lock context info */
- struct kiocb * iocb; /* controlling i/o request */
- struct inode * inode; /* target file of i/o */
-
- /* completion state */
- atomic_t io_count; /* i/os we're waiting for */
- spinlock_t lock; /* protect completion state */
-
- loff_t io_start; /* Start offset for I/O */
- ssize_t count, /* bytes actually processed */
- max_count, /* max expected count */
- bytes_left, /* bytes left to be sent */
- error; /* any reported error */
- struct completion completion; /* wait for i/o completion */
-
- /* commit state */
- struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
- struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
- struct work_struct work;
- int flags;
- /* for write */
-#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
-#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
- /* for read */
-#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
-#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
-};
-
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
@@ -364,13 +332,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
size_t pgbase;
unsigned npages, i;
- result = iov_iter_get_pages_alloc(iter, &pagevec,
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
rsize, &pgbase);
if (result < 0)
break;
bytes = result;
- iov_iter_advance(iter, bytes);
npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
@@ -478,7 +445,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- if (iter_is_iovec(iter))
+ if (user_backed_iter(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
if (!swap)
@@ -595,14 +562,17 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
+ trace_nfs_direct_commit_complete(dreq);
+
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
dreq->max_count = 0;
dreq->count = 0;
dreq->flags = NFS_ODIRECT_DONE;
- } else if (dreq->flags == NFS_ODIRECT_DONE)
+ } else {
status = dreq->error;
+ }
nfs_init_cinfo_from_dreq(&cinfo, dreq);
@@ -631,6 +601,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
{
struct nfs_direct_req *dreq = cinfo->dreq;
+ trace_nfs_direct_resched_write(dreq);
+
spin_lock(&dreq->lock);
if (dreq->flags != NFS_ODIRECT_DONE)
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
@@ -695,6 +667,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
{
+ trace_nfs_direct_write_complete(dreq);
queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
}
@@ -705,6 +678,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
int flags = NFS_ODIRECT_DONE;
+ trace_nfs_direct_write_completion(dreq);
+
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
@@ -714,7 +689,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
flags = dreq->flags;
@@ -759,6 +734,8 @@ static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
+ trace_nfs_direct_write_reschedule_io(dreq);
+
spin_lock(&dreq->lock);
if (dreq->error == 0) {
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
@@ -799,6 +776,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ trace_nfs_direct_write_schedule_iovec(dreq);
+
nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
@@ -812,13 +791,12 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t pgbase;
unsigned npages, i;
- result = iov_iter_get_pages_alloc(iter, &pagevec,
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
wsize, &pgbase);
if (result < 0)
break;
bytes = result;
- iov_iter_advance(iter, bytes);
npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 549baed76351..e032fe201a36 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = file_inode(file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ long nredirtied;
int ret;
trace_nfs_fsync_enter(inode);
@@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = pnfs_sync_inode(inode, !!datasync);
if (ret != 0)
break;
- if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+ nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ if (nredirtied == save_nredirtied)
break;
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
+ save_nredirtied = nredirtied;
}
trace_nfs_fsync_exit(inode, ret);
@@ -661,8 +658,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
result = filemap_fdatawait_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
- if (result < 0)
- goto out;
}
result = generic_write_sync(iocb, written);
if (result < 0)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 2b2661582bbe..ad34a33b0737 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -181,6 +181,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 604be402ae13..7d285561e59f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1131,6 +1131,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
@@ -1236,6 +1238,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -ENOBUFS:
case -EPIPE:
case -EPERM:
+ case -EPROTO:
+ case -ENODEV:
*op_status = status = NFS4ERR_NXIO;
break;
case -EACCES:
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index bfa7202ca7be..e028f5a0ef5f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -113,8 +113,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_drain_dsaddrs;
ds_versions[i].version = be32_to_cpup(p++);
ds_versions[i].minor_version = be32_to_cpup(p++);
- ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
- ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
+ ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
ds_versions[i].tightly_coupled = be32_to_cpup(p);
if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 9a16897e8dc6..4da701fd1424 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -21,6 +21,8 @@
#include "nfs.h"
#include "internal.h"
+#include "nfstrace.h"
+
#define NFSDBG_FACILITY NFSDBG_MOUNT
#if IS_ENABLED(CONFIG_NFS_V3)
@@ -284,7 +286,6 @@ static int nfs_verify_server_address(struct sockaddr *addr)
}
}
- dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
return 0;
}
@@ -378,7 +379,7 @@ static int nfs_parse_security_flavors(struct fs_context *fc,
char *string = param->string, *p;
int ret;
- dfprintk(MOUNT, "NFS: parsing %s=%s option\n", param->key, param->string);
+ trace_nfs_mount_assign(param->key, string);
while ((p = strsep(&string, ":")) != NULL) {
if (!*p)
@@ -480,11 +481,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
unsigned int len;
int ret, opt;
- dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", param->key);
+ trace_nfs_mount_option(param);
opt = fs_parse(fc, nfs_fs_parameters, param, &result);
if (opt < 0)
- return ctx->sloppy ? 1 : opt;
+ return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt;
if (fc->security)
ctx->has_sec_mnt_opts = 1;
@@ -683,6 +684,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
return ret;
break;
case Opt_vers:
+ trace_nfs_mount_assign(param->key, param->string);
ret = nfs_parse_version_string(fc, param->string);
if (ret < 0)
return ret;
@@ -694,6 +696,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_proto:
+ trace_nfs_mount_assign(param->key, param->string);
protofamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
@@ -729,6 +732,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_mountproto:
+ trace_nfs_mount_assign(param->key, param->string);
mountfamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
@@ -751,6 +755,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_addr:
+ trace_nfs_mount_assign(param->key, param->string);
len = rpc_pton(fc->net_ns, param->string, param->size,
&ctx->nfs_server.address,
sizeof(ctx->nfs_server._address));
@@ -759,16 +764,19 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->nfs_server.addrlen = len;
break;
case Opt_clientaddr:
+ trace_nfs_mount_assign(param->key, param->string);
kfree(ctx->client_address);
ctx->client_address = param->string;
param->string = NULL;
break;
case Opt_mounthost:
+ trace_nfs_mount_assign(param->key, param->string);
kfree(ctx->mount_server.hostname);
ctx->mount_server.hostname = param->string;
param->string = NULL;
break;
case Opt_mountaddr:
+ trace_nfs_mount_assign(param->key, param->string);
len = rpc_pton(fc->net_ns, param->string, param->size,
&ctx->mount_server.address,
sizeof(ctx->mount_server._address));
@@ -846,7 +854,6 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
*/
case Opt_sloppy:
ctx->sloppy = true;
- dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
break;
}
@@ -879,10 +886,8 @@ static int nfs_parse_source(struct fs_context *fc,
size_t len;
const char *end;
- if (unlikely(!dev_name || !*dev_name)) {
- dfprintk(MOUNT, "NFS: device name not specified\n");
+ if (unlikely(!dev_name || !*dev_name))
return -EINVAL;
- }
/* Is the host name protected with square brakcets? */
if (*dev_name == '[') {
@@ -922,7 +927,7 @@ static int nfs_parse_source(struct fs_context *fc,
if (!ctx->nfs_server.export_path)
goto out_nomem;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", ctx->nfs_server.export_path);
+ trace_nfs_mount_path(ctx->nfs_server.export_path);
return 0;
out_bad_devname:
@@ -1116,7 +1121,6 @@ out_no_sec:
return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS");
out_nomem:
- dfprintk(MOUNT, "NFS: not enough memory to handle mount options");
return -ENOMEM;
out_no_address:
@@ -1248,7 +1252,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
if (IS_ERR(c))
return PTR_ERR(c);
ctx->nfs_server.export_path = c;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+ trace_nfs_mount_path(c);
c = strndup_user(data->client_addr.data, 16);
if (IS_ERR(c))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4e46b0ffa2d..bea7c005119c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
static void nfs_inode_init_regular(struct nfs_inode *nfsi)
{
atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->redirtied_pages, 0);
INIT_LIST_HEAD(&nfsi->commit_info.list);
atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 437ebe544aaf..27c720d71b4e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -707,6 +707,24 @@ unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
}
/*
+ * Compute and set NFS server rsize / wsize
+ */
+static inline
+unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
+{
+ if (iosize < NFS_MIN_FILE_IO_SIZE)
+ iosize = NFS_DEF_FILE_IO_SIZE;
+ else if (iosize >= NFS_MAX_FILE_IO_SIZE)
+ iosize = NFS_MAX_FILE_IO_SIZE;
+ else
+ iosize = iosize & PAGE_MASK;
+
+ if (proto == XPRT_TRANSPORT_UDP)
+ return nfs_block_bits(iosize, NULL);
+ return iosize;
+}
+
+/*
* Determine the maximum file size for a superblock
*/
static inline
@@ -861,3 +879,36 @@ static inline void nfs_set_port(struct sockaddr *sap, int *port,
rpc_set_port(sap, *port);
}
+
+struct nfs_direct_req {
+ struct kref kref; /* release manager */
+
+ /* I/O parameters */
+ struct nfs_open_context *ctx; /* file open context info */
+ struct nfs_lock_context *l_ctx; /* Lock context info */
+ struct kiocb * iocb; /* controlling i/o request */
+ struct inode * inode; /* target file of i/o */
+
+ /* completion state */
+ atomic_t io_count; /* i/os we're waiting for */
+ spinlock_t lock; /* protect completion state */
+
+ loff_t io_start; /* Start offset for I/O */
+ ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
+ bytes_left, /* bytes left to be sent */
+ error; /* any reported error */
+ struct completion completion; /* wait for i/o completion */
+
+ /* commit state */
+ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
+ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
+ struct work_struct work;
+ int flags;
+ /* for write */
+#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ /* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
+};
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 5601e47360c2..b49359afac88 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -108,7 +108,6 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- __set_bit(NFS_CS_NOPING, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
/* Use the MDS nfs_client cl_ipaddr. */
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index e7b34f7e0614..a9bf09fdf2c3 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
if (ret)
goto out2;
- ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+ ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
if (ret)
goto out1;
- ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+ ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
if (ret)
goto out;
- ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+ ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
+ "nfs-xattr_large_entry");
if (!ret)
return 0;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 271e5f92ed01..b56f05113d36 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1025,82 +1025,95 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
return decode_op_hdr(xdr, OP_DEALLOCATE);
}
-static int decode_read_plus_data(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res)
-{
- uint32_t count, recvd;
+struct read_plus_segment {
+ enum data_content4 type;
uint64_t offset;
- __be32 *p;
-
- p = xdr_inline_decode(xdr, 8 + 4);
- if (!p)
- return 1;
+ union {
+ struct {
+ uint64_t length;
+ } hole;
+
+ struct {
+ uint32_t length;
+ unsigned int from;
+ } data;
+ };
+};
- p = xdr_decode_hyper(p, &offset);
- count = be32_to_cpup(p);
- recvd = xdr_align_data(xdr, res->count, xdr_align_size(count));
- if (recvd > count)
- recvd = count;
- if (res->count + recvd > args->count) {
- if (args->count > res->count)
- res->count += args->count - res->count;
- return 1;
- }
- res->count += recvd;
- if (count > recvd)
- return 1;
- return 0;
+static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg)
+{
+ return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length;
}
-static int decode_read_plus_hole(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res, uint32_t *eof)
+static int decode_read_plus_segment(struct xdr_stream *xdr,
+ struct read_plus_segment *seg)
{
- uint64_t offset, length, recvd;
__be32 *p;
- p = xdr_inline_decode(xdr, 8 + 8);
+ p = xdr_inline_decode(xdr, 4);
if (!p)
- return 1;
-
- p = xdr_decode_hyper(p, &offset);
- p = xdr_decode_hyper(p, &length);
- if (offset != args->offset + res->count) {
- /* Server returned an out-of-sequence extent */
- if (offset > args->offset + res->count ||
- offset + length < args->offset + res->count) {
- dprintk("NFS: server returned out of sequence extent: "
- "offset/size = %llu/%llu != expected %llu\n",
- (unsigned long long)offset,
- (unsigned long long)length,
- (unsigned long long)(args->offset +
- res->count));
- return 1;
- }
- length -= args->offset + res->count - offset;
- }
- if (length + res->count > args->count) {
- *eof = 0;
- if (unlikely(res->count >= args->count))
- return 1;
- length = args->count - res->count;
- }
- recvd = xdr_expand_hole(xdr, res->count, length);
- res->count += recvd;
+ return -EIO;
+ seg->type = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &seg->offset);
+
+ if (seg->type == NFS4_CONTENT_DATA) {
+ struct xdr_buf buf;
+ uint32_t len = be32_to_cpup(p);
+
+ seg->data.length = len;
+ seg->data.from = xdr_stream_pos(xdr);
- if (recvd < length)
- return 1;
+ if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len)))
+ return -EIO;
+ } else if (seg->type == NFS4_CONTENT_HOLE) {
+ xdr_decode_hyper(p, &seg->hole.length);
+ } else
+ return -EINVAL;
return 0;
}
+static int process_read_plus_segment(struct xdr_stream *xdr,
+ struct nfs_pgio_args *args,
+ struct nfs_pgio_res *res,
+ struct read_plus_segment *seg)
+{
+ unsigned long offset = seg->offset;
+ unsigned long length = read_plus_segment_length(seg);
+ unsigned int bufpos;
+
+ if (offset + length < args->offset)
+ return 0;
+ else if (offset > args->offset + args->count) {
+ res->eof = 0;
+ return 0;
+ } else if (offset < args->offset) {
+ length -= (args->offset - offset);
+ offset = args->offset;
+ } else if (offset + length > args->offset + args->count) {
+ length = (args->offset + args->count) - offset;
+ res->eof = 0;
+ }
+
+ bufpos = xdr->buf->head[0].iov_len + (offset - args->offset);
+ if (seg->type == NFS4_CONTENT_HOLE)
+ return xdr_stream_zero(xdr, bufpos, length);
+ else
+ return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length);
+}
+
static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
{
struct nfs_pgio_header *hdr =
container_of(res, struct nfs_pgio_header, res);
struct nfs_pgio_args *args = &hdr->args;
- uint32_t eof, segments, type;
+ uint32_t segments;
+ struct read_plus_segment *segs;
int status, i;
+ char scratch_buf[16];
__be32 *p;
status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1112,38 +1125,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
return -EIO;
res->count = 0;
- eof = be32_to_cpup(p++);
+ res->eof = be32_to_cpup(p++);
segments = be32_to_cpup(p++);
if (segments == 0)
- goto out;
-
- for (i = 0; i < segments; i++) {
- p = xdr_inline_decode(xdr, 4);
- if (!p)
- goto early_out;
+ return status;
- type = be32_to_cpup(p++);
- if (type == NFS4_CONTENT_DATA)
- status = decode_read_plus_data(xdr, args, res);
- else if (type == NFS4_CONTENT_HOLE)
- status = decode_read_plus_hole(xdr, args, res, &eof);
- else
- return -EINVAL;
+ segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL);
+ if (!segs)
+ return -ENOMEM;
+ xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+ status = -EIO;
+ for (i = 0; i < segments; i++) {
+ status = decode_read_plus_segment(xdr, &segs[i]);
if (status < 0)
- return status;
- if (status > 0)
- goto early_out;
+ goto out;
}
+ xdr_set_pagelen(xdr, xdr_align_size(args->count));
+ for (i = segments; i > 0; i--)
+ res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]);
+ status = 0;
+
out:
- res->eof = eof;
- return 0;
-early_out:
- if (unlikely(!i))
- return -EIO;
- res->eof = 0;
- return 0;
+ kfree(segs);
+ return status;
}
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 47a6cf892c95..3c5678aec006 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1161,9 +1161,9 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
return error;
if (ctx->rsize)
- server->rsize = nfs_block_size(ctx->rsize, NULL);
+ server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto);
if (ctx->wsize)
- server->wsize = nfs_block_size(ctx->wsize, NULL);
+ server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto);
server->acregmin = ctx->acregmin * HZ;
server->acregmax = ctx->acregmax * HZ;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e88f6b18445e..9eb181287879 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -340,6 +340,11 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out;
}
+ if (!S_ISREG(fattr->mode)) {
+ res = ERR_PTR(-EBADF);
+ goto out;
+ }
+
res = ERR_PTR(-ENOMEM);
len = strlen(SSC_READ_NAME_BODY) + 16;
read_name = kzalloc(len, GFP_KERNEL);
@@ -357,6 +362,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
r_ino->i_fop);
if (IS_ERR(filep)) {
res = ERR_CAST(filep);
+ iput(r_ino);
goto out_free_name;
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index f331866dd418..ec6afd3c4bca 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -561,22 +561,20 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
return true;
}
-static void
-nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
{
- struct key *authkey = idmap->idmap_upcall_data->authkey;
-
- kfree(idmap->idmap_upcall_data);
- idmap->idmap_upcall_data = NULL;
- complete_request_key(authkey, ret);
- key_put(authkey);
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
}
-static void
-nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
{
- if (idmap->idmap_upcall_data != NULL)
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
}
static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
@@ -613,7 +611,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
if (ret < 0)
- nfs_idmap_abort_pipe_upcall(idmap, ret);
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
return ret;
out2:
@@ -669,6 +667,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
@@ -678,10 +677,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
* will have been woken up and someone else may now have used
* idmap_key_cons - so after this point we may no longer touch it.
*/
- if (idmap->idmap_upcall_data == NULL)
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
goto out_noupcall;
- authkey = idmap->idmap_upcall_data->authkey;
+ authkey = data->authkey;
rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
@@ -703,18 +703,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
ret = -EINVAL;
goto out;
-}
+ }
- ret = nfs_idmap_read_and_verify_message(&im,
- &idmap->idmap_upcall_data->idmap_msg,
- rka->target_key, authkey);
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
if (ret >= 0) {
key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
out:
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ nfs_idmap_complete_pipe_upcall(data, ret);
out_noupcall:
return ret;
}
@@ -728,7 +727,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
struct idmap *idmap = data->idmap;
if (msg->errno)
- nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
}
static void
@@ -736,8 +735,11 @@ idmap_release_pipe(struct inode *inode)
{
struct rpc_inode *rpci = RPC_I(inode);
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
- nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
}
int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bb0e84a46d61..3ed14a2a84a4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -784,10 +784,9 @@ static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
slot->seq_nr_highest_sent = seqnr;
}
-static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
- u32 seqnr)
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
{
- slot->seq_nr_highest_sent = seqnr;
+ nfs4_slot_sequence_record_sent(slot, seqnr);
slot->seq_nr_last_acked = seqnr;
}
@@ -854,7 +853,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
- nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -3098,12 +3096,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- if (opendata->lgp) {
- nfs4_lgopen_release(opendata->lgp);
- opendata->lgp = NULL;
- }
- if (!opendata->cancelled)
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
return ret;
}
@@ -8924,6 +8923,9 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+ else if (rpc_clnt_xprt_switch_has_addr(clnt,
+ (struct sockaddr *)&xprt->addr))
+ rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
rpc_put_task(task);
}
@@ -9248,6 +9250,13 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
int status;
unsigned *ptr;
struct nfs4_session *session = clp->cl_session;
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
@@ -9264,6 +9273,7 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
ptr = (unsigned *)&session->sess_id.data[0];
dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+ rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata);
out:
return status;
}
@@ -9293,6 +9303,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
if (status)
dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
"Session has been destroyed regardless...\n", status);
+ rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient);
return status;
}
@@ -9477,6 +9488,9 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
rpc_delay(task, NFS4_POLL_RETRY_MAX);
fallthrough;
case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
return -EAGAIN;
case -NFS4ERR_BADSESSION:
case -NFS4ERR_DEADSESSION:
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 012bd7339862..8c6cc58679ff 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1137,7 +1137,7 @@ TRACE_EVENT(nfs_readpage_done,
__field(u32, arg_count)
__field(u32, res_count)
__field(bool, eof)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1146,7 +1146,7 @@ TRACE_EVENT(nfs_readpage_done,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1157,14 +1157,13 @@ TRACE_EVENT(nfs_readpage_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
- __entry->eof ? " eof" : ""
+ __entry->res_count, __entry->eof ? " eof" : ""
)
);
@@ -1184,7 +1183,7 @@ TRACE_EVENT(nfs_readpage_short,
__field(u32, arg_count)
__field(u32, res_count)
__field(bool, eof)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1193,7 +1192,7 @@ TRACE_EVENT(nfs_readpage_short,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1204,14 +1203,13 @@ TRACE_EVENT(nfs_readpage_short,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
- __entry->eof ? " eof" : ""
+ __entry->res_count, __entry->eof ? " eof" : ""
)
);
@@ -1323,7 +1321,7 @@ TRACE_EVENT(nfs_pgio_error,
__field(u32, arg_count)
__field(u32, res_count)
__field(loff_t, pos)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1332,7 +1330,7 @@ TRACE_EVENT(nfs_pgio_error,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = error;
+ __entry->error = error;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1341,12 +1339,12 @@ TRACE_EVENT(nfs_pgio_error,
__entry->fhandle = nfs_fhandle_hash(fh);
),
- TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u pos=%llu status=%d",
+ TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u pos=%llu", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid, __entry->fhandle,
(long long)__entry->offset, __entry->arg_count, __entry->res_count,
- __entry->pos, __entry->status
+ __entry->pos
)
);
@@ -1406,7 +1404,7 @@ TRACE_EVENT(nfs_writeback_done,
__field(loff_t, offset)
__field(u32, arg_count)
__field(u32, res_count)
- __field(int, status)
+ __field(int, error)
__field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1418,7 +1416,7 @@ TRACE_EVENT(nfs_writeback_done,
hdr->args.fh : &nfsi->fh;
const struct nfs_writeverf *verf = hdr->res.verf;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1432,14 +1430,14 @@ TRACE_EVENT(nfs_writeback_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d stable=%s "
- "verifier=%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stable=%s "
+ "verifier=%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
+ __entry->res_count,
show_nfs_stable_how(__entry->stable),
show_nfs4_verifier(__entry->verifier)
)
@@ -1447,44 +1445,50 @@ TRACE_EVENT(nfs_writeback_done,
DECLARE_EVENT_CLASS(nfs_page_error_class,
TP_PROTO(
+ const struct inode *inode,
const struct nfs_page *req,
int error
),
- TP_ARGS(req, error),
+ TP_ARGS(inode, req, error),
TP_STRUCT__entry(
- __field(const void *, req)
- __field(pgoff_t, index)
- __field(unsigned int, offset)
- __field(unsigned int, pgbase)
- __field(unsigned int, bytes)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
__field(int, error)
),
TP_fast_assign(
- __entry->req = req;
- __entry->index = req->wb_index;
- __entry->offset = req->wb_offset;
- __entry->pgbase = req->wb_pgbase;
- __entry->bytes = req->wb_bytes;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
__entry->error = error;
),
TP_printk(
- "req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d",
- __entry->req, __entry->index, __entry->offset,
- __entry->pgbase, __entry->bytes, __entry->error
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count
)
);
#define DEFINE_NFS_PAGEERR_EVENT(name) \
DEFINE_EVENT(nfs_page_error_class, name, \
TP_PROTO( \
+ const struct inode *inode, \
const struct nfs_page *req, \
int error \
), \
- TP_ARGS(req, error))
+ TP_ARGS(inode, req, error))
DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
@@ -1541,7 +1545,7 @@ TRACE_EVENT(nfs_commit_done,
__field(u32, fhandle)
__field(u64, fileid)
__field(loff_t, offset)
- __field(int, status)
+ __field(int, error)
__field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1553,7 +1557,7 @@ TRACE_EVENT(nfs_commit_done,
data->args.fh : &nfsi->fh;
const struct nfs_writeverf *verf = data->res.verf;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = data->args.offset;
__entry->stable = verf->committed;
memcpy(__entry->verifier,
@@ -1565,17 +1569,83 @@ TRACE_EVENT(nfs_commit_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld status=%d stable=%s verifier=%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld stable=%s verifier=%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- (long long)__entry->offset, __entry->status,
+ (long long)__entry->offset,
show_nfs_stable_how(__entry->stable),
show_nfs4_verifier(__entry->verifier)
)
);
+#define nfs_show_direct_req_flags(v) \
+ __print_flags(v, "|", \
+ { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \
+ { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \
+ { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \
+ { NFS_ODIRECT_DONE, "DONE" } )
+
+DECLARE_EVENT_CLASS(nfs_direct_req_class,
+ TP_PROTO(
+ const struct nfs_direct_req *dreq
+ ),
+
+ TP_ARGS(dreq),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(ssize_t, bytes_left)
+ __field(ssize_t, error)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = dreq->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = dreq->io_start;
+ __entry->count = dreq->count;
+ __entry->bytes_left = dreq->bytes_left;
+ __entry->error = dreq->error;
+ __entry->flags = dreq->flags;
+ ),
+
+ TP_printk(
+ "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd bytes_left=%zd flags=%s",
+ __entry->error, MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count, __entry->bytes_left,
+ nfs_show_direct_req_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \
+ DEFINE_EVENT(nfs_direct_req_class, name, \
+ TP_PROTO( \
+ const struct nfs_direct_req *dreq \
+ ), \
+ TP_ARGS(dreq))
+
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
@@ -1609,6 +1679,65 @@ TRACE_EVENT(nfs_fh_to_dentry,
)
);
+TRACE_EVENT(nfs_mount_assign,
+ TP_PROTO(
+ const char *option,
+ const char *value
+ ),
+
+ TP_ARGS(option, value),
+
+ TP_STRUCT__entry(
+ __string(option, option)
+ __string(value, value)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, option);
+ __assign_str(value, value);
+ ),
+
+ TP_printk("option %s=%s",
+ __get_str(option), __get_str(value)
+ )
+);
+
+TRACE_EVENT(nfs_mount_option,
+ TP_PROTO(
+ const struct fs_parameter *param
+ ),
+
+ TP_ARGS(param),
+
+ TP_STRUCT__entry(
+ __string(option, param->key)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, param->key);
+ ),
+
+ TP_printk("option %s", __get_str(option))
+);
+
+TRACE_EVENT(nfs_mount_path,
+ TP_PROTO(
+ const char *path
+ ),
+
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __string(path, path)
+ ),
+
+ TP_fast_assign(
+ __assign_str(path, path);
+ ),
+
+ TP_printk("path='%s'", __get_str(path))
+);
+
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 41a9b6b58fb9..2613b7e36eb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2817,7 +2817,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
hdr->completion_ops);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6ab5eeb000dc..82944e14fcea 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- ret = register_shrinker(&acl_shrinker);
+ ret = register_shrinker(&acl_shrinker, "nfs-acl");
if (ret < 0)
goto error_3;
#ifdef CONFIG_NFS_V4_2
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 69569696dde0..1843fa235d9b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page)
static void nfs_write_error(struct nfs_page *req, int error)
{
- trace_nfs_write_error(req, error);
+ trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req,
+ error);
nfs_mapping_set_error(req->wb_page, error);
nfs_inode_remove_request(req);
nfs_end_page_writeback(req);
@@ -1000,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- trace_nfs_comp_error(req, hdr->error);
+ trace_nfs_comp_error(hdr->inode, req, hdr->error);
nfs_mapping_set_error(req->wb_page, hdr->error);
goto remove_req;
}
@@ -1419,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+
/* Bump the transmission count */
req->wb_nio++;
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&nfsi->redirtied_pages);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1444,8 +1447,6 @@ static void nfs_async_write_error(struct list_head *head, int error)
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
nfs_async_write_error(&hdr->pages, 0);
- filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
- hdr->args.offset + hdr->args.count - 1);
}
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
@@ -1576,25 +1577,37 @@ static int nfs_writeback_done(struct rpc_task *task,
nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
trace_nfs_writeback_done(task, hdr);
- if (hdr->res.verf->committed < hdr->args.stable &&
- task->tk_status >= 0) {
- /* We tried a write call, but the server did not
- * commit data to stable storage even though we
- * requested it.
- * Note: There is a known bug in Tru64 < 5.0 in which
- * the server reports NFS_DATA_SYNC, but performs
- * NFS_FILE_SYNC. We therefore implement this checking
- * as a dprintk() in order to avoid filling syslog.
- */
- static unsigned long complain;
+ if (task->tk_status >= 0) {
+ enum nfs3_stable_how committed = hdr->res.verf->committed;
+
+ if (committed == NFS_UNSTABLE) {
+ /*
+ * We have some uncommitted data on the server at
+ * this point, so ensure that we keep track of that
+ * fact irrespective of what later writes do.
+ */
+ set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags);
+ }
- /* Note this will print the MDS for a DS write */
- if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
- " (committed = %d) != (stable = %d)\n",
- NFS_SERVER(inode)->nfs_client->cl_hostname,
- hdr->res.verf->committed, hdr->args.stable);
- complain = jiffies + 300 * HZ;
+ if (committed < hdr->args.stable) {
+ /* We tried a write call, but the server did not
+ * commit data to stable storage even though we
+ * requested it.
+ * Note: There is a known bug in Tru64 < 5.0 in which
+ * the server reports NFS_DATA_SYNC, but performs
+ * NFS_FILE_SYNC. We therefore implement this checking
+ * as a dprintk() in order to avoid filling syslog.
+ */
+ static unsigned long complain;
+
+ /* Note this will print the MDS for a DS write */
+ if (time_before(complain, jiffies)) {
+ dprintk("NFS: faulty NFS server %s:"
+ " (committed = %d) != (stable = %d)\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ committed, hdr->args.stable);
+ complain = jiffies + 300 * HZ;
+ }
}
}
@@ -1872,7 +1885,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
(long long)req_offset(req));
if (status < 0) {
if (req->wb_page) {
- trace_nfs_commit_error(req, status);
+ trace_nfs_commit_error(data->inode, req,
+ status);
nfs_mapping_set_error(req->wb_page, status);
nfs_inode_remove_request(req);
}
@@ -1892,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a mismatch. Write the page again */
dprintk_cont(" mismatch\n");
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
nfs_unlock_and_release_request(req);
/* Latency breaker */
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index ba14d2f4b64f..4b7324458a94 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -38,6 +38,8 @@
struct nfs4_acl;
struct svc_fh;
struct svc_rqst;
+struct nfsd_attrs;
+enum nfs_ftype4;
int nfs4_acl_bytes(int entries);
int nfs4_acl_get_whotype(char *, u32);
@@ -45,7 +47,7 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
struct nfs4_acl **acl);
-__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl);
+__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
+ struct nfsd_attrs *attr);
#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb2d590c036..eeed4ae5b4ad 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -13,6 +13,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
+#include <linux/rhashtable.h>
#include "vfs.h"
#include "nfsd.h"
@@ -21,28 +22,19 @@
#include "filecache.h"
#include "trace.h"
-#define NFSDDBG_FACILITY NFSDDBG_FH
-
-/* FIXME: dynamically size this for the machine somehow? */
-#define NFSD_FILE_HASH_BITS 12
-#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
#define NFSD_LAUNDRETTE_DELAY (2 * HZ)
-#define NFSD_FILE_SHUTDOWN (1)
-#define NFSD_FILE_LRU_THRESHOLD (4096UL)
-#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
+#define NFSD_FILE_CACHE_UP (0)
/* We only care about NFSD_MAY_READ/WRITE for this cache */
#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
-struct nfsd_fcache_bucket {
- struct hlist_head nfb_head;
- spinlock_t nfb_lock;
- unsigned int nfb_count;
- unsigned int nfb_maxcount;
-};
-
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
struct work_struct work;
@@ -54,21 +46,146 @@ static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
-static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
static struct list_lru nfsd_file_lru;
-static long nfsd_file_lru_flags;
+static unsigned long nfsd_file_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
-static atomic_long_t nfsd_filecache_count;
static struct delayed_work nfsd_filecache_laundrette;
+static struct rhashtable nfsd_file_rhash_tbl
+ ____cacheline_aligned_in_smp;
+
+enum nfsd_file_lookup_type {
+ NFSD_FILE_KEY_INODE,
+ NFSD_FILE_KEY_FULL,
+};
+
+struct nfsd_file_lookup_key {
+ struct inode *inode;
+ struct net *net;
+ const struct cred *cred;
+ unsigned char need;
+ enum nfsd_file_lookup_type type;
+};
+
+/*
+ * The returned hash value is based solely on the address of an in-code
+ * inode, a pointer to a slab-allocated object. The entropy in such a
+ * pointer is concentrated in its middle bits.
+ */
+static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed)
+{
+ unsigned long ptr = (unsigned long)inode;
+ u32 k;
+
+ k = ptr >> L1_CACHE_SHIFT;
+ k &= 0x00ffffff;
+ return jhash2(&k, 1, seed);
+}
+
+/**
+ * nfsd_file_key_hashfn - Compute the hash value of a lookup key
+ * @data: key on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ * Computed 32-bit hash value
+ */
+static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nfsd_file_lookup_key *key = data;
+
+ return nfsd_file_inode_hash(key->inode, seed);
+}
+
+/**
+ * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file
+ * @data: object on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ * Computed 32-bit hash value
+ */
+static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nfsd_file *nf = data;
+
+ return nfsd_file_inode_hash(nf->nf_inode, seed);
+}
-static void nfsd_file_gc(void);
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+ int i;
+
+ if (!uid_eq(c1->fsuid, c2->fsuid))
+ return false;
+ if (!gid_eq(c1->fsgid, c2->fsgid))
+ return false;
+ if (c1->group_info == NULL || c2->group_info == NULL)
+ return c1->group_info == c2->group_info;
+ if (c1->group_info->ngroups != c2->group_info->ngroups)
+ return false;
+ for (i = 0; i < c1->group_info->ngroups; i++) {
+ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+ return false;
+ }
+ return true;
+}
+
+/**
+ * nfsd_file_obj_cmpfn - Match a cache item against search criteria
+ * @arg: search criteria
+ * @ptr: cache item to check
+ *
+ * Return values:
+ * %0 - Item matches search criteria
+ * %1 - Item does not match search criteria
+ */
+static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nfsd_file_lookup_key *key = arg->key;
+ const struct nfsd_file *nf = ptr;
+
+ switch (key->type) {
+ case NFSD_FILE_KEY_INODE:
+ if (nf->nf_inode != key->inode)
+ return 1;
+ break;
+ case NFSD_FILE_KEY_FULL:
+ if (nf->nf_inode != key->inode)
+ return 1;
+ if (nf->nf_may != key->need)
+ return 1;
+ if (nf->nf_net != key->net)
+ return 1;
+ if (!nfsd_match_cred(nf->nf_cred, key->cred))
+ return 1;
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+static const struct rhashtable_params nfsd_file_rhash_params = {
+ .key_len = sizeof_field(struct nfsd_file, nf_inode),
+ .key_offset = offsetof(struct nfsd_file, nf_inode),
+ .head_offset = offsetof(struct nfsd_file, nf_rhash),
+ .hashfn = nfsd_file_key_hashfn,
+ .obj_hashfn = nfsd_file_obj_hashfn,
+ .obj_cmpfn = nfsd_file_obj_cmpfn,
+ /* Reduce resizing churn on light workloads */
+ .min_size = 512, /* buckets */
+ .automatic_shrinking = true,
+};
static void
nfsd_file_schedule_laundrette(void)
{
- long count = atomic_long_read(&nfsd_filecache_count);
-
- if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+ if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) ||
+ test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
return;
queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
@@ -111,12 +228,11 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
}
static struct nfsd_file_mark *
-nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
{
int err;
struct fsnotify_mark *mark;
struct nfsd_file_mark *nfm = NULL, *new;
- struct inode *inode = nf->nf_inode;
do {
fsnotify_group_lock(nfsd_file_fsnotify_group);
@@ -167,31 +283,25 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
}
static struct nfsd_file *
-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
- struct net *net)
+nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
{
struct nfsd_file *nf;
nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
if (nf) {
- INIT_HLIST_NODE(&nf->nf_node);
INIT_LIST_HEAD(&nf->nf_lru);
+ nf->nf_birthtime = ktime_get();
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
- nf->nf_net = net;
+ nf->nf_net = key->net;
nf->nf_flags = 0;
- nf->nf_inode = inode;
- nf->nf_hashval = hashval;
- refcount_set(&nf->nf_ref, 1);
- nf->nf_may = may & NFSD_FILE_MAY_MASK;
- if (may & NFSD_MAY_NOT_BREAK_LEASE) {
- if (may & NFSD_MAY_WRITE)
- __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
- if (may & NFSD_MAY_READ)
- __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- }
+ __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+ __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ nf->nf_inode = key->inode;
+ /* nf_ref is pre-incremented for hash table */
+ refcount_set(&nf->nf_ref, 2);
+ nf->nf_may = key->need;
nf->nf_mark = NULL;
- trace_nfsd_file_alloc(nf);
}
return nf;
}
@@ -199,8 +309,12 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
static bool
nfsd_file_free(struct nfsd_file *nf)
{
+ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
bool flush = false;
+ this_cpu_inc(nfsd_file_releases);
+ this_cpu_add(nfsd_file_total_age, age);
+
trace_nfsd_file_put_final(nf);
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
@@ -210,6 +324,14 @@ nfsd_file_free(struct nfsd_file *nf)
fput(nf->nf_file);
flush = true;
}
+
+ /*
+ * If this item is still linked via nf_lru, that's a bug.
+ * WARN and leak it to preserve system stability.
+ */
+ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return flush;
+
call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
return flush;
}
@@ -240,31 +362,44 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
static void
nfsd_file_flush(struct nfsd_file *nf)
{
- if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0)
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+ this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
+ if (vfs_fsync(file, 1) != 0)
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
-static void
-nfsd_file_do_unhash(struct nfsd_file *nf)
+static void nfsd_file_lru_add(struct nfsd_file *nf)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ trace_nfsd_file_lru_add(nf);
+}
+static void nfsd_file_lru_remove(struct nfsd_file *nf)
+{
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
+ trace_nfsd_file_lru_del(nf);
+}
+
+static void
+nfsd_file_hash_remove(struct nfsd_file *nf)
+{
trace_nfsd_file_unhash(nf);
if (nfsd_file_check_write_error(nf))
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
- --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
- hlist_del_rcu(&nf->nf_node);
- atomic_long_dec(&nfsd_filecache_count);
+ rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
+ nfsd_file_rhash_params);
}
static bool
nfsd_file_unhash(struct nfsd_file *nf)
{
if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_do_unhash(nf);
- if (!list_empty(&nf->nf_lru))
- list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+ nfsd_file_hash_remove(nf);
return true;
}
return false;
@@ -274,17 +409,16 @@ nfsd_file_unhash(struct nfsd_file *nf)
* Return true if the file was unhashed.
*/
static bool
-nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
-
- trace_nfsd_file_unhash_and_release_locked(nf);
+ trace_nfsd_file_unhash_and_dispose(nf);
if (!nfsd_file_unhash(nf))
return false;
/* keep final reference for nfsd_file_lru_dispose */
if (refcount_dec_not_one(&nf->nf_ref))
return true;
+ nfsd_file_lru_remove(nf);
list_add(&nf->nf_lru, dispose);
return true;
}
@@ -296,6 +430,7 @@ nfsd_file_put_noref(struct nfsd_file *nf)
if (refcount_dec_and_test(&nf->nf_ref)) {
WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+ nfsd_file_lru_remove(nf);
nfsd_file_free(nf);
}
}
@@ -305,7 +440,7 @@ nfsd_file_put(struct nfsd_file *nf)
{
might_sleep();
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ nfsd_file_lru_add(nf);
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
@@ -314,9 +449,24 @@ nfsd_file_put(struct nfsd_file *nf)
nfsd_file_schedule_laundrette();
} else
nfsd_file_put_noref(nf);
+}
- if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
- nfsd_file_gc();
+/**
+ * nfsd_file_close - Close an nfsd_file
+ * @nf: nfsd_file to close
+ *
+ * If this is the final reference for @nf, free it immediately.
+ * This reflects an on-the-wire CLOSE or DELEGRETURN into the
+ * VFS and exported filesystem.
+ */
+void nfsd_file_close(struct nfsd_file *nf)
+{
+ nfsd_file_put(nf);
+ if (refcount_dec_if_one(&nf->nf_ref)) {
+ nfsd_file_unhash(nf);
+ nfsd_file_lru_remove(nf);
+ nfsd_file_free(nf);
+ }
}
struct nfsd_file *
@@ -334,7 +484,7 @@ nfsd_file_dispose_list(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
+ list_del_init(&nf->nf_lru);
nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
}
@@ -348,7 +498,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
+ list_del_init(&nf->nf_lru);
nfsd_file_flush(nf);
if (!refcount_dec_and_test(&nf->nf_ref))
continue;
@@ -405,8 +555,19 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
}
}
-/*
+/**
+ * nfsd_file_lru_cb - Examine an entry on the LRU list
+ * @item: LRU entry to examine
+ * @lru: controlling LRU
+ * @lock: LRU list lock (unused)
+ * @arg: dispose list
+ *
* Note this can deadlock with nfsd_file_cache_purge.
+ *
+ * Return values:
+ * %LRU_REMOVED: @item was removed from the LRU
+ * %LRU_ROTATE: @item is to be moved to the LRU tail
+ * %LRU_SKIP: @item cannot be evicted
*/
static enum lru_status
nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
@@ -427,55 +588,65 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
* counter. Here we check the counter and then test and clear the flag.
* That order is deliberate to ensure that we can do this locklessly.
*/
- if (refcount_read(&nf->nf_ref) > 1)
- goto out_skip;
+ if (refcount_read(&nf->nf_ref) > 1) {
+ list_lru_isolate(lru, &nf->nf_lru);
+ trace_nfsd_file_gc_in_use(nf);
+ return LRU_REMOVED;
+ }
/*
* Don't throw out files that are still undergoing I/O or
* that have uncleared errors pending.
*/
- if (nfsd_file_check_writeback(nf))
- goto out_skip;
+ if (nfsd_file_check_writeback(nf)) {
+ trace_nfsd_file_gc_writeback(nf);
+ return LRU_SKIP;
+ }
- if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
- goto out_skip;
+ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
+ trace_nfsd_file_gc_referenced(nf);
+ return LRU_ROTATE;
+ }
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
- goto out_skip;
+ if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ trace_nfsd_file_gc_hashed(nf);
+ return LRU_SKIP;
+ }
list_lru_isolate_move(lru, &nf->nf_lru, head);
+ this_cpu_inc(nfsd_file_evictions);
+ trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
-out_skip:
- return LRU_SKIP;
}
-static unsigned long
-nfsd_file_lru_walk_list(struct shrink_control *sc)
+/*
+ * Unhash items on @dispose immediately, then queue them on the
+ * disposal workqueue to finish releasing them in the background.
+ *
+ * cel: Note that between the time list_lru_shrink_walk runs and
+ * now, these items are in the hash table but marked unhashed.
+ * Why release these outside of lru_cb ? There's no lock ordering
+ * problem since lru_cb currently takes no lock.
+ */
+static void nfsd_file_gc_dispose_list(struct list_head *dispose)
{
- LIST_HEAD(head);
struct nfsd_file *nf;
- unsigned long ret;
- if (sc)
- ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
- nfsd_file_lru_cb, &head);
- else
- ret = list_lru_walk(&nfsd_file_lru,
- nfsd_file_lru_cb,
- &head, LONG_MAX);
- list_for_each_entry(nf, &head, nf_lru) {
- spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- nfsd_file_do_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- }
- nfsd_file_dispose_list_delayed(&head);
- return ret;
+ list_for_each_entry(nf, dispose, nf_lru)
+ nfsd_file_hash_remove(nf);
+ nfsd_file_dispose_list_delayed(dispose);
}
static void
nfsd_file_gc(void)
{
- nfsd_file_lru_walk_list(NULL);
+ LIST_HEAD(dispose);
+ unsigned long ret;
+
+ ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
+ &dispose, list_lru_count(&nfsd_file_lru));
+ trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_gc_dispose_list(&dispose);
}
static void
@@ -494,7 +665,14 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
static unsigned long
nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
{
- return nfsd_file_lru_walk_list(sc);
+ LIST_HEAD(dispose);
+ unsigned long ret;
+
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+ nfsd_file_lru_cb, &dispose);
+ trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_gc_dispose_list(&dispose);
+ return ret;
}
static struct shrinker nfsd_file_shrinker = {
@@ -503,39 +681,47 @@ static struct shrinker nfsd_file_shrinker = {
.seeks = 1,
};
-static void
-__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
- struct list_head *dispose)
+/*
+ * Find all cache items across all net namespaces that match @inode and
+ * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+ */
+static unsigned int
+__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
{
- struct nfsd_file *nf;
- struct hlist_node *tmp;
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_INODE,
+ .inode = inode,
+ };
+ unsigned int count = 0;
+ struct nfsd_file *nf;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
- if (inode == nf->nf_inode)
- nfsd_file_unhash_and_release_locked(nf, dispose);
- }
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ rcu_read_lock();
+ do {
+ nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
+ if (!nf)
+ break;
+ nfsd_file_unhash_and_dispose(nf, dispose);
+ count++;
+ } while (1);
+ rcu_read_unlock();
+ return count;
}
/**
* nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put. Also ensure that any of the
- * fputs also have their final __fput done as well.
+ * Unhash and put, then flush and fput all cache items associated with @inode.
*/
void
nfsd_file_close_inode_sync(struct inode *inode)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
LIST_HEAD(dispose);
+ unsigned int count;
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+ count = __nfsd_file_close_inode(inode, &dispose);
+ trace_nfsd_file_close_inode_sync(inode, count);
nfsd_file_dispose_list_sync(&dispose);
}
@@ -543,19 +729,16 @@ nfsd_file_close_inode_sync(struct inode *inode)
* nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put.
+ * Unhash and put all cache item associated with @inode.
*/
static void
nfsd_file_close_inode(struct inode *inode)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
LIST_HEAD(dispose);
+ unsigned int count;
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+ count = __nfsd_file_close_inode(inode, &dispose);
+ trace_nfsd_file_close_inode(inode, count);
nfsd_file_dispose_list_delayed(&dispose);
}
@@ -630,25 +813,21 @@ static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
int
nfsd_file_cache_init(void)
{
- int ret = -ENOMEM;
- unsigned int i;
+ int ret;
- clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
-
- if (nfsd_file_hashtbl)
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
return 0;
+ ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
- sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
- if (!nfsd_file_hashtbl) {
- pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
- goto out_err;
- }
-
nfsd_file_slab = kmem_cache_create("nfsd_file",
sizeof(struct nfsd_file), 0, 0, NULL);
if (!nfsd_file_slab) {
@@ -670,7 +849,7 @@ nfsd_file_cache_init(void)
goto out_err;
}
- ret = register_shrinker(&nfsd_file_shrinker);
+ ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
if (ret) {
pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
goto out_lru;
@@ -692,11 +871,6 @@ nfsd_file_cache_init(void)
goto out_notifier;
}
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
- spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
- }
-
INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
out:
return ret;
@@ -711,46 +885,47 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kvfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
+ rhashtable_destroy(&nfsd_file_rhash_tbl);
goto out;
}
/*
* Note this can deadlock with nfsd_file_lru_cb.
*/
-void
-nfsd_file_cache_purge(struct net *net)
+static void
+__nfsd_file_cache_purge(struct net *net)
{
- unsigned int i;
- struct nfsd_file *nf;
- struct hlist_node *next;
+ struct rhashtable_iter iter;
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
bool del;
- if (!nfsd_file_hashtbl)
- return;
-
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+ rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
+ do {
+ rhashtable_walk_start(&iter);
- spin_lock(&nfb->nfb_lock);
- hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ nf = rhashtable_walk_next(&iter);
+ while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+ del = nfsd_file_unhash_and_dispose(nf, &dispose);
/*
* Deadlock detected! Something marked this entry as
* unhased, but hasn't removed it from the hash list.
*/
WARN_ON_ONCE(!del);
+
+ nf = rhashtable_walk_next(&iter);
}
- spin_unlock(&nfb->nfb_lock);
- nfsd_file_dispose_list(&dispose);
- }
+
+ rhashtable_walk_stop(&iter);
+ } while (nf == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ nfsd_file_dispose_list(&dispose);
}
static struct nfsd_fcache_disposal *
@@ -793,6 +968,19 @@ nfsd_file_cache_start_net(struct net *net)
return nn->fcache_disposal ? 0 : -ENOMEM;
}
+/**
+ * nfsd_file_cache_purge - Remove all cache items associated with @net
+ * @net: target net namespace
+ *
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
+ __nfsd_file_cache_purge(net);
+}
+
void
nfsd_file_cache_shutdown_net(struct net *net)
{
@@ -803,7 +991,11 @@ nfsd_file_cache_shutdown_net(struct net *net)
void
nfsd_file_cache_shutdown(void)
{
- set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+ int i;
+
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
+ return;
lease_unregister_notifier(&nfsd_file_lease_notifier);
unregister_shrinker(&nfsd_file_shrinker);
@@ -812,7 +1004,7 @@ nfsd_file_cache_shutdown(void)
* calling nfsd_file_cache_purge
*/
cancel_delayed_work_sync(&nfsd_filecache_laundrette);
- nfsd_file_cache_purge(NULL);
+ __nfsd_file_cache_purge(NULL);
list_lru_destroy(&nfsd_file_lru);
rcu_barrier();
fsnotify_put_group(nfsd_file_fsnotify_group);
@@ -822,124 +1014,96 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kvfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
-}
-
-static bool
-nfsd_match_cred(const struct cred *c1, const struct cred *c2)
-{
- int i;
-
- if (!uid_eq(c1->fsuid, c2->fsuid))
- return false;
- if (!gid_eq(c1->fsgid, c2->fsgid))
- return false;
- if (c1->group_info == NULL || c2->group_info == NULL)
- return c1->group_info == c2->group_info;
- if (c1->group_info->ngroups != c2->group_info->ngroups)
- return false;
- for (i = 0; i < c1->group_info->ngroups; i++) {
- if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
- return false;
- }
- return true;
-}
-
-static struct nfsd_file *
-nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
- unsigned int hashval, struct net *net)
-{
- struct nfsd_file *nf;
- unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
-
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
- if (nf->nf_may != need)
- continue;
- if (nf->nf_inode != inode)
- continue;
- if (nf->nf_net != net)
- continue;
- if (!nfsd_match_cred(nf->nf_cred, current_cred()))
- continue;
- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
- continue;
- if (nfsd_file_get(nf) != NULL)
- return nf;
+ rhashtable_destroy(&nfsd_file_rhash_tbl);
+
+ for_each_possible_cpu(i) {
+ per_cpu(nfsd_file_cache_hits, i) = 0;
+ per_cpu(nfsd_file_acquisitions, i) = 0;
+ per_cpu(nfsd_file_releases, i) = 0;
+ per_cpu(nfsd_file_total_age, i) = 0;
+ per_cpu(nfsd_file_pages_flushed, i) = 0;
+ per_cpu(nfsd_file_evictions, i) = 0;
}
- return NULL;
}
/**
- * nfsd_file_is_cached - are there any cached open files for this fh?
- * @inode: inode of the file to check
+ * nfsd_file_is_cached - are there any cached open files for this inode?
+ * @inode: inode to check
+ *
+ * The lookup matches inodes in all net namespaces and is atomic wrt
+ * nfsd_file_acquire().
*
- * Scan the hashtable for open files that match this fh. Returns true if there
- * are any, and false if not.
+ * Return values:
+ * %true: filecache contains at least one file matching this inode
+ * %false: filecache contains no files matching this inode
*/
bool
nfsd_file_is_cached(struct inode *inode)
{
- bool ret = false;
- struct nfsd_file *nf;
- unsigned int hashval;
-
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node) {
- if (inode == nf->nf_inode) {
- ret = true;
- break;
- }
- }
- rcu_read_unlock();
- trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_INODE,
+ .inode = inode,
+ };
+ bool ret = false;
+
+ if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params) != NULL)
+ ret = true;
+ trace_nfsd_file_is_cached(inode, (int)ret);
return ret;
}
static __be32
-nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf, bool open)
{
- __be32 status;
- struct net *net = SVC_NET(rqstp);
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_FULL,
+ .need = may_flags & NFSD_FILE_MAY_MASK,
+ .net = SVC_NET(rqstp),
+ };
struct nfsd_file *nf, *new;
- struct inode *inode;
- unsigned int hashval;
bool retry = true;
+ __be32 status;
- /* FIXME: skip this if fh_dentry is already set? */
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
if (status != nfs_ok)
return status;
+ key.inode = d_inode(fhp->fh_dentry);
+ key.cred = get_current_cred();
- inode = d_inode(fhp->fh_dentry);
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
retry:
- rcu_read_lock();
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
- rcu_read_unlock();
+ /* Avoid allocation if the item is already in cache */
+ nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
+ if (nf)
+ nf = nfsd_file_get(nf);
if (nf)
goto wait_for_construction;
- new = nfsd_file_alloc(inode, may_flags, hashval, net);
+ new = nfsd_file_alloc(&key, may_flags);
if (!new) {
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
- NULL, nfserr_jukebox);
- return nfserr_jukebox;
+ status = nfserr_jukebox;
+ goto out_status;
}
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
- if (nf == NULL)
+ nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl,
+ &key, &new->nf_rhash,
+ nfsd_file_rhash_params);
+ if (!nf) {
+ nf = new;
goto open_file;
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ }
+ if (IS_ERR(nf))
+ goto insert_err;
+ nf = nfsd_file_get(nf);
+ if (nf == NULL) {
+ nf = new;
+ goto open_file;
+ }
nfsd_file_slab_free(&new->nf_rcu);
wait_for_construction:
@@ -947,6 +1111,7 @@ wait_for_construction:
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
if (!retry) {
status = nfserr_jukebox;
goto out;
@@ -956,49 +1121,29 @@ wait_for_construction:
goto retry;
}
+ nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
- if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
- bool write = (may_flags & NFSD_MAY_WRITE);
-
- if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
- (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
- status = nfserrno(nfsd_open_break_lease(
- file_inode(nf->nf_file), may_flags));
- if (status == nfs_ok) {
- clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- if (write)
- clear_bit(NFSD_FILE_BREAK_WRITE,
- &nf->nf_flags);
- }
- }
- }
+ status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
out:
if (status == nfs_ok) {
+ if (open)
+ this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
nfsd_file_put(nf);
nf = NULL;
}
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+out_status:
+ put_cred(key.cred);
+ if (open)
+ trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
return status;
+
open_file:
- nf = new;
- /* Take reference for the hashtable */
- refcount_inc(&nf->nf_ref);
- __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
- __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
- list_lru_add(&nfsd_file_lru, &nf->nf_lru);
- hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
- ++nfsd_file_hashtbl[hashval].nfb_count;
- nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
- nfsd_file_hashtbl[hashval].nfb_count);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
- nfsd_file_gc();
-
- nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+ trace_nfsd_file_alloc(nf);
+ nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
if (nf->nf_mark) {
if (open) {
status = nfsd_open_verified(rqstp, fhp, may_flags,
@@ -1012,18 +1157,20 @@ open_file:
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || inode->i_nlink == 0) {
- bool do_free;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- do_free = nfsd_file_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (do_free)
+ if (status != nfs_ok || key.inode->i_nlink == 0)
+ if (nfsd_file_unhash(nf))
nfsd_file_put_noref(nf);
- }
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
goto out;
+
+insert_err:
+ nfsd_file_slab_free(&new->nf_rcu);
+ trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf));
+ nf = NULL;
+ status = nfserr_jukebox;
+ goto out_status;
}
/**
@@ -1040,7 +1187,7 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
}
/**
@@ -1057,7 +1204,7 @@ __be32
nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
}
/*
@@ -1067,29 +1214,49 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned int i, count = 0, longest = 0;
- unsigned long hits = 0;
+ unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+ unsigned long hits = 0, acquisitions = 0;
+ unsigned int i, count = 0, buckets = 0;
+ unsigned long lru = 0, total_age = 0;
- /*
- * No need for spinlocks here since we're not terribly interested in
- * accuracy. We do take the nfsd_mutex simply to ensure that we
- * don't end up racing with server shutdown
- */
+ /* Serialize with server shutdown */
mutex_lock(&nfsd_mutex);
- if (nfsd_file_hashtbl) {
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- count += nfsd_file_hashtbl[i].nfb_count;
- longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
- }
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) {
+ struct bucket_table *tbl;
+ struct rhashtable *ht;
+
+ lru = list_lru_count(&nfsd_file_lru);
+
+ rcu_read_lock();
+ ht = &nfsd_file_rhash_tbl;
+ count = atomic_read(&ht->nelems);
+ tbl = rht_dereference_rcu(ht->tbl, ht);
+ buckets = tbl->size;
+ rcu_read_unlock();
}
mutex_unlock(&nfsd_mutex);
- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
hits += per_cpu(nfsd_file_cache_hits, i);
+ acquisitions += per_cpu(nfsd_file_acquisitions, i);
+ releases += per_cpu(nfsd_file_releases, i);
+ total_age += per_cpu(nfsd_file_total_age, i);
+ evictions += per_cpu(nfsd_file_evictions, i);
+ pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
+ }
seq_printf(m, "total entries: %u\n", count);
- seq_printf(m, "longest chain: %u\n", longest);
+ seq_printf(m, "hash buckets: %u\n", buckets);
+ seq_printf(m, "lru entries: %lu\n", lru);
seq_printf(m, "cache hits: %lu\n", hits);
+ seq_printf(m, "acquisitions: %lu\n", acquisitions);
+ seq_printf(m, "releases: %lu\n", releases);
+ seq_printf(m, "evictions: %lu\n", evictions);
+ if (releases)
+ seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
+ else
+ seq_printf(m, "mean age (ms): -\n");
+ seq_printf(m, "pages flushed: %lu\n", pages_flushed);
return 0;
}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 1da0c79a5580..8e8c0c47d67d 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -29,7 +29,7 @@ struct nfsd_file_mark {
* never be dereferenced, only used for comparison.
*/
struct nfsd_file {
- struct hlist_node nf_node;
+ struct rhash_head nf_rhash;
struct list_head nf_lru;
struct rcu_head nf_rcu;
struct file *nf_file;
@@ -37,15 +37,13 @@ struct nfsd_file {
struct net *nf_net;
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
-#define NFSD_FILE_BREAK_READ (2)
-#define NFSD_FILE_BREAK_WRITE (3)
-#define NFSD_FILE_REFERENCED (4)
+#define NFSD_FILE_REFERENCED (2)
unsigned long nf_flags;
- struct inode *nf_inode;
- unsigned int nf_hashval;
+ struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
unsigned char nf_may;
struct nfsd_file_mark *nf_mark;
+ ktime_t nf_birthtime;
};
int nfsd_file_cache_init(void);
@@ -54,6 +52,7 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
+void nfsd_file_close(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1b1a962a1804..ffe17743cc74 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -189,6 +189,9 @@ struct nfsd_net {
struct nfsd_fcache_disposal *fcache_disposal;
siphash_key_t siphash_key;
+
+ atomic_t nfs4_client_count;
+ int nfs4_max_clients;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b5760801d377..9edd3c1a30fb 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -111,7 +111,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_errno;
- fh_lock(fh);
+ inode_lock(inode);
error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
argp->acl_access);
@@ -122,7 +122,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_drop_lock;
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
@@ -136,7 +136,7 @@ out:
return rpc_success;
out_drop_lock:
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
out_errno:
resp->status = nfserrno(error);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 35b2ebda14da..9446c6743664 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -101,7 +101,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_errno;
- fh_lock(fh);
+ inode_lock(inode);
error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
argp->acl_access);
@@ -111,7 +111,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
argp->acl_default);
out_drop_lock:
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
out_errno:
resp->status = nfserrno(error);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 981a3a7a6e16..a41cca619338 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -67,12 +67,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
{
struct nfsd3_sattrargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: SETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+ resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
argp->check_guard, argp->guardtime);
return rpc_success;
}
@@ -233,6 +236,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
{
struct iattr *iap = &argp->attrs;
struct dentry *parent, *child;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ };
__u32 v_mtime, v_atime;
struct inode *inode;
__be32 status;
@@ -254,7 +260,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- fh_lock_nested(fhp, I_MUTEX_PARENT);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
child = lookup_one_len(argp->name, parent, argp->len);
if (IS_ERR(child)) {
@@ -312,11 +318,13 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!IS_POSIXACL(inode))
iap->ia_mode &= ~current_umask();
+ fh_fill_pre_attrs(fhp);
host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true);
if (host_err < 0) {
status = nfserrno(host_err);
goto out;
}
+ fh_fill_post_attrs(fhp);
/* A newly created file already has a file size of zero. */
if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
@@ -331,10 +339,10 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
set_attr:
- status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
out:
- fh_unlock(fhp);
+ inode_unlock(inode);
if (child && !IS_ERR(child))
dput(child);
fh_drop_write(fhp);
@@ -368,6 +376,9 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: MKDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -378,8 +389,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
- fh_unlock(&resp->dirfh);
+ &attrs, S_IFDIR, 0, &resp->fh);
return rpc_success;
}
@@ -388,6 +398,9 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd3_symlinkargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
if (argp->tlen == 0) {
resp->status = nfserr_inval;
@@ -414,7 +427,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
- argp->flen, argp->tname, &resp->fh);
+ argp->flen, argp->tname, &attrs, &resp->fh);
kfree(argp->tname);
out:
return rpc_success;
@@ -428,6 +441,9 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
{
struct nfsd3_mknodargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
int type;
dev_t rdev = 0;
@@ -453,8 +469,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
type = nfs3_ftypes[argp->ftype];
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, type, rdev, &resp->fh);
- fh_unlock(&resp->dirfh);
+ &attrs, type, rdev, &resp->fh);
out:
return rpc_success;
}
@@ -477,7 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
argp->name, argp->len);
- fh_unlock(&resp->fh);
return rpc_success;
}
@@ -498,7 +512,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
argp->name, argp->len);
- fh_unlock(&resp->fh);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index eaa3a0cf38f1..bb8e2f6d7d03 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -751,58 +751,26 @@ out_estate:
return ret;
}
-__be32
-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl)
+__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
+ struct nfsd_attrs *attr)
{
- __be32 error;
int host_error;
- struct dentry *dentry;
- struct inode *inode;
- struct posix_acl *pacl = NULL, *dpacl = NULL;
unsigned int flags = 0;
- /* Get inode */
- error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
- if (error)
- return error;
-
- dentry = fhp->fh_dentry;
- inode = d_inode(dentry);
+ if (!acl)
+ return nfs_ok;
- if (S_ISDIR(inode->i_mode))
+ if (type == NF4DIR)
flags = NFS4_ACL_DIR;
- host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+ host_error = nfs4_acl_nfsv4_to_posix(acl, &attr->na_pacl,
+ &attr->na_dpacl, flags);
if (host_error == -EINVAL)
return nfserr_attrnotsupp;
- if (host_error < 0)
- goto out_nfserr;
-
- fh_lock(fhp);
-
- host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl);
- if (host_error < 0)
- goto out_drop_lock;
-
- if (S_ISDIR(inode->i_mode)) {
- host_error = set_posix_acl(&init_user_ns, inode,
- ACL_TYPE_DEFAULT, dpacl);
- }
-
-out_drop_lock:
- fh_unlock(fhp);
-
- posix_acl_release(pacl);
- posix_acl_release(dpacl);
-out_nfserr:
- if (host_error == -EOPNOTSUPP)
- return nfserr_attrnotsupp;
else
return nfserrno(host_error);
}
-
static short
ace2type(struct nfs4_ace *ace)
{
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 11f8715d92d6..4ce328209f61 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
* case NFS4_OK:
* write_response4 coa_resok4;
* default:
- * length4 coa_bytes_copied;
+ * length4 coa_bytes_copied;
* };
* struct CB_OFFLOAD4args {
* nfs_fh4 coa_fh;
@@ -688,21 +688,22 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
* };
*/
static void encode_offload_info4(struct xdr_stream *xdr,
- __be32 nfserr,
- const struct nfsd4_copy *cp)
+ const struct nfsd4_cb_offload *cbo)
{
__be32 *p;
p = xdr_reserve_space(xdr, 4);
- *p++ = nfserr;
- if (!nfserr) {
+ *p = cbo->co_nfserr;
+ switch (cbo->co_nfserr) {
+ case nfs_ok:
p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
p = xdr_encode_empty_array(p);
- p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
- *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
- p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+ p = xdr_encode_hyper(p, cbo->co_res.wr_bytes_written);
+ *p++ = cpu_to_be32(cbo->co_res.wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, cbo->co_res.wr_verifier.data,
NFS4_VERIFIER_SIZE);
- } else {
+ break;
+ default:
p = xdr_reserve_space(xdr, 8);
/* We always return success if bytes were written */
p = xdr_encode_hyper(p, 0);
@@ -710,18 +711,16 @@ static void encode_offload_info4(struct xdr_stream *xdr,
}
static void encode_cb_offload4args(struct xdr_stream *xdr,
- __be32 nfserr,
- const struct knfsd_fh *fh,
- const struct nfsd4_copy *cp,
+ const struct nfsd4_cb_offload *cbo,
struct nfs4_cb_compound_hdr *hdr)
{
__be32 *p;
p = xdr_reserve_space(xdr, 4);
- *p++ = cpu_to_be32(OP_CB_OFFLOAD);
- encode_nfs_fh4(xdr, fh);
- encode_stateid4(xdr, &cp->cp_res.cb_stateid);
- encode_offload_info4(xdr, nfserr, cp);
+ *p = cpu_to_be32(OP_CB_OFFLOAD);
+ encode_nfs_fh4(xdr, &cbo->co_fh);
+ encode_stateid4(xdr, &cbo->co_res.cb_stateid);
+ encode_offload_info4(xdr, cbo);
hdr->nops++;
}
@@ -731,8 +730,8 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
const void *data)
{
const struct nfsd4_callback *cb = data;
- const struct nfsd4_copy *cp =
- container_of(cb, struct nfsd4_copy, cp_cb);
+ const struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
.minorversion = cb->cb_clp->cl_minorversion,
@@ -740,7 +739,7 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
encode_cb_compound4args(xdr, &hdr);
encode_cb_sequence4args(xdr, cb, &hdr);
- encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+ encode_cb_offload4args(xdr, cbo, &hdr);
encode_cb_nops(&hdr);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3895eb52d2b1..a72ab97f77ef 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -64,36 +64,6 @@ MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
"idle msecs before unmount export from source server");
#endif
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#include <linux/security.h>
-
-static inline void
-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
-{
- struct inode *inode = d_inode(resfh->fh_dentry);
- int status;
-
- inode_lock(inode);
- status = security_inode_setsecctx(resfh->fh_dentry,
- label->data, label->len);
- inode_unlock(inode);
-
- if (status)
- /*
- * XXX: We should really fail the whole open, but we may
- * already have created a new file, so it may be too
- * late. For now this seems the least of evils:
- */
- bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-
- return;
-}
-#else
-static inline void
-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
-{ }
-#endif
-
#define NFSDDBG_FACILITY NFSDDBG_PROC
static u32 nfsd_attrmask[] = {
@@ -158,26 +128,6 @@ is_create_with_attrs(struct nfsd4_open *open)
|| open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
}
-/*
- * if error occurs when setting the acl, just clear the acl bit
- * in the returned attr bitmap.
- */
-static void
-do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl, u32 *bmval)
-{
- __be32 status;
-
- status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
- if (status)
- /*
- * We should probably fail the whole open at this point,
- * but we've already created the file, so it's too late;
- * So this seems the least of evils:
- */
- bmval[0] &= ~FATTR4_WORD0_ACL;
-}
-
static inline void
fh_dup2(struct svc_fh *dst, struct svc_fh *src)
{
@@ -286,6 +236,10 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_fh *resfhp, struct nfsd4_open *open)
{
struct iattr *iap = &open->op_iattr;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ .na_seclabel = &open->op_label,
+ };
struct dentry *parent, *child;
__u32 v_mtime, v_atime;
struct inode *inode;
@@ -307,7 +261,10 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- fh_lock_nested(fhp, I_MUTEX_PARENT);
+ if (is_create_with_attrs(open))
+ nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
+
+ inode_lock_nested(inode, I_MUTEX_PARENT);
child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
if (IS_ERR(child)) {
@@ -345,6 +302,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (d_really_is_positive(child)) {
status = nfs_ok;
+ /* NFSv4 protocol requires change attributes even though
+ * no change happened.
+ */
+ fh_fill_both_attrs(fhp);
+
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
if (!d_is_reg(child))
@@ -386,10 +348,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!IS_POSIXACL(inode))
iap->ia_mode &= ~current_umask();
+ fh_fill_pre_attrs(fhp);
status = nfsd4_vfs_create(fhp, child, open);
if (status != nfs_ok)
goto out;
open->op_created = true;
+ fh_fill_post_attrs(fhp);
/* A newly created file already has a file size of zero. */
if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
@@ -404,10 +368,15 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
set_attr:
- status = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
+ if (attrs.na_labelerr)
+ open->op_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (attrs.na_aclerr)
+ open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
out:
- fh_unlock(fhp);
+ inode_unlock(inode);
+ nfsd_attrs_free(&attrs);
if (child && !IS_ERR(child))
dput(child);
fh_drop_write(fhp);
@@ -447,9 +416,6 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
current->fs->umask = 0;
- if (!status && open->op_label.len)
- nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
-
/*
* Following rfc 3530 14.2.16, and rfc 5661 18.16.4
* use the returned bitmask to indicate which attributes
@@ -458,24 +424,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
- } else
- /*
- * Note this may exit with the parent still locked.
- * We will hold the lock until nfsd4_open's final
- * lookup, to prevent renames or unlinks until we've had
- * a chance to an acquire a delegation if appropriate.
- */
+ } else {
status = nfsd_lookup(rqstp, current_fh,
open->op_fname, open->op_fnamelen, *resfh);
+ if (!status)
+ /* NFSv4 protocol requires change attributes even though
+ * no change happened.
+ */
+ fh_fill_both_attrs(current_fh);
+ }
if (status)
goto out;
status = nfsd_check_obj_isreg(*resfh);
if (status)
goto out;
- if (is_create_with_attrs(open) && open->op_acl != NULL)
- do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
-
nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
accmode = NFSD_MAY_NOP;
if (open->op_created ||
@@ -547,6 +510,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
open->op_openowner);
open->op_filp = NULL;
+ open->op_rqstp = rqstp;
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
@@ -630,9 +594,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
status = nfsd4_process_open2(rqstp, resfh, open);
- WARN(status && open->op_created,
- "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
- be32_to_cpu(status));
+ if (status && open->op_created)
+ pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n",
+ be32_to_cpu(status));
if (reclaim && !status)
nn->somebody_reclaimed = true;
out:
@@ -786,6 +750,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_create *create = &u->create;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &create->cr_iattr,
+ .na_seclabel = &create->cr_label,
+ };
struct svc_fh resfh;
__be32 status;
dev_t rdev;
@@ -801,12 +769,13 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
+ status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl, &attrs);
current->fs->umask = create->cr_umask;
switch (create->cr_type) {
case NF4LNK:
status = nfsd_symlink(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- create->cr_data, &resfh);
+ create->cr_data, &attrs, &resfh);
break;
case NF4BLK:
@@ -817,7 +786,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFBLK, rdev, &resfh);
+ &attrs, S_IFBLK, rdev, &resfh);
break;
case NF4CHR:
@@ -828,26 +797,26 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr,S_IFCHR, rdev, &resfh);
+ &attrs, S_IFCHR, rdev, &resfh);
break;
case NF4SOCK:
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFSOCK, 0, &resfh);
+ &attrs, S_IFSOCK, 0, &resfh);
break;
case NF4FIFO:
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFIFO, 0, &resfh);
+ &attrs, S_IFIFO, 0, &resfh);
break;
case NF4DIR:
create->cr_iattr.ia_valid &= ~ATTR_SIZE;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFDIR, 0, &resfh);
+ &attrs, S_IFDIR, 0, &resfh);
break;
default:
@@ -857,20 +826,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (create->cr_label.len)
- nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
-
- if (create->cr_acl != NULL)
- do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
- create->cr_bmval);
-
- fh_unlock(&cstate->current_fh);
+ if (attrs.na_labelerr)
+ create->cr_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (attrs.na_aclerr)
+ create->cr_bmval[0] &= ~FATTR4_WORD0_ACL;
set_change_info(&create->cr_cinfo, &cstate->current_fh);
fh_dup2(&cstate->current_fh, &resfh);
out:
fh_put(&resfh);
out_umask:
current->fs->umask = 0;
+ nfsd_attrs_free(&attrs);
return status;
}
@@ -1043,10 +1009,8 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
- if (!status) {
- fh_unlock(&cstate->current_fh);
+ if (!status)
set_change_info(&remove->rm_cinfo, &cstate->current_fh);
- }
return status;
}
@@ -1086,7 +1050,6 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
- fh_unlock(&cstate->current_fh);
if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
@@ -1141,6 +1104,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_setattr *setattr = &u->setattr;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &setattr->sa_iattr,
+ .na_seclabel = &setattr->sa_label,
+ };
+ struct inode *inode;
__be32 status = nfs_ok;
int err;
@@ -1163,19 +1131,18 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (setattr->sa_acl != NULL)
- status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
- setattr->sa_acl);
- if (status)
- goto out;
- if (setattr->sa_label.len)
- status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
- &setattr->sa_label);
+ inode = cstate->current_fh.fh_dentry->d_inode;
+ status = nfsd4_acl_to_attr(S_ISDIR(inode->i_mode) ? NF4DIR : NF4REG,
+ setattr->sa_acl, &attrs);
+
if (status)
goto out;
- status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
+ status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
0, (time64_t)0);
+ if (!status)
+ status = nfserrno(attrs.na_labelerr);
out:
+ nfsd_attrs_free(&attrs);
fh_drop_write(&cstate->current_fh);
return status;
}
@@ -1285,30 +1252,17 @@ out:
return status;
}
-void nfs4_put_copy(struct nfsd4_copy *copy)
+static void nfs4_put_copy(struct nfsd4_copy *copy)
{
if (!refcount_dec_and_test(&copy->refcount))
return;
+ kfree(copy->cp_src);
kfree(copy);
}
-static bool
-check_and_set_stop_copy(struct nfsd4_copy *copy)
-{
- bool value;
-
- spin_lock(&copy->cp_clp->async_lock);
- value = copy->stopped;
- if (!copy->stopped)
- copy->stopped = true;
- spin_unlock(&copy->cp_clp->async_lock);
- return value;
-}
-
static void nfsd4_stop_copy(struct nfsd4_copy *copy)
{
- /* only 1 thread should stop the copy */
- if (!check_and_set_stop_copy(copy))
+ if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags))
kthread_stop(copy->copy_task);
nfs4_put_copy(copy);
}
@@ -1389,7 +1343,7 @@ try_again:
return 0;
}
if (work) {
- strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+ strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
refcount_set(&work->nsui_refcnt, 2);
work->nsui_busy = true;
list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
@@ -1549,7 +1503,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
if (status)
goto out;
- status = nfsd4_interssc_connect(&copy->cp_src, rqstp, mount);
+ status = nfsd4_interssc_connect(copy->cp_src, rqstp, mount);
if (status)
goto out;
@@ -1567,7 +1521,7 @@ out:
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
struct nfsd_file *dst)
{
bool found = false;
@@ -1576,9 +1530,9 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
- nfs42_ssc_close(src->nf_file);
+ nfs42_ssc_close(filp);
nfsd_file_put(dst);
- fput(src->nf_file);
+ fput(filp);
if (!nn) {
mntput(ss_mnt);
@@ -1621,7 +1575,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
struct nfsd_file *dst)
{
}
@@ -1658,9 +1612,10 @@ nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst)
static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
{
- struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+ struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
- nfs4_put_copy(copy);
+ kfree(cbo);
}
static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
@@ -1677,15 +1632,16 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
{
copy->cp_res.wr_stable_how =
- copy->committed ? NFS_FILE_SYNC : NFS_UNSTABLE;
- copy->cp_synchronous = sync;
+ test_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags) ?
+ NFS_FILE_SYNC : NFS_UNSTABLE;
+ nfsd4_copy_set_sync(copy, sync);
gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
}
-static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
+ struct file *dst,
+ struct file *src)
{
- struct file *dst = copy->nf_dst->nf_file;
- struct file *src = copy->nf_src->nf_file;
errseq_t since;
ssize_t bytes_copied = 0;
u64 bytes_total = copy->cp_count;
@@ -1707,26 +1663,29 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
copy->cp_res.wr_bytes_written += bytes_copied;
src_pos += bytes_copied;
dst_pos += bytes_copied;
- } while (bytes_total > 0 && !copy->cp_synchronous);
+ } while (bytes_total > 0 && nfsd4_copy_is_async(copy));
/* for a non-zero asynchronous copy do a commit of data */
- if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
+ if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
since = READ_ONCE(dst->f_wb_err);
status = vfs_fsync_range(dst, copy->cp_dst_pos,
copy->cp_res.wr_bytes_written, 0);
if (!status)
status = filemap_check_wb_err(dst->f_mapping, since);
if (!status)
- copy->committed = true;
+ set_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags);
}
return bytes_copied;
}
-static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy,
+ struct file *src, struct file *dst,
+ bool sync)
{
__be32 status;
ssize_t bytes;
- bytes = _nfsd_copy_file_range(copy);
+ bytes = _nfsd_copy_file_range(copy, dst, src);
+
/* for async copy, we ignore the error, client can always retry
* to get the error
*/
@@ -1736,13 +1695,6 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
nfsd4_init_copy_res(copy, sync);
status = nfs_ok;
}
-
- if (!copy->cp_intra) /* Inter server SSC */
- nfsd4_cleanup_inter_ssc(copy->ss_mnt, copy->nf_src,
- copy->nf_dst);
- else
- nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
-
return status;
}
@@ -1751,17 +1703,17 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
dst->cp_src_pos = src->cp_src_pos;
dst->cp_dst_pos = src->cp_dst_pos;
dst->cp_count = src->cp_count;
- dst->cp_synchronous = src->cp_synchronous;
+ dst->cp_flags = src->cp_flags;
memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
memcpy(&dst->fh, &src->fh, sizeof(src->fh));
dst->cp_clp = src->cp_clp;
dst->nf_dst = nfsd_file_get(src->nf_dst);
- dst->cp_intra = src->cp_intra;
- if (src->cp_intra) /* for inter, file_src doesn't exist yet */
+ /* for inter, nf_src doesn't exist yet */
+ if (!nfsd4_ssc_is_inter(src))
dst->nf_src = nfsd_file_get(src->nf_src);
memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
- memcpy(&dst->cp_src, &src->cp_src, sizeof(struct nl4_server));
+ memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server));
memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
dst->ss_mnt = src->ss_mnt;
@@ -1771,7 +1723,7 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
{
nfs4_free_copy_state(copy);
nfsd_file_put(copy->nf_dst);
- if (copy->cp_intra)
+ if (!nfsd4_ssc_is_inter(copy))
nfsd_file_put(copy->nf_src);
spin_lock(&copy->cp_clp->async_lock);
list_del(&copy->copies);
@@ -1779,45 +1731,58 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
nfs4_put_copy(copy);
}
+static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
+{
+ struct nfsd4_cb_offload *cbo;
+
+ cbo = kzalloc(sizeof(*cbo), GFP_KERNEL);
+ if (!cbo)
+ return;
+
+ memcpy(&cbo->co_res, &copy->cp_res, sizeof(copy->cp_res));
+ memcpy(&cbo->co_fh, &copy->fh, sizeof(copy->fh));
+ cbo->co_nfserr = nfserr;
+
+ nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
+ NFSPROC4_CLNT_CB_OFFLOAD);
+ trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
+ &cbo->co_fh, copy->cp_count, nfserr);
+ nfsd4_run_cb(&cbo->co_cb);
+}
+
+/**
+ * nfsd4_do_async_copy - kthread function for background server-side COPY
+ * @data: arguments for COPY operation
+ *
+ * Return values:
+ * %0: Copy operation is done.
+ */
static int nfsd4_do_async_copy(void *data)
{
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
- struct nfsd4_copy *cb_copy;
+ __be32 nfserr;
- if (!copy->cp_intra) { /* Inter server SSC */
- copy->nf_src = kzalloc(sizeof(struct nfsd_file), GFP_KERNEL);
- if (!copy->nf_src) {
- copy->nfserr = nfserr_serverfault;
- nfsd4_interssc_disconnect(copy->ss_mnt);
- goto do_callback;
- }
- copy->nf_src->nf_file = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
- &copy->stateid);
- if (IS_ERR(copy->nf_src->nf_file)) {
- copy->nfserr = nfserr_offload_denied;
+ if (nfsd4_ssc_is_inter(copy)) {
+ struct file *filp;
+
+ filp = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
+ &copy->stateid);
+ if (IS_ERR(filp)) {
+ nfserr = nfserr_offload_denied;
nfsd4_interssc_disconnect(copy->ss_mnt);
goto do_callback;
}
+ nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
+ false);
+ nfsd4_cleanup_inter_ssc(copy->ss_mnt, filp, copy->nf_dst);
+ } else {
+ nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, false);
+ nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
- copy->nfserr = nfsd4_do_copy(copy, 0);
do_callback:
- cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
- if (!cb_copy)
- goto out;
- refcount_set(&cb_copy->refcount, 1);
- memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
- cb_copy->cp_clp = copy->cp_clp;
- cb_copy->nfserr = copy->nfserr;
- memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
- nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
- &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
- trace_nfsd_cb_offload(copy->cp_clp, &copy->cp_res.cb_stateid,
- &copy->fh, copy->cp_count, copy->nfserr);
- nfsd4_run_cb(&cb_copy->cp_cb);
-out:
- if (!copy->cp_intra)
- kfree(copy->nf_src);
+ nfsd4_send_cb_offload(copy, nfserr);
cleanup_async_copy(copy);
return 0;
}
@@ -1830,8 +1795,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_copy *async_copy = NULL;
- if (!copy->cp_intra) { /* Inter server SSC */
- if (!inter_copy_offload_enable || copy->cp_synchronous) {
+ if (nfsd4_ssc_is_inter(copy)) {
+ if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
status = nfserr_notsupp;
goto out;
}
@@ -1848,13 +1813,16 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy->cp_clp = cstate->clp;
memcpy(&copy->fh, &cstate->current_fh.fh_handle,
sizeof(struct knfsd_fh));
- if (!copy->cp_synchronous) {
+ if (nfsd4_copy_is_async(copy)) {
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
status = nfserrno(-ENOMEM);
async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!async_copy)
goto out_err;
+ async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
+ if (!async_copy->cp_src)
+ goto out_err;
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
refcount_set(&async_copy->refcount, 1);
@@ -1872,7 +1840,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
wake_up_process(async_copy->copy_task);
status = nfs_ok;
} else {
- status = nfsd4_do_copy(copy, 1);
+ status = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, true);
+ nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
out:
return status;
@@ -1880,7 +1850,7 @@ out_err:
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- if (!copy->cp_intra)
+ if (nfsd4_ssc_is_inter(copy))
nfsd4_interssc_disconnect(copy->ss_mnt);
goto out;
}
@@ -1953,9 +1923,9 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* For now, only return one server address in cpn_src, the
* address used by the client to connect to this server.
*/
- cn->cpn_src.nl4_type = NL4_NETADDR;
+ cn->cpn_src->nl4_type = NL4_NETADDR;
status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr,
- &cn->cpn_src.u.nl4_addr);
+ &cn->cpn_src->u.nl4_addr);
WARN_ON_ONCE(status);
if (status) {
nfs4_put_cpntf_state(nn, cps);
@@ -2609,7 +2579,7 @@ check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
return;
}
putfh = (struct nfsd4_putfh *)&saved_op->u;
- if (!copy->cp_intra)
+ if (nfsd4_ssc_is_inter(copy))
putfh->no_verify = true;
}
}
@@ -2711,7 +2681,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) {
/*
* Don't execute this op if we couldn't encode a
- * succesful reply:
+ * successful reply:
*/
u32 plen = op->opdesc->op_rsize_bop(rqstp, op);
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9409a0dc1b76..c5d199d7e6b4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -820,9 +820,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- nfsd_file_put(f1);
+ nfsd_file_close(f1);
if (f2)
- nfsd_file_put(f2);
+ nfsd_file_close(f2);
}
}
@@ -1131,7 +1131,6 @@ static void block_delegations(struct knfsd_fh *fh)
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
- struct svc_fh *current_fh,
struct nfs4_clnt_odstate *odstate)
{
struct nfs4_delegation *dp;
@@ -1141,7 +1140,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
n = atomic_long_inc_return(&num_delegations);
if (n < 0 || n > max_delegations)
goto out_dec;
- if (delegation_blocked(&current_fh->fh_handle))
+ if (delegation_blocked(&fp->fi_fhandle))
goto out_dec;
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
if (dp == NULL)
@@ -2053,11 +2052,16 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
* This type of memory management is somewhat inefficient, but we use it
* anyway since SETCLIENTID is not a common operation.
*/
-static struct nfs4_client *alloc_client(struct xdr_netobj name)
+static struct nfs4_client *alloc_client(struct xdr_netobj name,
+ struct nfsd_net *nn)
{
struct nfs4_client *clp;
int i;
+ if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) {
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ return NULL;
+ }
clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
if (clp == NULL)
return NULL;
@@ -2076,6 +2080,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
atomic_set(&clp->cl_rpc_users, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
clp->cl_state = NFSD4_ACTIVE;
+ atomic_inc(&nn->nfs4_client_count);
atomic_set(&clp->cl_delegs_in_recall, 0);
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_openowners);
@@ -2183,6 +2188,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
static void
__destroy_client(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
int i;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
@@ -2226,6 +2232,7 @@ __destroy_client(struct nfs4_client *clp)
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ atomic_add_unless(&nn->nfs4_client_count, -1, 0);
free_client(clp);
wake_up_all(&expiry_wq);
}
@@ -2564,7 +2571,7 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
{
- struct inode *inode = f->nf_inode;
+ struct inode *inode = file_inode(f->nf_file);
seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
MAJOR(inode->i_sb->s_dev),
@@ -2848,7 +2855,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct dentry *dentries[ARRAY_SIZE(client_files)];
- clp = alloc_client(name);
+ clp = alloc_client(name, nn);
if (clp == NULL)
return NULL;
@@ -4330,6 +4337,27 @@ out:
return -ENOMEM;
}
+void nfsd4_init_leases_net(struct nfsd_net *nn)
+{
+ struct sysinfo si;
+ u64 max_clients;
+
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
+ nn->somebody_reclaimed = false;
+ nn->track_reclaim_completes = false;
+ nn->clverifier_counter = prandom_u32();
+ nn->clientid_base = prandom_u32();
+ nn->clientid_counter = nn->clientid_base + 1;
+ nn->s2s_cp_cl_id = nn->clientid_counter++;
+
+ atomic_set(&nn->nfs4_client_count, 0);
+ si_meminfo(&si);
+ max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024);
+ max_clients *= NFS4_CLIENTS_PER_GB;
+ nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
+}
+
static void init_nfs4_replay(struct nfs4_replay *rp)
{
rp->rp_status = nfserr_serverfault;
@@ -5032,11 +5060,14 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
.ia_valid = ATTR_SIZE,
.ia_size = 0,
};
+ struct nfsd_attrs attrs = {
+ .na_iattr = &iattr,
+ };
if (!open->op_truncate)
return 0;
if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
- return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0);
+ return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
}
static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
@@ -5104,6 +5135,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
goto out_put_access;
nf->nf_file = open->op_filp;
open->op_filp = NULL;
+ trace_nfsd_file_create(rqstp, access, nf);
}
spin_lock(&fp->fi_lock);
@@ -5259,11 +5291,41 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
return 0;
}
+/*
+ * It's possible that between opening the dentry and setting the delegation,
+ * that it has been renamed or unlinked. Redo the lookup to verify that this
+ * hasn't happened.
+ */
+static int
+nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
+ struct svc_fh *parent)
+{
+ struct svc_export *exp;
+ struct dentry *child;
+ __be32 err;
+
+ err = nfsd_lookup_dentry(open->op_rqstp, parent,
+ open->op_fname, open->op_fnamelen,
+ &exp, &child);
+
+ if (err)
+ return -EAGAIN;
+
+ dput(child);
+ if (child != file_dentry(fp->fi_deleg_file->nf_file))
+ return -EAGAIN;
+
+ return 0;
+}
+
static struct nfs4_delegation *
-nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
- struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
+nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
+ struct svc_fh *parent)
{
int status = 0;
+ struct nfs4_client *clp = stp->st_stid.sc_client;
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+ struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
struct nfs4_delegation *dp;
struct nfsd_file *nf;
struct file_lock *fl;
@@ -5305,7 +5367,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
return ERR_PTR(status);
status = -ENOMEM;
- dp = alloc_init_deleg(clp, fp, fh, odstate);
+ dp = alloc_init_deleg(clp, fp, odstate);
if (!dp)
goto out_delegees;
@@ -5318,6 +5380,13 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
locks_free_lock(fl);
if (status)
goto out_clnt_odstate;
+
+ if (parent) {
+ status = nfsd4_verify_deleg_dentry(open, fp, parent);
+ if (status)
+ goto out_unlock;
+ }
+
status = nfsd4_check_conflicting_opens(clp, fp);
if (status)
goto out_unlock;
@@ -5373,12 +5442,13 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
* proper support for them.
*/
static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
- struct nfs4_ol_stateid *stp)
+nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
+ struct svc_fh *currentfh)
{
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = openowner(stp->st_stateowner);
struct nfs4_client *clp = stp->st_stid.sc_client;
+ struct svc_fh *parent = NULL;
int cb_up;
int status = 0;
@@ -5392,6 +5462,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
+ parent = currentfh;
+ fallthrough;
case NFS4_OPEN_CLAIM_FH:
/*
* Let's not give out any delegations till everyone's
@@ -5406,7 +5478,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
default:
goto out_no_deleg;
}
- dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
+ dp = nfs4_set_delegation(open, stp, parent);
if (IS_ERR(dp))
goto out_no_deleg;
@@ -5538,7 +5610,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
- nfs4_open_delegation(current_fh, open, stp);
+ nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
nodeleg:
status = nfs_ok;
trace_nfsd_open(&stp->st_stid.sc_stateid);
@@ -5792,9 +5864,12 @@ static void
nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
struct laundry_time *lt)
{
+ unsigned int maxreap, reapcnt = 0;
struct list_head *pos, *next;
struct nfs4_client *clp;
+ maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
+ NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
INIT_LIST_HEAD(reaplist);
spin_lock(&nn->client_lock);
list_for_each_safe(pos, next, &nn->client_lru) {
@@ -5805,14 +5880,15 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
break;
if (!atomic_read(&clp->cl_rpc_users))
clp->cl_state = NFSD4_COURTESY;
- if (!client_has_state(clp) ||
- ktime_get_boottime_seconds() >=
- (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT))
+ if (!client_has_state(clp))
goto exp_client;
- if (nfs4_anylock_blockers(clp)) {
+ if (!nfs4_anylock_blockers(clp))
+ if (reapcnt >= maxreap)
+ continue;
exp_client:
- if (!mark_client_expired_locked(clp))
- list_add(&clp->cl_lru, reaplist);
+ if (!mark_client_expired_locked(clp)) {
+ list_add(&clp->cl_lru, reaplist);
+ reapcnt++;
}
}
spin_unlock(&nn->client_lock);
@@ -7321,21 +7397,22 @@ out:
static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
{
struct nfsd_file *nf;
+ struct inode *inode;
__be32 err;
err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
- fh_lock(fhp); /* to block new leases till after test_lock: */
- err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
- NFSD_MAY_READ));
+ inode = fhp->fh_dentry->d_inode;
+ inode_lock(inode); /* to block new leases till after test_lock: */
+ err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
if (err)
goto out;
lock->fl_file = nf->nf_file;
err = nfserrno(vfs_test_lock(nf->nf_file, lock));
lock->fl_file = NULL;
out:
- fh_unlock(fhp);
+ inode_unlock(inode);
nfsd_file_put(nf);
return err;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2acea7792bb2..1e9690a061ec 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1810,7 +1810,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
for (i = 0; i < test_stateid->ts_num_ids; i++) {
stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
if (!stateid)
- return nfserrno(-ENOMEM); /* XXX: not jukebox? */
+ return nfserr_jukebox;
INIT_LIST_HEAD(&stateid->ts_id_list);
list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid);
@@ -1896,8 +1896,8 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
{
+ u32 consecutive, i, count, sync;
struct nl4_server *ns_dummy;
- u32 consecutive, i, count;
__be32 status;
status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
@@ -1915,25 +1915,28 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
/* ca_consecutive: we always do consecutive copies */
if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0)
return nfserr_bad_xdr;
- if (xdr_stream_decode_u32(argp->xdr, &copy->cp_synchronous) < 0)
+ if (xdr_stream_decode_bool(argp->xdr, &sync) < 0)
return nfserr_bad_xdr;
+ nfsd4_copy_set_sync(copy, sync);
if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
return nfserr_bad_xdr;
- copy->cp_intra = false;
+ copy->cp_src = svcxdr_tmpalloc(argp, sizeof(*copy->cp_src));
+ if (copy->cp_src == NULL)
+ return nfserr_jukebox;
if (count == 0) { /* intra-server copy */
- copy->cp_intra = true;
+ __set_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
return nfs_ok;
}
/* decode all the supplied server addresses but use only the first */
- status = nfsd4_decode_nl4_server(argp, &copy->cp_src);
+ status = nfsd4_decode_nl4_server(argp, copy->cp_src);
if (status)
return status;
ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL);
if (ns_dummy == NULL)
- return nfserrno(-ENOMEM); /* XXX: jukebox? */
+ return nfserr_jukebox;
for (i = 0; i < count - 1; i++) {
status = nfsd4_decode_nl4_server(argp, ns_dummy);
if (status) {
@@ -1952,10 +1955,17 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src));
+ if (cn->cpn_src == NULL)
+ return nfserr_jukebox;
+ cn->cpn_dst = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_dst));
+ if (cn->cpn_dst == NULL)
+ return nfserr_jukebox;
+
status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid);
if (status)
return status;
- return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
+ return nfsd4_decode_nl4_server(argp, cn->cpn_dst);
}
static __be32
@@ -2828,10 +2838,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
struct kstat stat;
struct svc_fh *tempfh = NULL;
struct kstatfs statfs;
- __be32 *p;
+ __be32 *p, *attrlen_p;
int starting_len = xdr->buf->len;
int attrlen_offset;
- __be32 attrlen;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
@@ -2919,10 +2928,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out;
attrlen_offset = xdr->buf->len;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!attrlen_p)
goto out_resource;
- p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
u32 supp[3];
@@ -3344,8 +3352,7 @@ out_acl:
*p++ = cpu_to_be32(err == 0);
}
- attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
- write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
+ *attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
status = nfs_ok;
out:
@@ -3882,16 +3889,15 @@ static __be32 nfsd4_encode_splice_read(
struct xdr_stream *xdr = resp->xdr;
struct xdr_buf *buf = xdr->buf;
int status, space_left;
- u32 eof;
__be32 nfserr;
- __be32 *p = xdr->p - 2;
/* Make sure there will be room for padding if needed */
if (xdr->end - xdr->p < 1)
return nfserr_resource;
nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
- file, read->rd_offset, &maxcount, &eof);
+ file, read->rd_offset, &maxcount,
+ &read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
goto out_err;
@@ -3902,9 +3908,6 @@ static __be32 nfsd4_encode_splice_read(
goto out_err;
}
- *(p++) = htonl(eof);
- *(p++) = htonl(maxcount);
-
buf->page_len = maxcount;
buf->len += maxcount;
xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
@@ -3946,11 +3949,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct file *file, unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
- u32 eof;
- int starting_len = xdr->buf->len - 8;
+ unsigned int starting_len = xdr->buf->len;
+ __be32 zero = xdr_zero;
__be32 nfserr;
- __be32 tmp;
- int pad;
read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
if (read->rd_vlen < 0)
@@ -3958,31 +3959,24 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
- &eof);
+ &read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
return nfserr;
- if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount))
+ if (svc_encode_result_payload(resp->rqstp, starting_len, maxcount))
return nfserr_io;
- xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
-
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
- tmp = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
-
- tmp = xdr_zero;
- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &tmp, pad);
- return 0;
+ xdr_truncate_encode(xdr, starting_len + xdr_align_size(maxcount));
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + maxcount, &zero,
+ xdr_pad_size(maxcount));
+ return nfs_ok;
}
static __be32
nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
+ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
unsigned long maxcount;
struct xdr_stream *xdr = resp->xdr;
struct file *file;
@@ -3995,11 +3989,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
if (!p) {
- WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
+ WARN_ON_ONCE(splice_ok);
return nfserr_resource;
}
- if (resp->xdr->buf->page_len &&
- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+ if (resp->xdr->buf->page_len && splice_ok) {
WARN_ON_ONCE(1);
return nfserr_resource;
}
@@ -4008,31 +4001,30 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- if (file->f_op->splice_read &&
- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
-
- if (nfserr)
+ if (nfserr) {
xdr_truncate_encode(xdr, starting_len);
+ return nfserr;
+ }
- return nfserr;
+ p = xdr_encode_bool(p, read->rd_eof);
+ *p = cpu_to_be32(read->rd_length);
+ return nfs_ok;
}
static __be32
nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
{
- int maxcount;
- __be32 wire_count;
- int zero = 0;
+ __be32 *p, *maxcount_p, zero = xdr_zero;
struct xdr_stream *xdr = resp->xdr;
int length_offset = xdr->buf->len;
- int status;
- __be32 *p;
+ int maxcount, status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ maxcount_p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!maxcount_p)
return nfserr_resource;
maxcount = PAGE_SIZE;
@@ -4057,14 +4049,11 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
nfserr = nfserrno(status);
goto out_err;
}
-
- wire_count = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
- xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
- if (maxcount & 3)
- write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
- &zero, 4 - (maxcount&3));
- return 0;
+ *maxcount_p = cpu_to_be32(maxcount);
+ xdr_truncate_encode(xdr, length_offset + 4 + xdr_align_size(maxcount));
+ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero,
+ xdr_pad_size(maxcount));
+ return nfs_ok;
out_err:
xdr_truncate_encode(xdr, length_offset);
@@ -4715,13 +4704,13 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
__be32 *p;
nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
- !!copy->cp_synchronous);
+ nfsd4_copy_is_sync(copy));
if (nfserr)
return nfserr;
p = xdr_reserve_space(resp->xdr, 4 + 4);
*p++ = xdr_one; /* cr_consecutive */
- *p++ = cpu_to_be32(copy->cp_synchronous);
+ *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
return 0;
}
@@ -4919,7 +4908,8 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(1);
- return nfsd42_encode_nl4_server(resp, &cn->cpn_src);
+ nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
+ return nfserr;
}
static __be32
@@ -5373,8 +5363,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
so->so_replay.rp_buf, len);
}
status:
- /* Note that op->status is already in network byte order: */
- write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
+ *p = op->status;
}
/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 7da88bdc0d6c..9b31e1103e7b 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
- status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+ status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
+ "nfsd-reply:%s", nn->nfsd_name);
if (status)
goto out_stats_destroy;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0621c2faf242..917fa1892fd2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,6 +25,7 @@
#include "state.h"
#include "netns.h"
#include "pnfs.h"
+#include "filecache.h"
/*
* We have a single directory with several nodes in it.
@@ -45,6 +46,7 @@ enum {
NFSD_Ports,
NFSD_MaxBlkSize,
NFSD_MaxConnections,
+ NFSD_Filecache,
NFSD_SupportedEnctypes,
/*
* The below MUST come last. Otherwise we leave a hole in nfsd_files[]
@@ -229,6 +231,13 @@ static const struct file_operations reply_cache_stats_operations = {
.release = single_release,
};
+static const struct file_operations filecache_ops = {
+ .open = nfsd_file_cache_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
/*----------------------------------------------------------------------------*/
/*
* payload - write methods
@@ -633,7 +642,6 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
}
/* Now write current state into reply buffer */
- len = 0;
sep = "";
remaining = SIMPLE_TRANSACTION_LIMIT;
for (num=2 ; num <= 4 ; num++) {
@@ -1371,6 +1379,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO},
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
@@ -1475,14 +1484,7 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_reply_cache_init(nn);
if (retval)
goto out_drc_error;
- nn->nfsd4_lease = 90; /* default lease time */
- nn->nfsd4_grace = 90;
- nn->somebody_reclaimed = false;
- nn->track_reclaim_completes = false;
- nn->clverifier_counter = prandom_u32();
- nn->clientid_base = prandom_u32();
- nn->clientid_counter = nn->clientid_base + 1;
- nn->s2s_cp_cl_id = nn->clientid_counter++;
+ nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
@@ -1517,7 +1519,6 @@ static struct pernet_operations nfsd_net_ops = {
static int __init init_nfsd(void)
{
int retval;
- printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
retval = nfsd4_init_slabs();
if (retval)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9a8b09afc173..57a468ed85c3 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -341,6 +341,8 @@ void nfsd_lockd_shutdown(void);
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
+#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128
+#define NFS4_CLIENTS_PER_GB 1024
/*
* The following attributes are currently not supported by the NFSv4 server:
@@ -496,12 +498,16 @@ extern void unregister_cld_notifier(void);
extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
#endif
+extern void nfsd4_init_leases_net(struct nfsd_net *nn);
+
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
{
return 0;
}
+static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {};
+
#define register_cld_notifier() 0
#define unregister_cld_notifier() do { } while(0)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c29baa03dfaf..a5b71526cee0 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -331,8 +331,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
struct dentry *dentry;
__be32 error;
- dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
-
if (!fhp->fh_dentry) {
error = nfsd_set_fh_dentry(rqstp, fhp);
if (error)
@@ -340,6 +338,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
}
dentry = fhp->fh_dentry;
exp = fhp->fh_export;
+
+ trace_nfsd_fh_verify(rqstp, fhp, type, access);
+
/*
* We still have to do all these permission checks, even when
* fh_dentry is already set:
@@ -548,7 +549,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (ref_fh == fhp)
fh_put(ref_fh);
- if (fhp->fh_locked || fhp->fh_dentry) {
+ if (fhp->fh_dentry) {
printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
dentry);
}
@@ -671,6 +672,25 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
+/**
+ * fh_fill_both_attrs - Fill pre-op and post-op attributes
+ * @fhp: file handle to be updated
+ *
+ * This is used when the directory wasn't changed, but wcc attributes
+ * are needed anyway.
+ */
+void fh_fill_both_attrs(struct svc_fh *fhp)
+{
+ fh_fill_post_attrs(fhp);
+ if (!fhp->fh_post_saved)
+ return;
+ fhp->fh_pre_change = fhp->fh_post_change;
+ fhp->fh_pre_mtime = fhp->fh_post_attr.mtime;
+ fhp->fh_pre_ctime = fhp->fh_post_attr.ctime;
+ fhp->fh_pre_size = fhp->fh_post_attr.size;
+ fhp->fh_pre_saved = true;
+}
+
/*
* Release a file handle.
*/
@@ -680,7 +700,6 @@ fh_put(struct svc_fh *fhp)
struct dentry * dentry = fhp->fh_dentry;
struct svc_export * exp = fhp->fh_export;
if (dentry) {
- fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
fh_clear_pre_post_attrs(fhp);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index fb9d358a267e..c3ae6414fc5c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -81,7 +81,6 @@ typedef struct svc_fh {
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- bool fh_locked; /* inode locked by us */
bool fh_want_write; /* remount protection taken */
bool fh_no_wcc; /* no wcc data needed */
bool fh_no_atomic_attr;
@@ -93,7 +92,7 @@ typedef struct svc_fh {
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
- /* Pre-op attributes saved during fh_lock */
+ /* Pre-op attributes saved when inode is locked */
__u64 fh_pre_size; /* size before operation */
struct timespec64 fh_pre_mtime; /* mtime before oper */
struct timespec64 fh_pre_ctime; /* ctime before oper */
@@ -103,7 +102,7 @@ typedef struct svc_fh {
*/
u64 fh_pre_change;
- /* Post-op attributes saved in fh_unlock */
+ /* Post-op attributes saved in fh_fill_post_attrs() */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
} svc_fh;
@@ -223,8 +222,8 @@ void fh_put(struct svc_fh *);
static __inline__ struct svc_fh *
fh_copy(struct svc_fh *dst, struct svc_fh *src)
{
- WARN_ON(src->fh_dentry || src->fh_locked);
-
+ WARN_ON(src->fh_dentry);
+
*dst = *src;
return dst;
}
@@ -322,52 +321,5 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
-
-
-/*
- * Lock a file handle/inode
- * NOTE: both fh_lock and fh_unlock are done "by hand" in
- * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
- * so, any changes here should be reflected there.
- */
-
-static inline void
-fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
-{
- struct dentry *dentry = fhp->fh_dentry;
- struct inode *inode;
-
- BUG_ON(!dentry);
-
- if (fhp->fh_locked) {
- printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
- dentry);
- return;
- }
-
- inode = d_inode(dentry);
- inode_lock_nested(inode, subclass);
- fh_fill_pre_attrs(fhp);
- fhp->fh_locked = true;
-}
-
-static inline void
-fh_lock(struct svc_fh *fhp)
-{
- fh_lock_nested(fhp, I_MUTEX_NORMAL);
-}
-
-/*
- * Unlock a file handle/inode
- */
-static inline void
-fh_unlock(struct svc_fh *fhp)
-{
- if (fhp->fh_locked) {
- fh_fill_post_attrs(fhp);
- inode_unlock(d_inode(fhp->fh_dentry));
- fhp->fh_locked = false;
- }
-}
-
+extern void fh_fill_both_attrs(struct svc_fh *fhp);
#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index fcdab8a8a41f..7381972f1677 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -51,6 +51,9 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
struct nfsd_sattrargs *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
struct iattr *iap = &argp->attrs;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ };
struct svc_fh *fhp;
dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
@@ -100,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
}
}
- resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
+ resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
if (resp->status != nfs_ok)
goto out;
@@ -260,6 +263,9 @@ nfsd_proc_create(struct svc_rqst *rqstp)
svc_fh *dirfhp = &argp->fh;
svc_fh *newfhp = &resp->fh;
struct iattr *attr = &argp->attrs;
+ struct nfsd_attrs attrs = {
+ .na_iattr = attr,
+ };
struct inode *inode;
struct dentry *dchild;
int type, mode;
@@ -285,7 +291,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
goto done;
}
- fh_lock_nested(dirfhp, I_MUTEX_PARENT);
+ inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
if (IS_ERR(dchild)) {
resp->status = nfserrno(PTR_ERR(dchild));
@@ -385,7 +391,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
if (!inode) {
/* File doesn't exist. Create it and set attrs */
resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
- argp->len, attr, type, rdev,
+ argp->len, &attrs, type, rdev,
newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
@@ -396,13 +402,12 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
attr->ia_valid &= ATTR_SIZE;
if (attr->ia_valid)
- resp->status = nfsd_setattr(rqstp, newfhp, attr, 0,
+ resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
(time64_t)0);
}
out_unlock:
- /* We don't really need to unlock, as fh_put does it. */
- fh_unlock(dirfhp);
+ inode_unlock(dirfhp->fh_dentry->d_inode);
fh_drop_write(dirfhp);
done:
fh_put(dirfhp);
@@ -472,6 +477,9 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd_symlinkargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
struct svc_fh newfh;
if (argp->tlen > NFS_MAXPATHLEN) {
@@ -493,7 +501,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
fh_init(&newfh, NFS_FHSIZE);
resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
- argp->tname, &newfh);
+ argp->tname, &attrs, &newfh);
kfree(argp->tname);
fh_put(&argp->ffh);
@@ -511,6 +519,9 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd_createargs *argp = rqstp->rq_argp;
struct nfsd_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -522,7 +533,7 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_init(&resp->fh, NFS_FHSIZE);
resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
+ &attrs, S_IFDIR, 0, &resp->fh);
fh_put(&argp->fh);
if (resp->status != nfs_ok)
goto out;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f3d6313914ed..ae596dbf8667 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -703,7 +703,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
void put_nfs4_file(struct nfs4_file *fi);
-extern void nfs4_put_copy(struct nfsd4_copy *copy);
extern struct nfsd4_copy *
find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a60ead3b227a..9ebd67d461f9 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -171,6 +171,52 @@ TRACE_EVENT(nfsd_compound_encode_err,
__entry->opnum, __entry->status)
);
+#define show_fs_file_type(x) \
+ __print_symbolic(x, \
+ { S_IFLNK, "LNK" }, \
+ { S_IFREG, "REG" }, \
+ { S_IFDIR, "DIR" }, \
+ { S_IFCHR, "CHR" }, \
+ { S_IFBLK, "BLK" }, \
+ { S_IFIFO, "FIFO" }, \
+ { S_IFSOCK, "SOCK" })
+
+TRACE_EVENT(nfsd_fh_verify,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ int access
+ ),
+ TP_ARGS(rqstp, fhp, type, access),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(void *, inode)
+ __field(unsigned long, type)
+ __field(unsigned long, access)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+ __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local,
+ rqstp->rq_xprt->xpt_locallen);
+ __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->inode = d_inode(fhp->fh_dentry);
+ __entry->type = type;
+ __entry->access = access;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s",
+ __entry->xid, __entry->fh_hash, __entry->inode,
+ show_fs_file_type(__entry->type),
+ show_nfsd_may_flags(__entry->access)
+ )
+);
DECLARE_EVENT_CLASS(nfsd_fh_err_class,
TP_PROTO(struct svc_rqst *rqstp,
@@ -696,15 +742,12 @@ DEFINE_CLID_EVENT(confirmed_r);
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \
- { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
{ 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
TP_ARGS(nf),
TP_STRUCT__entry(
- __field(unsigned int, nf_hashval)
__field(void *, nf_inode)
__field(int, nf_ref)
__field(unsigned long, nf_flags)
@@ -712,15 +755,13 @@ DECLARE_EVENT_CLASS(nfsd_file_class,
__field(struct file *, nf_file)
),
TP_fast_assign(
- __entry->nf_hashval = nf->nf_hashval;
__entry->nf_inode = nf->nf_inode;
__entry->nf_ref = refcount_read(&nf->nf_ref);
__entry->nf_flags = nf->nf_flags;
__entry->nf_may = nf->nf_may;
__entry->nf_file = nf->nf_file;
),
- TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
- __entry->nf_hashval,
+ TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p",
__entry->nf_inode,
__entry->nf_ref,
show_nf_flags(__entry->nf_flags),
@@ -733,34 +774,59 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))
-DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+
+TRACE_EVENT(nfsd_file_alloc,
+ TP_PROTO(
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(const void *, nf_inode)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(unsigned int, nf_ref)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_may = nf->nf_may;
+ ),
+ TP_printk("inode=%p ref=%u flags=%s may=%s",
+ __entry->nf_inode, __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may)
+ )
+);
TRACE_EVENT(nfsd_file_acquire,
- TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
- struct inode *inode, unsigned int may_flags,
- struct nfsd_file *nf, __be32 status),
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ const struct nfsd_file *nf,
+ __be32 status
+ ),
- TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+ TP_ARGS(rqstp, inode, may_flags, nf, status),
TP_STRUCT__entry(
__field(u32, xid)
- __field(unsigned int, hash)
- __field(void *, inode)
+ __field(const void *, inode)
__field(unsigned long, may_flags)
- __field(int, nf_ref)
+ __field(unsigned int, nf_ref)
__field(unsigned long, nf_flags)
__field(unsigned long, nf_may)
- __field(struct file *, nf_file)
+ __field(const void *, nf_file)
__field(u32, status)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqstp->rq_xid);
- __entry->hash = hash;
__entry->inode = inode;
__entry->may_flags = may_flags;
__entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0;
@@ -770,19 +836,117 @@ TRACE_EVENT(nfsd_file_acquire,
__entry->status = be32_to_cpu(status);
),
- TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u",
- __entry->xid, __entry->hash, __entry->inode,
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p status=%u",
+ __entry->xid, __entry->inode,
show_nfsd_may_flags(__entry->may_flags),
__entry->nf_ref, show_nf_flags(__entry->nf_flags),
show_nfsd_may_flags(__entry->nf_may),
- __entry->nf_file, __entry->status)
+ __entry->nf_file, __entry->status
+ )
+);
+
+TRACE_EVENT(nfsd_file_create,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ unsigned int may_flags,
+ const struct nfsd_file *nf
+ ),
+
+ TP_ARGS(rqstp, may_flags, nf),
+
+ TP_STRUCT__entry(
+ __field(const void *, nf_inode)
+ __field(const void *, nf_file)
+ __field(unsigned long, may_flags)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(unsigned int, nf_ref)
+ __field(u32, xid)
+ ),
+
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_file = nf->nf_file;
+ __entry->may_flags = may_flags;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ ),
+
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
+ __entry->xid, __entry->nf_inode,
+ show_nfsd_may_flags(__entry->may_flags),
+ __entry->nf_ref, show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
+ )
+);
+
+TRACE_EVENT(nfsd_file_insert_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ long error
+ ),
+ TP_ARGS(rqstp, inode, may_flags, error),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(unsigned long, may_flags)
+ __field(long, error)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->error = error;
+ ),
+ TP_printk("xid=0x%x inode=%p may_flags=%s error=%ld",
+ __entry->xid, __entry->inode,
+ show_nfsd_may_flags(__entry->may_flags),
+ __entry->error
+ )
+);
+
+TRACE_EVENT(nfsd_file_cons_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(rqstp, inode, may_flags, nf),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(unsigned long, may_flags)
+ __field(unsigned int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(const void *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
+ __entry->xid, __entry->inode,
+ show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
+ )
);
TRACE_EVENT(nfsd_file_open,
TP_PROTO(struct nfsd_file *nf, __be32 status),
TP_ARGS(nf, status),
TP_STRUCT__entry(
- __field(unsigned int, nf_hashval)
__field(void *, nf_inode) /* cannot be dereferenced */
__field(int, nf_ref)
__field(unsigned long, nf_flags)
@@ -790,15 +954,13 @@ TRACE_EVENT(nfsd_file_open,
__field(void *, nf_file) /* cannot be dereferenced */
),
TP_fast_assign(
- __entry->nf_hashval = nf->nf_hashval;
__entry->nf_inode = nf->nf_inode;
__entry->nf_ref = refcount_read(&nf->nf_ref);
__entry->nf_flags = nf->nf_flags;
__entry->nf_may = nf->nf_may;
__entry->nf_file = nf->nf_file;
),
- TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
- __entry->nf_hashval,
+ TP_printk("inode=%p ref=%d flags=%s may=%s file=%p",
__entry->nf_inode,
__entry->nf_ref,
show_nf_flags(__entry->nf_flags),
@@ -807,30 +969,53 @@ TRACE_EVENT(nfsd_file_open,
)
DECLARE_EVENT_CLASS(nfsd_file_search_class,
- TP_PROTO(struct inode *inode, unsigned int hash, int found),
- TP_ARGS(inode, hash, found),
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int count
+ ),
+ TP_ARGS(inode, count),
TP_STRUCT__entry(
- __field(struct inode *, inode)
- __field(unsigned int, hash)
- __field(int, found)
+ __field(const struct inode *, inode)
+ __field(unsigned int, count)
),
TP_fast_assign(
__entry->inode = inode;
- __entry->hash = hash;
- __entry->found = found;
+ __entry->count = count;
),
- TP_printk("hash=0x%x inode=%p found=%d", __entry->hash,
- __entry->inode, __entry->found)
+ TP_printk("inode=%p count=%u",
+ __entry->inode, __entry->count)
);
#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
DEFINE_EVENT(nfsd_file_search_class, name, \
- TP_PROTO(struct inode *inode, unsigned int hash, int found), \
- TP_ARGS(inode, hash, found))
+ TP_PROTO( \
+ const struct inode *inode, \
+ unsigned int count \
+ ), \
+ TP_ARGS(inode, count))
DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_is_cached,
+ TP_PROTO(
+ const struct inode *inode,
+ int found
+ ),
+ TP_ARGS(inode, found),
+ TP_STRUCT__entry(
+ __field(const struct inode *, inode)
+ __field(int, found)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->found = found;
+ ),
+ TP_printk("inode=%p is %scached",
+ __entry->inode,
+ __entry->found ? "" : "not "
+ )
+);
TRACE_EVENT(nfsd_file_fsnotify_handle_event,
TP_PROTO(struct inode *inode, u32 mask),
@@ -851,6 +1036,76 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
__entry->nlink, __entry->mode, __entry->mask)
);
+DECLARE_EVENT_CLASS(nfsd_file_gc_class,
+ TP_PROTO(
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(void *, nf_inode)
+ __field(void *, nf_file)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_file = nf->nf_file;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ ),
+ TP_printk("inode=%p ref=%d nf_flags=%s nf_file=%p",
+ __entry->nf_inode, __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ __entry->nf_file
+ )
+);
+
+#define DEFINE_NFSD_FILE_GC_EVENT(name) \
+DEFINE_EVENT(nfsd_file_gc_class, name, \
+ TP_PROTO( \
+ const struct nfsd_file *nf \
+ ), \
+ TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
+
+DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
+ TP_PROTO(
+ unsigned long removed,
+ unsigned long remaining
+ ),
+ TP_ARGS(removed, remaining),
+ TP_STRUCT__entry(
+ __field(unsigned long, removed)
+ __field(unsigned long, remaining)
+ ),
+ TP_fast_assign(
+ __entry->removed = removed;
+ __entry->remaining = remaining;
+ ),
+ TP_printk("%lu entries removed, %lu remaining",
+ __entry->removed, __entry->remaining)
+);
+
+#define DEFINE_NFSD_FILE_LRUWALK_EVENT(name) \
+DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
+ TP_PROTO( \
+ unsigned long removed, \
+ unsigned long remaining \
+ ), \
+ TP_ARGS(removed, remaining))
+
+DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
+DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
+
#include "cache.h"
TRACE_DEFINE_ENUM(RC_DROPIT);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d79db56475d4..9f486b788ed0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -199,27 +199,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
} else {
- /*
- * In the nfsd4_open() case, this may be held across
- * subsequent open and delegation acquisition which may
- * need to take the child's i_mutex:
- */
- fh_lock_nested(fhp, I_MUTEX_PARENT);
- dentry = lookup_one_len(name, dparent, len);
+ dentry = lookup_one_len_unlocked(name, dparent, len);
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
if (nfsd_mountpoint(dentry, exp)) {
- /*
- * We don't need the i_mutex after all. It's
- * still possible we could open this (regular
- * files can be mountpoints too), but the
- * i_mutex is just there to prevent renames of
- * something that we might be about to delegate,
- * and a mountpoint won't be renamed:
- */
- fh_unlock(fhp);
- if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+ host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
+ if (host_err) {
dput(dentry);
goto out_nfserr;
}
@@ -234,7 +220,15 @@ out_nfserr:
return nfserrno(host_err);
}
-/*
+/**
+ * nfsd_lookup - look up a single path component for nfsd
+ *
+ * @rqstp: the request context
+ * @fhp: the file handle of the directory
+ * @name: the component name, or %NULL to look up parent
+ * @len: length of name to examine
+ * @resfh: pointer to pre-initialised filehandle to hold result.
+ *
* Look up one component of a pathname.
* N.B. After this call _both_ fhp and resfh need an fh_put
*
@@ -244,11 +238,11 @@ out_nfserr:
* returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all
* clients and is explicitly disallowed for NFSv3
- * NeilBrown <neilb@cse.unsw.edu.au>
+ *
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
- unsigned int len, struct svc_fh *resfh)
+ unsigned int len, struct svc_fh *resfh)
{
struct svc_export *exp;
struct dentry *dentry;
@@ -349,11 +343,13 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
* Set various file attributes. After this call fhp needs an fh_put.
*/
__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_attrs *attr,
int check_guard, time64_t guardtime)
{
struct dentry *dentry;
struct inode *inode;
+ struct iattr *iap = attr->na_iattr;
int accmode = NFSD_MAY_SATTR;
umode_t ftype = 0;
__be32 err;
@@ -420,7 +416,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
return err;
}
- fh_lock(fhp);
+ inode_lock(inode);
if (size_change) {
/*
* RFC5661, Section 18.30.4:
@@ -456,7 +452,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
host_err = notify_change(&init_user_ns, dentry, iap, NULL);
out_unlock:
- fh_unlock(fhp);
+ if (attr->na_seclabel && attr->na_seclabel->len)
+ attr->na_labelerr = security_inode_setsecctx(dentry,
+ attr->na_seclabel->data, attr->na_seclabel->len);
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
+ attr->na_aclerr = set_posix_acl(&init_user_ns,
+ inode, ACL_TYPE_ACCESS,
+ attr->na_pacl);
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
+ !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
+ attr->na_aclerr = set_posix_acl(&init_user_ns,
+ inode, ACL_TYPE_DEFAULT,
+ attr->na_dpacl);
+ inode_unlock(inode);
if (size_change)
put_write_access(inode);
out:
@@ -494,32 +502,6 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
return 1;
}
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct xdr_netobj *label)
-{
- __be32 error;
- int host_error;
- struct dentry *dentry;
-
- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
- if (error)
- return error;
-
- dentry = fhp->fh_dentry;
-
- inode_lock(d_inode(dentry));
- host_error = security_inode_setsecctx(dentry, label->data, label->len);
- inode_unlock(d_inode(dentry));
- return nfserrno(host_error);
-}
-#else
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct xdr_netobj *label)
-{
- return nfserr_notsupp;
-}
-#endif
static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
{
@@ -1202,14 +1184,15 @@ out:
* @rqstp: RPC transaction being executed
* @fhp: NFS filehandle of parent directory
* @resfhp: NFS filehandle of new object
- * @iap: requested attributes of new object
+ * @attrs: requested attributes of new object
*
* Returns nfs_ok on success, or an nfsstat in network byte order.
*/
__be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct svc_fh *resfhp, struct iattr *iap)
+ struct svc_fh *resfhp, struct nfsd_attrs *attrs)
{
+ struct iattr *iap = attrs->na_iattr;
__be32 status;
/*
@@ -1230,7 +1213,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
* if the attributes have not changed.
*/
if (iap->ia_valid)
- status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
+ status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
else
status = nfserrno(commit_metadata(resfhp));
@@ -1269,11 +1252,12 @@ nfsd_check_ignore_resizing(struct iattr *iap)
/* The parent directory should already be locked: */
__be32
nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- int type, dev_t rdev, struct svc_fh *resfhp)
+ char *fname, int flen, struct nfsd_attrs *attrs,
+ int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild;
struct inode *dirp;
+ struct iattr *iap = attrs->na_iattr;
__be32 err;
int host_err;
@@ -1281,13 +1265,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
dirp = d_inode(dentry);
dchild = dget(resfhp->fh_dentry);
- if (!fhp->fh_locked) {
- WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
- dentry);
- err = nfserr_io;
- goto out;
- }
-
err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1347,7 +1324,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err < 0)
goto out_nfserr;
- err = nfsd_create_setattr(rqstp, fhp, resfhp, iap);
+ err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
out:
dput(dchild);
@@ -1366,8 +1343,8 @@ out_nfserr:
*/
__be32
nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- int type, dev_t rdev, struct svc_fh *resfhp)
+ char *fname, int flen, struct nfsd_attrs *attrs,
+ int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild = NULL;
__be32 err;
@@ -1386,11 +1363,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- fh_lock_nested(fhp, I_MUTEX_PARENT);
+ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
dchild = lookup_one_len(fname, dentry, flen);
host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- return nfserrno(host_err);
+ if (IS_ERR(dchild)) {
+ err = nfserrno(host_err);
+ goto out_unlock;
+ }
err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
/*
* We unconditionally drop our ref to dchild as fh_compose will have
@@ -1398,9 +1377,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
dput(dchild);
if (err)
- return err;
- return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
- rdev, resfhp);
+ goto out_unlock;
+ fh_fill_pre_attrs(fhp);
+ err = nfsd_create_locked(rqstp, fhp, fname, flen, attrs, type,
+ rdev, resfhp);
+ fh_fill_post_attrs(fhp);
+out_unlock:
+ inode_unlock(dentry->d_inode);
+ return err;
}
/*
@@ -1441,15 +1425,25 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
return 0;
}
-/*
- * Create a symlink and look up its inode
+/**
+ * nfsd_symlink - Create a symlink and look up its inode
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of parent directory
+ * @fname: filename of the new symlink
+ * @flen: length of @fname
+ * @path: content of the new symlink (NUL-terminated)
+ * @attrs: requested attributes of new object
+ * @resfhp: NFS filehandle of new object
+ *
* N.B. After this call _both_ fhp and resfhp need an fh_put
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
*/
__be32
nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen,
- char *path,
- struct svc_fh *resfhp)
+ char *fname, int flen,
+ char *path, struct nfsd_attrs *attrs,
+ struct svc_fh *resfhp)
{
struct dentry *dentry, *dnew;
__be32 err, cerr;
@@ -1467,33 +1461,35 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
- fh_lock(fhp);
dentry = fhp->fh_dentry;
+ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
dnew = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dnew);
- if (IS_ERR(dnew))
- goto out_nfserr;
-
+ if (IS_ERR(dnew)) {
+ err = nfserrno(PTR_ERR(dnew));
+ inode_unlock(dentry->d_inode);
+ goto out_drop_write;
+ }
+ fh_fill_pre_attrs(fhp);
host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
err = nfserrno(host_err);
- fh_unlock(fhp);
+ cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+ if (!err)
+ nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(dentry->d_inode);
if (!err)
err = nfserrno(commit_metadata(fhp));
-
- fh_drop_write(fhp);
-
- cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
dput(dnew);
if (err==0) err = cerr;
+out_drop_write:
+ fh_drop_write(fhp);
out:
return err;
-
-out_nfserr:
- err = nfserrno(host_err);
- goto out;
}
/*
@@ -1531,22 +1527,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
goto out;
}
- fh_lock_nested(ffhp, I_MUTEX_PARENT);
ddir = ffhp->fh_dentry;
dirp = d_inode(ddir);
+ inode_lock_nested(dirp, I_MUTEX_PARENT);
dnew = lookup_one_len(name, ddir, len);
- host_err = PTR_ERR(dnew);
- if (IS_ERR(dnew))
- goto out_nfserr;
+ if (IS_ERR(dnew)) {
+ err = nfserrno(PTR_ERR(dnew));
+ goto out_unlock;
+ }
dold = tfhp->fh_dentry;
err = nfserr_noent;
if (d_really_is_negative(dold))
goto out_dput;
+ fh_fill_pre_attrs(ffhp);
host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
- fh_unlock(ffhp);
+ fh_fill_post_attrs(ffhp);
+ inode_unlock(dirp);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
@@ -1557,17 +1556,17 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
-out_dput:
dput(dnew);
-out_unlock:
- fh_unlock(ffhp);
+out_drop_write:
fh_drop_write(tfhp);
out:
return err;
-out_nfserr:
- err = nfserrno(host_err);
- goto out_unlock;
+out_dput:
+ dput(dnew);
+out_unlock:
+ inode_unlock(dirp);
+ goto out_drop_write;
}
static void
@@ -1628,10 +1627,7 @@ retry:
goto out;
}
- /* cannot use fh_lock as we need deadlock protective ordering
- * so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = true;
fh_fill_pre_attrs(ffhp);
fh_fill_pre_attrs(tfhp);
@@ -1687,17 +1683,12 @@ retry:
dput(odentry);
out_nfserr:
err = nfserrno(host_err);
- /*
- * We cannot rely on fh_unlock on the two filehandles,
- * as that would do the wrong thing if the two directories
- * were the same, so again we do it by hand.
- */
+
if (!close_cached) {
fh_fill_post_attrs(ffhp);
fh_fill_post_attrs(tfhp);
}
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
/*
@@ -1741,19 +1732,19 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (host_err)
goto out_nfserr;
- fh_lock_nested(fhp, I_MUTEX_PARENT);
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
+ inode_lock_nested(dirp, I_MUTEX_PARENT);
rdentry = lookup_one_len(fname, dentry, flen);
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
- goto out_drop_write;
+ goto out_unlock;
if (d_really_is_negative(rdentry)) {
dput(rdentry);
host_err = -ENOENT;
- goto out_drop_write;
+ goto out_unlock;
}
rinode = d_inode(rdentry);
ihold(rinode);
@@ -1761,6 +1752,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
+ fh_fill_pre_attrs(fhp);
if (type != S_IFDIR) {
if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
nfsd_close_cached_files(rdentry);
@@ -1768,8 +1760,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
} else {
host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
}
+ fh_fill_post_attrs(fhp);
- fh_unlock(fhp);
+ inode_unlock(dirp);
if (!host_err)
host_err = commit_metadata(fhp);
dput(rdentry);
@@ -1791,6 +1784,9 @@ out_nfserr:
}
out:
return err;
+out_unlock:
+ inode_unlock(dirp);
+ goto out_drop_write;
}
/*
@@ -2144,13 +2140,16 @@ out:
return err;
}
-/*
- * Removexattr and setxattr need to call fh_lock to both lock the inode
- * and set the change attribute. Since the top-level vfs_removexattr
- * and vfs_setxattr calls already do their own inode_lock calls, call
- * the _locked variant. Pass in a NULL pointer for delegated_inode,
- * and let the client deal with NFS4ERR_DELAY (same as with e.g.
- * setattr and remove).
+/**
+ * nfsd_removexattr - Remove an extended attribute
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of object with xattr to remove
+ * @name: name of xattr to remove (NUL-terminate)
+ *
+ * Pass in a NULL pointer for delegated_inode, and let the client deal
+ * with NFS4ERR_DELAY (same as with e.g. setattr and remove).
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
*/
__be32
nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
@@ -2166,12 +2165,14 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
if (ret)
return nfserrno(ret);
- fh_lock(fhp);
+ inode_lock(fhp->fh_dentry->d_inode);
+ fh_fill_pre_attrs(fhp);
ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
name, NULL);
- fh_unlock(fhp);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
return nfsd_xattr_errno(ret);
@@ -2191,12 +2192,13 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
ret = fh_want_write(fhp);
if (ret)
return nfserrno(ret);
- fh_lock(fhp);
+ inode_lock(fhp->fh_dentry->d_inode);
+ fh_fill_pre_attrs(fhp);
ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
len, flags, NULL);
-
- fh_unlock(fhp);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
return nfsd_xattr_errno(ret);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 26347d76f44a..c95cd414b4bb 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -6,6 +6,8 @@
#ifndef LINUX_NFSD_VFS_H
#define LINUX_NFSD_VFS_H
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
#include "nfsfh.h"
#include "nfsd.h"
@@ -42,6 +44,22 @@ struct nfsd_file;
typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
/* nfsd/vfs.c */
+struct nfsd_attrs {
+ struct iattr *na_iattr; /* input */
+ struct xdr_netobj *na_seclabel; /* input */
+ struct posix_acl *na_pacl; /* input */
+ struct posix_acl *na_dpacl; /* input */
+
+ int na_labelerr; /* output */
+ int na_aclerr; /* output */
+};
+
+static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
+{
+ posix_acl_release(attrs->na_pacl);
+ posix_acl_release(attrs->na_dpacl);
+}
+
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -50,11 +68,9 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
const char *, unsigned int,
struct svc_export **, struct dentry **);
__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
- struct iattr *, int, time64_t);
+ struct nfsd_attrs *, int, time64_t);
int nfsd_mountpoint(struct dentry *, struct svc_export *);
#ifdef CONFIG_NFSD_V4
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
- struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
@@ -63,14 +79,14 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
u64 count, bool sync);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
+ char *name, int len, struct nfsd_attrs *attrs,
int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
+ char *name, int len, struct nfsd_attrs *attrs,
int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct svc_fh *resfhp, struct iattr *iap);
+ struct svc_fh *resfhp, struct nfsd_attrs *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
@@ -110,8 +126,9 @@ __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
- char *name, int len, char *path,
- struct svc_fh *res);
+ char *name, int len, char *path,
+ struct nfsd_attrs *attrs,
+ struct svc_fh *res);
__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
char *, int, struct svc_fh *);
ssize_t nfsd_copy_file_range(struct file *, u64,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 7b744011f2d3..96267258e629 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -279,6 +279,7 @@ struct nfsd4_open {
struct nfs4_clnt_odstate *op_odstate; /* used during processing */
struct nfs4_acl *op_acl;
struct xdr_netobj op_label;
+ struct svc_rqst *op_rqstp;
};
struct nfsd4_open_confirm {
@@ -302,9 +303,10 @@ struct nfsd4_read {
u32 rd_length; /* request */
int rd_vlen;
struct nfsd_file *rd_nf;
-
+
struct svc_rqst *rd_rqstp; /* response */
- struct svc_fh *rd_fhp; /* response */
+ struct svc_fh *rd_fhp; /* response */
+ u32 rd_eof; /* response */
};
struct nfsd4_readdir {
@@ -532,6 +534,13 @@ struct nfsd42_write_res {
stateid_t cb_stateid;
};
+struct nfsd4_cb_offload {
+ struct nfsd4_callback co_cb;
+ struct nfsd42_write_res co_res;
+ __be32 co_nfserr;
+ struct knfsd_fh co_fh;
+};
+
struct nfsd4_copy {
/* request */
stateid_t cp_src_stateid;
@@ -539,18 +548,16 @@ struct nfsd4_copy {
u64 cp_src_pos;
u64 cp_dst_pos;
u64 cp_count;
- struct nl4_server cp_src;
- bool cp_intra;
+ struct nl4_server *cp_src;
- /* both */
- u32 cp_synchronous;
+ unsigned long cp_flags;
+#define NFSD4_COPY_F_STOPPED (0)
+#define NFSD4_COPY_F_INTRA (1)
+#define NFSD4_COPY_F_SYNCHRONOUS (2)
+#define NFSD4_COPY_F_COMMITTED (3)
/* response */
struct nfsd42_write_res cp_res;
-
- /* for cb_offload */
- struct nfsd4_callback cp_cb;
- __be32 nfserr;
struct knfsd_fh fh;
struct nfs4_client *cp_clp;
@@ -563,14 +570,35 @@ struct nfsd4_copy {
struct list_head copies;
struct task_struct *copy_task;
refcount_t refcount;
- bool stopped;
struct vfsmount *ss_mnt;
struct nfs_fh c_fh;
nfs4_stateid stateid;
- bool committed;
};
+static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync)
+{
+ if (sync)
+ set_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ else
+ clear_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_copy_is_sync(const struct nfsd4_copy *copy)
+{
+ return test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_copy_is_async(const struct nfsd4_copy *copy)
+{
+ return !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_ssc_is_inter(const struct nfsd4_copy *copy)
+{
+ return !test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+}
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -594,19 +622,20 @@ struct nfsd4_offload_status {
struct nfsd4_copy_notify {
/* request */
stateid_t cpn_src_stateid;
- struct nl4_server cpn_dst;
+ struct nl4_server *cpn_dst;
/* response */
stateid_t cpn_cnr_stateid;
u64 cpn_sec;
u32 cpn_nsec;
- struct nl4_server cpn_src;
+ struct nl4_server *cpn_src;
};
struct nfsd4_op {
u32 opnum;
- const struct nfsd4_operation * opdesc;
__be32 status;
+ const struct nfsd4_operation *opdesc;
+ struct nfs4_replay *replay;
union nfsd4_op_u {
struct nfsd4_access access;
struct nfsd4_close close;
@@ -670,7 +699,6 @@ struct nfsd4_op {
struct nfsd4_listxattrs listxattrs;
struct nfsd4_removexattr removexattr;
} u;
- struct nfs4_replay * replay;
};
bool nfsd4_cache_this_op(struct nfsd4_op *);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index e8c00dda42ad..71f870d497ae 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -84,8 +84,8 @@ static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi,
/*
* attr_load_runs - Load all runs stored in @attr.
*/
-int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
- struct runs_tree *run, const CLST *vcn)
+static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
+ struct runs_tree *run, const CLST *vcn)
{
int err;
CLST svcn = le64_to_cpu(attr->nres.svcn);
@@ -140,7 +140,10 @@ failed:
}
if (lcn != SPARSE_LCN) {
- mark_as_free_ex(sbi, lcn, clen, trim);
+ if (sbi) {
+ /* mark bitmap range [lcn + clen) as free and trim clusters. */
+ mark_as_free_ex(sbi, lcn, clen, trim);
+ }
dn += clen;
}
@@ -173,7 +176,6 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
{
int err;
CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
- struct wnd_bitmap *wnd = &sbi->used.bitmap;
size_t cnt = run->count;
for (;;) {
@@ -196,9 +198,7 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
/* Add new fragment into run storage. */
if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
/* Undo last 'ntfs_look_for_free_space' */
- down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
- wnd_set_free(wnd, lcn, flen);
- up_write(&wnd->rw_lock);
+ mark_as_free_ex(sbi, lcn, len, false);
err = -ENOMEM;
goto out;
}
@@ -320,7 +320,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s),
attr_s->name_len, run, 0, alen,
- attr_s->flags, &attr, NULL);
+ attr_s->flags, &attr, NULL, NULL);
if (err)
goto out3;
@@ -419,40 +419,44 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
struct mft_inode *mi, *mi_b;
CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn;
CLST next_svcn, pre_alloc = -1, done = 0;
- bool is_ext;
+ bool is_ext, is_bad = false;
u32 align;
struct MFT_REC *rec;
again:
+ alen = 0;
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL,
&mi_b);
if (!attr_b) {
err = -ENOENT;
- goto out;
+ goto bad_inode;
}
if (!attr_b->non_res) {
err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run,
&attr_b);
- if (err || !attr_b->non_res)
- goto out;
+ if (err)
+ return err;
+
+ /* Return if file is still resident. */
+ if (!attr_b->non_res)
+ goto ok1;
/* Layout of records may be changed, so do a full search. */
goto again;
}
is_ext = is_attr_ext(attr_b);
-
-again_1:
align = sbi->cluster_size;
-
if (is_ext)
align <<= attr_b->nres.c_unit;
old_valid = le64_to_cpu(attr_b->nres.valid_size);
old_size = le64_to_cpu(attr_b->nres.data_size);
old_alloc = le64_to_cpu(attr_b->nres.alloc_size);
+
+again_1:
old_alen = old_alloc >> cluster_bits;
new_alloc = (new_size + align - 1) & ~(u64)(align - 1);
@@ -475,24 +479,27 @@ again_1:
mi = mi_b;
} else if (!le_b) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
} else {
le = le_b;
attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn,
&mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
next_le_1:
svcn = le64_to_cpu(attr->nres.svcn);
evcn = le64_to_cpu(attr->nres.evcn);
}
-
+ /*
+ * Here we have:
+ * attr,mi,le - last attribute segment (containing 'vcn').
+ * attr_b,mi_b,le_b - base (primary) attribute segment.
+ */
next_le:
rec = mi->mrec;
-
err = attr_load_runs(attr, ni, run, NULL);
if (err)
goto out;
@@ -507,6 +514,13 @@ next_le:
goto ok;
}
+ /*
+ * Add clusters. In simple case we have to:
+ * - allocate space (vcn, lcn, len)
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ */
to_allocate = new_alen - old_alen;
add_alloc_in_same_attr_seg:
lcn = 0;
@@ -520,9 +534,11 @@ add_alloc_in_same_attr_seg:
pre_alloc = 0;
if (type == ATTR_DATA && !name_len &&
sbi->options->prealloc) {
- CLST new_alen2 = bytes_to_cluster(
- sbi, get_pre_allocated(new_size));
- pre_alloc = new_alen2 - new_alen;
+ pre_alloc =
+ bytes_to_cluster(
+ sbi,
+ get_pre_allocated(new_size)) -
+ new_alen;
}
/* Get the last LCN to allocate from. */
@@ -580,7 +596,7 @@ add_alloc_in_same_attr_seg:
pack_runs:
err = mi_pack_runs(mi, attr, run, vcn - svcn);
if (err)
- goto out;
+ goto undo_1;
next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
new_alloc_tmp = (u64)next_svcn << cluster_bits;
@@ -614,7 +630,7 @@ pack_runs:
if (type == ATTR_LIST) {
err = ni_expand_list(ni);
if (err)
- goto out;
+ goto undo_2;
if (next_svcn < vcn)
goto pack_runs;
@@ -624,8 +640,9 @@ pack_runs:
if (!ni->attr_list.size) {
err = ni_create_attr_list(ni);
+ /* In case of error layout of records is not changed. */
if (err)
- goto out;
+ goto undo_2;
/* Layout of records is changed. */
}
@@ -637,48 +654,57 @@ pack_runs:
/* Insert new attribute segment. */
err = ni_insert_nonresident(ni, type, name, name_len, run,
next_svcn, vcn - next_svcn,
- attr_b->flags, &attr, &mi);
- if (err)
- goto out;
-
- if (!is_mft)
- run_truncate_head(run, evcn + 1);
-
- svcn = le64_to_cpu(attr->nres.svcn);
- evcn = le64_to_cpu(attr->nres.evcn);
+ attr_b->flags, &attr, &mi, NULL);
- le_b = NULL;
/*
* Layout of records maybe changed.
* Find base attribute to update.
*/
+ le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len,
NULL, &mi_b);
if (!attr_b) {
- err = -ENOENT;
- goto out;
+ err = -EINVAL;
+ goto bad_inode;
}
- attr_b->nres.alloc_size = cpu_to_le64((u64)vcn << cluster_bits);
- attr_b->nres.data_size = attr_b->nres.alloc_size;
- attr_b->nres.valid_size = attr_b->nres.alloc_size;
+ if (err) {
+ /* ni_insert_nonresident failed. */
+ attr = NULL;
+ goto undo_2;
+ }
+
+ if (!is_mft)
+ run_truncate_head(run, evcn + 1);
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ /*
+ * Attribute is in consistency state.
+ * Save this point to restore to if next steps fail.
+ */
+ old_valid = old_size = old_alloc = (u64)vcn << cluster_bits;
+ attr_b->nres.valid_size = attr_b->nres.data_size =
+ attr_b->nres.alloc_size = cpu_to_le64(old_size);
mi_b->dirty = true;
goto again_1;
}
if (new_size != old_size ||
(new_alloc != old_alloc && !keep_prealloc)) {
+ /*
+ * Truncate clusters. In simple case we have to:
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ * - mark and trim clusters as free (vcn, lcn, len)
+ */
+ CLST dlen = 0;
+
vcn = max(svcn, new_alen);
new_alloc_tmp = (u64)vcn << cluster_bits;
- alen = 0;
- err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &alen,
- true);
- if (err)
- goto out;
-
- run_truncate(run, vcn);
-
if (vcn > svcn) {
err = mi_pack_runs(mi, attr, run, vcn - svcn);
if (err)
@@ -697,7 +723,7 @@ pack_runs:
if (!al_remove_le(ni, le)) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz);
@@ -723,12 +749,20 @@ pack_runs:
attr_b->nres.valid_size =
attr_b->nres.alloc_size;
}
+ mi_b->dirty = true;
- if (is_ext)
+ err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen,
+ true);
+ if (err)
+ goto out;
+
+ if (is_ext) {
+ /* dlen - really deallocated clusters. */
le64_sub_cpu(&attr_b->nres.total_size,
- ((u64)alen << cluster_bits));
+ ((u64)dlen << cluster_bits));
+ }
- mi_b->dirty = true;
+ run_truncate(run, vcn);
if (new_alloc_tmp <= new_alloc)
goto ok;
@@ -747,7 +781,7 @@ pack_runs:
if (le->type != type || le->name_len != name_len ||
memcmp(le_name(le), name, name_len * sizeof(short))) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
err = ni_load_mi(ni, le, &mi);
@@ -757,7 +791,7 @@ pack_runs:
attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
goto next_le_1;
}
@@ -772,13 +806,13 @@ ok:
}
}
-out:
- if (!err && attr_b && ret)
+ok1:
+ if (ret)
*ret = attr_b;
/* Update inode_set_bytes. */
- if (!err && ((type == ATTR_DATA && !name_len) ||
- (type == ATTR_ALLOC && name == I30_NAME))) {
+ if (((type == ATTR_DATA && !name_len) ||
+ (type == ATTR_ALLOC && name == I30_NAME))) {
bool dirty = false;
if (ni->vfs_inode.i_size != new_size) {
@@ -786,7 +820,7 @@ out:
dirty = true;
}
- if (attr_b && attr_b->non_res) {
+ if (attr_b->non_res) {
new_alloc = le64_to_cpu(attr_b->nres.alloc_size);
if (inode_get_bytes(&ni->vfs_inode) != new_alloc) {
inode_set_bytes(&ni->vfs_inode, new_alloc);
@@ -800,6 +834,47 @@ out:
}
}
+ return 0;
+
+undo_2:
+ vcn -= alen;
+ attr_b->nres.data_size = cpu_to_le64(old_size);
+ attr_b->nres.valid_size = cpu_to_le64(old_valid);
+ attr_b->nres.alloc_size = cpu_to_le64(old_alloc);
+
+ /* Restore 'attr' and 'mi'. */
+ if (attr)
+ goto restore_run;
+
+ if (le64_to_cpu(attr_b->nres.svcn) <= svcn &&
+ svcn <= le64_to_cpu(attr_b->nres.evcn)) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, type, name, name_len,
+ &svcn, &mi);
+ if (!attr)
+ goto bad_inode;
+ }
+
+restore_run:
+ if (mi_pack_runs(mi, attr, run, evcn - svcn + 1))
+ is_bad = true;
+
+undo_1:
+ run_deallocate_ex(sbi, run, vcn, alen, NULL, false);
+
+ run_truncate(run, vcn);
+out:
+ if (is_bad) {
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
return err;
}
@@ -855,7 +930,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
goto out;
}
- asize = le64_to_cpu(attr_b->nres.alloc_size) >> sbi->cluster_bits;
+ asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits;
if (vcn >= asize) {
err = -EINVAL;
goto out;
@@ -1047,7 +1122,7 @@ ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
next_svcn, evcn1 - next_svcn,
- attr_b->flags, &attr, &mi);
+ attr_b->flags, &attr, &mi, NULL);
if (err)
goto out;
}
@@ -1173,7 +1248,7 @@ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
u8 cluster_bits = sbi->cluster_bits;
- CLST vcn = from >> cluster_bits;
+ CLST vcn;
CLST vcn_last = (to - 1) >> cluster_bits;
CLST lcn, clen;
int err;
@@ -1647,7 +1722,7 @@ ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
next_svcn, evcn1 - next_svcn,
- attr_b->flags, &attr, &mi);
+ attr_b->flags, &attr, &mi, NULL);
if (err)
goto out;
}
@@ -1812,18 +1887,12 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
err = ni_insert_nonresident(
ni, ATTR_DATA, NULL, 0, run, next_svcn,
evcn1 - eat - next_svcn, a_flags, &attr,
- &mi);
+ &mi, &le);
if (err)
goto out;
/* Layout of records maybe changed. */
attr_b = NULL;
- le = al_find_ex(ni, NULL, ATTR_DATA, NULL, 0,
- &next_svcn);
- if (!le) {
- err = -EINVAL;
- goto out;
- }
}
/* Free all allocated memory. */
@@ -1918,7 +1987,7 @@ next_attr:
out:
up_write(&ni->file.run_lock);
if (err)
- make_bad_inode(&ni->vfs_inode);
+ _ntfs_bad_inode(&ni->vfs_inode);
return err;
}
@@ -1936,9 +2005,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST svcn, evcn1, vcn, len, end, alen, dealloc;
+ CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn;
u64 total_size, alloc_size;
u32 mask;
+ __le16 a_flags;
+ struct runs_tree run2;
if (!bytes)
return 0;
@@ -1990,6 +2061,9 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
}
down_write(&ni->file.run_lock);
+ run_init(&run2);
+ run_truncate(run, 0);
+
/*
* Enumerate all attribute segments and punch hole where necessary.
*/
@@ -1997,10 +2071,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
vcn = vbo >> sbi->cluster_bits;
len = bytes >> sbi->cluster_bits;
end = vcn + len;
- dealloc = 0;
+ hole = 0;
svcn = le64_to_cpu(attr_b->nres.svcn);
evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+ a_flags = attr_b->flags;
if (svcn <= vcn && vcn < evcn1) {
attr = attr_b;
@@ -2008,14 +2083,14 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
mi = mi_b;
} else if (!le_b) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
} else {
le = le_b;
attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
&mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
svcn = le64_to_cpu(attr->nres.svcn);
@@ -2023,49 +2098,91 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
}
while (svcn < end) {
- CLST vcn1, zero, dealloc2;
+ CLST vcn1, zero, hole2 = hole;
err = attr_load_runs(attr, ni, run, &svcn);
if (err)
- goto out;
+ goto done;
vcn1 = max(vcn, svcn);
zero = min(end, evcn1) - vcn1;
- dealloc2 = dealloc;
- err = run_deallocate_ex(sbi, run, vcn1, zero, &dealloc, true);
+ /*
+ * Check range [vcn1 + zero).
+ * Calculate how many clusters there are.
+ * Don't do any destructive actions.
+ */
+ err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false);
if (err)
- goto out;
+ goto done;
- if (dealloc2 == dealloc) {
- /* Looks like the required range is already sparsed. */
- } else {
- if (!run_add_entry(run, vcn1, SPARSE_LCN, zero,
- false)) {
- err = -ENOMEM;
- goto out;
- }
+ /* Check if required range is already hole. */
+ if (hole2 == hole)
+ goto next_attr;
+
+ /* Make a clone of run to undo. */
+ err = run_clone(run, &run2);
+ if (err)
+ goto done;
+
+ /* Make a hole range (sparse) [vcn1 + zero). */
+ if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) {
+ err = -ENOMEM;
+ goto done;
+ }
- err = mi_pack_runs(mi, attr, run, evcn1 - svcn);
+ /* Update run in attribute segment. */
+ err = mi_pack_runs(mi, attr, run, evcn1 - svcn);
+ if (err)
+ goto done;
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ if (next_svcn < evcn1) {
+ /* Insert new attribute segment. */
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn,
+ evcn1 - next_svcn, a_flags,
+ &attr, &mi, &le);
if (err)
- goto out;
+ goto undo_punch;
+
+ /* Layout of records maybe changed. */
+ attr_b = NULL;
}
+
+ /* Real deallocate. Should not fail. */
+ run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true);
+
+next_attr:
/* Free all allocated memory. */
run_truncate(run, 0);
if (evcn1 >= alen)
break;
+ /* Get next attribute segment. */
attr = ni_enum_attr_ex(ni, attr, &le, &mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
svcn = le64_to_cpu(attr->nres.svcn);
evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
}
- total_size -= (u64)dealloc << sbi->cluster_bits;
+done:
+ if (!hole)
+ goto out;
+
+ if (!attr_b) {
+ attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+ }
+
+ total_size -= (u64)hole << sbi->cluster_bits;
attr_b->nres.total_size = cpu_to_le64(total_size);
mi_b->dirty = true;
@@ -2075,9 +2192,263 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
mark_inode_dirty(&ni->vfs_inode);
out:
+ run_close(&run2);
up_write(&ni->file.run_lock);
+ return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_punch:
+ /*
+ * Restore packed runs.
+ * 'mi_pack_runs' should not fail, cause we restore original.
+ */
+ if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn))
+ goto bad_inode;
+
+ goto done;
+}
+
+/*
+ * attr_insert_range - Insert range (hole) in file.
+ * Not for normal files.
+ */
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST vcn, svcn, evcn1, len, next_svcn;
+ u64 data_size, alloc_size;
+ u32 mask;
+ __le16 a_flags;
+
+ if (!bytes)
+ return 0;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!is_attr_ext(attr_b)) {
+ /* It was checked above. See fallocate. */
+ return -EOPNOTSUPP;
+ }
+
+ if (!attr_b->non_res) {
+ data_size = le32_to_cpu(attr_b->res.data_size);
+ alloc_size = data_size;
+ mask = sbi->cluster_mask; /* cluster_size - 1 */
+ } else {
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1;
+ }
+
+ if (vbo > data_size) {
+ /* Insert range after the file size is not allowed. */
+ return -EINVAL;
+ }
+
+ if ((vbo & mask) || (bytes & mask)) {
+ /* Allow to insert only frame aligned ranges. */
+ return -EINVAL;
+ }
+
+ /*
+ * valid_size <= data_size <= alloc_size
+ * Check alloc_size for maximum possible.
+ */
+ if (bytes > sbi->maxbytes_sparse - alloc_size)
+ return -EFBIG;
+
+ vcn = vbo >> sbi->cluster_bits;
+ len = bytes >> sbi->cluster_bits;
+
+ down_write(&ni->file.run_lock);
+
+ if (!attr_b->non_res) {
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, run,
+ data_size + bytes, NULL, false, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err)
+ goto out;
+
+ if (!attr_b->non_res) {
+ /* Still resident. */
+ char *data = Add2Ptr(attr_b, attr_b->res.data_off);
+
+ memmove(data + bytes, data, bytes);
+ memset(data, 0, bytes);
+ goto done;
+ }
+
+ /* Resident files becomes nonresident. */
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ }
+
+ /*
+ * Enumerate all attribute segments and shift start vcn.
+ */
+ a_flags = attr_b->flags;
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ run_truncate(run, 0); /* clear cached values. */
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ if (!run_insert_range(run, vcn, len)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Try to pack in current record as much as possible. */
+ err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn);
if (err)
- make_bad_inode(&ni->vfs_inode);
+ goto out;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_add_cpu(&attr->nres.svcn, len);
+ le64_add_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ if (next_svcn < evcn1 + len) {
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn, evcn1 + len - next_svcn,
+ a_flags, NULL, NULL, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err) {
+ /* ni_insert_nonresident failed. Try to undo. */
+ goto undo_insert_range;
+ }
+ }
+
+ /*
+ * Update primary attribute segment.
+ */
+ if (vbo <= ni->i_valid)
+ ni->i_valid += bytes;
+
+ attr_b->nres.data_size = le64_to_cpu(data_size + bytes);
+ attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes);
+
+ /* ni->valid may be not equal valid_size (temporary). */
+ if (ni->i_valid > data_size + bytes)
+ attr_b->nres.valid_size = attr_b->nres.data_size;
+ else
+ attr_b->nres.valid_size = cpu_to_le64(ni->i_valid);
+ mi_b->dirty = true;
+
+done:
+ ni->vfs_inode.i_size += bytes;
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ run_truncate(run, 0); /* clear cached values. */
+
+ up_write(&ni->file.run_lock);
return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_insert_range:
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ if (attr_load_runs(attr, ni, run, NULL))
+ goto bad_inode;
+
+ if (!run_collapse_range(run, vcn, len))
+ goto bad_inode;
+
+ if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn))
+ goto bad_inode;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_sub_cpu(&attr->nres.svcn, len);
+ le64_sub_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ goto out;
}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index aa184407520f..5d44ceac855b 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -51,11 +51,6 @@ void ntfs3_exit_bitmap(void)
kmem_cache_destroy(ntfs_enode_cachep);
}
-static inline u32 wnd_bits(const struct wnd_bitmap *wnd, size_t i)
-{
- return i + 1 == wnd->nwnd ? wnd->bits_last : wnd->sb->s_blocksize * 8;
-}
-
/*
* wnd_scan
*
@@ -1333,9 +1328,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
if (!new_free)
return -ENOMEM;
- if (new_free != wnd->free_bits)
- memcpy(new_free, wnd->free_bits,
- wnd->nwnd * sizeof(short));
+ memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
memset(new_free + wnd->nwnd, 0,
(new_wnd - wnd->nwnd) * sizeof(short));
kfree(wnd->free_bits);
@@ -1395,9 +1388,8 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len)
{
- size_t zlen;
+ size_t zlen = wnd->zone_end - wnd->zone_bit;
- zlen = wnd->zone_end - wnd->zone_bit;
if (zlen)
wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 4a21745711fe..4f2ffc7ef296 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -530,21 +530,35 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
{
struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
loff_t end = vbo + len;
loff_t vbo_down = round_down(vbo, PAGE_SIZE);
- loff_t i_size;
+ bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
+ loff_t i_size, new_size;
+ bool map_locked;
int err;
/* No support for dir. */
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- /* Return error if mode is not supported. */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE)) {
+ /*
+ * vfs_fallocate checks all possible combinations of mode.
+ * Do additional checks here before ntfs_set_state(dirty).
+ */
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode &
+ ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) {
ntfs_inode_warn(inode, "fallocate(0x%x) is not supported",
mode);
return -EOPNOTSUPP;
@@ -554,6 +568,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
inode_lock(inode);
i_size = inode->i_size;
+ new_size = max(end, i_size);
+ map_locked = false;
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open. */
@@ -561,38 +577,27 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
goto out;
}
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE)) {
+ inode_dio_wait(inode);
+ filemap_invalidate_lock(mapping);
+ map_locked = true;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
u32 frame_size;
loff_t mask, vbo_a, end_a, tmp;
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- err = -EINVAL;
- goto out;
- }
-
- err = filemap_write_and_wait_range(inode->i_mapping, vbo,
- end - 1);
+ err = filemap_write_and_wait_range(mapping, vbo, end - 1);
if (err)
goto out;
- err = filemap_write_and_wait_range(inode->i_mapping, end,
- LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (err)
goto out;
- inode_dio_wait(inode);
-
truncate_pagecache(inode, vbo_down);
- if (!is_sparsed(ni) && !is_compressed(ni)) {
- /*
- * Normal file, can't make hole.
- * TODO: Try to find way to save info about hole.
- */
- err = -EOPNOTSUPP;
- goto out;
- }
-
ni_lock(ni);
err = attr_punch_hole(ni, vbo, len, &frame_size);
ni_unlock(ni);
@@ -624,17 +629,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_unlock(ni);
}
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- if (mode & ~FALLOC_FL_COLLAPSE_RANGE) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Write tail of the last page before removed range since
* it will get removed from the page cache below.
*/
- err = filemap_write_and_wait_range(inode->i_mapping, vbo_down,
- vbo);
+ err = filemap_write_and_wait_range(mapping, vbo_down, vbo);
if (err)
goto out;
@@ -642,34 +641,58 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
* Write data that will be shifted to preserve them
* when discarding page cache below.
*/
- err = filemap_write_and_wait_range(inode->i_mapping, end,
- LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (err)
goto out;
- /* Wait for existing dio to complete. */
- inode_dio_wait(inode);
-
truncate_pagecache(inode, vbo_down);
ni_lock(ni);
err = attr_collapse_range(ni, vbo, len);
ni_unlock(ni);
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ /* Check new size. */
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
+ /* Write out all dirty pages. */
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
+ if (err)
+ goto out;
+ truncate_pagecache(inode, vbo_down);
+
+ ni_lock(ni);
+ err = attr_insert_range(ni, vbo, len);
+ ni_unlock(ni);
} else {
- /*
- * Normal file: Allocate clusters, do not change 'valid' size.
- */
- loff_t new_size = max(end, i_size);
+ /* Check new size. */
+
+ /* generic/213: expected -ENOSPC instead of -EFBIG. */
+ if (!is_supported_holes) {
+ loff_t to_alloc = new_size - inode_get_bytes(inode);
+
+ if (to_alloc > 0 &&
+ (to_alloc >> sbi->cluster_bits) >
+ wnd_zeroes(&sbi->used.bitmap)) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
err = inode_newsize_ok(inode, new_size);
if (err)
goto out;
+ /*
+ * Allocate clusters, do not change 'valid' size.
+ */
err = ntfs_set_size(inode, new_size);
if (err)
goto out;
- if (is_sparsed(ni) || is_compressed(ni)) {
+ if (is_supported_holes) {
CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
CLST vcn = vbo >> sbi->cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
@@ -717,8 +740,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
}
out:
- if (err == -EFBIG)
- err = -ENOSPC;
+ if (map_locked)
+ filemap_invalidate_unlock(mapping);
if (!err) {
inode->i_ctime = inode->i_mtime = current_time(inode);
@@ -989,7 +1012,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (bytes > count)
bytes = count;
- frame = pos >> frame_bits;
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 18842998c8fa..381a38a06ec2 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -7,6 +7,7 @@
#include <linux/fiemap.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/vmalloc.h>
#include "debug.h"
@@ -468,7 +469,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
&ref, &le);
if (err) {
/* No memory or no space. */
- return NULL;
+ return ERR_PTR(err);
}
le_added = true;
@@ -649,6 +650,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
struct mft_inode *mi;
u32 asize, free;
struct MFT_REF ref;
+ struct MFT_REC *mrec;
__le16 id;
if (!ni->attr_list.dirty)
@@ -692,11 +694,17 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
free -= asize;
}
+ /* Make a copy of primary record to restore if error. */
+ mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS);
+ if (!mrec)
+ return 0; /* Not critical. */
+
/* It seems that attribute list can be removed from primary record. */
mi_remove_attr(NULL, &ni->mi, attr_list);
/*
- * Repeat the cycle above and move all attributes to primary record.
+ * Repeat the cycle above and copy all attributes to primary record.
+ * Do not remove original attributes from subrecords!
* It should be success!
*/
le = NULL;
@@ -707,14 +715,14 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
mi = ni_find_mi(ni, ino_get(&le->ref));
if (!mi) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
attr = mi_find_attr(mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
asize = le32_to_cpu(attr->size);
@@ -724,18 +732,33 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
le16_to_cpu(attr->name_off));
if (!attr_ins) {
/*
- * Internal error.
- * Either no space in primary record (already checked).
- * Either tried to insert another
- * non indexed attribute (logic error).
+ * No space in primary record (already checked).
*/
- goto bad;
+ goto out;
}
/* Copy all except id. */
id = attr_ins->id;
memcpy(attr_ins, attr, asize);
attr_ins->id = id;
+ }
+
+ /*
+ * Repeat the cycle above and remove all attributes from subrecords.
+ */
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi)
+ continue;
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr)
+ continue;
/* Remove from original record. */
mi_remove_attr(NULL, mi, attr);
@@ -748,11 +771,13 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
ni->attr_list.le = NULL;
ni->attr_list.dirty = false;
+ kfree(mrec);
+ return 0;
+out:
+ /* Restore primary record. */
+ swap(mrec, ni->mi.mrec);
+ kfree(mrec);
return 0;
-bad:
- ntfs_inode_err(&ni->vfs_inode, "Internal error");
- make_bad_inode(&ni->vfs_inode);
- return -EINVAL;
}
/*
@@ -986,6 +1011,8 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
name_off, svcn, ins_le);
if (!attr)
continue;
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
if (ins_attr)
*ins_attr = attr;
@@ -1007,8 +1034,15 @@ insert_ext:
attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
name_off, svcn, ins_le);
- if (!attr)
+ if (!attr) {
+ err = -EINVAL;
goto out2;
+ }
+
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out2;
+ }
if (ins_attr)
*ins_attr = attr;
@@ -1020,10 +1054,9 @@ insert_ext:
out2:
ni_remove_mi(ni, mi);
mi_put(mi);
- err = -EINVAL;
out1:
- ntfs_mark_rec_free(sbi, rno);
+ ntfs_mark_rec_free(sbi, rno, is_mft);
out:
return err;
@@ -1076,6 +1109,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
if (asize <= free) {
attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len,
asize, name_off, svcn, ins_le);
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
if (attr) {
if (ins_attr)
*ins_attr = attr;
@@ -1173,6 +1211,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
goto out;
}
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
if (ins_attr)
*ins_attr = attr;
if (ins_mi)
@@ -1218,7 +1261,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
mft_min = mft_new;
mi_min = mi_new;
} else {
- ntfs_mark_rec_free(sbi, mft_new);
+ ntfs_mark_rec_free(sbi, mft_new, true);
mft_new = 0;
ni_remove_mi(ni, mi_new);
}
@@ -1262,7 +1305,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
done = asize - run_size - SIZEOF_NONRESIDENT;
le32_sub_cpu(&ni->mi.mrec->used, done);
- /* Estimate the size of second part: run_buf=NULL. */
+ /* Estimate packed size (run_buf=NULL). */
err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size,
&plen);
if (err < 0)
@@ -1288,10 +1331,16 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
goto out;
}
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
attr->non_res = 1;
attr->name_off = SIZEOF_NONRESIDENT_LE;
attr->flags = 0;
+ /* This function can't fail - cause already checked above. */
run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
run_size, &plen);
@@ -1301,7 +1350,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
out:
if (mft_new) {
- ntfs_mark_rec_free(sbi, mft_new);
+ ntfs_mark_rec_free(sbi, mft_new, true);
ni_remove_mi(ni, mi_new);
}
@@ -1367,8 +1416,6 @@ int ni_expand_list(struct ntfs_inode *ni)
/* Split MFT data as much as possible. */
err = ni_expand_mft_list(ni);
- if (err)
- goto out;
out:
return !err && !done ? -EOPNOTSUPP : err;
@@ -1381,7 +1428,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
const __le16 *name, u8 name_len,
const struct runs_tree *run, CLST svcn, CLST len,
__le16 flags, struct ATTRIB **new_attr,
- struct mft_inode **mi)
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le)
{
int err;
CLST plen;
@@ -1394,6 +1441,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
u32 run_size, asize;
struct ntfs_sb_info *sbi = ni->mi.sbi;
+ /* Estimate packed size (run_buf=NULL). */
err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off,
&plen);
if (err < 0)
@@ -1414,7 +1462,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
}
err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn,
- &attr, mi, NULL);
+ &attr, mi, le);
if (err)
goto out;
@@ -1423,12 +1471,12 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
attr->name_off = cpu_to_le16(name_off);
attr->flags = flags;
+ /* This function can't fail - cause already checked above. */
run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen);
attr->nres.svcn = cpu_to_le64(svcn);
attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1);
- err = 0;
if (new_attr)
*new_attr = attr;
@@ -1560,7 +1608,7 @@ int ni_delete_all(struct ntfs_inode *ni)
mi->dirty = true;
mi_write(mi, 0);
- ntfs_mark_rec_free(sbi, mi->rno);
+ ntfs_mark_rec_free(sbi, mi->rno, false);
ni_remove_mi(ni, mi);
mi_put(mi);
node = next;
@@ -1571,7 +1619,7 @@ int ni_delete_all(struct ntfs_inode *ni)
ni->mi.dirty = true;
err = mi_write(&ni->mi, 0);
- ntfs_mark_rec_free(sbi, ni->mi.rno);
+ ntfs_mark_rec_free(sbi, ni->mi.rno, false);
return err;
}
@@ -1589,7 +1637,8 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
struct ATTRIB *attr = NULL;
struct ATTR_FILE_NAME *fname;
- *le = NULL;
+ if (le)
+ *le = NULL;
/* Enumerate all names. */
next:
@@ -1605,7 +1654,7 @@ next:
goto next;
if (!uni)
- goto next;
+ return fname;
if (uni->len != fname->name_len)
goto next;
@@ -2302,10 +2351,8 @@ remove_wof:
out:
kfree(pages);
- if (err) {
- make_bad_inode(inode);
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
- }
+ if (err)
+ _ntfs_bad_inode(inode);
return err;
}
@@ -2944,7 +2991,7 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
}
/*
- * ni_add_name - Add new name in MFT and in directory.
+ * ni_add_name - Add new name into MFT and into directory.
*/
int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de)
@@ -2953,13 +3000,20 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
struct mft_inode *mi;
+ struct ATTR_FILE_NAME *fname;
struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
u16 de_key_size = le16_to_cpu(de->key_size);
mi_get_ref(&ni->mi, &de->ref);
mi_get_ref(&dir_ni->mi, &de_name->home);
- /* Insert new name in MFT. */
+ /* Fill duplicate from any ATTR_NAME. */
+ fname = ni_fname_name(ni, NULL, NULL, NULL, NULL);
+ if (fname)
+ memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup));
+ de_name->dup.fa = ni->std_fa;
+
+ /* Insert new name into MFT. */
err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr,
&mi, &le);
if (err)
@@ -2967,7 +3021,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);
- /* Insert new name in directory. */
+ /* Insert new name into directory. */
err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0);
if (err)
ni_remove_attr_le(ni, attr, mi, le);
@@ -2991,7 +3045,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
* 1) Add new name and remove old name.
* 2) Remove old name and add new name.
*
- * In most cases (not all!) adding new name in MFT and in directory can
+ * In most cases (not all!) adding new name into MFT and into directory can
* allocate additional cluster(s).
* Second way may result to bad inode if we can't add new name
* and then can't restore (add) old name.
@@ -3261,7 +3315,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
err = err2;
if (is_empty) {
- ntfs_mark_rec_free(sbi, mi->rno);
+ ntfs_mark_rec_free(sbi, mi->rno, false);
rb_erase(node, &ni->mi_tree);
mi_put(mi);
}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 49b7df616778..e7c494005122 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3843,6 +3843,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
memset(&rst_info2, 0, sizeof(struct restart_info));
err = log_read_rst(log, l_size, false, &rst_info2);
+ if (err)
+ goto out;
/* Determine which restart area to use. */
if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
@@ -5057,7 +5059,7 @@ undo_action_next:
goto add_allocated_vcns;
vcn = le64_to_cpu(lrh->target_vcn);
- vcn &= ~(log->clst_per_page - 1);
+ vcn &= ~(u64)(log->clst_per_page - 1);
add_allocated_vcns:
for (i = 0, vcn = le64_to_cpu(lrh->target_vcn),
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 1835e35199c2..4ed15f64b17f 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -703,12 +703,14 @@ out:
/*
* ntfs_mark_rec_free - Mark record as free.
+ * is_mft - true if we are changing MFT
*/
-void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno)
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
{
struct wnd_bitmap *wnd = &sbi->mft.bitmap;
- down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+ if (!is_mft)
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
if (rno >= wnd->nbits)
goto out;
@@ -727,7 +729,8 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno)
sbi->mft.next_free = rno;
out:
- up_write(&wnd->rw_lock);
+ if (!is_mft)
+ up_write(&wnd->rw_lock);
}
/*
@@ -780,7 +783,7 @@ out:
*/
int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
{
- CLST zone_limit, zone_max, lcn, vcn, len;
+ CLST lcn, vcn, len;
size_t lcn_s, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
struct ntfs_inode *ni = sbi->mft.ni;
@@ -789,16 +792,6 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
if (wnd_zone_len(wnd))
return 0;
- /*
- * Compute the MFT zone at two steps.
- * It would be nice if we are able to allocate 1/8 of
- * total clusters for MFT but not more then 512 MB.
- */
- zone_limit = (512 * 1024 * 1024) >> sbi->cluster_bits;
- zone_max = wnd->nbits >> 3;
- if (zone_max > zone_limit)
- zone_max = zone_limit;
-
vcn = bytes_to_cluster(sbi,
(u64)sbi->mft.bitmap.nbits << sbi->record_bits);
@@ -812,13 +805,7 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
lcn_s = lcn + 1;
/* Try to allocate clusters after last MFT run. */
- zlen = wnd_find(wnd, zone_max, lcn_s, 0, &lcn_s);
- if (!zlen) {
- ntfs_notice(sbi->sb, "MftZone: unavailable");
- return 0;
- }
-
- /* Truncate too large zone. */
+ zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s);
wnd_zone_set(wnd, lcn_s, zlen);
return 0;
@@ -827,16 +814,21 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
/*
* ntfs_update_mftmirr - Update $MFTMirr data.
*/
-int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
{
int err;
struct super_block *sb = sbi->sb;
- u32 blocksize = sb->s_blocksize;
+ u32 blocksize;
sector_t block1, block2;
u32 bytes;
+ if (!sb)
+ return;
+
+ blocksize = sb->s_blocksize;
+
if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
- return 0;
+ return;
err = 0;
bytes = sbi->mft.recs_mirr << sbi->record_bits;
@@ -847,16 +839,13 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
struct buffer_head *bh1, *bh2;
bh1 = sb_bread(sb, block1++);
- if (!bh1) {
- err = -EIO;
- goto out;
- }
+ if (!bh1)
+ return;
bh2 = sb_getblk(sb, block2++);
if (!bh2) {
put_bh(bh1);
- err = -EIO;
- goto out;
+ return;
}
if (buffer_locked(bh2))
@@ -876,13 +865,24 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
put_bh(bh2);
if (err)
- goto out;
+ return;
}
sbi->flags &= ~NTFS_FLAGS_MFTMIRR;
+}
-out:
- return err;
+/*
+ * ntfs_bad_inode
+ *
+ * Marks inode as bad and marks fs as 'dirty'
+ */
+void ntfs_bad_inode(struct inode *inode, const char *hint)
+{
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+ ntfs_inode_err(inode, "%s", hint);
+ make_bad_inode(inode);
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
/*
@@ -1395,7 +1395,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
if (buffer_locked(bh))
__wait_on_buffer(bh);
- lock_buffer(nb->bh[idx]);
+ lock_buffer(bh);
bh_data = bh->b_data + off;
end_data = Add2Ptr(bh_data, op);
@@ -2424,7 +2424,7 @@ static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn,
void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
{
- CLST end, i;
+ CLST end, i, zone_len, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
@@ -2459,6 +2459,28 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
ntfs_unmap_and_discard(sbi, lcn, len);
wnd_set_free(wnd, lcn, len);
+ /* append to MFT zone, if possible. */
+ zone_len = wnd_zone_len(wnd);
+ zlen = min(zone_len + len, sbi->zone_max);
+
+ if (zlen == zone_len) {
+ /* MFT zone already has maximum size. */
+ } else if (!zone_len) {
+ /* Create MFT zone only if 'zlen' is large enough. */
+ if (zlen == sbi->zone_max)
+ wnd_zone_set(wnd, lcn, zlen);
+ } else {
+ CLST zone_lcn = wnd_zone_bit(wnd);
+
+ if (lcn + len == zone_lcn) {
+ /* Append into head MFT zone. */
+ wnd_zone_set(wnd, lcn, zlen);
+ } else if (zone_lcn + zone_len == lcn) {
+ /* Append into tail MFT zone. */
+ wnd_zone_set(wnd, zone_lcn, zlen);
+ }
+ }
+
out:
up_write(&wnd->rw_lock);
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 6f81e3a49abf..440328147e7e 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1042,19 +1042,16 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
{
int err;
struct NTFS_DE *e;
- const struct INDEX_HDR *hdr;
struct indx_node *node;
if (!root)
root = indx_get_root(&ni->dir, ni, NULL, NULL);
if (!root) {
- err = -EINVAL;
- goto out;
+ /* Should not happen. */
+ return -EINVAL;
}
- hdr = &root->ihdr;
-
/* Check cache. */
e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de;
if (e && !de_is_last(e) &&
@@ -1068,39 +1065,35 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
fnd_clear(fnd);
/* Lookup entry that is <= to the search value. */
- e = hdr_find_e(indx, hdr, key, key_len, ctx, diff);
+ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff);
if (!e)
return -EINVAL;
fnd->root_de = e;
- err = 0;
for (;;) {
node = NULL;
- if (*diff >= 0 || !de_has_vcn_ex(e)) {
- *entry = e;
- goto out;
- }
+ if (*diff >= 0 || !de_has_vcn_ex(e))
+ break;
/* Read next level. */
err = indx_read(indx, ni, de_get_vbn(e), &node);
if (err)
- goto out;
+ return err;
/* Lookup entry that is <= to the search value. */
e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
diff);
if (!e) {
- err = -EINVAL;
put_indx_node(node);
- goto out;
+ return -EINVAL;
}
fnd_push(fnd, node, e);
}
-out:
- return err;
+ *entry = e;
+ return 0;
}
int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
@@ -1354,7 +1347,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out;
err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len,
- &run, 0, len, 0, &alloc, NULL);
+ &run, 0, len, 0, &alloc, NULL, NULL);
if (err)
goto out1;
@@ -1685,8 +1678,8 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
{
int err;
const struct NTFS_DE *sp;
- struct NTFS_DE *e, *de_t, *up_e = NULL;
- struct indx_node *n2 = NULL;
+ struct NTFS_DE *e, *de_t, *up_e;
+ struct indx_node *n2;
struct indx_node *n1 = fnd->nodes[level];
struct INDEX_HDR *hdr1 = &n1->index->ihdr;
struct INDEX_HDR *hdr2;
@@ -1994,7 +1987,7 @@ static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni,
const struct NTFS_DE *e, bool trim)
{
int err;
- struct indx_node *n;
+ struct indx_node *n = NULL;
struct INDEX_HDR *hdr;
CLST vbn = de_get_vbn(e);
size_t i;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 80104afeb2cd..51363d4e8636 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -430,6 +430,7 @@ end_enum:
} else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) &&
fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
/* Records in $Extend are not a files or general directories. */
+ inode->i_op = &ntfs_file_inode_operations;
} else {
err = -EINVAL;
goto out;
@@ -500,7 +501,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
inode = ntfs_read_mft(inode, name, ref);
else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
/* Inode overlaps? */
- make_bad_inode(inode);
+ _ntfs_bad_inode(inode);
}
return inode;
@@ -1632,7 +1633,7 @@ out4:
ni->mi.dirty = false;
discard_new_inode(inode);
out3:
- ntfs_mark_rec_free(sbi, ino);
+ ntfs_mark_rec_free(sbi, ino, false);
out2:
__putname(new_de);
@@ -1655,7 +1656,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
struct NTFS_DE *de;
- struct ATTR_FILE_NAME *de_name;
/* Allocate PATH_MAX bytes. */
de = __getname();
@@ -1670,15 +1670,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
if (err)
goto out;
- de_name = (struct ATTR_FILE_NAME *)(de + 1);
- /* Fill duplicate info. */
- de_name->dup.cr_time = de_name->dup.m_time = de_name->dup.c_time =
- de_name->dup.a_time = kernel2nt(&inode->i_ctime);
- de_name->dup.alloc_size = de_name->dup.data_size =
- cpu_to_le64(inode->i_size);
- de_name->dup.fa = ni->std_fa;
- de_name->dup.ea_size = de_name->dup.reparse = 0;
-
err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de);
out:
__putname(de);
@@ -1731,9 +1722,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (inode->i_nlink)
mark_inode_dirty(inode);
} else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
- make_bad_inode(inode);
- ntfs_inode_err(inode, "failed to undo unlink");
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(inode);
} else {
if (ni_is_dirty(dir))
mark_inode_dirty(dir);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index bc741213ad84..bc22cc321a74 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -208,7 +208,7 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
/*
- * ntfs_rmdir - inode_operations::rm_dir
+ * ntfs_rmdir - inode_operations::rmdir
*/
static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -308,9 +308,7 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
if (is_bad) {
/* Restore after failed rename failed too. */
- make_bad_inode(inode);
- ntfs_inode_err(inode, "failed to undo rename");
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(inode);
} else if (!err) {
inode->i_ctime = dir->i_ctime = dir->i_mtime =
current_time(dir);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 8dbdca03e1af..2c791222c4e2 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -220,6 +220,7 @@ struct ntfs_sb_info {
u32 flags; // See NTFS_FLAGS_XXX.
+ CLST zone_max; // Maximum MFT zone length in clusters
CLST bad_clusters; // The count of marked bad clusters.
u16 max_bytes_per_attr; // Maximum attribute size in record.
@@ -408,8 +409,6 @@ enum REPARSE_SIGN {
};
/* Functions from attrib.c */
-int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
- struct runs_tree *run, const CLST *vcn);
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
@@ -440,6 +439,7 @@ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr,
int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
u64 new_valid);
int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
/* Functions from attrlist.c */
@@ -528,7 +528,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
const __le16 *name, u8 name_len,
const struct runs_tree *run, CLST svcn, CLST len,
__le16 flags, struct ATTRIB **new_attr,
- struct mft_inode **mi);
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le);
int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
enum ATTR_TYPE type, const __le16 *name, u8 name_len,
struct ATTRIB **new_attr, struct mft_inode **mi,
@@ -589,10 +589,12 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
enum ALLOCATE_OPT opt);
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
struct ntfs_inode *ni, struct mft_inode **mi);
-void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno);
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to);
int ntfs_refresh_zone(struct ntfs_sb_info *sbi);
-int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
+void ntfs_bad_inode(struct inode *inode, const char *hint);
+#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__)
enum NTFS_DIRTY_FLAGS {
NTFS_DIRTY_CLEAR = 0,
NTFS_DIRTY_DIRTY = 1,
@@ -738,7 +740,6 @@ static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
int mi_write(struct mft_inode *mi, int wait);
int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
__le16 flags, bool is_mft);
-void mi_mark_free(struct mft_inode *mi);
struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
const __le16 *name, u8 name_len, u32 asize,
u16 name_off);
@@ -780,10 +781,10 @@ bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
void run_truncate(struct runs_tree *run, CLST vcn);
void run_truncate_head(struct runs_tree *run, CLST vcn);
void run_truncate_around(struct runs_tree *run, CLST vcn);
-bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *Index);
bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
bool is_mft);
bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len);
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len);
bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
CLST *lcn, CLST *len);
bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn);
@@ -802,6 +803,7 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
#define run_unpack_ex run_unpack
#endif
int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn);
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run);
/* Globals from super.c */
void *ntfs_set_shared(void *ptr, u32 bytes);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 861e35791506..7d2fac5ee215 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -395,28 +395,6 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
}
/*
- * mi_mark_free - Mark record as unused and marks it as free in bitmap.
- */
-void mi_mark_free(struct mft_inode *mi)
-{
- CLST rno = mi->rno;
- struct ntfs_sb_info *sbi = mi->sbi;
-
- if (rno >= MFT_REC_RESERVED && rno < MFT_REC_FREE) {
- ntfs_clear_mft_tail(sbi, rno, rno + 1);
- mi->dirty = false;
- return;
- }
-
- if (mi->mrec) {
- clear_rec_inuse(mi->mrec);
- mi->dirty = true;
- mi_write(mi, 0);
- }
- ntfs_mark_rec_free(sbi, rno);
-}
-
-/*
* mi_insert_attr - Reserve space for new attribute.
*
* Return: Not full constructed attribute or NULL if not possible to create.
@@ -445,12 +423,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
attr = NULL;
while ((attr = mi_enum_attr(mi, attr))) {
diff = compare_attr(attr, type, name, name_len, upcase);
- if (diff > 0)
- break;
+
if (diff < 0)
continue;
- if (!is_attr_indexed(attr))
+ if (!diff && !is_attr_indexed(attr))
return NULL;
break;
}
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index a8fec651f973..aaaa0d3d35a2 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -31,7 +31,7 @@ struct ntfs_run {
* Case of entry missing from list 'index' will be set to
* point to insertion position for the entry question.
*/
-bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
+static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
{
size_t min_idx, max_idx, mid_idx;
struct ntfs_run *r;
@@ -547,6 +547,48 @@ bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len)
return true;
}
+/* run_insert_range
+ *
+ * Helper for attr_insert_range(),
+ * which is helper for fallocate(insert_range).
+ */
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len)
+{
+ size_t index;
+ struct ntfs_run *r, *e;
+
+ if (WARN_ON(!run_lookup(run, vcn, &index)))
+ return false; /* Should never be here. */
+
+ e = run->runs + run->count;
+ r = run->runs + index;
+
+ if (vcn > r->vcn)
+ r += 1;
+
+ for (; r < e; r++)
+ r->vcn += len;
+
+ r = run->runs + index;
+
+ if (vcn > r->vcn) {
+ /* split fragment. */
+ CLST len1 = vcn - r->vcn;
+ CLST len2 = r->len - len1;
+ CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1);
+
+ r->len = len1;
+
+ if (!run_add_entry(run, vcn + len, lcn2, len2, false))
+ return false;
+ }
+
+ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
+ return false;
+
+ return true;
+}
+
/*
* run_get_entry - Return index-th mapped region.
*/
@@ -778,26 +820,36 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
CLST next_vcn, vcn, lcn;
CLST prev_lcn = 0;
CLST evcn1 = svcn + len;
+ const struct ntfs_run *r, *r_end;
int packed_size = 0;
size_t i;
- bool ok;
s64 dlcn;
int offset_size, size_size, tmp;
- next_vcn = vcn = svcn;
-
*packed_vcns = 0;
if (!len)
goto out;
- ok = run_lookup_entry(run, vcn, &lcn, &len, &i);
+ /* Check all required entries [svcn, encv1) available. */
+ if (!run_lookup(run, svcn, &i))
+ return -ENOENT;
+
+ r_end = run->runs + run->count;
+ r = run->runs + i;
- if (!ok)
- goto error;
+ for (next_vcn = r->vcn + r->len; next_vcn < evcn1;
+ next_vcn = r->vcn + r->len) {
+ if (++r >= r_end || r->vcn != next_vcn)
+ return -ENOENT;
+ }
- if (next_vcn != vcn)
- goto error;
+ /* Repeat cycle above and pack runs. Assume no errors. */
+ r = run->runs + i;
+ len = svcn - r->vcn;
+ vcn = svcn;
+ lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len);
+ len = r->len - len;
for (;;) {
next_vcn = vcn + len;
@@ -846,12 +898,10 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1)
goto out;
- ok = run_get_entry(run, ++i, &vcn, &lcn, &len);
- if (!ok)
- goto error;
-
- if (next_vcn != vcn)
- goto error;
+ r += 1;
+ vcn = r->vcn;
+ lcn = r->lcn;
+ len = r->len;
}
out:
@@ -860,9 +910,6 @@ out:
run_buf[0] = 0;
return packed_size + 1;
-
-error:
- return -EOPNOTSUPP;
}
/*
@@ -1109,3 +1156,28 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
*highest_vcn = vcn64 - 1;
return 0;
}
+
+/*
+ * run_clone
+ *
+ * Make a copy of run
+ */
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run)
+{
+ size_t bytes = run->count * sizeof(struct ntfs_run);
+
+ if (bytes > new_run->allocated) {
+ struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL);
+
+ if (!new_ptr)
+ return -ENOMEM;
+
+ kvfree(new_run->runs);
+ new_run->runs = new_ptr;
+ new_run->allocated = bytes;
+ }
+
+ memcpy(new_run->runs, run->runs, bytes);
+ new_run->count = run->count;
+ return 0;
+}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 0c6de6287737..47012c9bf505 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -30,6 +30,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/nls.h>
#include <linux/seq_file.h>
@@ -390,7 +391,7 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
return -EINVAL;
}
- memcpy(sbi->options, new_opts, sizeof(*new_opts));
+ swap(sbi->options, fc->fs_private);
return 0;
}
@@ -870,6 +871,13 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
#endif
+ /*
+ * Compute the MFT zone at two steps.
+ * It would be nice if we are able to allocate 1/8 of
+ * total clusters for MFT but not more then 512 MB.
+ */
+ sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
+
err = 0;
out:
@@ -900,6 +908,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.high = 0;
sbi->sb = sb;
+ sbi->options = fc->fs_private;
+ fc->fs_private = NULL;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
sb->s_op = &ntfs_sops;
@@ -1262,8 +1272,6 @@ load_root:
goto put_inode_out;
}
- fc->fs_private = NULL;
-
return 0;
put_inode_out:
@@ -1378,7 +1386,7 @@ static const struct fs_context_operations ntfs_context_ops = {
/*
* ntfs_init_fs_context - Initialize spi and opts
*
- * This will called when mount/remount. We will first initiliaze
+ * This will called when mount/remount. We will first initialize
* options so that if remount we can use just that.
*/
static int ntfs_init_fs_context(struct fs_context *fc)
@@ -1416,7 +1424,6 @@ static int ntfs_init_fs_context(struct fs_context *fc)
mutex_init(&sbi->compress.mtx_lzx);
#endif
- sbi->options = opts;
fc->s_fs_info = sbi;
ok:
fc->fs_private = opts;
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 5e0e0280e70d..6ae1f56b7358 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -118,7 +118,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
run_init(&run);
- err = attr_load_runs(attr_ea, ni, &run, NULL);
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size);
if (!err)
err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL);
run_close(&run);
@@ -444,6 +444,11 @@ update_ea:
/* Delete xattr, ATTR_EA */
ni_remove_attr_le(ni, attr, mi, le);
} else if (attr->non_res) {
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0,
+ size);
+ if (err)
+ goto out;
+
err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0);
if (err)
goto out;
@@ -478,8 +483,7 @@ out:
}
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
- struct inode *inode, int type,
+static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
int locked)
{
struct ntfs_inode *ni = ntfs_i(inode);
@@ -514,7 +518,7 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
/* Translate extended attribute to acl. */
if (err >= 0) {
- acl = posix_acl_from_xattr(mnt_userns, buf, err);
+ acl = posix_acl_from_xattr(&init_user_ns, buf, err);
} else if (err == -ENODATA) {
acl = NULL;
} else {
@@ -537,8 +541,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
if (rcu)
return ERR_PTR(-ECHILD);
- /* TODO: init_user_ns? */
- return ntfs_get_acl_ex(&init_user_ns, inode, type, 0);
+ return ntfs_get_acl_ex(inode, type, 0);
}
static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
@@ -547,28 +550,23 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
{
const char *name;
size_t size, name_len;
- void *value = NULL;
- int err = 0;
+ void *value;
+ int err;
int flags;
+ umode_t mode;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
+ mode = inode->i_mode;
switch (type) {
case ACL_TYPE_ACCESS:
/* Do not change i_mode if we are in init_acl */
if (acl && !init_acl) {
- umode_t mode;
-
err = posix_acl_update_mode(mnt_userns, inode, &mode,
&acl);
if (err)
- goto out;
-
- if (inode->i_mode != mode) {
- inode->i_mode = mode;
- mark_inode_dirty(inode);
- }
+ return err;
}
name = XATTR_NAME_POSIX_ACL_ACCESS;
name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
@@ -595,7 +593,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
value = kmalloc(size, GFP_NOFS);
if (!value)
return -ENOMEM;
- err = posix_acl_to_xattr(mnt_userns, acl, value, size);
+ err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (err < 0)
goto out;
flags = 0;
@@ -604,8 +602,13 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
if (err == -ENODATA && !size)
err = 0; /* Removing non existed xattr. */
- if (!err)
+ if (!err) {
set_cached_acl(inode, type, acl);
+ if (inode->i_mode != mode) {
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
+ }
+ }
out:
kfree(value);
@@ -641,7 +644,7 @@ static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
if (!acl)
return -ENODATA;
- err = posix_acl_to_xattr(mnt_userns, acl, buffer, size);
+ err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
posix_acl_release(acl);
return err;
@@ -665,12 +668,12 @@ static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
if (!value) {
acl = NULL;
} else {
- acl = posix_acl_from_xattr(mnt_userns, value, size);
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
- err = posix_acl_valid(mnt_userns, acl);
+ err = posix_acl_valid(&init_user_ns, acl);
if (err)
goto release_and_out;
}
@@ -706,13 +709,13 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_default_acl = NULL;
}
- if (!acl)
- inode->i_acl = NULL;
- else {
+ if (acl) {
if (!err)
err = ntfs_set_acl_ex(mnt_userns, inode, acl,
ACL_TYPE_ACCESS, true);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return err;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index e360543ad7e7..8b2020f92b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
+ struct user_lock_res *lockres;
+ int teardown;
clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
ip = DLMFS_I(inode);
+ lockres = &ip->ip_lockres;
if (S_ISREG(inode->i_mode)) {
- status = user_dlm_destroy_lock(&ip->ip_lockres);
- if (status < 0)
- mlog_errno(status);
+ spin_lock(&lockres->l_lock);
+ teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
+ spin_unlock(&lockres->l_lock);
+ if (!teardown) {
+ status = user_dlm_destroy_lock(lockres);
+ if (status < 0)
+ mlog_errno(status);
+ }
iput(ip->ip_parent);
goto clear_fields;
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 801e60bab955..c28bc983a7b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3403,10 +3403,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 9099d8fc7599..22da768e65b7 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -2,12 +2,13 @@
/*
* heartbeat.c
*
- * Register ourselves with the heartbaet service, keep our node maps
+ * Register ourselves with the heartbeat service, keep our node maps
* up to date, and fire off recovery when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*/
+#include <linux/bitmap.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
@@ -24,18 +25,12 @@
#include "buffer_head_io.h"
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit);
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit);
-
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
static void ocfs2_node_map_init(struct ocfs2_node_map *map)
{
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
- memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
- sizeof(unsigned long));
+ bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES);
}
void ocfs2_init_node_maps(struct ocfs2_super *osb)
@@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data)
ocfs2_recovery_thread(osb, node_num);
}
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit)
-{
- set_bit(bit, map->map);
-}
-
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_set_bit(map, bit);
+ set_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit)
-{
- clear_bit(bit, map->map);
-}
-
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_clear_bit(map, bit);
+ clear_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c75fd54b9185..961d1cf54388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -197,6 +197,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
* callers. */
if (S_ISDIR(mode))
set_nlink(inode, 2);
+ mode = mode_strip_sgid(&init_user_ns, dir, mode);
inode_init_owner(&init_user_ns, inode, dir, mode);
status = dquot_initialize(inode);
if (status)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0b6f551a342a..dc9f76ab7e13 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -412,7 +412,7 @@ out_unlock:
goto out_err;
}
-/* Write information to global quota file. Expects exlusive lock on quota
+/* Write information to global quota file. Expects exclusive lock on quota
* file inode and quota info */
static int __ocfs2_global_write_info(struct super_block *sb, int type)
{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 013a727bd7c8..e2cc9eec287c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1914,8 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
!ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
- if (osb->cconn)
- ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove_recursive(osb->osb_debug_root);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 2eada97bbd23..e065a5b9a442 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -259,7 +259,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
return FILEID_INVALID;
dentry = d_find_any_alias(inode);
- if (WARN_ON(!dentry))
+ if (!dentry)
return FILEID_INVALID;
bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7922b619f6c8..0fbcb590af84 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -454,14 +454,18 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return res;
}
+#ifdef CONFIG_FS_POSIX_ACL
/*
* Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
* of the POSIX ACLs retrieved from the lower layer to this function to not
* alter the POSIX ACLs for the underlying filesystem.
*/
-static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns,
+static void ovl_idmap_posix_acl(struct inode *realinode,
+ struct user_namespace *mnt_userns,
struct posix_acl *acl)
{
+ struct user_namespace *fs_userns = i_user_ns(realinode);
+
for (unsigned int i = 0; i < acl->a_count; i++) {
vfsuid_t vfsuid;
vfsgid_t vfsgid;
@@ -469,11 +473,11 @@ static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns,
struct posix_acl_entry *e = &acl->a_entries[i];
switch (e->e_tag) {
case ACL_USER:
- vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid);
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid);
e->e_uid = vfsuid_into_kuid(vfsuid);
break;
case ACL_GROUP:
- vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid);
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid);
e->e_gid = vfsgid_into_kgid(vfsgid);
break;
}
@@ -497,7 +501,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
struct posix_acl *acl, *clone;
struct path realpath;
- if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
+ if (!IS_POSIXACL(realinode))
return NULL;
/* Careful in RCU walk mode */
@@ -535,7 +539,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
if (!clone)
clone = ERR_PTR(-ENOMEM);
else
- ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone);
+ ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
/*
* Since we're not in RCU path walk we always need to release the
* original ACLs.
@@ -543,6 +547,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
posix_acl_release(acl);
return clone;
}
+#endif
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
{
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 65c4346a5b43..69dc577974f8 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -42,7 +42,7 @@ static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d,
* One of the ancestor path elements in an absolute path
* lookup in ovl_lookup_layer() could have been opaque and
* that will stop further lookup in lower layers (d->stop=true)
- * But we have found an absolute redirect in decendant path
+ * But we have found an absolute redirect in descendant path
* element and that should force continue lookup in lower
* layers (reset d->stop).
*/
@@ -648,7 +648,7 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
* If the index dentry for a copy up origin inode is positive, but points
* to an inode different than the upper inode, then either the upper inode
* has been copied up and not indexed or it was indexed, but since then
- * index dir was cleared. Either way, that index cannot be used to indentify
+ * index dir was cleared. Either way, that index cannot be used to identify
* the overlay inode.
*/
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6ec815b84d48..87759165d32b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -590,7 +590,13 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+
+#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+#else
+#define ovl_get_acl NULL
+#endif
+
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e0a2e0468ee7..ec746d447f1b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -301,7 +301,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
/**
* ovl_statfs
- * @sb: The overlayfs super block
+ * @dentry: The dentry to query
* @buf: The struct kstatfs to fill in with stats
*
* Get the filesystem statistics. As writes always target the upper layer
@@ -349,6 +349,8 @@ static inline int ovl_xino_def(void)
/**
* ovl_show_options
+ * @m: the seq_file handle
+ * @dentry: The dentry to query
*
* Prints the mount options for a given superblock.
* Returns zero; does not fail.
@@ -1412,11 +1414,12 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
*/
err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
+ pr_warn("failed to set xattr on upper\n");
ofs->noxattr = true;
if (ofs->config.index || ofs->config.metacopy) {
ofs->config.index = false;
ofs->config.metacopy = false;
- pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n");
+ pr_warn("...falling back to index=off,metacopy=off.\n");
}
/*
* xattr support is required for persistent st_ino.
@@ -1424,8 +1427,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
*/
if (ofs->config.xino == OVL_XINO_AUTO) {
ofs->config.xino = OVL_XINO_OFF;
- pr_warn("upper fs does not support xattr, falling back to xino=off.\n");
+ pr_warn("...falling back to xino=off.\n");
}
+ if (err == -EPERM && !ofs->config.userxattr)
+ pr_info("try mounting with 'userxattr' option\n");
err = 0;
} else {
ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
@@ -2032,7 +2037,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
atomic_long_set(&ofs->last_ino, 1);
- /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
+ /* Assume underlying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
if (!ofs->xino_mode) {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 1d17d7b13dcd..5af33800743e 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -361,6 +361,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
const struct posix_acl *acl, int want)
{
const struct posix_acl_entry *pa, *pe, *mask_obj;
+ struct user_namespace *fs_userns = i_user_ns(inode);
int found = 0;
vfsuid_t vfsuid;
vfsgid_t vfsgid;
@@ -376,7 +377,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
goto check_perm;
break;
case ACL_USER:
- vfsuid = make_vfsuid(mnt_userns, &init_user_ns,
+ vfsuid = make_vfsuid(mnt_userns, fs_userns,
pa->e_uid);
if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
goto mask;
@@ -390,7 +391,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
}
break;
case ACL_GROUP:
- vfsgid = make_vfsgid(mnt_userns, &init_user_ns,
+ vfsgid = make_vfsgid(mnt_userns, fs_userns,
pa->e_gid);
if (vfsgid_in_group_p(vfsgid)) {
found = 1;
@@ -736,6 +737,7 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
{
struct posix_acl_xattr_header *header = value;
struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ struct user_namespace *fs_userns = i_user_ns(inode);
int count;
vfsuid_t vfsuid;
vfsgid_t vfsgid;
@@ -753,13 +755,13 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
switch (le16_to_cpu(entry->e_tag)) {
case ACL_USER:
uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid);
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
vfsuid_into_kuid(vfsuid)));
break;
case ACL_GROUP:
gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid);
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
vfsgid_into_kgid(vfsgid)));
break;
@@ -775,6 +777,7 @@ void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
{
struct posix_acl_xattr_header *header = value;
struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ struct user_namespace *fs_userns = i_user_ns(inode);
int count;
vfsuid_t vfsuid;
vfsgid_t vfsgid;
@@ -793,13 +796,13 @@ void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
case ACL_USER:
uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
vfsuid = VFSUIDT_INIT(uid);
- uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid);
+ uid = from_vfsuid(mnt_userns, fs_userns, vfsuid);
entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid));
break;
case ACL_GROUP:
gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
vfsgid = VFSGIDT_INIT(gid);
- gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid);
+ gid = from_vfsgid(mnt_userns, fs_userns, vfsgid);
entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid));
break;
default:
diff --git a/fs/proc/array.c b/fs/proc/array.c
index eb815759842c..99fcbfda8e25 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -69,7 +69,6 @@
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
-#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -100,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char tcomm[64];
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
else if (p->flags & PF_KTHREAD)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8dfa36a99c74..93f7e3d971e4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei)
put_pid(pid);
}
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
@@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
- if (S_ISDIR(mode)) {
- spin_lock(&pid->lock);
- hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->lock);
- }
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1931,6 +1926,39 @@ out_unlock:
return NULL;
}
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -3369,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
@@ -3671,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 73aeb4e6d32e..f495fdb39151 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,8 +26,6 @@
#include <linux/mount.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
-
#include "internal.h"
static void proc_evict_inode(struct inode *inode)
@@ -214,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
__releases(&pde->pde_unload_lock)
{
@@ -224,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
- *
- * Therefore, first process to enter this function does ->release() and
- * signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
@@ -240,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
+
file = pdeo->file;
pde->proc_ops->proc_release(file_inode(file), file);
+
spin_lock(&pde->pde_unload_lock);
- /* After ->release. */
+ /* Strictly after ->proc_release, see above. */
list_del(&pdeo->lh);
c = pdeo->c;
spin_unlock(&pde->pde_unload_lock);
@@ -489,6 +494,9 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (!pde->proc_ops->proc_lseek)
+ file->f_mode &= ~FMODE_LSEEK;
+
if (pde_is_permanent(pde)) {
open = pde->proc_ops->proc_open;
if (open)
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887f..592e6dc7c110 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,7 +15,6 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <linux/uaccess.h>
#include <asm/io.h>
extern wait_queue_head_t log_wait;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..4d3493579458 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 913e5acefbb6..856839b8ae8b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
*
* proc net directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -353,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net)
kgid_t gid;
int err;
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
err = -ENOMEM;
netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
*
* Copyright 1997, Theodore Ts'o
*/
-
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..3c2ee3eb1138 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
*
* proc root directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -305,6 +302,11 @@ void __init proc_root_init(void)
proc_mkdir("bus", NULL);
proc_sys_init();
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
register_filesystem(&proc_fs_type);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..4e0023643f8b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -406,6 +406,7 @@ struct mem_size_stats {
u64 pss_anon;
u64 pss_file;
u64 pss_shmem;
+ u64 pss_dirty;
u64 pss_locked;
u64 swap_pss;
};
@@ -427,6 +428,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
mss->pss_locked += pss;
if (dirty || PageDirty(page)) {
+ mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
else
@@ -525,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false;
+ bool migration = false, young = false, dirty = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
+ young = pte_young(*pte);
+ dirty = pte_dirty(*pte);
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
@@ -558,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
- locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -808,6 +811,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
{
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
if (rollup_mode) {
/*
* These are meaningful only for smaps_rollup, otherwise two of
@@ -860,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_active(vma));
+ hugepage_vma_check(vma, vma->vm_flags, true, false));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1792,7 +1796,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4eaeb645e759..f2aa86c421f2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,7 +25,6 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <linux/uaccess.h>
#include <linux/uio.h>
#include <linux/cc_platform.h>
#include <asm/io.h>
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 49650e54d2f8..846f9455ae22 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -86,7 +86,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
static inline void mangle(struct seq_file *m, const char *s)
{
- seq_escape(m, s, " \t\n\\");
+ seq_escape(m, s, " \t\n\\#");
}
static void show_type(struct seq_file *m, struct super_block *sb)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 28966da7834e..0427b44bfee5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -3002,7 +3002,7 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- if (register_shrinker(&dqcache_shrinker))
+ if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
panic("Cannot register dquot shrinker");
return 0;
diff --git a/fs/read_write.c b/fs/read_write.c
index ea59dd0095c2..1a261dcf1778 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -378,14 +378,13 @@ EXPORT_SYMBOL(rw_verify_area);
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
- struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_init(&iter, READ, &iov, 1, len);
+ iov_iter_ubuf(&iter, READ, buf, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -481,14 +480,13 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
- struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_init(&iter, WRITE, &iov, 1, len);
+ iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 046a513dbc3a..654912d06862 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include "internal.h"
#include <linux/uaccess.h>
@@ -263,9 +264,11 @@ out_error:
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *dax_read_ops)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -325,8 +328,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(file_in, pos_in,
- file_out, pos_out, *len, &is_same);
+ if (*len == 0)
+ return 0;
+
+ if (!IS_DAX(inode_in))
+ ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+ file_out, pos_out, *len, &is_same);
+ else if (dax_read_ops)
+ ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same,
+ dax_read_ops);
+ else
+ return -EINVAL;
if (ret)
return ret;
if (!is_same)
@@ -344,6 +357,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return ret;
}
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, NULL);
+}
EXPORT_SYMBOL(generic_remap_file_range_prep);
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/splice.c b/fs/splice.c
index 93a2c9bf6249..0878b852b355 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -301,11 +301,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct iov_iter to;
struct kiocb kiocb;
- unsigned int i_head;
int ret;
iov_iter_pipe(&to, READ, pipe, len);
- i_head = to.head;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -313,9 +311,8 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
- to.head = i_head;
- to.iov_offset = 0;
- iov_iter_advance(&to, 0); /* to free what was emitted */
+ /* free what was emitted */
+ pipe_discard_from(pipe, to.start_head);
/*
* callers of ->splice_read() expect -EAGAIN on
* "can't put anything in there", rather than -EFAULT.
@@ -1161,39 +1158,40 @@ static int iter_to_pipe(struct iov_iter *from,
};
size_t total = 0;
int ret = 0;
- bool failed = false;
- while (iov_iter_count(from) && !failed) {
+ while (iov_iter_count(from)) {
struct page *pages[16];
- ssize_t copied;
+ ssize_t left;
size_t start;
- int n;
+ int i, n;
- copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
- if (copied <= 0) {
- ret = copied;
+ left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
+ if (left <= 0) {
+ ret = left;
break;
}
- for (n = 0; copied; n++, start = 0) {
- int size = min_t(int, copied, PAGE_SIZE - start);
- if (!failed) {
- buf.page = pages[n];
- buf.offset = start;
- buf.len = size;
- ret = add_to_pipe(pipe, &buf);
- if (unlikely(ret < 0)) {
- failed = true;
- } else {
- iov_iter_advance(from, ret);
- total += ret;
- }
- } else {
- put_page(pages[n]);
+ n = DIV_ROUND_UP(left + start, PAGE_SIZE);
+ for (i = 0; i < n; i++) {
+ int size = min_t(int, left, PAGE_SIZE - start);
+
+ buf.page = pages[i];
+ buf.offset = start;
+ buf.len = size;
+ ret = add_to_pipe(pipe, &buf);
+ if (unlikely(ret < 0)) {
+ iov_iter_revert(from, left);
+ // this one got dropped by add_to_pipe()
+ while (++i < n)
+ put_page(pages[i]);
+ goto out;
}
- copied -= size;
+ total += ret;
+ left -= size;
+ start = 0;
}
}
+out:
return total ? total : ret;
}
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7bd9b8b856d0..477c89a519ee 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,9 +5,9 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8879d052f96c..833aca92301f 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio,
struct squashfs_page_actor *actor,
int offset, int req_length)
{
- void *actor_addr = squashfs_first_page(actor);
+ void *actor_addr;
struct bvec_iter_all iter_all = {};
struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
int copied_bytes = 0;
int actor_offset = 0;
+ squashfs_actor_nobuff(actor);
+ actor_addr = squashfs_first_page(actor);
+
if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
return 0;
@@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio,
bytes_to_copy = min_t(int, bytes_to_copy,
req_length - copied_bytes);
- memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
- bytes_to_copy);
+ if (!IS_ERR(actor_addr))
+ memcpy(actor_addr + actor_offset, bvec_virt(bvec) +
+ offset, bytes_to_copy);
actor_offset += bytes_to_copy;
copied_bytes += bytes_to_copy;
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 1b9ccfd0aa51..19ab60834389 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -20,6 +20,7 @@ struct squashfs_decompressor {
struct bio *, int, int, struct squashfs_page_actor *);
int id;
char *name;
+ int alloc_buffer;
int supported;
};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 7f0904b20329..e56510964b22 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -39,6 +39,7 @@
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "page_actor.h"
/*
* Locate cache slot in range [offset, index] for specified inode. If
@@ -496,7 +497,137 @@ out:
return res;
}
+static int squashfs_readahead_fragment(struct page **page,
+ unsigned int pages, unsigned int expected)
+{
+ struct inode *inode = page[0]->mapping->host;
+ struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
+
+ if (buffer->error)
+ goto out;
+
+ expected += squashfs_i(inode)->fragment_offset;
+
+ for (n = 0; n < pages; n++) {
+ unsigned int base = (page[n]->index & mask) << PAGE_SHIFT;
+ unsigned int offset = base + squashfs_i(inode)->fragment_offset;
+
+ if (expected > offset) {
+ unsigned int avail = min_t(unsigned int, expected -
+ offset, PAGE_SIZE);
+
+ squashfs_fill_page(page[n], buffer, offset, avail);
+ }
+
+ unlock_page(page[n]);
+ put_page(page[n]);
+ }
+
+out:
+ squashfs_cache_put(buffer);
+ return buffer->error;
+}
+
+static void squashfs_readahead(struct readahead_control *ractl)
+{
+ struct inode *inode = ractl->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ size_t mask = (1UL << msblk->block_log) - 1;
+ unsigned short shift = msblk->block_log - PAGE_SHIFT;
+ loff_t start = readahead_pos(ractl) & ~mask;
+ size_t len = readahead_length(ractl) + readahead_pos(ractl) - start;
+ struct squashfs_page_actor *actor;
+ unsigned int nr_pages = 0;
+ struct page **pages;
+ int i, file_end = i_size_read(inode) >> msblk->block_log;
+ unsigned int max_pages = 1UL << shift;
+
+ readahead_expand(ractl, start, (len | mask) + 1);
+
+ pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return;
+
+ for (;;) {
+ pgoff_t index;
+ int res, bsize;
+ u64 block = 0;
+ unsigned int expected;
+
+ nr_pages = __readahead_batch(ractl, pages, max_pages);
+ if (!nr_pages)
+ break;
+
+ if (readahead_pos(ractl) >= i_size_read(inode))
+ goto skip_pages;
+
+ index = pages[0]->index >> shift;
+ if ((pages[nr_pages - 1]->index >> shift) != index)
+ goto skip_pages;
+
+ expected = index == file_end ?
+ (i_size_read(inode) & (msblk->block_size - 1)) :
+ msblk->block_size;
+
+ if (index == file_end && squashfs_i(inode)->fragment_block !=
+ SQUASHFS_INVALID_BLK) {
+ res = squashfs_readahead_fragment(pages, nr_pages,
+ expected);
+ if (res)
+ goto skip_pages;
+ continue;
+ }
+
+ bsize = read_blocklist(inode, index, &block);
+ if (bsize == 0)
+ goto skip_pages;
+
+ actor = squashfs_page_actor_init_special(msblk, pages, nr_pages,
+ expected);
+ if (!actor)
+ goto skip_pages;
+
+ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+ squashfs_page_actor_free(actor);
+
+ if (res == expected) {
+ int bytes;
+
+ /* Last page (if present) may have trailing bytes not filled */
+ bytes = res % PAGE_SIZE;
+ if (pages[nr_pages - 1]->index == file_end && bytes)
+ memzero_page(pages[nr_pages - 1], bytes,
+ PAGE_SIZE - bytes);
+
+ for (i = 0; i < nr_pages; i++) {
+ flush_dcache_page(pages[i]);
+ SetPageUptodate(pages[i]);
+ }
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+
+ kfree(pages);
+ return;
+
+skip_pages:
+ for (i = 0; i < nr_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+}
const struct address_space_operations squashfs_aops = {
- .read_folio = squashfs_read_folio
+ .read_folio = squashfs_read_folio,
+ .readahead = squashfs_readahead
};
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index a4894cc59447..f1ccad519e28 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -18,9 +18,6 @@
#include "squashfs.h"
#include "page_actor.h"
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page, int bytes);
-
/* Read separately compressed datablock directly into page cache */
int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int expected)
@@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = target_page->index & ~mask;
int end_index = start_index | mask;
- int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+ int i, n, pages, bytes, res = -ENOMEM;
struct page **page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -47,50 +44,38 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
if (page == NULL)
return res;
- /*
- * Create a "page actor" which will kmap and kunmap the
- * page cache pages appropriately within the decompressor
- */
- actor = squashfs_page_actor_init_special(page, pages, 0);
- if (actor == NULL)
- goto out;
-
/* Try to grab all the pages covered by the Squashfs block */
- for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+ for (i = 0, n = start_index; n <= end_index; n++) {
page[i] = (n == target_page->index) ? target_page :
grab_cache_page_nowait(target_page->mapping, n);
- if (page[i] == NULL) {
- missing_pages++;
+ if (page[i] == NULL)
continue;
- }
if (PageUptodate(page[i])) {
unlock_page(page[i]);
put_page(page[i]);
- page[i] = NULL;
- missing_pages++;
+ continue;
}
+
+ i++;
}
- if (missing_pages) {
- /*
- * Couldn't get one or more pages, this page has either
- * been VM reclaimed, but others are still in the page cache
- * and uptodate, or we're racing with another thread in
- * squashfs_readpage also trying to grab them. Fall back to
- * using an intermediate buffer.
- */
- res = squashfs_read_cache(target_page, block, bsize, pages,
- page, expected);
- if (res < 0)
- goto mark_errored;
+ pages = i;
+ /*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+ actor = squashfs_page_actor_init_special(msblk, page, pages, expected);
+ if (actor == NULL)
goto out;
- }
/* Decompress directly into the page cache buffers */
res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+ squashfs_page_actor_free(actor);
+
if (res < 0)
goto mark_errored;
@@ -99,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
goto mark_errored;
}
- /* Last page may have trailing bytes not filled */
+ /* Last page (if present) may have trailing bytes not filled */
bytes = res % PAGE_SIZE;
- if (bytes) {
- pageaddr = kmap_atomic(page[pages - 1]);
+ if (page[pages - 1]->index == end_index && bytes) {
+ pageaddr = kmap_local_page(page[pages - 1]);
memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap_atomic(pageaddr);
+ kunmap_local(pageaddr);
}
/* Mark pages as uptodate, unlock and release */
@@ -116,7 +101,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
put_page(page[i]);
}
- kfree(actor);
kfree(page);
return 0;
@@ -135,40 +119,6 @@ mark_errored:
}
out:
- kfree(actor);
kfree(page);
return res;
}
-
-
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page, int bytes)
-{
- struct inode *i = target_page->mapping->host;
- struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
- block, bsize);
- int res = buffer->error, n, offset = 0;
-
- if (res) {
- ERROR("Unable to read page, block %llx, size %x\n", block,
- bsize);
- goto out;
- }
-
- for (n = 0; n < pages && bytes > 0; n++,
- bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
- int avail = min_t(int, bytes, PAGE_SIZE);
-
- if (page[n] == NULL)
- continue;
-
- squashfs_fill_page(page[n], buffer, offset, avail);
- unlock_page(page[n]);
- if (page[n] != target_page)
- put_page(page[n]);
- }
-
-out:
- squashfs_cache_put(buffer);
- return res;
-}
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index b685b6238316..49797729f143 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
buff = stream->output;
while (data) {
if (bytes <= PAGE_SIZE) {
- memcpy(data, buff, bytes);
+ if (!IS_ERR(data))
+ memcpy(data, buff, bytes);
break;
}
- memcpy(data, buff, PAGE_SIZE);
+ if (!IS_ERR(data))
+ memcpy(data, buff, PAGE_SIZE);
buff += PAGE_SIZE;
bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
@@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = {
.decompress = lz4_uncompress,
.id = LZ4_COMPRESSION,
.name = "lz4",
+ .alloc_buffer = 0,
.supported = 1
};
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index cb510a631968..d216aeefa865 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
buff = stream->output;
while (data) {
if (bytes <= PAGE_SIZE) {
- memcpy(data, buff, bytes);
+ if (!IS_ERR(data))
+ memcpy(data, buff, bytes);
break;
} else {
- memcpy(data, buff, PAGE_SIZE);
+ if (!IS_ERR(data))
+ memcpy(data, buff, PAGE_SIZE);
buff += PAGE_SIZE;
bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
@@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = {
.decompress = lzo_uncompress,
.id = LZO_COMPRESSION,
.name = "lzo",
+ .alloc_buffer = 0,
.supported = 1
};
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 520d323a99ce..54b93bf4a25c 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -7,6 +7,8 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
#include "page_actor.h"
/*
@@ -50,6 +52,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
actor->buffer = buffer;
actor->pages = pages;
actor->next_page = 0;
+ actor->tmp_buffer = NULL;
actor->squashfs_first_page = cache_first_page;
actor->squashfs_next_page = cache_next_page;
actor->squashfs_finish_page = cache_finish_page;
@@ -57,40 +60,72 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
}
/* Implementation of page_actor for decompressing directly into page cache. */
+static void *handle_next_page(struct squashfs_page_actor *actor)
+{
+ int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ if (actor->returned_pages == max_pages)
+ return NULL;
+
+ if ((actor->next_page == actor->pages) ||
+ (actor->next_index != actor->page[actor->next_page]->index)) {
+ actor->next_index++;
+ actor->returned_pages++;
+ return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM);
+ }
+
+ actor->next_index++;
+ actor->returned_pages++;
+ return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
+}
+
static void *direct_first_page(struct squashfs_page_actor *actor)
{
- actor->next_page = 1;
- return actor->pageaddr = kmap_atomic(actor->page[0]);
+ return handle_next_page(actor);
}
static void *direct_next_page(struct squashfs_page_actor *actor)
{
- if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ if (actor->pageaddr) {
+ kunmap_local(actor->pageaddr);
+ actor->pageaddr = NULL;
+ }
- return actor->pageaddr = actor->next_page == actor->pages ? NULL :
- kmap_atomic(actor->page[actor->next_page++]);
+ return handle_next_page(actor);
}
static void direct_finish_page(struct squashfs_page_actor *actor)
{
if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ kunmap_local(actor->pageaddr);
}
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
- int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
+ struct page **page, int pages, int length)
{
struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
if (actor == NULL)
return NULL;
+ if (msblk->decompressor->alloc_buffer) {
+ actor->tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (actor->tmp_buffer == NULL) {
+ kfree(actor);
+ return NULL;
+ }
+ } else
+ actor->tmp_buffer = NULL;
+
actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
+ actor->returned_pages = 0;
+ actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
actor->pageaddr = NULL;
+ actor->alloc_buffer = msblk->decompressor->alloc_buffer;
actor->squashfs_first_page = direct_first_page;
actor->squashfs_next_page = direct_next_page;
actor->squashfs_finish_page = direct_finish_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 2e3073ace009..95ffbb543d91 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -6,63 +6,34 @@
* Phillip Lougher <phillip@squashfs.org.uk>
*/
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
-struct squashfs_page_actor {
- void **page;
- int pages;
- int length;
- int next_page;
-};
-
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
- int pages, int length)
-{
- struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
- if (actor == NULL)
- return NULL;
-
- actor->length = length ? : pages * PAGE_SIZE;
- actor->page = page;
- actor->pages = pages;
- actor->next_page = 0;
- return actor;
-}
-
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
- actor->next_page = 1;
- return actor->page[0];
-}
-
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
- return actor->next_page == actor->pages ? NULL :
- actor->page[actor->next_page++];
-}
-
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
- /* empty */
-}
-#else
struct squashfs_page_actor {
union {
void **buffer;
struct page **page;
};
void *pageaddr;
+ void *tmp_buffer;
void *(*squashfs_first_page)(struct squashfs_page_actor *);
void *(*squashfs_next_page)(struct squashfs_page_actor *);
void (*squashfs_finish_page)(struct squashfs_page_actor *);
int pages;
int length;
int next_page;
+ int alloc_buffer;
+ int returned_pages;
+ pgoff_t next_index;
};
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
- **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+ int pages, int length);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(
+ struct squashfs_sb_info *msblk,
+ struct page **page, int pages, int length);
+static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor)
+{
+ kfree(actor->tmp_buffer);
+ kfree(actor);
+}
static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
{
return actor->squashfs_first_page(actor);
@@ -75,5 +46,8 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
{
actor->squashfs_finish_page(actor);
}
-#endif
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+ actor->alloc_buffer = 0;
+}
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6d594ba2ed28..32565dafa7f3 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -29,7 +29,6 @@
#include <linux/module.h>
#include <linux/magic.h>
#include <linux/xattr.h>
-#include <linux/backing-dev.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
return decompressor;
}
-static int squashfs_bdi_init(struct super_block *sb)
-{
- int err;
- unsigned int major = MAJOR(sb->s_dev);
- unsigned int minor = MINOR(sb->s_dev);
-
- bdi_put(sb->s_bdi);
- sb->s_bdi = &noop_backing_dev_info;
-
- err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
- if (err)
- return err;
-
- sb->s_bdi->ra_pages = 0;
- sb->s_bdi->io_pages = 0;
-
- return 0;
-}
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
@@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Entered squashfs_fill_superblock\n");
- /*
- * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
- * squashfs, I/O is not deferred, it is done immediately in read_folio,
- * which means the user would always have to wait their own I/O. So the effect
- * of readahead is very weak for squashfs. squashfs_bdi_init will set
- * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
- * squashfs.
- */
- err = squashfs_bdi_init(sb);
- if (err) {
- errorf(fc, "squashfs init bdi failed");
- return err;
- }
-
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n");
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 68f6d09bb3a2..6c49481a2f8c 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.out_pos = 0;
stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
+ if (IS_ERR(stream->buf.out)) {
+ error = PTR_ERR(stream->buf.out);
+ goto finish;
+ }
for (;;) {
enum xz_ret xz_err;
@@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->buf.out_pos == stream->buf.out_size) {
stream->buf.out = squashfs_next_page(output);
- if (stream->buf.out != NULL) {
+ if (IS_ERR(stream->buf.out)) {
+ error = PTR_ERR(stream->buf.out);
+ break;
+ } else if (stream->buf.out != NULL) {
stream->buf.out_pos = 0;
total += PAGE_SIZE;
}
@@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
squashfs_finish_page(output);
return error ? error : total + stream->buf.out_pos;
@@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = {
.decompress = squashfs_xz_uncompress,
.id = XZ_COMPRESSION,
.name = "xz",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index a20e9042146b..cbb7afe7bc46 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
+ if (IS_ERR(stream->next_out)) {
+ error = PTR_ERR(stream->next_out);
+ goto finish;
+ }
+
for (;;) {
int zlib_err;
@@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->avail_out == 0) {
stream->next_out = squashfs_next_page(output);
- if (stream->next_out != NULL)
+ if (IS_ERR(stream->next_out)) {
+ error = PTR_ERR(stream->next_out);
+ break;
+ } else if (stream->next_out != NULL)
stream->avail_out = PAGE_SIZE;
}
@@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
squashfs_finish_page(output);
if (!error)
@@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = {
.decompress = zlib_uncompress,
.id = ZLIB_COMPRESSION,
.name = "zlib",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index c40445dbf38c..0e407c4d8b3b 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
out_buf.size = PAGE_SIZE;
out_buf.dst = squashfs_first_page(output);
+ if (IS_ERR(out_buf.dst)) {
+ error = PTR_ERR(out_buf.dst);
+ goto finish;
+ }
for (;;) {
size_t zstd_err;
@@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (out_buf.pos == out_buf.size) {
out_buf.dst = squashfs_next_page(output);
- if (out_buf.dst == NULL) {
+ if (IS_ERR(out_buf.dst)) {
+ error = PTR_ERR(out_buf.dst);
+ break;
+ } else if (out_buf.dst == NULL) {
/* Shouldn't run out of pages
* before stream is done.
*/
@@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
+
squashfs_finish_page(output);
return error ? error : total_out;
@@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = {
.decompress = zstd_uncompress,
.id = ZSTD_COMPRESSION,
.name = "zstd",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/super.c b/fs/super.c
index 60f57c7bc0a6..734ed584a946 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
- if (prealloc_shrinker(&s->s_shrink))
+ if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
goto fail;
if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
goto fail;
@@ -423,6 +423,35 @@ bool trylock_super(struct super_block *sb)
}
/**
+ * retire_super - prevents superblock from being reused
+ * @sb: superblock to retire
+ *
+ * The function marks superblock to be ignored in superblock test, which
+ * prevents it from being reused for any new mounts. If the superblock has
+ * a private bdi, it also unregisters it, but doesn't reduce the refcount
+ * of the superblock to prevent potential races. The refcount is reduced
+ * by generic_shutdown_super(). The function can not be called
+ * concurrently with generic_shutdown_super(). It is safe to call the
+ * function multiple times, subsequent calls have no effect.
+ *
+ * The marker will affect the re-use only for block-device-based
+ * superblocks. Other superblocks will still get marked if this function
+ * is used, but that will not affect their reusability.
+ */
+void retire_super(struct super_block *sb)
+{
+ WARN_ON(!sb->s_bdev);
+ down_write(&sb->s_umount);
+ if (sb->s_iflags & SB_I_PERSB_BDI) {
+ bdi_unregister(sb->s_bdi);
+ sb->s_iflags &= ~SB_I_PERSB_BDI;
+ }
+ sb->s_iflags |= SB_I_RETIRED;
+ up_write(&sb->s_umount);
+}
+EXPORT_SYMBOL(retire_super);
+
+/**
* generic_shutdown_super - common helper for ->kill_sb()
* @sb: superblock to kill
*
@@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
- return s->s_bdev == fc->sget_key;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
}
/**
@@ -1288,6 +1317,8 @@ int get_tree_bdev(struct fs_context *fc,
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+ fc->fs_type->name, s->s_id);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, fc);
if (error) {
@@ -1307,7 +1338,7 @@ EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
- return (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
@@ -1363,6 +1394,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+ fs_type->name, s->s_id);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 81d26abf486f..da85b3979195 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -141,6 +141,8 @@ struct tracefs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = TRACEFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
* but traditionally tracefs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int tracefs_apply_options(struct super_block *sb)
+static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct tracefs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
+
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- inode->i_uid = opts->uid;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
+ if (!remount || opts->opts & BIT(Opt_gid)) {
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
+ }
return 0;
}
@@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, true);
fail:
return err;
@@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &tracefs_super_operations;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, false);
return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0978d01b0ea4..d0c9a09988bc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2430,7 +2430,7 @@ static int __init ubifs_init(void)
if (!ubifs_inode_slab)
return -ENOMEM;
- err = register_shrinker(&ubifs_shrinker_info);
+ err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
if (err)
goto out_slab;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index de86f5b2859f..175de70e3adf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(mm, vma, start, vma_end - start, false);
+
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
@@ -1925,10 +1929,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
- features = uffdio_api.features;
- ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
- goto err_out;
+ /* Ignore unsupported features (userspace built against newer kernel) */
+ features = uffdio_api.features & UFFD_API_FEATURES;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b056cfc6398e..03135a1c31b6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,6 +106,7 @@ xfs-y += xfs_log.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_inode_item_recover.o \
+ xfs_iunlink_item.o \
xfs_refcount_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
@@ -129,6 +130,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+# notify failure
+ifeq ($(CONFIG_MEMORY_FAILURE),y)
+xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
+endif
+
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 3e920cf1b454..bb0c700afe3c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -120,18 +120,18 @@ xfs_initialize_perag_data(
for (index = 0; index < agcount; index++) {
/*
- * read the agf, then the agi. This gets us
- * all the information we need and populates the
- * per-ag structures for us.
+ * Read the AGF and AGI buffers to populate the per-ag
+ * structures for us.
*/
- error = xfs_alloc_pagf_init(mp, NULL, index, 0);
- if (error)
+ pag = xfs_perag_get(mp, index);
+ error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+ if (!error)
+ error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ if (error) {
+ xfs_perag_put(pag);
return error;
+ }
- error = xfs_ialloc_pagi_init(mp, NULL, index);
- if (error)
- return error;
- pag = xfs_perag_get(mp, index);
ifree += pag->pagi_freecount;
ialloc += pag->pagi_count;
bfree += pag->pagf_freeblks;
@@ -194,17 +194,76 @@ xfs_free_perag(
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
+{
+ ASSERT(agno < agcount);
+
+ if (agno < agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agblock_t eoag,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks,
xfs_agnumber_t *maxagi)
{
struct xfs_perag *pag;
@@ -263,13 +322,18 @@ xfs_initialize_perag(
if (error)
goto out_remove_pag;
- error = xfs_iunlink_init(pag);
- if (error)
- goto out_hash_destroy;
-
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+
+ /*
+ * Pre-calculated geometry
+ */
+ pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ dblocks);
+ pag->min_block = XFS_AGFL_BLOCK(mp);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -280,8 +344,6 @@ xfs_initialize_perag(
mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
return 0;
-out_hash_destroy:
- xfs_buf_hash_destroy(pag);
out_remove_pag:
radix_tree_delete(&mp->m_perag_tree, index);
out_free_pag:
@@ -293,7 +355,6 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
- xfs_iunlink_destroy(pag);
kmem_free(pag);
}
return error;
@@ -321,12 +382,6 @@ xfs_get_aghdr_buf(
return 0;
}
-static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
-{
- return mp->m_sb.sb_logstart > 0 &&
- id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
-}
-
/*
* Generic btree root block init function
*/
@@ -352,7 +407,7 @@ xfs_freesp_init_recs(
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
struct xfs_alloc_rec *nrec;
xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
mp->m_sb.sb_logstart);
@@ -479,7 +534,7 @@ xfs_rmaproot_init(
}
/* account for the log space */
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
rrec = XFS_RMAP_REC_ADDR(block,
be16_to_cpu(block->bb_numrecs) + 1);
rrec->rm_startblock = cpu_to_be32(
@@ -550,7 +605,7 @@ xfs_agfblock_init(
agf->agf_refcount_blocks = cpu_to_be32(1);
}
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
int64_t logblocks = mp->m_sb.sb_logblocks;
be32_add_cpu(&agf->agf_freeblks, -logblocks);
@@ -761,11 +816,11 @@ xfs_ag_init_headers(
int
xfs_ag_shrink_space(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans **tpp,
- xfs_agnumber_t agno,
xfs_extlen_t delta)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_alloc_arg args = {
.tp = *tpp,
.mp = mp,
@@ -782,14 +837,14 @@ xfs_ag_shrink_space(
xfs_agblock_t aglen;
int error, err2;
- ASSERT(agno == mp->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp);
+ ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
if (error)
return error;
agi = agibp->b_addr;
- error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp);
+ error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
if (error)
return error;
@@ -801,13 +856,14 @@ xfs_ag_shrink_space(
if (delta >= aglen)
return -EINVAL;
- args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
+ args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
/*
* Make sure that the last inode cluster cannot overlap with the new
* end of the AG, even if it's sparse.
*/
- error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
+ error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+ aglen - delta);
if (error)
return error;
@@ -815,7 +871,7 @@ xfs_ag_shrink_space(
* Disable perag reservations so it doesn't cause the allocation request
* to fail. We'll reestablish reservation before we return.
*/
- error = xfs_ag_resv_free(agibp->b_pag);
+ error = xfs_ag_resv_free(pag);
if (error)
return error;
@@ -844,7 +900,7 @@ xfs_ag_shrink_space(
be32_add_cpu(&agi->agi_length, -delta);
be32_add_cpu(&agf->agf_length, -delta);
- err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+ err2 = xfs_ag_resv_init(pag, *tpp);
if (err2) {
be32_add_cpu(&agi->agi_length, delta);
be32_add_cpu(&agf->agf_length, delta);
@@ -868,8 +924,9 @@ xfs_ag_shrink_space(
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
return 0;
+
resv_init_out:
- err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+ err2 = xfs_ag_resv_init(pag, *tpp);
if (!err2)
return error;
resv_err:
@@ -883,9 +940,8 @@ resv_err:
*/
int
xfs_ag_extend_space(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct aghdr_init_data *id,
xfs_extlen_t len)
{
struct xfs_buf *bp;
@@ -893,23 +949,20 @@ xfs_ag_extend_space(
struct xfs_agf *agf;
int error;
- /*
- * Change the agi length.
- */
- error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+ ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+ error = xfs_ialloc_read_agi(pag, tp, &bp);
if (error)
return error;
agi = bp->b_addr;
be32_add_cpu(&agi->agi_length, len);
- ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
- be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
/*
* Change agf length.
*/
- error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &bp);
if (error)
return error;
@@ -924,49 +977,49 @@ xfs_ag_extend_space(
* XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- error = xfs_rmap_free(tp, bp, bp->b_pag,
- be32_to_cpu(agf->agf_length) - len,
+ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
return error;
- return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+ error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
be32_to_cpu(agf->agf_length) - len),
len, &XFS_RMAP_OINFO_SKIP_UPDATE,
XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /* Update perag geometry */
+ pag->block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ return 0;
}
/* Retrieve AG geometry. */
int
xfs_ag_get_geometry(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_ag_geometry *ageo)
{
struct xfs_buf *agi_bp;
struct xfs_buf *agf_bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
- struct xfs_perag *pag;
unsigned int freeblks;
int error;
- if (agno >= mp->m_sb.sb_agcount)
- return -EINVAL;
-
/* Lock the AG headers. */
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
if (error)
return error;
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
if (error)
goto out_agi;
- pag = agi_bp->b_pag;
-
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
- ageo->ag_number = agno;
+ ageo->ag_number = pag->pag_agno;
agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index e411d51c2589..517a138faa66 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -67,6 +67,12 @@ struct xfs_perag {
/* for rcu-safe freeing */
struct rcu_head rcu_head;
+ /* Precalculated geometry info */
+ xfs_agblock_t block_count;
+ xfs_agblock_t min_block;
+ xfs_agino_t agino_min;
+ xfs_agino_t agino_max;
+
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
@@ -97,17 +103,11 @@ struct xfs_perag {
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
- /*
- * Unlinked inode information. This incore information reflects
- * data stored in the AGI, so callers must hold the AGI buffer lock
- * or have some other means to control concurrency.
- */
- struct rhashtable pagi_unlinked_hash;
#endif /* __KERNEL__ */
};
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi);
+ xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
void xfs_free_perag(struct xfs_mount *mp);
@@ -117,6 +117,56 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
void xfs_perag_put(struct xfs_perag *pag);
/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+ if (agbno >= pag->block_count)
+ return false;
+ if (agbno <= pag->min_block)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino < pag->agino_min)
+ return false;
+ if (agino > pag->agino_max)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino == NULLAGINO)
+ return true;
+ return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
+/*
* Perag iteration APIs
*/
static inline struct xfs_perag *
@@ -168,11 +218,10 @@ struct aghdr_init_data {
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
-int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp,
- xfs_agnumber_t agno, xfs_extlen_t delta);
-int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
- struct aghdr_init_data *id, xfs_extlen_t len);
-int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_ag_geometry *ageo);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+ xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+ xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fe94058d4e9e..5af123d13a63 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -322,7 +322,7 @@ out:
* address.
*/
if (has_resv) {
- error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
+ error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
if (error2)
return error2;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d3f2886fdc08..e2bdf089c0a3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_prealloc_blocks(
/*
* The number of blocks per AG that we withhold from xfs_mod_fdblocks to
* guarantee that we can refill the AGFL prior to allocating space in a nearly
- * full AG. Although the the space described by the free space btrees, the
+ * full AG. Although the space described by the free space btrees, the
* blocks used by the freesp btrees themselves, and the blocks owned by the
* AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
* free space in the AG drop so low that the free space btrees cannot refill an
@@ -248,7 +248,7 @@ xfs_alloc_get_rec(
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -263,11 +263,11 @@ xfs_alloc_get_rec(
goto out_bad_rec;
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, *bno))
+ if (!xfs_verify_agbno(pag, *bno))
goto out_bad_rec;
if (*bno > *bno + *len)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
+ if (!xfs_verify_agbno(pag, *bno + *len - 1))
goto out_bad_rec;
return 0;
@@ -275,7 +275,8 @@ xfs_alloc_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Freespace BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
+ cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+ pag->pag_agno);
xfs_warn(mp,
"start block 0x%x block count 0x%x", *bno, *len);
return -EFSCORRUPTED;
@@ -703,20 +704,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
/*
* Read in the allocation group free block array.
*/
-int /* error */
+int
xfs_alloc_read_agfl(
- xfs_mount_t *mp, /* mount point structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* buffer for the ag free block array */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp; /* return value */
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_buf *bp;
+ int error;
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
@@ -1075,7 +1075,8 @@ xfs_alloc_ag_vextent_small(
be32_to_cpu(agf->agf_flcount) <= args->minleft)
goto out;
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+ &fbno, 0);
if (error)
goto error;
if (fbno == NULLAGBLOCK)
@@ -2609,7 +2610,7 @@ xfs_alloc_fix_freelist(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
if (!pag->pagf_init) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2639,7 +2640,7 @@ xfs_alloc_fix_freelist(
* Can fail if we're not blocking on locks, and it's held.
*/
if (!agbp) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2697,7 +2698,7 @@ xfs_alloc_fix_freelist(
else
targs.oinfo = XFS_RMAP_OINFO_AG;
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
- error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+ error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
@@ -2712,7 +2713,7 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
goto out_agbp_relse;
@@ -2741,7 +2742,7 @@ xfs_alloc_fix_freelist(
* Put each allocated block on the list.
*/
for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
- error = xfs_alloc_put_freelist(tp, agbp,
+ error = xfs_alloc_put_freelist(pag, tp, agbp,
agflbp, bno, 0);
if (error)
goto out_agflbp_relse;
@@ -2767,6 +2768,7 @@ out_no_agbp:
*/
int
xfs_alloc_get_freelist(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_agblock_t *bnop,
@@ -2779,7 +2781,6 @@ xfs_alloc_get_freelist(
int error;
uint32_t logflags;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
/*
* Freelist is empty, give up.
@@ -2791,8 +2792,7 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
- &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
return error;
@@ -2807,7 +2807,6 @@ xfs_alloc_get_freelist(
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
pag->pagf_flcount--;
@@ -2868,29 +2867,11 @@ xfs_alloc_log_agf(
}
/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags) /* XFS_ALLOC_FLAGS_... */
-{
- struct xfs_buf *bp;
- int error;
-
- error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
- if (!error)
- xfs_trans_brelse(tp, bp);
- return error;
-}
-
-/*
* Put the block on the freelist for the allocation group.
*/
int
xfs_alloc_put_freelist(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
struct xfs_buf *agflbp,
@@ -2899,21 +2880,22 @@ xfs_alloc_put_freelist(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag;
__be32 *blockp;
int error;
uint32_t logflags;
__be32 *agfl_bno;
int startoff;
- if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
- return error;
+ if (!agflbp) {
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ return error;
+ }
+
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
pag->pagf_flcount++;
@@ -3070,61 +3052,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = {
/*
* Read in the allocation group header (free/alloc section).
*/
-int /* error */
+int
xfs_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_BUF_ */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
- trace_xfs_read_agf(mp, agno);
+ trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+ xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
return 0;
}
/*
- * Read in the allocation group header (free/alloc section).
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
*/
-int /* error */
+int
xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- struct xfs_agf *agf; /* ag freelist header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agfbp;
+ struct xfs_agf *agf;
int error;
int allocbt_blks;
- trace_xfs_alloc_read_agf(mp, agno);
+ trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
- ASSERT(agno != NULLAGNUMBER);
- error = xfs_read_agf(mp, tp, agno,
+ error = xfs_read_agf(pag, tp,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
- bpp);
+ &agfbp);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- agf = (*bpp)->b_addr;
- pag = (*bpp)->b_pag;
+ agf = agfbp->b_addr;
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -3138,7 +3116,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
pag->pagf_init = 1;
- pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+ pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
/*
* Update the in-core allocbt counter. Filter out the rmapbt
@@ -3148,13 +3126,14 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_has_rmapbt(mp))
+ if (xfs_has_rmapbt(pag->pag_mount))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
- atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
+ atomic64_add(allocbt_blks,
+ &pag->pag_mount->m_allocbt_blks);
}
#ifdef DEBUG
- else if (!xfs_is_shutdown(mp)) {
+ else if (!xfs_is_shutdown(pag->pag_mount)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3165,6 +3144,10 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
+ if (agfbpp)
+ *agfbpp = agfbp;
+ else
+ xfs_trans_brelse(tp, agfbp);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 84ca09b2223f..2c3f762dfb58 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -95,6 +95,11 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
xfs_extlen_t need, xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+ xfs_agblock_t bno, int btreeblk);
/*
* Compute and fill in value of m_alloc_maxlevels.
@@ -104,17 +109,6 @@ xfs_alloc_compute_maxlevels(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int /* error */
-xfs_alloc_get_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop, /* block address retrieved from freelist */
- int btreeblk); /* destination is a AGF btree */
-
-/*
* Log the given fields from the agf structure.
*/
void
@@ -124,38 +118,6 @@ xfs_alloc_log_agf(
uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags); /* XFS_ALLOC_FLAGS_... */
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int /* error */
-xfs_alloc_put_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for a.g. freelist header */
- struct xfs_buf *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno, /* block being freed */
- int btreeblk); /* owner was a AGF btree */
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int /* error */
-xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp); /* buffer for the ag freelist header */
-
-/*
* Allocate an extent (variable-size).
*/
int /* error */
@@ -206,10 +168,12 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
-int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **bpp);
int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8c9f73cc0bee..549a3cba0234 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -60,8 +60,8 @@ xfs_allocbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
- &bno, 1);
+ error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp, &bno, 1);
if (error)
return error;
@@ -71,7 +71,7 @@ xfs_allocbt_alloc_block(
}
atomic64_inc(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
new->s = cpu_to_be32(bno);
@@ -89,7 +89,8 @@ xfs_allocbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+ bno, 1);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 224649a76cbb..e28d93d232de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -67,12 +67,10 @@ int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip))
+ if (!xfs_inode_has_attr_fork(ip))
return 0;
- if (!ip->i_afp)
- return 0;
- if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0)
+ if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0)
return 0;
return 1;
}
@@ -85,7 +83,7 @@ bool
xfs_attr_is_leaf(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = ip->i_afp;
+ struct xfs_ifork *ifp = &ip->i_af;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec imap;
@@ -231,7 +229,7 @@ xfs_attr_get_ilocked(
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
- if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
if (xfs_attr_is_leaf(args->dp))
return xfs_attr_leaf_get(args);
@@ -354,7 +352,7 @@ xfs_attr_try_sf_addname(
/*
* Build initial attribute list (if required).
*/
- if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
xfs_attr_shortform_create(args);
error = xfs_attr_shortform_addname(args);
@@ -864,7 +862,7 @@ xfs_attr_lookup(
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_sf_findname(args, NULL, NULL);
if (xfs_attr_is_leaf(dp)) {
@@ -1001,7 +999,7 @@ xfs_attr_set(
* If the inode doesn't have an attribute fork, add one.
* (inode must not be locked when we call this routine)
*/
- if (XFS_IFORK_Q(dp) == 0) {
+ if (xfs_inode_has_attr_fork(dp) == 0) {
int sf_size = sizeof(struct xfs_attr_sf_hdr) +
xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
@@ -1101,7 +1099,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
struct xfs_attr_shortform *sf;
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
return be16_to_cpu(sf->hdr.totsize);
}
@@ -1558,7 +1556,7 @@ xfs_attr_node_get(
* If not in a transaction, we have to release all the buffers.
*/
out_release:
- for (i = 0; state != NULL && i < state->path.active; i++) {
+ for (i = 0; i < state->path.active; i++) {
xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index dfb47fa63c6d..81be9b3e4004 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -560,9 +560,9 @@ static inline bool
xfs_attr_is_shortform(
struct xfs_inode *ip)
{
- return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0);
+ return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0);
}
static inline enum xfs_delattr_state
@@ -573,10 +573,10 @@ xfs_attr_init_add_state(struct xfs_da_args *args)
* next state, the attribute fork may be null. This can occur only occur
* on a pure remove, but we grab the next state before we check if a
* replace operation is being performed. If we are called from any other
- * context, i_afp is guaranteed to exist. Hence if the attr fork is
+ * context, i_af is guaranteed to exist. Hence if the attr fork is
* null, we were called from a pure remove operation and so we are done.
*/
- if (!args->dp->i_afp)
+ if (!xfs_inode_has_attr_fork(args->dp))
return XFS_DAS_DONE;
args->op_flags |= XFS_DA_OP_ADDNAME;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8f47396f8dd2..beee51ad75ce 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -590,7 +590,7 @@ xfs_attr_shortform_bytesfit(
* to real extents, or the delalloc conversion will take care of the
* literal area rebalancing.
*/
- if (bytes <= XFS_IFORK_ASIZE(dp))
+ if (bytes <= xfs_inode_attr_fork_size(dp))
return dp->i_forkoff;
/*
@@ -682,7 +682,7 @@ xfs_attr_shortform_create(
struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
- struct xfs_ifork *ifp = dp->i_afp;
+ struct xfs_ifork *ifp = &dp->i_af;
struct xfs_attr_sf_hdr *hdr;
trace_xfs_attr_sf_create(args);
@@ -719,7 +719,7 @@ xfs_attr_sf_findname(
int end;
int i;
- sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
sfe = &sf->list[0];
end = sf->hdr.count;
for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
@@ -764,7 +764,7 @@ xfs_attr_shortform_add(
mp = dp->i_mount;
dp->i_forkoff = forkoff;
- ifp = dp->i_afp;
+ ifp = &dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
@@ -797,11 +797,9 @@ xfs_attr_fork_remove(
struct xfs_inode *ip,
struct xfs_trans *tp)
{
- ASSERT(ip->i_afp->if_nextents == 0);
+ ASSERT(ip->i_af.if_nextents == 0);
- xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- ip->i_afp = NULL;
+ xfs_ifork_zap_attr(ip);
ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -825,7 +823,7 @@ xfs_attr_sf_removename(
dp = args->dp;
mp = dp->i_mount;
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
@@ -889,7 +887,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
trace_xfs_attr_sf_lookup(args);
- ifp = args->dp->i_afp;
+ ifp = &args->dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
@@ -917,8 +915,8 @@ xfs_attr_shortform_getvalue(
struct xfs_attr_sf_entry *sfe;
int i;
- ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+ ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
sfe = xfs_attr_sf_nextentry(sfe), i++) {
@@ -948,7 +946,7 @@ xfs_attr_shortform_to_leaf(
trace_xfs_attr_sf_to_leaf(args);
dp = args->dp;
- ifp = dp->i_afp;
+ ifp = &dp->i_af;
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
@@ -1055,8 +1053,8 @@ xfs_attr_shortform_verify(
int i;
int64_t size;
- ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+ ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = ifp->if_bytes;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 7298c148f848..d440393b40eb 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -543,6 +543,7 @@ xfs_attr_rmtval_stale(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buf *bp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -550,14 +551,18 @@ xfs_attr_rmtval_stale(
XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
return -EFSCORRUPTED;
- bp = xfs_buf_incore(mp->m_ddev_targp,
+ error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
- XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
+ XFS_FSB_TO_BB(mp, map->br_blockcount),
+ incore_flags, &bp);
+ if (error) {
+ if (error == -ENOENT)
+ return 0;
+ return error;
}
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6833110d1bd4..e56723dc9cd5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -128,7 +128,7 @@ xfs_bmbt_lookup_first(
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
return whichfork != XFS_COW_FORK &&
ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
@@ -140,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
return whichfork != XFS_COW_FORK &&
ifp->if_format == XFS_DINODE_FMT_BTREE &&
@@ -319,7 +319,7 @@ xfs_bmap_check_leaf_extents(
int whichfork) /* data or attr fork */
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
struct xfs_buf *bp; /* buffer for "block" */
@@ -538,7 +538,7 @@ xfs_bmap_btree_to_extents(
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
@@ -616,7 +616,7 @@ xfs_bmap_extents_to_btree(
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
/*
@@ -745,7 +745,7 @@ xfs_bmap_local_to_extents_empty(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
@@ -785,7 +785,7 @@ xfs_bmap_local_to_extents(
* So sending the data fork of a regular inode is invalid.
*/
ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
@@ -880,7 +880,7 @@ xfs_bmap_add_attrfork_btree(
mp = ip->i_mount;
- if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip))
+ if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -920,7 +920,7 @@ xfs_bmap_add_attrfork_extents(
int error; /* error return value */
if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
- XFS_IFORK_DSIZE(ip))
+ xfs_inode_data_fork_size(ip))
return 0;
cur = NULL;
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
@@ -951,7 +951,7 @@ xfs_bmap_add_attrfork_local(
{
struct xfs_da_args dargs; /* args for dir/attr code */
- if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+ if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
return 0;
if (S_ISDIR(VFS_I(ip)->i_mode)) {
@@ -1023,7 +1023,7 @@ xfs_bmap_add_attrfork(
int logflags; /* logging flags */
int error; /* error return value */
- ASSERT(XFS_IFORK_Q(ip) == 0);
+ ASSERT(xfs_inode_has_attr_fork(ip) == 0);
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -1034,16 +1034,15 @@ xfs_bmap_add_attrfork(
rsvd, &tp);
if (error)
return error;
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
goto trans_cancel;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
goto trans_cancel;
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
logflags = 0;
switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_LOCAL:
@@ -1116,7 +1115,7 @@ xfs_iread_bmbt_block(
xfs_extnum_t num_recs;
xfs_extnum_t j;
int whichfork = cur->bc_ino.whichfork;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
block = xfs_btree_get_block(cur, level, &bp);
@@ -1164,7 +1163,7 @@ xfs_iread_extents(
int whichfork)
{
struct xfs_iread_state ir;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur;
int error;
@@ -1208,7 +1207,7 @@ xfs_bmap_first_unused(
xfs_fileoff_t *first_unused, /* unused block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
xfs_fileoff_t lastaddr = 0;
@@ -1255,7 +1254,7 @@ xfs_bmap_last_before(
xfs_fileoff_t *last_block, /* last block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
int error;
@@ -1289,7 +1288,7 @@ xfs_bmap_last_extent(
struct xfs_bmbt_irec *rec,
int *is_empty)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
int error;
@@ -1355,7 +1354,7 @@ xfs_bmap_last_offset(
xfs_fileoff_t *last_block,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec rec;
int is_empty;
int error;
@@ -1389,7 +1388,7 @@ xfs_bmap_add_extent_delay_real(
int whichfork)
{
struct xfs_mount *mp = bma->ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
int i; /* temp state */
@@ -1955,7 +1954,7 @@ xfs_bmap_add_extent_unwritten_real(
*logflagsp = 0;
cur = *curp;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2480,7 +2479,7 @@ xfs_bmap_add_extent_hole_delay(
uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
@@ -2616,7 +2615,7 @@ xfs_bmap_add_extent_hole_real(
int *logflagsp,
uint32_t flags)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
@@ -3185,7 +3184,8 @@ xfs_bmap_longest_free_extent(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+ NULL);
if (error) {
/* Couldn't lock the AGF, so skip this AG. */
if (error == -EAGAIN) {
@@ -3866,7 +3866,7 @@ xfs_bmapi_read(
{
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
@@ -3959,7 +3959,7 @@ xfs_bmapi_reserve_delalloc(
int eof)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
int error;
@@ -4086,7 +4086,7 @@ xfs_bmapi_allocate(
{
struct xfs_mount *mp = bma->ip->i_mount;
int whichfork = xfs_bmapi_whichfork(bma->flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4185,7 +4185,7 @@ xfs_bmapi_convert_unwritten(
uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4262,7 +4262,7 @@ xfs_bmapi_minleft(
struct xfs_inode *ip,
int fork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork);
if (tp && tp->t_firstblock != NULLFSBLOCK)
return 0;
@@ -4283,7 +4283,7 @@ xfs_bmapi_finish(
int whichfork,
int error)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
ifp->if_format != XFS_DINODE_FMT_EXTENTS)
@@ -4322,7 +4322,7 @@ xfs_bmapi_write(
};
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4503,7 +4503,7 @@ xfs_bmapi_convert_delalloc(
struct iomap *iomap,
unsigned int *seq)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
@@ -4640,7 +4640,7 @@ xfs_bmapi_remap(
int whichfork = xfs_bmapi_whichfork(flags);
int logflags = 0, error;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4797,7 +4797,7 @@ xfs_bmap_del_extent_delay(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
@@ -4924,7 +4924,7 @@ xfs_bmap_del_extent_cow(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
uint32_t state = BMAP_COWFORK;
@@ -5022,7 +5022,7 @@ xfs_bmap_del_extent_real(
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(del->br_blockcount > 0);
xfs_iext_get_extent(ifp, icur, &got);
ASSERT(got.br_startoff <= del->br_startoff);
@@ -5288,7 +5288,7 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
if (xfs_is_shutdown(mp))
@@ -5629,7 +5629,7 @@ xfs_bmse_merge(
struct xfs_btree_cur *cur,
int *logflags) /* output */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5750,7 +5750,7 @@ xfs_bmap_collapse_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, prev;
struct xfs_iext_cursor icur;
@@ -5865,7 +5865,7 @@ xfs_bmap_insert_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, next;
struct xfs_iext_cursor icur;
@@ -5965,7 +5965,7 @@ xfs_bmap_split_extent(
xfs_fileoff_t split_fsb)
{
int whichfork = XFS_DATA_FORK;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec new; /* split extent */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 2b77d45c215f..cfa052d40105 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -550,7 +550,7 @@ xfs_bmbt_init_cursor(
struct xfs_inode *ip, /* inode owning the btree */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
@@ -564,7 +564,7 @@ xfs_bmbt_init_cursor(
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.ip = ip;
cur->bc_ino.allocated = 0;
cur->bc_ino.flags = 0;
@@ -664,7 +664,7 @@ xfs_bmbt_change_owner(
ASSERT(tp || buffer_list);
ASSERT(!(tp && buffer_list));
- ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
+ ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2eecc49fc1b2..4c16c8c31fcb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -91,10 +91,9 @@ xfs_btree_check_lblock_siblings(
static inline xfs_failaddr_t
xfs_btree_check_sblock_siblings(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_btree_cur *cur,
int level,
- xfs_agnumber_t agno,
xfs_agblock_t agbno,
__be32 dsibling)
{
@@ -110,7 +109,7 @@ xfs_btree_check_sblock_siblings(
if (!xfs_btree_check_sptr(cur, sibling, level + 1))
return __this_address;
} else {
- if (!xfs_verify_agbno(mp, agno, sibling))
+ if (!xfs_verify_agbno(pag, sibling))
return __this_address;
}
return NULL;
@@ -195,11 +194,11 @@ __xfs_btree_check_sblock(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
xfs_failaddr_t fa;
xfs_agblock_t agbno = NULLAGBLOCK;
- xfs_agnumber_t agno = NULLAGNUMBER;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -217,16 +216,14 @@ __xfs_btree_check_sblock(
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (bp) {
+ if (bp)
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
- agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
- }
- fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
- agbno, block->bb_u.s.bb_rightsib);
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_rightsib);
return fa;
}
@@ -288,7 +285,7 @@ xfs_btree_check_sptr(
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
+ return xfs_verify_agbno(cur->bc_ag.pag, agbno);
}
/*
@@ -725,7 +722,7 @@ xfs_btree_ifork_ptr(
if (cur->bc_flags & XFS_BTREE_STAGING)
return cur->bc_ino.ifake->if_fork;
- return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
+ return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
}
/*
@@ -3559,7 +3556,7 @@ xfs_btree_kill_iroot(
{
int whichfork = cur->bc_ino.whichfork;
struct xfs_inode *ip = cur->bc_ino.ip;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
union xfs_btree_key *kp;
@@ -4595,7 +4592,6 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_failaddr_t fa;
@@ -4604,12 +4600,11 @@ xfs_btree_sblock_verify(
return __this_address;
/* sibling pointer verification */
- agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
- fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
- fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
block->bb_u.s.bb_rightsib);
return fa;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 3cd51fa3837b..76eedc2756b3 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -193,7 +193,7 @@ xfs_dir_isempty(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_disk_size == 0) /* might happen during shutdown. */
return 1;
- if (dp->i_disk_size > XFS_IFORK_DSIZE(dp))
+ if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
return !sfp->count;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index df0869bba275..00f960a703b2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -842,7 +842,7 @@ xfs_dir2_block_removename(
* See if the size as a shortform is good enough.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
/*
@@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block(
* Now see if the resulting block can be shrunken to shortform.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
@@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block(
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_da_geometry *geo = args->geo;
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5a97a87eaa20..003812fd7d35 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -237,7 +237,7 @@ xfs_dir2_block_sfsize(
(i8count ? /* inumber */
count * XFS_INO64_SIZE :
count * XFS_INO32_SIZE);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return size; /* size value is a failure */
}
/*
@@ -406,7 +406,7 @@ xfs_dir2_sf_addname(
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
*/
- if (new_isize > XFS_IFORK_DSIZE(dp) ||
+ if (new_isize > xfs_inode_data_fork_size(dp) ||
(pick =
xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
/*
@@ -710,7 +710,7 @@ xfs_dir2_sf_verify(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
@@ -1031,7 +1031,7 @@ xfs_dir2_sf_replace_needblock(
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
- sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+ sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index afdfc8108c5f..b55bdfa9c8a8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -704,7 +704,7 @@ struct xfs_agfl {
* When the bigtime feature is enabled, ondisk inode timestamps become an
* unsigned 64-bit nanoseconds counter. This means that the bigtime inode
* timestamp epoch is the start of the classic timestamp range, which is
- * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers
* /must/ use the bigtime conversion functions when encoding and decoding raw
* timestamps.
*/
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index bf2f4bc89193..6cdfd64bc56b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -105,7 +105,6 @@ xfs_inobt_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -116,7 +115,7 @@ xfs_inobt_get_rec(
xfs_inobt_btrec_to_irec(mp, rec, irec);
- if (!xfs_verify_agino(mp, agno, irec->ir_startino))
+ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
goto out_bad_rec;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
irec->ir_count > XFS_INODES_PER_CHUNK)
@@ -137,7 +136,8 @@ xfs_inobt_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Inode BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno);
+ cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+ cur->bc_ag.pag->pag_agno);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag(
return false;
if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno);
+ error = xfs_ialloc_read_agi(pag, tp, NULL);
if (error)
return false;
}
@@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag(
return false;
if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags);
+ error = xfs_alloc_read_agf(pag, tp, flags, NULL);
if (error)
return false;
}
@@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag(
* Then read in the AGI buffer and recheck with the AGI buffer
* lock held.
*/
- error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
if (error)
return error;
@@ -2169,7 +2169,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2215,7 +2215,7 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
@@ -2571,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = {
*/
int
xfs_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
+ struct xfs_mount *mp = pag->pag_mount;
int error;
- trace_xfs_read_agi(mp, agno);
+ trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
if (error)
return error;
if (tp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
+ xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
- xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+ xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
return 0;
}
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
int
xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
- struct xfs_agi *agi; /* allocation group header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
int error;
- trace_xfs_ialloc_read_agi(mp, agno);
+ trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
- error = xfs_read_agi(mp, tp, agno, bpp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
return error;
- agi = (*bpp)->b_addr;
- pag = (*bpp)->b_pag;
+ agi = agibp->b_addr;
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2623,27 +2624,11 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(mp));
- return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno) /* allocation group number */
-{
- struct xfs_buf *bp = NULL;
- int error;
-
- error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
- if (error)
- return error;
- if (bp)
- xfs_trans_brelse(tp, bp);
+ xfs_is_shutdown(pag->pag_mount));
+ if (agibpp)
+ *agibpp = agibp;
+ else
+ xfs_trans_brelse(tp, agibp);
return 0;
}
@@ -2912,8 +2897,7 @@ xfs_ialloc_calc_rootino(
* allocation group, or very odd geometries created by old mkfs
* versions on very small filesystems.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ if (xfs_ag_contains_log(mp, 0))
first_bno += mp->m_sb.sb_logblocks;
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index a7705b6a1fd3..9bbbca6ac4ed 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -62,25 +62,10 @@ xfs_ialloc_log_agi(
struct xfs_buf *bp, /* allocation group header buffer */
uint32_t fields); /* bitmask of fields to log */
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int /* error */
-xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp); /* allocation group hdr buf */
-
-/*
- * Read in the allocation group header to initialise the per-ag data
- * in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno); /* allocation group number */
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
/*
* Lookup a record by ino in the btree given by cur.
@@ -102,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
-int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b2ad2fdc40f5..8c83e265770c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -683,10 +683,10 @@ xfs_inobt_rec_check_count(
static xfs_extlen_t
xfs_inobt_max_size(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
- xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_agblock_t agblocks = pag->block_count;
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -697,8 +697,7 @@ xfs_inobt_max_size(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -722,7 +721,7 @@ xfs_inobt_cur(
ASSERT(*agi_bpp == NULL);
ASSERT(*curpp == NULL);
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
@@ -757,16 +756,15 @@ xfs_inobt_count_blocks(
/* Read finobt block count from AGI header. */
static int
xfs_finobt_read_blocks(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp;
struct xfs_agi *agi;
int error;
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error)
return error;
@@ -794,14 +792,14 @@ xfs_finobt_calc_reserves(
return 0;
if (xfs_has_inobtcounts(mp))
- error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len);
+ error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
&tree_len);
if (error)
return error;
- *ask += xfs_inobt_max_size(mp, pag->pag_agno);
+ *ask += xfs_inobt_max_size(pag);
*used += tree_len;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3b1b63f9d886..758aacd8166b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_inode.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
@@ -41,14 +42,12 @@ xfs_inode_buf_verify(
bool readahead)
{
struct xfs_mount *mp = bp->b_mount;
- xfs_agnumber_t agno;
int i;
int ni;
/*
* Validate the magic number and version of every inode in the buffer
*/
- agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
struct xfs_dinode *dip;
@@ -59,7 +58,7 @@ xfs_inode_buf_verify(
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- xfs_verify_agino_or_null(mp, agno, unlinked_ino);
+ xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -178,7 +177,6 @@ xfs_inode_from_disk(
xfs_failaddr_t fa;
ASSERT(ip->i_cowfp == NULL);
- ASSERT(ip->i_afp == NULL);
fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
if (fa) {
@@ -230,7 +228,8 @@ xfs_inode_from_disk(
ip->i_nblocks = be64_to_cpu(from->di_nblocks);
ip->i_extsize = be32_to_cpu(from->di_extsize);
ip->i_forkoff = from->di_forkoff;
- ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
if (from->di_dmevmask || from->di_dmstate)
xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
@@ -286,7 +285,7 @@ xfs_inode_to_disk_iext_counters(
{
if (xfs_inode_has_large_extent_counts(ip)) {
to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
- to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
/*
* We might be upgrading the inode to use larger extent counters
* than was previously used. Hence zero the unused field.
@@ -294,7 +293,7 @@ xfs_inode_to_disk_iext_counters(
to->di_nrext64_pad = cpu_to_be16(0);
} else {
to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
- to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
}
}
@@ -326,7 +325,7 @@ xfs_inode_to_disk(
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
to->di_forkoff = ip->i_forkoff;
- to->di_aformat = xfs_ifork_format(ip->i_afp);
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
to->di_flags = cpu_to_be16(ip->i_diflags);
if (xfs_has_v3inodes(ip->i_mount)) {
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 1a4cdf550f6d..9327a4f39206 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -35,7 +35,7 @@ xfs_init_local_fork(
const void *data,
int64_t size)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int mem_size = size;
bool zero_terminate;
@@ -102,7 +102,7 @@ xfs_iformat_extents(
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
@@ -173,7 +173,7 @@ xfs_iformat_btree(
int size;
int level;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
size = XFS_BMAP_BROOT_SPACE(mp, dfp);
nrecs = be16_to_cpu(dfp->bb_numrecs);
@@ -276,17 +276,23 @@ xfs_dfork_attr_shortform_size(
return be16_to_cpu(atp->hdr.totsize);
}
-struct xfs_ifork *
-xfs_ifork_alloc(
+void
+xfs_ifork_init_attr(
+ struct xfs_inode *ip,
enum xfs_dinode_fmt format,
xfs_extnum_t nextents)
{
- struct xfs_ifork *ifp;
+ ip->i_af.if_format = format;
+ ip->i_af.if_nextents = nextents;
+}
- ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
- ifp->if_format = format;
- ifp->if_nextents = nextents;
- return ifp;
+void
+xfs_ifork_zap_attr(
+ struct xfs_inode *ip)
+{
+ xfs_idestroy_fork(&ip->i_af);
+ memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
}
int
@@ -301,9 +307,9 @@ xfs_iformat_attr_fork(
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
- ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
+ xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_LOCAL:
error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
xfs_dfork_attr_shortform_size(dip));
@@ -323,10 +329,8 @@ xfs_iformat_attr_fork(
break;
}
- if (error) {
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- ip->i_afp = NULL;
- }
+ if (error)
+ xfs_ifork_zap_attr(ip);
return error;
}
@@ -370,7 +374,7 @@ xfs_iroot_realloc(
return;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
if (rec_diff > 0) {
/*
* If there wasn't any memory allocated before, just
@@ -400,7 +404,7 @@ xfs_iroot_realloc(
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
return;
}
@@ -454,7 +458,7 @@ xfs_iroot_realloc(
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
return;
}
@@ -480,11 +484,11 @@ xfs_idata_realloc(
int64_t byte_diff,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int64_t new_size = ifp->if_bytes + byte_diff;
ASSERT(new_size >= 0);
- ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
if (byte_diff == 0)
return;
@@ -539,7 +543,7 @@ xfs_iextents_copy(
int whichfork)
{
int state = xfs_bmap_fork_to_state(whichfork);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec rec;
int64_t copied = 0;
@@ -591,7 +595,7 @@ xfs_iflush_fork(
if (!iip)
return;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
/*
* This can happen if we gave up in iformat in an error path,
* for the attribute fork.
@@ -607,7 +611,7 @@ xfs_iflush_fork(
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
}
break;
@@ -626,7 +630,7 @@ xfs_iflush_fork(
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -656,7 +660,7 @@ xfs_iext_state_to_fork(
if (state & BMAP_COWFORK)
return ip->i_cowfp;
else if (state & BMAP_ATTRFORK)
- return ip->i_afp;
+ return &ip->i_af;
return &ip->i_df;
}
@@ -707,18 +711,17 @@ int
xfs_ifork_verify_local_attr(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = ip->i_afp;
+ struct xfs_ifork *ifp = &ip->i_af;
xfs_failaddr_t fa;
- if (!ifp)
+ if (!xfs_inode_has_attr_fork(ip))
fa = __this_address;
else
fa = xfs_attr_shortform_verify(ip);
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp ? ifp->if_u1.if_data : NULL,
- ifp ? ifp->if_bytes : 0, fa);
+ ifp->if_u1.if_data, ifp->if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -731,7 +734,7 @@ xfs_iext_count_may_overflow(
int whichfork,
int nr_to_add)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
uint64_t max_exts;
uint64_t nr_exts;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 4f68c1f20beb..d3943d6ad0b9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -77,28 +77,8 @@ struct xfs_ifork {
/*
* Fork handling.
*/
-
-#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0)
-#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- &(ip)->i_df : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_afp : \
- (ip)->i_cowfp))
-#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
-#define XFS_IFORK_SIZE(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_IFORK_DSIZE(ip) : \
- ((w) == XFS_ATTR_FORK ? \
- XFS_IFORK_ASIZE(ip) : \
- 0))
#define XFS_IFORK_MAXEXT(ip, w) \
- (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+ (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
{
@@ -179,8 +159,9 @@ xfs_dfork_nextents(
return 0;
}
-struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
- xfs_extnum_t nextents);
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 97e9e6020596..64b910caafaa 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -111,7 +111,7 @@ xfs_refcount_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
xfs_agblock_t realstart;
@@ -121,8 +121,6 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
-
- agno = cur->bc_ag.pag->pag_agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
@@ -137,22 +135,23 @@ xfs_refcount_get_rec(
}
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, realstart))
+ if (!xfs_verify_agbno(pag, realstart))
goto out_bad_rec;
if (realstart > realstart + irec->rc_blockcount)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1))
+ if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1))
goto out_bad_rec;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
return 0;
out_bad_rec:
xfs_warn(mp,
- "Refcount BTree record corruption in AG %d detected!", agno);
+ "Refcount BTree record corruption in AG %d detected!",
+ pag->pag_agno);
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -1177,8 +1176,8 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno,
- XFS_ALLOC_FLAG_FREEING, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+ &agbp);
if (error)
goto out_drop;
@@ -1710,7 +1709,7 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
goto out_trans;
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d14c1720b0fb..316c1ec0c3c2 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -493,7 +493,7 @@ xfs_refcountbt_calc_reserves(
if (!xfs_has_reflink(mp))
return 0;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
@@ -507,8 +507,7 @@ xfs_refcountbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 2845019d31da..094dfc897ebc 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -215,7 +215,7 @@ xfs_rmap_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -235,12 +235,12 @@ xfs_rmap_get_rec(
goto out_bad_rec;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, irec->rm_startblock))
+ if (!xfs_verify_agbno(pag, irec->rm_startblock))
goto out_bad_rec;
if (irec->rm_startblock >
irec->rm_startblock + irec->rm_blockcount)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno,
+ if (!xfs_verify_agbno(pag,
irec->rm_startblock + irec->rm_blockcount - 1))
goto out_bad_rec;
}
@@ -254,7 +254,7 @@ xfs_rmap_get_rec(
out_bad_rec:
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected!",
- agno);
+ pag->pag_agno);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 69e104d0277f..7f83f62e51e0 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
+ error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
@@ -129,7 +129,7 @@ xfs_rmapbt_free_block(
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
@@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves(
if (!xfs_has_rmapbt(mp))
return 0;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
@@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8b9bd178a487..bdc777b9ec4a 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -204,7 +204,7 @@ xfs_failaddr_t
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
char *sfp = (char *)ifp->if_u1.if_data;
int size = ifp->if_bytes;
char *endp = sfp + size;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index e9913c2c5a24..2c4ad6e4bb14 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -515,7 +515,7 @@ xfs_calc_remove_reservation(
{
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_iunlink_add_reservation(mp) +
- max((xfs_calc_inode_res(mp, 1) +
+ max((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index e810d23f2d97..5c2765934732 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -13,25 +13,13 @@
#include "xfs_mount.h"
#include "xfs_ag.h"
-/* Find the size of the AG, in blocks. */
-inline xfs_agblock_t
-xfs_ag_block_count(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- ASSERT(agno < mp->m_sb.sb_agcount);
-
- if (agno < mp->m_sb.sb_agcount - 1)
- return mp->m_sb.sb_agblocks;
- return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
-}
/*
* Verify that an AG block number pointer neither points outside the AG
* nor points at static metadata.
*/
-inline bool
-xfs_verify_agbno(
+static inline bool
+xfs_verify_agno_agbno(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t agbno)
@@ -59,7 +47,7 @@ xfs_verify_fsbno(
if (agno >= mp->m_sb.sb_agcount)
return false;
- return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+ return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
}
/*
@@ -85,40 +73,12 @@ xfs_verify_fsbext(
XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
}
-/* Calculate the first and last possible inode number in an AG. */
-inline void
-xfs_agino_range(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t *first,
- xfs_agino_t *last)
-{
- xfs_agblock_t bno;
- xfs_agblock_t eoag;
-
- eoag = xfs_ag_block_count(mp, agno);
-
- /*
- * Calculate the first inode, which will be in the first
- * cluster-aligned block after the AGFL.
- */
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
- *first = XFS_AGB_TO_AGINO(mp, bno);
-
- /*
- * Calculate the last inode, which will be at the end of the
- * last (aligned) cluster that can be allocated in the AG.
- */
- bno = round_down(eoag, M_IGEO(mp)->cluster_align);
- *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
-}
-
/*
* Verify that an AG inode number pointer neither points outside the AG
* nor points at static metadata.
*/
-inline bool
-xfs_verify_agino(
+static inline bool
+xfs_verify_agno_agino(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agino_t agino)
@@ -131,19 +91,6 @@ xfs_verify_agino(
}
/*
- * Verify that an AG inode number pointer neither points outside the AG
- * nor points at static metadata, or is NULLAGINO.
- */
-bool
-xfs_verify_agino_or_null(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino)
-{
- return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
-}
-
-/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -159,7 +106,7 @@ xfs_verify_ino(
return false;
if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
return false;
- return xfs_verify_agino(mp, agno, agino);
+ return xfs_verify_agno_agino(mp, agno, agino);
}
/* Is this an internal inode number? */
@@ -229,12 +176,8 @@ xfs_icount_range(
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for_each_perag(mp, agno, pag) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, agno, &first, &last);
- nr_inos += last - first + 1;
- }
+ for_each_perag(mp, agno, pag)
+ nr_inos += pag->agino_max - pag->agino_min + 1;
*max = nr_inos;
}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 373f64a492a4..a6b7d98cf68f 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -179,19 +179,10 @@ enum xfs_ag_resv_type {
*/
struct xfs_mount;
-xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
-bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno);
bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
-void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t *first, xfs_agino_t *last);
-bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
-bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90aebfe9dc5f..b7b838bd4ba4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -541,16 +541,16 @@ xchk_agf(
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
@@ -563,7 +563,7 @@ xchk_agf(
if (xfs_has_rmapbt(mp)) {
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
@@ -573,7 +573,7 @@ xchk_agf(
if (xfs_has_reflink(mp)) {
agbno = be32_to_cpu(agf->agf_refcount_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
@@ -639,9 +639,8 @@ xchk_agfl_block(
{
struct xchk_agfl_info *sai = priv;
struct xfs_scrub *sc = sai->sc;
- xfs_agnumber_t agno = sc->sa.pag->pag_agno;
- if (xfs_verify_agbno(mp, agno, agbno) &&
+ if (xfs_verify_agbno(sc->sa.pag, agbno) &&
sai->nr_entries < sai->sz_entries)
sai->entries[sai->nr_entries++] = agbno;
else
@@ -871,12 +870,12 @@ xchk_agi(
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check btree roots and levels */
agbno = be32_to_cpu(agi->agi_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
@@ -885,7 +884,7 @@ xchk_agi(
if (xfs_has_finobt(mp)) {
agbno = be32_to_cpu(agi->agi_free_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
@@ -902,17 +901,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 6da7f2ca77de..1b0b4e243f77 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -106,7 +106,7 @@ xrep_agf_check_agfl_block(
{
struct xfs_scrub *sc = priv;
- if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno))
+ if (!xfs_verify_agbno(sc->sa.pag, agbno))
return -EFSCORRUPTED;
return 0;
}
@@ -130,10 +130,7 @@ xrep_check_btree_root(
struct xfs_scrub *sc,
struct xrep_find_ag_btree *fab)
{
- struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sc->sm->sm_agno;
-
- return xfs_verify_agbno(mp, agno, fab->root) &&
+ return xfs_verify_agbno(sc->sa.pag, fab->root) &&
fab->height <= fab->maxlevels;
}
@@ -201,8 +198,7 @@ xrep_agf_init_header(
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp,
- sc->sa.pag->pag_agno));
+ agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
@@ -405,7 +401,7 @@ xrep_agf(
* btrees rooted in the AGF. If the AGFL contents are obviously bad
* then we'll bail out.
*/
- error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp);
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
if (error)
return error;
@@ -666,8 +662,7 @@ xrep_agfl(
* nothing wrong with the AGF, but all the AG header repair functions
* have this chicken-and-egg problem.
*/
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
- &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -742,8 +737,7 @@ xrep_agi_find_btrees(
int error;
/* Read the AGF. */
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
- &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -782,8 +776,7 @@ xrep_agi_init_header(
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp,
- sc->sa.pag->pag_agno));
+ agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_has_crc(mp))
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 87518e1292f8..ab427b4d7fe0 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -93,8 +93,7 @@ xchk_allocbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
xfs_extlen_t len;
@@ -102,8 +101,8 @@ xchk_allocbt_rec(
len = be32_to_cpu(rec->alloc.ar_blockcount);
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 285995ba3947..f0b9cb6506fd 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -377,7 +377,7 @@ xchk_bmapbt_rec(
struct xfs_inode *ip = bs->cur->bc_ino.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork);
uint64_t owner;
int i;
@@ -426,7 +426,7 @@ xchk_bmap_btree(
struct xchk_bmap_info *info)
{
struct xfs_owner_info oinfo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
struct xfs_btree_cur *cur;
@@ -478,7 +478,7 @@ xchk_bmap_check_rmap(
return 0;
/* Now look up the bmbt record. */
- ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork);
if (!ifp) {
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
@@ -540,7 +540,7 @@ xchk_bmap_check_ag_rmaps(
struct xfs_buf *agf;
int error;
- error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf);
if (error)
return error;
@@ -563,7 +563,7 @@ xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_perag *pag;
xfs_agnumber_t agno;
bool zero_size;
@@ -578,7 +578,7 @@ xchk_bmap_check_rmaps(
if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
return 0;
- ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL);
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
/*
* Only do this for complex maps that are in btree format, or for
@@ -624,7 +624,7 @@ xchk_bmap(
struct xchk_bmap_info info = { NULL };
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
struct xfs_iext_cursor icur;
int error = 0;
@@ -689,7 +689,7 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 39dd46f038fe..2f4519590dc1 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs(
*/
if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
- XFS_IFORK_Q(bs->sc->ip))
+ xfs_inode_has_attr_fork(bs->sc->ip))
return false;
return true;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 97b54ac3075f..9bbbf20f401b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -416,15 +416,15 @@ xchk_ag_read_headers(
if (!sa->pag)
return -ENOENT;
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+ error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
return error;
- error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+ error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
return error;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index b962cfbbd92b..84fe3d33d699 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -482,7 +482,7 @@ xchk_da_btree(
int error;
/* Skip short format data structures; no btree to scan. */
- if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork)))
+ if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork)))
return 0;
/* Set up initial da state. */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38897adde7b5..5abb5fdb71d9 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -667,7 +667,7 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
xfs_fileoff_t free_lblk;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 48a6cbdf95d0..6a6f8fe7f87c 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -78,10 +78,10 @@ xchk_fscount_warmup(
continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
if (error)
break;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
if (error)
break;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 2e61df3bca83..aa65ec88a0c0 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 00848ee542fb..e1026e07bf94 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -104,13 +104,13 @@ xchk_iallocbt_chunk(
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
@@ -421,10 +421,10 @@ xchk_iallocbt_rec(
const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
@@ -446,8 +446,8 @@ xchk_iallocbt_rec(
agino = irec.ir_startino;
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(mp, agno, agino) ||
- !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ if (!xfs_verify_agino(pag, agino) ||
+ !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 3c7506c7553c..21b4c9006859 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -185,7 +185,7 @@ xchk_quota_data_fork(
/* Check for data fork problems that apply only to quota files. */
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
- ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 2744eecdbaf0..c68b767dc08f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -13,6 +13,8 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_ag.h"
/*
@@ -332,9 +334,8 @@ xchk_refcountbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
xfs_extlen_t len;
xfs_nlink_t refcount;
@@ -354,8 +355,8 @@ xchk_refcountbt_rec(
/* Check the extent. */
bno &= ~XFS_REFC_COW_START;
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
if (refcount == 0)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1e7b6b209ee8..c18bd039fce9 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -199,7 +199,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -207,9 +207,9 @@ xrep_calc_ag_resblks(
}
/* Now grab the block counters from the AGF. */
- error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
if (error) {
- aglen = xfs_ag_block_count(mp, sm->sm_agno);
+ aglen = pag->block_count;
freelen = aglen;
usedlen = aglen;
} else {
@@ -220,25 +220,22 @@ xrep_calc_ag_resblks(
usedlen = aglen - freelen;
xfs_buf_relse(bp);
}
- xfs_perag_put(pag);
/* If the icount is impossible, make some worst-case assumptions. */
if (icount == NULLAGINO ||
- !xfs_verify_agino(mp, sm->sm_agno, icount)) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, sm->sm_agno, &first, &last);
- icount = last - first + 1;
+ !xfs_verify_agino(pag, icount)) {
+ icount = pag->agino_max - pag->agino_min + 1;
}
/* If the block counts are impossible, make worst-case assumptions. */
if (aglen == NULLAGBLOCK ||
- aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
+ aglen != pag->block_count ||
freelen >= aglen) {
- aglen = xfs_ag_block_count(mp, sm->sm_agno);
+ aglen = pag->block_count;
freelen = aglen;
usedlen = aglen;
}
+ xfs_perag_put(pag);
trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
freelen, usedlen);
@@ -300,13 +297,13 @@ xrep_alloc_ag_block(
switch (resv) {
case XFS_AG_RESV_AGFL:
case XFS_AG_RESV_RMAPBT:
- error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
+ sc->sa.agf_bp, &bno, 1);
if (error)
return error;
if (bno == NULLAGBLOCK)
return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
- 1, false);
+ xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
if (resv == XFS_AG_RESV_RMAPBT)
xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
@@ -457,16 +454,19 @@ xrep_invalidate_blocks(
* assume it's owned by someone else.
*/
for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+ int error;
+
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
- bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ error = xfs_buf_incore(sc->mp->m_ddev_targp,
XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
- if (bp) {
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- }
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+ if (error)
+ continue;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
}
return 0;
@@ -516,8 +516,8 @@ xrep_put_freelist(
return error;
/* Put the block on the AGFL. */
- error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
- agbno, 0);
+ error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+ sc->sa.agfl_bp, agbno, 0);
if (error)
return error;
xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
@@ -536,13 +536,12 @@ xrep_reap_block(
{
struct xfs_btree_cur *cur;
struct xfs_buf *agf_bp = NULL;
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
bool has_other_rmap;
int error;
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
/*
* If we are repairing per-inode metadata, we need to read in the AGF
@@ -550,7 +549,7 @@ xrep_reap_block(
* the AGF buffer that the setup functions already grabbed.
*/
if (sc->ip) {
- error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
} else {
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8dae0345c7df..229826b2e1c0 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -92,7 +92,7 @@ xchk_rmapbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
@@ -121,8 +121,8 @@ xchk_rmapbt_rec(
* Otherwise we must point somewhere past the static metadata
* but before the end of the FS. Run the regular check.
*/
- if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
- !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
+ !xfs_verify_agbno(pag, irec.rm_startblock +
irec.rm_blockcount - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
}
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 599ee277bba2..75311f8daeeb 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -41,7 +41,7 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
len = ip->i_disk_size;
/* Plausible size? */
@@ -52,8 +52,8 @@ xchk_symlink(
/* Inline symlink? */
if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
- if (len > XFS_IFORK_DSIZE(ip) ||
- len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ if (len > xfs_inode_data_fork_size(ip) ||
+ len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out;
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 27265771f247..5db87b34fb6e 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -158,6 +158,7 @@ xfs_attr3_node_inactive(
}
child_fsb = be32_to_cpu(ichdr.btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ bp = NULL;
/*
* If this is the node level just above the leaves, simply loop
@@ -211,12 +212,8 @@ xfs_attr3_node_inactive(
&child_bp);
if (error)
return error;
- error = bp->b_error;
- if (error) {
- xfs_trans_brelse(*trans, child_bp);
- return error;
- }
xfs_trans_binval(*trans, child_bp);
+ child_bp = NULL;
/*
* If we're not done, re-read the parent to get the next
@@ -233,6 +230,7 @@ xfs_attr3_node_inactive(
bp->b_addr);
child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
+ bp = NULL;
}
/*
* Atomically commit the whole invalidate stuff.
@@ -338,7 +336,7 @@ xfs_attr_inactive(
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_destroy_fork;
xfs_iunlock(dp, lock_mode);
@@ -351,7 +349,7 @@ xfs_attr_inactive(
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_cancel;
/*
@@ -362,12 +360,11 @@ xfs_attr_inactive(
/*
* Invalidate and truncate the attribute fork extents. Make sure the
- * fork actually has attributes as otherwise the invalidation has no
+ * fork actually has xattr blocks as otherwise the invalidation has no
* blocks to read and returns an error. In this case, just do the fork
* removal below.
*/
- if (xfs_inode_hasattr(dp) &&
- dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_af.if_nextents > 0) {
error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out_cancel;
@@ -388,11 +385,7 @@ out_cancel:
xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
- if (dp->i_afp) {
- xfs_idestroy_fork(dp->i_afp);
- kmem_cache_free(xfs_ifork_cache, dp->i_afp);
- dp->i_afp = NULL;
- }
+ xfs_ifork_zap_attr(dp);
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90a14e85e76d..99bbbe1a0e44 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -61,8 +61,7 @@ xfs_attr_shortform_list(
int sbsize, nsbuf, count, i;
int error = 0;
- ASSERT(dp->i_afp != NULL);
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
@@ -80,7 +79,7 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
@@ -121,7 +120,7 @@ xfs_attr_shortform_list(
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
@@ -513,7 +512,7 @@ xfs_attr_list_ilocked(
*/
if (!xfs_inode_hasattr(dp))
return 0;
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
if (xfs_attr_is_leaf(dp))
return xfs_attr_leaf_list(context);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 85e1a26c92e8..04d0c2bff67c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -256,7 +256,7 @@ xfs_bmap_count_blocks(
xfs_filblks_t *count)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
xfs_extlen_t btblocks = 0;
int error;
@@ -439,29 +439,28 @@ xfs_getbmap(
whichfork = XFS_COW_FORK;
else
whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
switch (whichfork) {
case XFS_ATTR_FORK:
- if (!XFS_IFORK_Q(ip))
- goto out_unlock_iolock;
+ lock = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_has_attr_fork(ip))
+ goto out_unlock_ilock;
max_len = 1LL << 32;
- lock = xfs_ilock_attr_map_shared(ip);
break;
case XFS_COW_FORK:
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
+
/* No CoW fork? Just return */
- if (!ifp)
- goto out_unlock_iolock;
+ if (!xfs_ifork_ptr(ip, whichfork))
+ goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip))
max_len = mp->m_super->s_maxbytes;
else
max_len = XFS_ISIZE(ip);
-
- lock = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lock);
break;
case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
@@ -491,6 +490,8 @@ xfs_getbmap(
break;
}
+ ifp = xfs_ifork_ptr(ip, whichfork);
+
switch (ifp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -1320,8 +1321,8 @@ xfs_swap_extents_check_format(
* extent format...
*/
if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(ip) &&
- XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
+ if (xfs_inode_has_attr_fork(ip) &&
+ XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
return -EINVAL;
if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
@@ -1329,8 +1330,8 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(tip) &&
- XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+ if (xfs_inode_has_attr_fork(tip) &&
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
return -EINVAL;
if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
@@ -1506,15 +1507,15 @@ xfs_swap_extent_forks(
/*
* Count the number of extended attribute blocks
*/
- if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
&aforkblks);
if (error)
return error;
}
- if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
- tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
+ tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
&taforkblks);
if (error)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 5e8f40d8c052..dde346450952 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include <linux/backing-dev.h>
+#include <linux/dax.h>
#include "xfs_shared.h"
#include "xfs_format.h"
@@ -21,7 +22,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
-static struct kmem_cache *xfs_buf_cache;
+struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -295,6 +296,16 @@ xfs_buf_free_pages(
}
static void
+xfs_buf_free_callback(
+ struct callback_head *cb)
+{
+ struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
+
+ xfs_buf_free_maps(bp);
+ kmem_cache_free(xfs_buf_cache, bp);
+}
+
+static void
xfs_buf_free(
struct xfs_buf *bp)
{
@@ -307,8 +318,7 @@ xfs_buf_free(
else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_cache, bp);
+ call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
@@ -503,100 +513,45 @@ xfs_buf_hash_destroy(
rhashtable_destroy(&pag->pag_buf_hash);
}
-/*
- * Look up a buffer in the buffer cache and return it referenced and locked
- * in @found_bp.
- *
- * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
- * cache.
- *
- * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
- * -EAGAIN if we fail to lock it.
- *
- * Return values are:
- * -EFSCORRUPTED if have been supplied with an invalid address
- * -EAGAIN on trylock failure
- * -ENOENT if we fail to find a match and @new_bp was NULL
- * 0, with @found_bp:
- * - @new_bp if we inserted it into the cache
- * - the buffer we found and locked.
- */
static int
-xfs_buf_find(
+xfs_buf_map_verify(
struct xfs_buftarg *btp,
- struct xfs_buf_map *map,
- int nmaps,
- xfs_buf_flags_t flags,
- struct xfs_buf *new_bp,
- struct xfs_buf **found_bp)
+ struct xfs_buf_map *map)
{
- struct xfs_perag *pag;
- struct xfs_buf *bp;
- struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int i;
-
- *found_bp = NULL;
-
- for (i = 0; i < nmaps; i++)
- cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= eofs) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, cmap.bm_bn, eofs);
+ __func__, map->bm_bn, eofs);
WARN_ON(1);
return -EFSCORRUPTED;
}
-
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
-
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
- xfs_buf_hash_params);
- if (bp) {
- atomic_inc(&bp->b_hold);
- goto found;
- }
-
- /* No match found */
- if (!new_bp) {
- XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- return -ENOENT;
- }
-
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- *found_bp = new_bp;
return 0;
+}
-found:
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
-
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK) {
- xfs_buf_rele(bp);
- XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
+static int
+xfs_buf_find_lock(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ if (flags & XBF_TRYLOCK) {
+ if (!xfs_buf_trylock(bp)) {
+ XFS_STATS_INC(bp->b_mount, xb_busy_locked);
return -EAGAIN;
}
+ } else {
xfs_buf_lock(bp);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
+ XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
}
/*
@@ -609,57 +564,59 @@ found:
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
-
- trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- *found_bp = bp;
return 0;
}
-struct xfs_buf *
-xfs_buf_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+static inline int
+xfs_buf_lookup(
+ struct xfs_perag *pag,
+ struct xfs_buf_map *map,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
int error;
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
- if (error)
- return NULL;
- return bp;
+ rcu_read_lock();
+ bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ error = xfs_buf_find_lock(bp, flags);
+ if (error) {
+ xfs_buf_rele(bp);
+ return error;
+ }
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
+ *bpp = bp;
+ return 0;
}
/*
- * Assembles a buffer covering the specified range. The code is optimised for
- * cache hits, as metadata intensive workloads will see 3 orders of magnitude
- * more hits than misses.
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
*/
-int
-xfs_buf_get_map(
- struct xfs_buftarg *target,
+static int
+xfs_buf_find_insert(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag,
+ struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
struct xfs_buf *new_bp;
+ struct xfs_buf *bp;
int error;
- *bpp = NULL;
- error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
- if (!error)
- goto found;
- if (error != -ENOENT)
- return error;
-
- error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
- return error;
+ goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
@@ -674,18 +631,94 @@ xfs_buf_get_map(
goto out_free_buf;
}
- error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
- if (error)
+ spin_lock(&pag->pag_buf_lock);
+ bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head, xfs_buf_hash_params);
+ if (IS_ERR(bp)) {
+ error = PTR_ERR(bp);
+ spin_unlock(&pag->pag_buf_lock);
goto out_free_buf;
+ }
+ if (bp) {
+ /* found an existing buffer */
+ atomic_inc(&bp->b_hold);
+ spin_unlock(&pag->pag_buf_lock);
+ error = xfs_buf_find_lock(bp, flags);
+ if (error)
+ xfs_buf_rele(bp);
+ else
+ *bpp = bp;
+ goto out_free_buf;
+ }
+
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ *bpp = new_bp;
+ return 0;
- if (bp != new_bp)
- xfs_buf_free(new_bp);
+out_free_buf:
+ xfs_buf_free(new_bp);
+out_drop_pag:
+ xfs_perag_put(pag);
+ return error;
+}
-found:
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+int
+xfs_buf_get_map(
+ struct xfs_buftarg *btp,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ struct xfs_perag *pag;
+ struct xfs_buf *bp = NULL;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
+ int error;
+ int i;
+
+ for (i = 0; i < nmaps; i++)
+ cmap.bm_len += map[i].bm_len;
+
+ error = xfs_buf_map_verify(btp, &cmap);
+ if (error)
+ return error;
+
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+
+ error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ if (error && error != -ENOENT)
+ goto out_put_perag;
+
+ /* cache hits always outnumber misses by at least 10:1 */
+ if (unlikely(!bp)) {
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+ if (flags & XBF_INCORE)
+ goto out_put_perag;
+
+ /* xfs_buf_find_insert() consumes the perag reference. */
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ flags, &bp);
+ if (error)
+ return error;
+ } else {
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ xfs_perag_put(pag);
+ }
+
+ /* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn_ratelimited(target->bt_mount,
+ xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
@@ -700,12 +733,13 @@ found:
if (!(flags & XBF_READ))
xfs_buf_ioerror(bp, 0);
- XFS_STATS_INC(target->bt_mount, xb_get);
+ XFS_STATS_INC(btp->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
-out_free_buf:
- xfs_buf_free(new_bp);
+
+out_put_perag:
+ xfs_perag_put(pag);
return error;
}
@@ -1911,7 +1945,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
- fs_put_dax(btp->bt_daxdev);
+ fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
}
@@ -1958,13 +1992,18 @@ xfs_alloc_buftarg(
struct block_device *bdev)
{
xfs_buftarg_t *btp;
+ const struct dax_holder_operations *ops = NULL;
+#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
+ ops = &xfs_dax_holder_operations;
+#endif
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ mp, ops);
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
@@ -1986,7 +2025,8 @@ xfs_alloc_buftarg(
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- if (register_shrinker(&btp->bt_shrinker))
+ if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+ mp->m_super->s_id))
goto error_pcpu;
return btp;
@@ -2275,29 +2315,6 @@ xfs_buf_delwri_pushbuf(
return error;
}
-int __init
-xfs_buf_init(void)
-{
- xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN |
- SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD,
- NULL);
- if (!xfs_buf_cache)
- goto out;
-
- return 0;
-
- out:
- return -ENOMEM;
-}
-
-void
-xfs_buf_terminate(void)
-{
- kmem_cache_destroy(xfs_buf_cache);
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1ee3056ff9cf..549c60942208 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -15,6 +15,8 @@
#include <linux/uio.h>
#include <linux/list_lru.h>
+extern struct kmem_cache *xfs_buf_cache;
+
/*
* Base types
*/
@@ -42,9 +44,11 @@ struct xfs_buf;
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
+#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
+
typedef unsigned int xfs_buf_flags_t;
#define XFS_BUF_FLAGS \
@@ -63,6 +67,7 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
+ { XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
@@ -193,13 +198,10 @@ struct xfs_buf {
int b_last_error;
const struct xfs_buf_ops *b_ops;
+ struct rcu_head b_rcu;
};
/* Finding and Reading Buffers */
-struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
- xfs_daddr_t blkno, size_t numblks,
- xfs_buf_flags_t flags);
-
int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
@@ -210,6 +212,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target,
const struct xfs_buf_ops *ops);
static inline int
+xfs_buf_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+ return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
+}
+
+static inline int
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
@@ -294,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
return bp->b_maps[0].bm_bn;
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a7174a5b3203..e295fc8062d8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -248,7 +248,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
struct xfs_da_geometry *geo = args->geo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_bmbt_irec map;
struct blk_plug plug;
xfs_dir2_off_t new_off;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c6fe3f6ebb6b..bfc829c07f03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -45,7 +45,7 @@ xfs_trim_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
goto out_put_perag;
agf = agbp->b_addr;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5a6c3c3c4de2..8fb90da89787 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -549,7 +549,7 @@ xfs_dquot_check_type(
* at the same time. The non-user quota file can be switched between
* group and project quota uses depending on the mount options, which
* means that we can encounter the other type when we try to load quota
- * defaults. Quotacheck will soon reset the the entire quota file
+ * defaults. Quotacheck will soon reset the entire quota file
* (including the root dquot) anyway, but don't log scary corruption
* reports to dmesg.
*/
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 765be054dffe..27ccfcd82f04 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -11,6 +11,7 @@
#include "xfs_bit.h"
#include "xfs_shared.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -187,12 +188,12 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
- uint len = sizeof(xfs_efi_log_format_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
- uint len32 = sizeof(xfs_efi_log_format_32_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
- uint len64 = sizeof(xfs_efi_log_format_64_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+ uint len = sizeof(xfs_efi_log_format_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
+ uint len32 = sizeof(xfs_efi_log_format_32_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
+ uint len64 = sizeof(xfs_efi_log_format_64_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
if (buf->i_len == len) {
memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
@@ -551,6 +552,7 @@ xfs_agfl_free_finish_item(
xfs_agnumber_t agno;
xfs_agblock_t agbno;
uint next_extent;
+ struct xfs_perag *pag;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
ASSERT(free->xefi_blockcount == 1);
@@ -560,9 +562,11 @@ xfs_agfl_free_finish_item(
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (!error)
error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
+ xfs_perag_put(pag);
/*
* Mark the transaction dirty, even on error. This ensures the
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8d9b14d2b912..c6c80265c0b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include <linux/dax.h>
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
@@ -142,7 +143,7 @@ xfs_file_fsync(
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ int error, err2;
int log_flushed = 0;
trace_xfs_file_fsync(ip);
@@ -163,18 +164,21 @@ xfs_file_fsync(
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
- blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
- blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
* Any inode that has dirty modifications in the log is pinned. The
- * racy check here for a pinned inode while not catch modifications
+ * racy check here for a pinned inode will not catch modifications
* that happen concurrently to the fsync call, but fsync semantics
* only require to sync previously completed I/O.
*/
- if (xfs_ipincount(ip))
- error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (xfs_ipincount(ip)) {
+ err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (err2 && !error)
+ error = err2;
+ }
/*
* If we only have a single device, and the log force about was
@@ -184,8 +188,11 @@ xfs_file_fsync(
* commit.
*/
if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
- mp->m_logdev_targp == mp->m_ddev_targp)
- blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ mp->m_logdev_targp == mp->m_ddev_targp) {
+ err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ if (err2 && !error)
+ error = err2;
+ }
return error;
}
@@ -669,7 +676,7 @@ xfs_file_dax_write(
pos = iocb->ki_pos;
trace_xfs_file_dax_write(iocb, from);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -806,7 +813,7 @@ xfs_wait_dax_page(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
-static int
+int
xfs_break_dax_layouts(
struct inode *inode,
bool *retry)
@@ -1253,6 +1260,31 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
+#ifdef CONFIG_FS_DAX
+static int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#else
+static int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return 0;
+}
+#endif
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
@@ -1284,10 +1316,7 @@ __xfs_filemap_fault(
pfn_t pfn;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index be9bcf8a1f99..34b21a29c39b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -126,7 +126,7 @@ xfs_filestream_pick_ag(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+ err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
if (err) {
if (err != -EAGAIN) {
xfs_perag_put(pag);
@@ -181,7 +181,7 @@ next_ag:
if (ag != startag)
continue;
- /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+ /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
if (trylock != 0) {
trylock = 0;
continue;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index bb23199f65c3..d8337274c74d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -642,8 +642,7 @@ __xfs_getfsmap_datadev(
info->agf_bp = NULL;
}
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0,
- &info->agf_bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp);
if (error)
break;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d4a77c53f94b..13851c0d640b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -41,6 +41,7 @@ xfs_resizefs_init_new_ags(
xfs_agnumber_t oagcount,
xfs_agnumber_t nagcount,
xfs_rfsblock_t delta,
+ struct xfs_perag *last_pag,
bool *lastag_extended)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -73,7 +74,7 @@ xfs_resizefs_init_new_ags(
if (delta) {
*lastag_extended = true;
- error = xfs_ag_extend_space(mp, tp, id, delta);
+ error = xfs_ag_extend_space(last_pag, tp, delta);
}
return error;
}
@@ -96,6 +97,7 @@ xfs_growfs_data_private(
xfs_agnumber_t oagcount;
struct xfs_trans *tp;
struct aghdr_init_data id = {};
+ struct xfs_perag *last_pag;
nb = in->newblocks;
error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
@@ -128,10 +130,9 @@ xfs_growfs_data_private(
return -EINVAL;
oagcount = mp->m_sb.sb_agcount;
-
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
if (error)
return error;
} else if (nagcount < oagcount) {
@@ -145,15 +146,17 @@ xfs_growfs_data_private(
if (error)
return error;
+ last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
- delta, &lastag_extended);
+ delta, last_pag, &lastag_extended);
} else {
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
- error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
+ error = xfs_ag_shrink_space(last_pag, &tp, -delta);
}
+ xfs_perag_put(last_pag);
if (error)
goto out_trans_cancel;
@@ -528,6 +531,9 @@ xfs_do_force_shutdown(
} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
tag = XFS_PTAG_SHUTDOWN_CORRUPT;
why = "Corruption of in-memory data";
+ } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of on-disk metadata";
} else {
tag = XFS_PTAG_SHUTDOWN_IOERROR;
why = "Metadata I/O Error";
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2609825d53ee..2bbe7916a998 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -98,8 +98,9 @@ xfs_inode_alloc(
ip->i_ino = ino;
ip->i_mount = mp;
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
- ip->i_afp = NULL;
ip->i_cowfp = NULL;
+ memset(&ip->i_af, 0, sizeof(ip->i_af));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(ip->i_df));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
@@ -111,6 +112,8 @@ xfs_inode_alloc(
INIT_WORK(&ip->i_ioend_work, xfs_end_io);
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
return ip;
}
@@ -130,10 +133,8 @@ xfs_inode_free_callback(
break;
}
- if (ip->i_afp) {
- xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- }
+ xfs_ifork_zap_attr(ip);
+
if (ip->i_cowfp) {
xfs_idestroy_fork(ip->i_cowfp);
kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
@@ -912,6 +913,7 @@ reclaim:
ip->i_checked = 0;
spin_unlock(&ip->i_flags_lock);
+ ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1774,7 +1776,7 @@ xfs_check_delalloc(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
@@ -2219,5 +2221,5 @@ xfs_inodegc_register_shrinker(
shrink->flags = SHRINKER_NONSLAB;
shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
- return register_shrinker(shrink);
+ return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3e1c62ffa4f7..28493c8e9bb2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -125,7 +126,7 @@ xfs_ilock_attr_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_afp && xfs_need_iread_extents(ip->i_afp))
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
return lock_mode;
@@ -635,7 +636,7 @@ xfs_ip2xflags(
flags |= FS_XFLAG_COWEXTSIZE;
}
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
flags |= FS_XFLAG_HASATTR;
return flags;
}
@@ -893,7 +894,7 @@ xfs_init_new_inode(
*/
if (init_xattrs && xfs_has_attr(mp)) {
ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
- ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
}
/*
@@ -1293,8 +1294,8 @@ xfs_itruncate_clear_reflink_flags(
if (!xfs_is_reflink_inode(ip))
return;
- dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
if (cfork->if_bytes == 0)
@@ -1643,7 +1644,7 @@ xfs_inode_needs_inactive(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
/*
* If the inode is already free, then there can be nothing
@@ -1762,13 +1763,12 @@ xfs_inactive(
* now. The code calls a routine that recursively deconstructs the
* attribute fork. If also blows away the in-core attribute fork.
*/
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
error = xfs_attr_inactive(ip);
if (error)
goto out;
}
- ASSERT(!ip->i_afp);
ASSERT(ip->i_forkoff == 0);
/*
@@ -1801,195 +1801,69 @@ out:
* because we must walk that list to find the inode that points to the inode
* being removed from the unlinked hash bucket list.
*
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk. That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
*
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk. The only errors that
- * should bubble out are for obviously incorrect situations.
- *
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
*/
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
- struct rhash_head iu_rhash_head;
- xfs_agino_t iu_agino; /* X */
- xfs_agino_t iu_next_unlinked; /* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
- struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const xfs_agino_t *key = arg->key;
- const struct xfs_iunlink *iu = obj;
-
- if (iu->iu_next_unlinked != *key)
- return 1;
- return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
- .min_size = XFS_AGI_UNLINKED_BUCKETS,
- .key_len = sizeof(xfs_agino_t),
- .key_offset = offsetof(struct xfs_iunlink,
- iu_next_unlinked),
- .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
- .automatic_shrinking = true,
- .obj_cmpfn = xfs_iunlink_obj_cmpfn,
-};
-
/*
- * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list. If we
+ * don't find the inode in cache, then let the caller handle the situation.
*/
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
{
- struct xfs_iunlink *iu;
-
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- return iu ? iu->iu_agino : NULLAGINO;
-}
+ struct xfs_inode *ip;
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
- struct xfs_perag *pag,
- struct xfs_iunlink *iu)
-{
- int error;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
- error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
/*
- * Fail loudly if there already was an entry because that's a sign of
- * corruption of in-memory data. Also fail loudly if we see an error
- * code we didn't anticipate from the rhashtable code. Currently we
- * only anticipate ENOMEM.
+ * Inode not in memory or in RCU freeing limbo should not happen.
+ * Warn about this and let the caller handle the failure.
*/
- if (error) {
- WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
- kmem_free(iu);
+ if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ rcu_read_unlock();
+ return NULL;
}
- /*
- * Absorb any runtime errors that aren't a result of corruption because
- * this is a cache and we can always fall back to bucket list scanning.
- */
- if (error != 0 && error != -EEXIST)
- error = 0;
- return error;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+ rcu_read_unlock();
+ return ip;
}
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
+/* Update the prev pointer of the next agino. */
static int
-xfs_iunlink_add_backref(
+xfs_iunlink_update_backref(
struct xfs_perag *pag,
xfs_agino_t prev_agino,
- xfs_agino_t this_agino)
-{
- struct xfs_iunlink *iu;
-
- if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
- return 0;
-
- iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
- iu->iu_agino = prev_agino;
- iu->iu_next_unlinked = this_agino;
-
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
- struct xfs_perag *pag,
- xfs_agino_t agino,
- xfs_agino_t next_unlinked)
+ xfs_agino_t next_agino)
{
- struct xfs_iunlink *iu;
- int error;
-
- /* Look up the old entry; if there wasn't one then exit. */
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- if (!iu)
- return 0;
-
- /*
- * Remove the entry. This shouldn't ever return an error, but if we
- * couldn't remove the old entry we don't want to add it again to the
- * hash table, and if the entry disappeared on us then someone's
- * violated the locking rules and we need to fail loudly. Either way
- * we cannot remove the inode because internal state is or would have
- * been corrupt.
- */
- error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
- if (error)
- return error;
+ struct xfs_inode *ip;
- /* If there is no new next entry just free our item and return. */
- if (next_unlinked == NULLAGINO) {
- kmem_free(iu);
+ /* No update necessary if we are at the end of the list. */
+ if (next_agino == NULLAGINO)
return 0;
- }
- /* Update the entry and re-add it to the hash table. */
- iu->iu_next_unlinked = next_unlinked;
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
- struct xfs_perag *pag)
-{
- return rhashtable_init(&pag->pagi_unlinked_hash,
- &xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
- void *ptr,
- void *arg)
-{
- struct xfs_iunlink *iu = ptr;
- bool *freed_anything = arg;
-
- *freed_anything = true;
- kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
- struct xfs_perag *pag)
-{
- bool freed_anything = false;
-
- rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
- xfs_iunlink_free_item, &freed_anything);
-
- ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
+ ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!ip)
+ return -EFSCORRUPTED;
+ ip->i_prev_unlinked = prev_agino;
+ return 0;
}
/*
@@ -2008,7 +1882,7 @@ xfs_iunlink_update_bucket(
xfs_agino_t old_value;
int offset;
- ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
@@ -2031,88 +1905,53 @@ xfs_iunlink_update_bucket(
return 0;
}
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t agino,
- struct xfs_buf *ibp,
- struct xfs_dinode *dip,
- struct xfs_imap *imap,
- xfs_agino_t next_agino)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
- trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
- be32_to_cpu(dip->di_next_unlinked), next_agino);
-
- dip->di_next_unlinked = cpu_to_be32(next_agino);
- offset = imap->im_boffset +
- offsetof(struct xfs_dinode, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
+static int
+xfs_iunlink_insert_inode(
struct xfs_trans *tp,
- struct xfs_inode *ip,
struct xfs_perag *pag,
- xfs_agino_t next_agino,
- xfs_agino_t *old_next_agino)
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_dinode *dip;
- struct xfs_buf *ibp;
- xfs_agino_t old_value;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
- ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
- if (error)
- return error;
- dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
-
- /* Make sure the old pointer isn't garbage. */
- old_value = be32_to_cpu(dip->di_next_unlinked);
- if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
- sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- goto out;
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(pag, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
+ return -EFSCORRUPTED;
}
/*
- * Since we're updating a linked list, we should never find that the
- * current pointer is the same as the new value, unless we're
- * terminating the list.
+ * Update the prev pointer in the next inode to point back to this
+ * inode.
*/
- *old_next_agino = old_value;
- if (old_value == next_agino) {
- if (next_agino != NULLAGINO) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
- dip, sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- }
- goto out;
+ error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error)
+ return error;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ ip->i_next_unlinked = next_agino;
}
- /* Ok, update the new pointer. */
- xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
- ibp, dip, &ip->i_imap, next_agino);
- return 0;
-out:
- xfs_trans_brelse(tp, ibp);
- return error;
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
/*
@@ -2129,11 +1968,7 @@ xfs_iunlink(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
- struct xfs_agi *agi;
struct xfs_buf *agibp;
- xfs_agino_t next_agino;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2143,202 +1978,38 @@ xfs_iunlink(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out;
- agi = agibp->b_addr;
-
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the pointer isn't garbage and that this inode
- * isn't already on the list.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) {
- xfs_buf_mark_corrupt(agibp);
- error = -EFSCORRUPTED;
- goto out;
- }
-
- if (next_agino != NULLAGINO) {
- xfs_agino_t old_agino;
-
- /*
- * There is already another inode in the bucket, so point this
- * inode to the current head of the list.
- */
- error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
- &old_agino);
- if (error)
- goto out;
- ASSERT(old_agino == NULLAGINO);
- /*
- * agino has been unlinked, add a backref from the next inode
- * back to agino.
- */
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- if (error)
- goto out;
- }
-
- /* Point the head of the list to point to this inode. */
- error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+ error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
out:
xfs_perag_put(pag);
return error;
}
-/* Return the imap, dinode pointer, and buffer for an inode. */
-STATIC int
-xfs_iunlink_map_ino(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int error;
-
- imap->im_blkno = 0;
- error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
-
- error = xfs_imap_to_bp(mp, tp, imap, bpp);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
-
- *dipp = xfs_buf_offset(*bpp, imap->im_boffset);
- return 0;
-}
-
-/*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino. Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
- *
- * Do not call this function if @target_agino is the head of the list.
- */
-STATIC int
-xfs_iunlink_map_prev(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t head_agino,
- xfs_agino_t target_agino,
- xfs_agino_t *agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agino_t next_agino;
- int error;
-
- ASSERT(head_agino != target_agino);
- *bpp = NULL;
-
- /* See if our backref cache can find it faster. */
- *agino = xfs_iunlink_lookup_backref(pag, target_agino);
- if (*agino != NULLAGINO) {
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
- return 0;
-
- /*
- * If we get here the cache contents were corrupt, so drop the
- * buffer and fall back to walking the bucket list.
- */
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- WARN_ON_ONCE(1);
- }
-
- trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
-
- /* Otherwise, walk the entire bucket until we find it. */
- next_agino = head_agino;
- while (next_agino != target_agino) {
- xfs_agino_t unlinked_agino;
-
- if (*bpp)
- xfs_trans_brelse(tp, *bpp);
-
- *agino = next_agino;
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
- /*
- * Make sure this pointer is valid and isn't an obvious
- * infinite loop.
- */
- if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
- next_agino == unlinked_agino) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- *dipp, sizeof(**dipp));
- error = -EFSCORRUPTED;
- return error;
- }
- next_agino = unlinked_agino;
- }
-
- return 0;
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-STATIC int
-xfs_iunlink_remove(
+static int
+xfs_iunlink_remove_inode(
struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_buf *agibp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi;
- struct xfs_buf *agibp;
- struct xfs_buf *last_ibp;
- struct xfs_dinode *last_dip = NULL;
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t next_agino;
xfs_agino_t head_agino;
short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
trace_xfs_iunlink_remove(ip);
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
- if (error)
- return error;
- agi = agibp->b_addr;
-
/*
* Get the index into the agi hash table for the list this inode will
* go on. Make sure the head pointer isn't garbage.
*/
head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) {
+ if (!xfs_verify_agino(pag, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
@@ -2349,52 +2020,60 @@ xfs_iunlink_remove(
* the old pointer value so that we can update whatever was previous
* to us in the list to point to whatever was next in the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
+ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
if (error)
return error;
/*
- * If there was a backref pointing from the next inode back to this
- * one, remove it because we've removed this inode from the list.
- *
- * Later, if this inode was in the middle of the list we'll update
- * this inode's backref to point from the next inode.
+ * Update the prev pointer in the next inode to point back to previous
+ * inode in the chain.
*/
- if (next_agino != NULLAGINO) {
- error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
- if (error)
- return error;
- }
+ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error)
+ return error;
if (head_agino != agino) {
- struct xfs_imap imap;
- xfs_agino_t prev_agino;
+ struct xfs_inode *prev_ip;
- /* We need to search the list for the inode being freed. */
- error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
- &prev_agino, &imap, &last_dip, &last_ibp);
- if (error)
- return error;
+ prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+ if (!prev_ip)
+ return -EFSCORRUPTED;
- /* Point the previous inode on the list to the next inode. */
- xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
- last_dip, &imap, next_agino);
-
- /*
- * Now we deal with the backref for this inode. If this inode
- * pointed at a real inode, change the backref that pointed to
- * us to point to our old next. If this inode was the end of
- * the list, delete the backref that pointed to us. Note that
- * change_backref takes care of deleting the backref if
- * next_agino is NULLAGINO.
- */
- return xfs_iunlink_change_backref(agibp->b_pag, agino,
- next_agino);
+ error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+ ip->i_next_unlinked);
+ prev_ip->i_next_unlinked = ip->i_next_unlinked;
+ } else {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+ ip->i_next_unlinked);
}
- /* Point the head of the list to the next unlinked inode. */
- return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
- next_agino);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
+ return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *agibp;
+ int error;
+
+ trace_xfs_iunlink_remove(ip);
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, &agibp);
+ if (error)
+ return error;
+
+ return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
}
/*
@@ -3032,10 +2711,12 @@ out_trans_abort:
static int
xfs_rename_alloc_whiteout(
struct user_namespace *mnt_userns,
+ struct xfs_name *src_name,
struct xfs_inode *dp,
struct xfs_inode **wip)
{
struct xfs_inode *tmpfile;
+ struct qstr name;
int error;
error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
@@ -3043,6 +2724,15 @@ xfs_rename_alloc_whiteout(
if (error)
return error;
+ name.name = src_name->name;
+ name.len = src_name->len;
+ error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
+ if (error) {
+ xfs_finish_inode_setup(tmpfile);
+ xfs_irele(tmpfile);
+ return error;
+ }
+
/*
* Prepare the tmpfile inode as if it were created through the VFS.
* Complete the inode setup and flag it as linkable. nlink is already
@@ -3093,7 +2783,8 @@ xfs_rename(
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(mnt_userns, src_name,
+ target_dp, &wip);
if (error)
return error;
@@ -3229,11 +2920,13 @@ retry:
if (inodes[i] == wip ||
(inodes[i] == target_ip &&
(VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
- struct xfs_buf *bp;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
- agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
- error = xfs_read_agi(mp, tp, agno, &bp);
+ pag = xfs_perag_get(mp,
+ XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
+ error = xfs_read_agi(pag, tp, &bp);
+ xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
}
@@ -3452,13 +3145,13 @@ xfs_iflush(
goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %llu, "
"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
- ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
+ ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
ip->i_nblocks, ip);
goto flush_out;
}
@@ -3488,7 +3181,8 @@ xfs_iflush(
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
xfs_ifork_verify_local_data(ip))
goto flush_out;
- if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
+ if (xfs_inode_has_attr_fork(ip) &&
+ ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
xfs_ifork_verify_local_attr(ip))
goto flush_out;
@@ -3506,7 +3200,7 @@ xfs_iflush(
}
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
/*
@@ -3753,6 +3447,50 @@ retry:
return 0;
}
+static int
+xfs_mmaplock_two_inodes_and_break_dax_layout(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int error;
+ bool retry;
+ struct page *page;
+
+ if (ip1->i_ino > ip2->i_ino)
+ swap(ip1, ip2);
+
+again:
+ retry = false;
+ /* Lock the first inode */
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+ if (error || retry) {
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ if (error == 0 && retry)
+ goto again;
+ return error;
+ }
+
+ if (ip1 == ip2)
+ return 0;
+
+ /* Nested lock the second inode */
+ xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
+ /*
+ * We cannot use xfs_break_dax_layouts() directly here because it may
+ * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
+ * for this nested lock case.
+ */
+ page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
+ if (page && page_ref_count(page) != 1) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ goto again;
+ }
+
+ return 0;
+}
+
/*
* Lock two inodes so that userspace cannot initiate I/O via file syscalls or
* mmap activity.
@@ -3767,8 +3505,19 @@ xfs_ilock2_io_mmap(
ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
if (ret)
return ret;
- filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
- VFS_I(ip2)->i_mapping);
+
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
+ if (ret) {
+ inode_unlock(VFS_I(ip2));
+ if (ip1 != ip2)
+ inode_unlock(VFS_I(ip1));
+ return ret;
+ }
+ } else
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
return 0;
}
@@ -3778,8 +3527,14 @@ xfs_iunlock2_io_mmap(
struct xfs_inode *ip1,
struct xfs_inode *ip2)
{
- filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
- VFS_I(ip2)->i_mapping);
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ } else
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
inode_unlock(VFS_I(ip2));
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7be6f8e705ab..fa780f08dc89 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -33,9 +33,9 @@ typedef struct xfs_inode {
struct xfs_imap i_imap; /* location for xfs_imap() */
/* Extent information. */
- struct xfs_ifork *i_afp; /* attribute fork pointer */
struct xfs_ifork *i_cowfp; /* copy on write extents */
struct xfs_ifork i_df; /* data fork */
+ struct xfs_ifork i_af; /* attribute fork */
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
@@ -68,6 +68,10 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
+ /* unlinked list pointers */
+ xfs_agino_t i_next_unlinked;
+ xfs_agino_t i_prev_unlinked;
+
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -77,6 +81,66 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+{
+ return ip->i_forkoff > 0;
+}
+
+static inline struct xfs_ifork *
+xfs_ifork_ptr(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return &ip->i_df;
+ case XFS_ATTR_FORK:
+ if (!xfs_inode_has_attr_fork(ip))
+ return NULL;
+ return &ip->i_af;
+ case XFS_COW_FORK:
+ return ip->i_cowfp;
+ default:
+ ASSERT(0);
+ return NULL;
+ }
+}
+
+static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip)
+{
+ return ip->i_forkoff << 3;
+}
+
+static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return xfs_inode_fork_boff(ip);
+
+ return XFS_LITINO(ip->i_mount);
+}
+
+static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip);
+ return 0;
+}
+
+static inline unsigned int
+xfs_inode_fork_size(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_inode_data_fork_size(ip);
+ case XFS_ATTR_FORK:
+ return xfs_inode_attr_fork_size(ip);
+ default:
+ return 0;
+ }
+}
+
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -467,6 +531,7 @@ xfs_itruncate_extents(
}
/* from xfs_file.c */
+int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
@@ -505,9 +570,6 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
-
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 721def0639fd..6e19ece916bf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,7 +57,7 @@ xfs_inode_item_data_fork_size(
ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
/* worst case, doesn't subtract delalloc extents */
- *nbytes += XFS_IFORK_DSIZE(ip);
+ *nbytes += xfs_inode_data_fork_size(ip);
*nvecs += 1;
}
break;
@@ -92,27 +92,27 @@ xfs_inode_item_attr_fork_size(
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
/* worst case, doesn't subtract unused space */
- *nbytes += XFS_IFORK_ASIZE(ip);
+ *nbytes += xfs_inode_attr_fork_size(ip);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- *nbytes += ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot_bytes > 0) {
+ *nbytes += ip->i_af.if_broot_bytes;
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
+ ip->i_af.if_bytes > 0) {
+ *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
*nvecs += 1;
}
break;
@@ -143,7 +143,7 @@ xfs_inode_item_size(
xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
@@ -237,18 +237,18 @@ xfs_inode_item_format_attr_fork(
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(xfs_iext_count(ip->i_afp) ==
- ip->i_afp->if_nextents);
+ ASSERT(xfs_iext_count(&ip->i_af) ==
+ ip->i_af.if_nextents);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -265,13 +265,13 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- ASSERT(ip->i_afp->if_broot != NULL);
+ ip->i_af.if_broot_bytes > 0) {
+ ASSERT(ip->i_af.if_broot != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
- ip->i_afp->if_broot,
- ip->i_afp->if_broot_bytes);
- ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot,
+ ip->i_af.if_broot_bytes);
+ ilf->ilf_asize = ip->i_af.if_broot_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
@@ -282,12 +282,12 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ ip->i_af.if_bytes > 0) {
+ ASSERT(ip->i_af.if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_afp->if_u1.if_data,
- ip->i_afp->if_bytes);
- ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
+ ip->i_af.if_u1.if_data,
+ ip->i_af.if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -355,11 +355,11 @@ xfs_inode_to_log_dinode_iext_counters(
{
if (xfs_inode_has_large_extent_counts(ip)) {
to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+ to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
to->di_nrext64_pad = 0;
} else {
to->di_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+ to->di_anextents = xfs_ifork_nextents(&ip->i_af);
}
}
@@ -390,7 +390,7 @@ xfs_inode_to_log_dinode(
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
to->di_forkoff = ip->i_forkoff;
- to->di_aformat = xfs_ifork_format(ip->i_afp);
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
to->di_flags = ip->i_diflags;
xfs_copy_dm_fields_to_log_dinode(ip, to);
@@ -480,7 +480,7 @@ xfs_inode_item_format(
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
} else {
iip->ili_fields &=
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0d67ff8a8961..1f783e979629 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -955,6 +955,7 @@ xfs_ioc_ag_geometry(
struct xfs_mount *mp,
void __user *arg)
{
+ struct xfs_perag *pag;
struct xfs_ag_geometry ageo;
int error;
@@ -965,7 +966,12 @@ xfs_ioc_ag_geometry(
if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
return -EINVAL;
- error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
+ pag = xfs_perag_get(mp, ageo.ag_number);
+ if (!pag)
+ return -EINVAL;
+
+ error = xfs_ag_get_geometry(pag, &ageo);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -985,7 +991,7 @@ xfs_fill_fsxattr(
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5d50fed291b4..07da03976ec1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb(
struct xfs_inode *ip,
xfs_fileoff_t end_fsb)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
xfs_extlen_t align = xfs_eof_alignment(ip);
struct xfs_bmbt_irec irec;
@@ -370,7 +370,7 @@ xfs_iomap_prealloc_size(
struct xfs_iext_cursor ncur = *icur;
struct xfs_bmbt_irec prev, got;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
int64_t freesp;
xfs_fsblock_t qblocks;
@@ -773,7 +773,8 @@ xfs_direct_write_iomap_begin(
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -868,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
};
static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (!xfs_is_cow_inode(ip))
+ return 0;
+
+ if (!written) {
+ xfs_reflink_cancel_cow_range(ip, pos, length, true);
+ return 0;
+ }
+
+ return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
+static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1310,12 +1338,12 @@ xfs_xattr_iomap_begin(
lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) {
+ if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
error = -ENOENT;
goto out_unlock;
}
- ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e88dc162c785..c782e8c0479c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -51,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a7402f6ea510..45518b8c613c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -75,9 +75,8 @@ xfs_initxattrs(
* these attrs can be journalled at inode creation time (along with the
* inode, of course, such that log replay can't cause these to be lost).
*/
-
-STATIC int
-xfs_init_security(
+int
+xfs_inode_init_security(
struct inode *inode,
struct inode *dir,
const struct qstr *qstr)
@@ -122,7 +121,7 @@ xfs_cleanup_inode(
/* Oh, the horror.
* If we can't add the ACL or we fail in
- * xfs_init_security we must back out.
+ * xfs_inode_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
xfs_dentry_to_name(&teardown, dentry);
@@ -208,7 +207,7 @@ xfs_generic_create(
inode = VFS_I(ip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
@@ -424,7 +423,7 @@ xfs_vn_symlink(
inode = VFS_I(cip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
@@ -1282,7 +1281,7 @@ xfs_setup_inode(
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
- if (!XFS_IFORK_Q(ip)) {
+ if (!xfs_inode_has_attr_fork(ip)) {
inode_has_no_xattr(inode);
cache_no_acl(inode);
}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 278949056048..cb5fc68c9ea0 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -17,4 +17,7 @@ extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *vap);
+int xfs_inode_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
+
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f74c9fff72bb..36312b00b164 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -111,8 +111,8 @@ xfs_bulkstat_one_int(
buf->bs_extents64 = nextents;
xfs_bulkstat_health(ip, buf);
- buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
- buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ buf->bs_aextents = xfs_ifork_nextents(&ip->i_af);
+ buf->bs_forkoff = xfs_inode_fork_boff(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
if (xfs_has_v3inodes(mp)) {
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000000..43005ce8bd48
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+struct kmem_cache *xfs_iunlink_cache;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_iunlink_item, item);
+}
+
+static void
+xfs_iunlink_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+
+ xfs_perag_put(iup->pag);
+ kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+ struct xfs_log_item *lip)
+{
+ return IUL_ITEM(lip)->ip->i_ino;
+}
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+ struct xfs_trans *tp,
+ struct xfs_iunlink_item *iup)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip = iup->ip;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ int offset;
+ int error;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ if (error)
+ return error;
+ /*
+ * Don't log the unlinked field on stale buffers as this may be the
+ * transaction that frees the inode cluster and relogging the buffer
+ * here will incorrectly remove the stale state.
+ */
+ if (ibp->b_flags & XBF_STALE)
+ goto out;
+
+ dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+ /* Make sure the old pointer isn't garbage. */
+ if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+ XFS_INO_TO_AGINO(mp, ip->i_ino),
+ be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+ offset = ip->i_imap.im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number we were
+ * passed, then update the next unlinked field for that inode in the buffer and
+ * log the buffer. This ensures that the inode cluster buffer was logged in the
+ * correct order w.r.t. other inode cluster buffers. We can then remove the
+ * iunlink item from the transaction and release it as it is has now served it's
+ * purpose.
+ */
+static int
+xfs_iunlink_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+ int error;
+
+ error = xfs_iunlink_log_dinode(tp, iup);
+ list_del(&lip->li_trans);
+ xfs_iunlink_item_release(lip);
+ return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+ .iop_release = xfs_iunlink_item_release,
+ .iop_sort = xfs_iunlink_item_sort,
+ .iop_precommit = xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_iunlink_item *iup;
+
+ ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ if (ip->i_next_unlinked == next_agino) {
+ if (next_agino != NULLAGINO)
+ return -EFSCORRUPTED;
+ return 0;
+ }
+
+ iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK,
+ &xfs_iunlink_item_ops);
+
+ iup->ip = ip;
+ iup->next_agino = next_agino;
+ iup->old_agino = ip->i_next_unlinked;
+
+ atomic_inc(&pag->pag_ref);
+ iup->pag = pag;
+
+ xfs_trans_add_item(tp, &iup->item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &iup->item.li_flags);
+ return 0;
+}
+
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000000..c793cdcaccde
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H 1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+ struct xfs_log_item item;
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t next_agino;
+ xfs_agino_t old_agino;
+};
+
+extern struct kmem_cache *xfs_iunlink_cache;
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_perag *pag, xfs_agino_t next_agino);
+
+#endif /* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae904b21e9cc..386b0307aed8 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -57,7 +57,8 @@ xlog_grant_push_ail(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog);
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
xlog_verify_grant_tail(
@@ -567,7 +568,8 @@ xlog_state_shutdown_callbacks(
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
xfs_lsn_t tail_lsn;
bool last_ref;
@@ -614,7 +616,7 @@ xlog_state_release_iclog(
trace_xlog_iclog_syncing(iclog, _RET_IP_);
spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
+ xlog_sync(log, iclog, ticket);
spin_lock(&log->l_icloglock);
return 0;
}
@@ -881,7 +883,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
- return xlog_state_release_iclog(iclog->ic_log, iclog);
+ return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
}
/*
@@ -944,6 +946,8 @@ xlog_write_unmount_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
BUILD_BUG_ON((sizeof(struct xlog_op_header) +
sizeof(struct xfs_unmount_log_format)) !=
@@ -952,7 +956,7 @@ xlog_write_unmount_record(
/* account for space used by record data */
ticket->t_curr_res -= sizeof(unmount_rec);
- return xlog_write(log, NULL, &vec, ticket, reg.i_len);
+ return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
}
/*
@@ -1921,9 +1925,17 @@ xlog_write_iclog(
* device cache first to ensure all metadata writeback covered
* by the LSN in this iclog is on stable storage. This is slow,
* but it *must* complete before we issue the external log IO.
+ *
+ * If the flush fails, we cannot conclude that past metadata
+ * writeback from the log succeeded. Repeating the flush is
+ * not possible, hence we must shut down with log IO error to
+ * avoid shutdown re-entering this path and erroring out again.
*/
- if (log->l_targ != log->l_mp->m_ddev_targp)
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
+ if (log->l_targ != log->l_mp->m_ddev_targp &&
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
@@ -2000,7 +2012,7 @@ xlog_calc_iclog_size(
}
/*
- * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
* fashion. Previously, we should have moved the current iclog
* ptr in the log to point to the next available iclog. This allows further
* write to continue while this code syncs out an iclog ready to go.
@@ -2025,7 +2037,8 @@ xlog_calc_iclog_size(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
unsigned int count; /* byte count of bwrite */
unsigned int roundoff; /* roundoff to BB or stripe */
@@ -2037,12 +2050,20 @@ xlog_sync(
count = xlog_calc_iclog_size(log, iclog, &roundoff);
- /* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
- xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ /*
+ * If we have a ticket, account for the roundoff via the ticket
+ * reservation to avoid touching the hot grant heads needlessly.
+ * Otherwise, we have to move grant heads directly.
+ */
+ if (ticket) {
+ ticket->t_curr_res -= roundoff;
+ } else {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ }
/* put cycle number in every block */
- xlog_pack_data(log, iclog, roundoff);
+ xlog_pack_data(log, iclog, roundoff);
/* real byte length */
size = iclog->ic_offset;
@@ -2275,7 +2296,7 @@ xlog_write_get_more_iclog_space(
spin_lock(&log->l_icloglock);
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -2471,13 +2492,13 @@ int
xlog_write(
struct xlog *log,
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
struct xlog_ticket *ticket,
uint32_t len)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_vec *lv;
uint32_t record_cnt = 0;
uint32_t data_cnt = 0;
int error = 0;
@@ -2505,7 +2526,7 @@ xlog_write(
if (ctx)
xlog_cil_set_ctx_write_state(ctx, iclog);
- while (lv) {
+ list_for_each_entry(lv, lv_chain, lv_list) {
/*
* If the entire log vec does not fit in the iclog, punt it to
* the partial copy loop which can handle this case.
@@ -2526,7 +2547,6 @@ xlog_write(
xlog_write_full(lv, ticket, iclog, &log_offset,
&len, &record_cnt, &data_cnt);
}
- lv = lv->lv_next;
}
ASSERT(len == 0);
@@ -2538,7 +2558,7 @@ xlog_write(
*/
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, 0);
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
return error;
@@ -2958,7 +2978,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3406,7 +3426,8 @@ xfs_log_ticket_get(
static int
xlog_calc_unit_res(
struct xlog *log,
- int unit_bytes)
+ int unit_bytes,
+ int *niclogs)
{
int iclog_space;
uint num_headers;
@@ -3486,6 +3507,8 @@ xlog_calc_unit_res(
/* roundoff padding for transaction data and one for commit record */
unit_bytes += 2 * log->l_iclog_roundoff;
+ if (niclogs)
+ *niclogs = num_headers;
return unit_bytes;
}
@@ -3494,7 +3517,7 @@ xfs_log_calc_unit_res(
struct xfs_mount *mp,
int unit_bytes)
{
- return xlog_calc_unit_res(mp->m_log, unit_bytes);
+ return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
}
/*
@@ -3512,7 +3535,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xlog_calc_unit_res(log, unit_bytes);
+ unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index f3ce046a7d45..2728886c2963 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -9,7 +9,8 @@
struct xfs_cil_ctx;
struct xfs_log_vec {
- struct xfs_log_vec *lv_next; /* next lv in build list */
+ struct list_head lv_list; /* CIL lv chain ptrs */
+ uint32_t lv_order_id; /* chain ordering info */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index db6cb7800251..eccbfb99e894 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -44,9 +44,20 @@ xlog_cil_ticket_alloc(
* transaction overhead reservation from the first transaction commit.
*/
tic->t_curr_res = 0;
+ tic->t_iclog_hdrs = 0;
return tic;
}
+static inline void
+xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
+{
+ struct xlog *log = cil->xc_log;
+
+ atomic_set(&cil->xc_iclog_hdrs,
+ (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
+ (log->l_iclog_size - log->l_iclog_hsize)));
+}
+
/*
* Check if the current log item was first committed in this sequence.
* We can't rely on just the log item being in the CIL, we have to check
@@ -61,7 +72,7 @@ xlog_item_in_current_chkpt(
struct xfs_cil *cil,
struct xfs_log_item *lip)
{
- if (list_empty(&lip->li_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
return false;
/*
@@ -93,15 +104,88 @@ xlog_cil_ctx_alloc(void)
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_LIST_HEAD(&ctx->log_items);
+ INIT_LIST_HEAD(&ctx->lv_chain);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
return ctx;
}
+/*
+ * Aggregate the CIL per cpu structures into global counts, lists, etc and
+ * clear the percpu state ready for the next context to use. This is called
+ * from the push code with the context lock held exclusively, hence nothing else
+ * will be accessing or modifying the per-cpu counters.
+ */
+static void
+xlog_cil_push_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->busy_extents)) {
+ list_splice_init(&cilpcp->busy_extents,
+ &ctx->busy_extents);
+ }
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+
+ /*
+ * We're in the middle of switching cil contexts. Reset the
+ * counter we use to detect when the current context is nearing
+ * full.
+ */
+ cilpcp->space_used = 0;
+ }
+}
+
+/*
+ * Aggregate the CIL per-cpu space used counters into the global atomic value.
+ * This is called when the per-cpu counter aggregation will first pass the soft
+ * limit threshold so we can switch to atomic counter aggregation for accurate
+ * detection of hard limit traversal.
+ */
+static void
+xlog_cil_insert_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+ int count = 0;
+
+ /* Trigger atomic updates then aggregate only for the first caller */
+ if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
+ return;
+
+ for_each_online_cpu(cpu) {
+ int old, prev;
+
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ do {
+ old = cilpcp->space_used;
+ prev = cmpxchg(&cilpcp->space_used, old, 0);
+ } while (old != prev);
+ count += old;
+ }
+ atomic_add(count, &ctx->space_used);
+}
+
static void
xlog_cil_ctx_switch(
struct xfs_cil *cil,
struct xfs_cil_ctx *ctx)
{
+ xlog_cil_set_iclog_hdr_count(cil);
+ set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
+ set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
ctx->sequence = ++cil->xc_current_sequence;
ctx->cil = cil;
cil->xc_ctx = ctx;
@@ -123,6 +207,7 @@ xlog_cil_init_post_recovery(
{
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
+ xlog_cil_set_iclog_hdr_count(log->l_cilp);
}
static inline int
@@ -254,6 +339,7 @@ xlog_cil_alloc_shadow_bufs(
memset(lv, 0, xlog_cil_iovec_space(niovecs));
+ INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
lv->lv_size = buf_size;
if (ordered)
@@ -269,7 +355,6 @@ xlog_cil_alloc_shadow_bufs(
else
lv->lv_buf_len = 0;
lv->lv_bytes = 0;
- lv->lv_next = NULL;
}
/* Ensure the lv is set up according to ->iop_size */
@@ -396,7 +481,6 @@ xlog_cil_insert_format_items(
if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv;
- lv->lv_next = NULL;
if (ordered)
goto insert;
@@ -434,6 +518,23 @@ insert:
}
/*
+ * The use of lockless waitqueue_active() requires that the caller has
+ * serialised itself against the wakeup call in xlog_cil_push_work(). That
+ * can be done by either holding the push lock or the context lock.
+ */
+static inline bool
+xlog_cil_over_hard_limit(
+ struct xlog *log,
+ int32_t space_used)
+{
+ if (waitqueue_active(&log->l_cilp->xc_push_wait))
+ return true;
+ if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ return true;
+ return false;
+}
+
+/*
* Insert the log items into the CIL and calculate the difference in space
* consumed by the item. Add the space to the checkpoint ticket and calculate
* if the change requires additional log metadata. If it does, take that space
@@ -450,8 +551,10 @@ xlog_cil_insert_items(
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
+ int space_used;
+ int order;
+ struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -461,93 +564,135 @@ xlog_cil_insert_items(
*/
xlog_cil_insert_format_items(log, tp, &len);
- spin_lock(&cil->xc_cil_lock);
+ /*
+ * Subtract the space released by intent cancelation from the space we
+ * consumed so that we remove it from the CIL space and add it back to
+ * the current transaction reservation context.
+ */
+ len -= released_space;
- /* attach the transaction to the CIL if it has any busy extents */
- if (!list_empty(&tp->t_busy))
- list_splice_init(&tp->t_busy, &ctx->busy_extents);
+ /*
+ * Grab the per-cpu pointer for the CIL before we start any accounting.
+ * That ensures that we are running with pre-emption disabled and so we
+ * can't be scheduled away between split sample/update operations that
+ * are done without outside locking to serialise them.
+ */
+ cilpcp = get_cpu_ptr(cil->xc_pcp);
/*
- * Now transfer enough transaction reservation to the context ticket
- * for the checkpoint. The context ticket is special - the unit
- * reservation has to grow as well as the current reservation as we
- * steal from tickets so we can correctly determine the space used
- * during the transaction commit.
+ * We need to take the CIL checkpoint unit reservation on the first
+ * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
+ * unnecessarily do an atomic op in the fast path here. We can clear the
+ * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
+ * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
*/
- if (ctx->ticket->t_curr_res == 0) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
+ test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
ctx_res = ctx->ticket->t_unit_res;
- ctx->ticket->t_curr_res = ctx_res;
- tp->t_ticket->t_curr_res -= ctx_res;
- }
- /* do we need space for more log record headers? */
- iclog_space = log->l_iclog_size - log->l_iclog_hsize;
- if (len > 0 && (ctx->space_used / iclog_space !=
- (ctx->space_used + len) / iclog_space)) {
- split_res = (len + iclog_space - 1) / iclog_space;
- /* need to take into account split region headers, too */
- split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
- ctx->ticket->t_unit_res += split_res;
- ctx->ticket->t_curr_res += split_res;
- tp->t_ticket->t_curr_res -= split_res;
- ASSERT(tp->t_ticket->t_curr_res >= len);
+ /*
+ * Check if we need to steal iclog headers. atomic_read() is not a
+ * locked atomic operation, so we can check the value before we do any
+ * real atomic ops in the fast path. If we've already taken the CIL unit
+ * reservation from this commit, we've already got one iclog header
+ * space reserved so we have to account for that otherwise we risk
+ * overrunning the reservation on this ticket.
+ *
+ * If the CIL is already at the hard limit, we might need more header
+ * space that originally reserved. So steal more header space from every
+ * commit that occurs once we are over the hard limit to ensure the CIL
+ * push won't run out of reservation space.
+ *
+ * This can steal more than we need, but that's OK.
+ *
+ * The cil->xc_ctx_lock provides the serialisation necessary for safely
+ * calling xlog_cil_over_hard_limit() in this context.
+ */
+ space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
+ if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
+ xlog_cil_over_hard_limit(log, space_used)) {
+ split_res = log->l_iclog_hsize +
+ sizeof(struct xlog_op_header);
+ if (ctx_res)
+ ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
+ else
+ ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
+ atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
}
- tp->t_ticket->t_curr_res -= len;
- tp->t_ticket->t_curr_res += released_space;
- ctx->space_used += len;
- ctx->space_used -= released_space;
+ cilpcp->space_reserved += ctx_res;
/*
- * If we've overrun the reservation, dump the tx details before we move
- * the log items. Shutdown is imminent...
+ * Accurately account when over the soft limit, otherwise fold the
+ * percpu count into the global count if over the per-cpu threshold.
*/
- if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
- xfs_warn(log->l_mp, "Transaction log reservation overrun:");
- xfs_warn(log->l_mp,
- " log items: %d bytes (iov hdrs: %d bytes)",
- len, iovhdr_res);
- xfs_warn(log->l_mp, " split region headers: %d bytes",
- split_res);
- xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
- xlog_print_trans(tp);
+ if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
+ atomic_add(len, &ctx->space_used);
+ } else if (cilpcp->space_used + len >
+ (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
+ space_used = atomic_add_return(cilpcp->space_used + len,
+ &ctx->space_used);
+ cilpcp->space_used = 0;
+
+ /*
+ * If we just transitioned over the soft limit, we need to
+ * transition to the global atomic counter.
+ */
+ if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
+ xlog_cil_insert_pcp_aggregate(cil, ctx);
+ } else {
+ cilpcp->space_used += len;
}
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy))
+ list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
/*
- * Now (re-)position everything modified at the tail of the CIL.
+ * Now update the order of everything modified in the transaction
+ * and insert items into the CIL if they aren't already there.
* We do this here so we only need to take the CIL lock once during
* the transaction commit.
*/
+ order = atomic_inc_return(&ctx->order_id);
list_for_each_entry(lip, &tp->t_items, li_trans) {
-
/* Skip items which aren't dirty in this transaction. */
if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
- /*
- * Only move the item if it isn't already at the tail. This is
- * to prevent a transient list_empty() state when reinserting
- * an item that is already the only item in the CIL.
- */
- if (!list_is_last(&lip->li_cil, &cil->xc_cil))
- list_move_tail(&lip->li_cil, &cil->xc_cil);
+ lip->li_order_id = order;
+ if (!list_empty(&lip->li_cil))
+ continue;
+ list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
+ put_cpu_ptr(cilpcp);
- spin_unlock(&cil->xc_cil_lock);
-
- if (tp->t_ticket->t_curr_res < 0)
+ /*
+ * If we've overrun the reservation, dump the tx details before we move
+ * the log items. Shutdown is imminent...
+ */
+ tp->t_ticket->t_curr_res -= ctx_res + len;
+ if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
+ xfs_warn(log->l_mp, "Transaction log reservation overrun:");
+ xfs_warn(log->l_mp,
+ " log items: %d bytes (iov hdrs: %d bytes)",
+ len, iovhdr_res);
+ xfs_warn(log->l_mp, " split region headers: %d bytes",
+ split_res);
+ xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
+ xlog_print_trans(tp);
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
}
static void
xlog_cil_free_logvec(
- struct xfs_log_vec *log_vector)
+ struct list_head *lv_chain)
{
struct xfs_log_vec *lv;
- for (lv = log_vector; lv; ) {
- struct xfs_log_vec *next = lv->lv_next;
+ while (!list_empty(lv_chain)) {
+ lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
+ list_del_init(&lv->lv_list);
kmem_free(lv);
- lv = next;
}
}
@@ -647,7 +792,7 @@ xlog_cil_committed(
spin_unlock(&ctx->cil->xc_push_lock);
}
- xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
xfs_extent_busy_sort(&ctx->busy_extents);
@@ -658,7 +803,7 @@ xlog_cil_committed(
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
- xlog_cil_free_logvec(ctx->lv_chain);
+ xlog_cil_free_logvec(&ctx->lv_chain);
if (!list_empty(&ctx->busy_extents))
xlog_discard_busy_extents(mp, ctx);
@@ -817,7 +962,6 @@ restart:
static int
xlog_cil_write_chain(
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *chain,
uint32_t chain_len)
{
struct xlog *log = ctx->cil->xc_log;
@@ -826,7 +970,7 @@ xlog_cil_write_chain(
error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
if (error)
return error;
- return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
+ return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
}
/*
@@ -855,6 +999,8 @@ xlog_cil_write_commit_record(
.lv_iovecp = &reg,
};
int error;
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
if (xlog_is_shutdown(log))
return -EIO;
@@ -865,7 +1011,7 @@ xlog_cil_write_commit_record(
/* account for space used by record data */
ctx->ticket->t_curr_res -= reg.i_len;
- error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
+ error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
@@ -931,12 +1077,31 @@ xlog_cil_build_trans_hdr(
lvhdr->lv_niovecs = 2;
lvhdr->lv_iovecp = &hdr->lhdr[0];
lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
- lvhdr->lv_next = ctx->lv_chain;
tic->t_curr_res -= lvhdr->lv_bytes;
}
/*
+ * CIL item reordering compare function. We want to order in ascending ID order,
+ * but we want to leave items with the same ID in the order they were added to
+ * the list. This is important for operations like reflink where we log 4 order
+ * dependent intents in a single transaction when we overwrite an existing
+ * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
+ * CUI (inc), BUI(remap)...
+ */
+static int
+xlog_cil_order_cmp(
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list);
+ struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list);
+
+ return l1->lv_order_id > l2->lv_order_id;
+}
+
+/*
* Pull all the log vectors off the items in the CIL, and remove the items from
* the CIL. We don't need the CIL lock here because it's only needed on the
* transaction commit side which is currently locked out by the flush lock.
@@ -947,18 +1112,16 @@ xlog_cil_build_trans_hdr(
*/
static void
xlog_cil_build_lv_chain(
- struct xfs_cil *cil,
struct xfs_cil_ctx *ctx,
struct list_head *whiteouts,
uint32_t *num_iovecs,
uint32_t *num_bytes)
{
- struct xfs_log_vec *lv = NULL;
-
- while (!list_empty(&cil->xc_cil)) {
+ while (!list_empty(&ctx->log_items)) {
struct xfs_log_item *item;
+ struct xfs_log_vec *lv;
- item = list_first_entry(&cil->xc_cil,
+ item = list_first_entry(&ctx->log_items,
struct xfs_log_item, li_cil);
if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
@@ -967,18 +1130,18 @@ xlog_cil_build_lv_chain(
continue;
}
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
lv = item->li_lv;
- item->li_lv = NULL;
- *num_iovecs += lv->lv_niovecs;
+ lv->lv_order_id = item->li_order_id;
/* we don't write ordered log vectors */
if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*num_bytes += lv->lv_bytes;
+ *num_iovecs += lv->lv_niovecs;
+ list_add_tail(&lv->lv_list, &ctx->lv_chain);
+
+ list_del_init(&item->li_cil);
+ item->li_order_id = 0;
+ item->li_lv = NULL;
}
}
@@ -1022,10 +1185,11 @@ xlog_cil_push_work(
int num_bytes = 0;
int error = 0;
struct xlog_cil_trans_hdr thdr;
- struct xfs_log_vec lvhdr = { NULL };
+ struct xfs_log_vec lvhdr = {};
xfs_csn_t push_seq;
bool push_commit_stable;
LIST_HEAD (whiteouts);
+ struct xlog_ticket *ticket;
new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -1049,12 +1213,14 @@ xlog_cil_push_work(
if (waitqueue_active(&cil->xc_push_wait))
wake_up_all(&cil->xc_push_wait);
+ xlog_cil_push_pcp_aggregate(cil, ctx);
+
/*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
*/
- if (list_empty(&cil->xc_cil)) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
cil->xc_push_seq = 0;
spin_unlock(&cil->xc_push_lock);
goto out_skip;
@@ -1094,7 +1260,7 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
+ xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
* Switch the contexts so we can drop the context lock and move out
@@ -1127,14 +1293,30 @@ xlog_cil_push_work(
up_write(&cil->xc_ctx_lock);
/*
+ * Sort the log vector chain before we add the transaction headers.
+ * This ensures we always have the transaction headers at the start
+ * of the chain.
+ */
+ list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
+
+ /*
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
+ * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
+ * it gets written into the iclog first.
*/
xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
num_bytes += lvhdr.lv_bytes;
+ list_add(&lvhdr.lv_list, &ctx->lv_chain);
- error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
+ /*
+ * Take the lvhdr back off the lv_chain immediately after calling
+ * xlog_cil_write_chain() as it should not be passed to log IO
+ * completion.
+ */
+ error = xlog_cil_write_chain(ctx, num_bytes);
+ list_del(&lvhdr.lv_list);
if (error)
goto out_abort_free_ticket;
@@ -1142,7 +1324,14 @@ xlog_cil_push_work(
if (error)
goto out_abort_free_ticket;
- xfs_log_ticket_ungrant(log, ctx->ticket);
+ /*
+ * Grab the ticket from the ctx so we can ungrant it after releasing the
+ * commit_iclog. The ctx may be freed by the time we return from
+ * releasing the commit_iclog (i.e. checkpoint has been completed and
+ * callback run) so we can't reference the ctx after the call to
+ * xlog_state_release_iclog().
+ */
+ ticket = ctx->ticket;
/*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1192,12 +1381,14 @@ xlog_cil_push_work(
if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
- xlog_state_release_iclog(log, ctx->commit_iclog);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
xlog_cil_cleanup_whiteouts(&whiteouts);
+ xfs_log_ticket_ungrant(log, ticket);
return;
out_skip:
@@ -1207,17 +1398,19 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_ungrant(log, ctx->ticket);
ASSERT(xlog_is_shutdown(log));
xlog_cil_cleanup_whiteouts(&whiteouts);
if (!ctx->commit_iclog) {
+ xfs_log_ticket_ungrant(log, ctx->ticket);
xlog_cil_committed(ctx);
return;
}
spin_lock(&log->l_icloglock);
- xlog_state_release_iclog(log, ctx->commit_iclog);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xfs_log_ticket_ungrant(log, ticket);
}
/*
@@ -1232,18 +1425,27 @@ xlog_cil_push_background(
struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
+ int space_used = atomic_read(&cil->xc_ctx->space_used);
/*
* The cil won't be empty because we are called while holding the
- * context lock so whatever we added to the CIL will still be there
+ * context lock so whatever we added to the CIL will still be there.
*/
- ASSERT(!list_empty(&cil->xc_cil));
+ ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
/*
- * Don't do a background push if we haven't used up all the
- * space available yet.
+ * We are done if:
+ * - we haven't used up all the space available yet; or
+ * - we've already queued up a push; and
+ * - we're not over the hard limit; and
+ * - nothing has been over the hard limit.
+ *
+ * If so, we don't need to take the push lock as there's nothing to do.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
+ (cil->xc_push_seq == cil->xc_current_sequence &&
+ space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
+ !waitqueue_active(&cil->xc_push_wait))) {
up_read(&cil->xc_ctx_lock);
return;
}
@@ -1270,12 +1472,11 @@ xlog_cil_push_background(
* dipping back down under the hard limit.
*
* The ctx->xc_push_lock provides the serialisation necessary for safely
- * using the lockless waitqueue_active() check in this context.
+ * calling xlog_cil_over_hard_limit() in this context.
*/
- if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
- waitqueue_active(&cil->xc_push_wait)) {
+ if (xlog_cil_over_hard_limit(log, space_used)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
- ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ ASSERT(space_used < log->l_logsize);
xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
return;
}
@@ -1334,7 +1535,8 @@ xlog_cil_push_now(
* If the CIL is empty or we've already pushed the sequence then
* there's no more work that we need to do.
*/
- if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
+ push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
@@ -1352,7 +1554,7 @@ xlog_cil_empty(
bool empty = false;
spin_lock(&cil->xc_push_lock);
- if (list_empty(&cil->xc_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
empty = true;
spin_unlock(&cil->xc_push_lock);
return empty;
@@ -1483,7 +1685,7 @@ xlog_cil_flush(
* If the CIL is empty, make sure that any previous checkpoint that may
* still be in an active iclog is pushed to stable storage.
*/
- if (list_empty(&log->l_cilp->xc_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
xfs_log_force(log->l_mp, 0);
}
@@ -1568,7 +1770,7 @@ restart:
* we would have found the context on the committing list.
*/
if (sequence == cil->xc_current_sequence &&
- !list_empty(&cil->xc_cil)) {
+ !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
spin_unlock(&cil->xc_push_lock);
goto restart;
}
@@ -1589,14 +1791,48 @@ out_shutdown:
}
/*
+ * Move dead percpu state to the relevant CIL context structures.
+ *
+ * We have to lock the CIL context here to ensure that nothing is modifying
+ * the percpu state, either addition or removal. Both of these are done under
+ * the CIL context lock, so grabbing that exclusively here will ensure we can
+ * safely drain the cilpcp for the CPU that is dying.
+ */
+void
+xlog_cil_pcp_dead(
+ struct xlog *log,
+ unsigned int cpu)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ struct xfs_cil_ctx *ctx;
+
+ down_write(&cil->xc_ctx_lock);
+ ctx = cil->xc_ctx;
+ if (ctx->ticket)
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+ if (!list_empty(&cilpcp->busy_extents))
+ list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
+ atomic_add(cilpcp->space_used, &ctx->space_used);
+ cilpcp->space_used = 0;
+ up_write(&cil->xc_ctx_lock);
+}
+
+/*
* Perform initial CIL structure initialisation.
*/
int
xlog_cil_init(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_cil *cil;
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
@@ -1611,22 +1847,31 @@ xlog_cil_init(
if (!cil->xc_push_wq)
goto out_destroy_cil;
- INIT_LIST_HEAD(&cil->xc_cil);
+ cil->xc_log = log;
+ cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
+ if (!cil->xc_pcp)
+ goto out_destroy_wq;
+
+ for_each_possible_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ INIT_LIST_HEAD(&cilpcp->busy_extents);
+ INIT_LIST_HEAD(&cilpcp->log_items);
+ }
+
INIT_LIST_HEAD(&cil->xc_committing);
- spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_start_wait);
init_waitqueue_head(&cil->xc_commit_wait);
- cil->xc_log = log;
log->l_cilp = cil;
ctx = xlog_cil_ctx_alloc();
xlog_cil_ctx_switch(cil, ctx);
-
return 0;
+out_destroy_wq:
+ destroy_workqueue(cil->xc_push_wq);
out_destroy_cil:
kmem_free(cil);
return -ENOMEM;
@@ -1636,14 +1881,17 @@ void
xlog_cil_destroy(
struct xlog *log)
{
- if (log->l_cilp->xc_ctx) {
- if (log->l_cilp->xc_ctx->ticket)
- xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
- kmem_free(log->l_cilp->xc_ctx);
+ struct xfs_cil *cil = log->l_cilp;
+
+ if (cil->xc_ctx) {
+ if (cil->xc_ctx->ticket)
+ xfs_log_ticket_put(cil->xc_ctx->ticket);
+ kmem_free(cil->xc_ctx);
}
- ASSERT(list_empty(&log->l_cilp->xc_cil));
- destroy_workqueue(log->l_cilp->xc_push_wq);
- kmem_free(log->l_cilp);
+ ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+ free_percpu(cil->xc_pcp);
+ destroy_workqueue(cil->xc_push_wq);
+ kmem_free(cil);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 686c01eb3661..1bd2963e8fbd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -143,15 +143,16 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
typedef struct xlog_ticket {
- struct list_head t_queue; /* reserve/write queue */
- struct task_struct *t_task; /* task that owns this ticket */
- xlog_tid_t t_tid; /* transaction identifier : 4 */
- atomic_t t_ref; /* ticket reference count : 4 */
- int t_curr_res; /* current reservation in bytes : 4 */
- int t_unit_res; /* unit reservation in bytes : 4 */
- char t_ocnt; /* original count : 1 */
- char t_cnt; /* current count : 1 */
- uint8_t t_flags; /* properties of reservation : 1 */
+ struct list_head t_queue; /* reserve/write queue */
+ struct task_struct *t_task; /* task that owns this ticket */
+ xlog_tid_t t_tid; /* transaction identifier */
+ atomic_t t_ref; /* ticket reference count */
+ int t_curr_res; /* current reservation */
+ int t_unit_res; /* unit reservation */
+ char t_ocnt; /* original unit count */
+ char t_cnt; /* current unit count */
+ uint8_t t_flags; /* properties of reservation */
+ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
} xlog_ticket_t;
/*
@@ -221,13 +222,25 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int space_used; /* aggregate size of regions */
+ atomic_t space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
- struct xfs_log_vec *lv_chain; /* logvecs being pushed */
+ struct list_head log_items; /* log items in chkpt */
+ struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
struct work_struct push_work;
+ atomic_t order_id;
+};
+
+/*
+ * Per-cpu CIL tracking items
+ */
+struct xlog_cil_pcp {
+ int32_t space_used;
+ uint32_t space_reserved;
+ struct list_head busy_extents;
+ struct list_head log_items;
};
/*
@@ -248,8 +261,8 @@ struct xfs_cil_ctx {
*/
struct xfs_cil {
struct xlog *xc_log;
- struct list_head xc_cil;
- spinlock_t xc_cil_lock;
+ unsigned long xc_flags;
+ atomic_t xc_iclog_hdrs;
struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
@@ -263,8 +276,17 @@ struct xfs_cil {
wait_queue_head_t xc_start_wait;
xfs_csn_t xc_current_sequence;
wait_queue_head_t xc_push_wait; /* background push throttle */
+
+ void __percpu *xc_pcp; /* percpu CIL structures */
+#ifdef CONFIG_HOTPLUG_CPU
+ struct list_head xc_pcp_list;
+#endif
} ____cacheline_aligned_in_smp;
+/* xc_flags bit values */
+#define XLOG_CIL_EMPTY 1
+#define XLOG_CIL_PCP_SPACE 2
+
/*
* The amount of log space we allow the CIL to aggregate is difficult to size.
* Whatever we choose, we have to make sure we can get a reservation for the
@@ -486,14 +508,15 @@ struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
+ struct list_head *lv_chain, struct xlog_ticket *tic,
uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -682,4 +705,9 @@ xlog_kvmalloc(
return p;
}
+/*
+ * CIL CPU dead notifier
+ */
+void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 940c8107cbd4..17e923b9c5fa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2629,21 +2629,21 @@ xlog_recover_cancel_intents(
*/
STATIC void
xlog_recover_clear_agi_bucket(
- xfs_mount_t *mp,
- xfs_agnumber_t agno,
- int bucket)
+ struct xfs_perag *pag,
+ int bucket)
{
- xfs_trans_t *tp;
- xfs_agi_t *agi;
- struct xfs_buf *agibp;
- int offset;
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ int offset;
+ int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
if (error)
goto out_error;
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out_abort;
@@ -2662,60 +2662,62 @@ xlog_recover_clear_agi_bucket(
out_abort:
xfs_trans_cancel(tp);
out_error:
- xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+ xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
+ pag->pag_agno);
return;
}
-STATIC xfs_agino_t
-xlog_recover_process_one_iunlink(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- int bucket)
+static int
+xlog_recover_iunlink_bucket(
+ struct xfs_perag *pag,
+ struct xfs_agi *agi,
+ int bucket)
{
- struct xfs_buf *ibp;
- struct xfs_dinode *dip;
- struct xfs_inode *ip;
- xfs_ino_t ino;
- int error;
-
- ino = XFS_AGINO_TO_INO(mp, agno, agino);
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
- if (error)
- goto fail;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *prev_ip = NULL;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino, agino;
+ int error = 0;
- /*
- * Get the on disk inode to find the next inode in the bucket.
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp);
- if (error)
- goto fail_iput;
- dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+ agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (agino != NULLAGINO) {
+ error = xfs_iget(mp, NULL,
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
+ 0, 0, &ip);
+ if (error)
+ break;
- xfs_iflags_clear(ip, XFS_IRECOVERY);
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ xfs_iflags_clear(ip, XFS_IRECOVERY);
+ agino = ip->i_next_unlinked;
- /* setup for the next pass */
- agino = be32_to_cpu(dip->di_next_unlinked);
- xfs_buf_relse(ibp);
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
- xfs_irele(ip);
- return agino;
+ /*
+ * Ensure the inode is removed from the unlinked list
+ * before we continue so that it won't race with
+ * building the in-memory list here. This could be
+ * serialised with the agibp lock, but that just
+ * serialises via lockstepping and it's much simpler
+ * just to flush the inodegc queue and wait for it to
+ * complete.
+ */
+ xfs_inodegc_flush(mp);
+ }
- fail_iput:
- xfs_irele(ip);
- fail:
- /*
- * We can't read in the inode this bucket points to, or this inode
- * is messed up. Just ditch this bucket of inodes. We will lose
- * some inodes and space, but at least we won't hang.
- *
- * Call xlog_recover_clear_agi_bucket() to perform a transaction to
- * clear the inode pointer in the bucket.
- */
- xlog_recover_clear_agi_bucket(mp, agno, bucket);
- return NULLAGINO;
+ prev_agino = agino;
+ prev_ip = ip;
+ }
+
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
+ }
+ xfs_inodegc_flush(mp);
+ return error;
}
/*
@@ -2741,59 +2743,70 @@ xlog_recover_process_one_iunlink(
* scheduled on this CPU to ensure other scheduled work can run without undue
* latency.
*/
-STATIC void
-xlog_recover_process_iunlinks(
- struct xlog *log)
+static void
+xlog_recover_iunlink_ag(
+ struct xfs_perag *pag)
{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
struct xfs_agi *agi;
struct xfs_buf *agibp;
- xfs_agino_t agino;
int bucket;
int error;
- for_each_perag(mp, agno, pag) {
- error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
+ error = xfs_read_agi(pag, NULL, &agibp);
+ if (error) {
+ /*
+ * AGI is b0rked. Don't process it.
+ *
+ * We should probably mark the filesystem as corrupt after we've
+ * recovered all the ag's we can....
+ */
+ return;
+ }
+
+ /*
+ * Unlock the buffer so that it can be acquired in the normal course of
+ * the transaction to truncate and free each inode. Because we are not
+ * racing with anyone else here for the AGI buffer, we don't even need
+ * to hold it locked to read the initial unlinked bucket entries out of
+ * the buffer. We keep buffer reference though, so that it stays pinned
+ * in memory while we need the buffer.
+ */
+ agi = agibp->b_addr;
+ xfs_buf_unlock(agibp);
+
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+ error = xlog_recover_iunlink_bucket(pag, agi, bucket);
if (error) {
/*
- * AGI is b0rked. Don't process it.
- *
- * We should probably mark the filesystem as corrupt
- * after we've recovered all the ag's we can....
+ * Bucket is unrecoverable, so only a repair scan can
+ * free the remaining unlinked inodes. Just empty the
+ * bucket and remaining inodes on it unreferenced and
+ * unfreeable.
*/
- continue;
+ xfs_inodegc_flush(pag->pag_mount);
+ xlog_recover_clear_agi_bucket(pag, bucket);
}
- /*
- * Unlock the buffer so that it can be acquired in the normal
- * course of the transaction to truncate and free each inode.
- * Because we are not racing with anyone else here for the AGI
- * buffer, we don't even need to hold it locked to read the
- * initial unlinked bucket entries out of the buffer. We keep
- * buffer reference though, so that it stays pinned in memory
- * while we need the buffer.
- */
- agi = agibp->b_addr;
- xfs_buf_unlock(agibp);
-
- for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
- agino = be32_to_cpu(agi->agi_unlinked[bucket]);
- while (agino != NULLAGINO) {
- agino = xlog_recover_process_one_iunlink(mp,
- pag->pag_agno, agino, bucket);
- cond_resched();
- }
- }
- xfs_buf_rele(agibp);
}
+ xfs_buf_rele(agibp);
+}
+
+static void
+xlog_recover_process_iunlinks(
+ struct xlog *log)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag(log->l_mp, agno, pag)
+ xlog_recover_iunlink_ag(pag);
+
/*
* Flush the pending unlinked inodes to ensure that the inactivations
* are fully completed on disk and the incore inodes can be reclaimed
* before we signal that recovery is complete.
*/
- xfs_inodegc_flush(mp);
+ xfs_inodegc_flush(log->l_mp);
}
STATIC void
@@ -3313,7 +3326,8 @@ xlog_do_recover(
/* re-initialise in-core superblock and geometry structures */
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
return error;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index daa8d29c46b4..f10c88cee116 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -778,7 +778,8 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d2eaebd85abf..8aca2cc173ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -454,6 +454,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
+#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */
#define XFS_SHUTDOWN_STRINGS \
{ SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
new file mode 100644
index 000000000000..69d9c83ea4b2
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Fujitsu. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+#include <linux/mm.h>
+#include <linux/dax.h>
+
+struct failure_info {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ int mf_flags;
+};
+
+static pgoff_t
+xfs_failure_pgoff(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct failure_info *notify)
+{
+ loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
+
+ if (notify->startblock > rec->rm_startblock)
+ pos += XFS_FSB_TO_B(mp,
+ notify->startblock - rec->rm_startblock);
+ return pos >> PAGE_SHIFT;
+}
+
+static unsigned long
+xfs_failure_pgcnt(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct failure_info *notify)
+{
+ xfs_agblock_t end_rec;
+ xfs_agblock_t end_notify;
+ xfs_agblock_t start_cross;
+ xfs_agblock_t end_cross;
+
+ start_cross = max(rec->rm_startblock, notify->startblock);
+
+ end_rec = rec->rm_startblock + rec->rm_blockcount;
+ end_notify = notify->startblock + notify->blockcount;
+ end_cross = min(end_rec, end_notify);
+
+ return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
+}
+
+static int
+xfs_dax_failure_fn(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *data)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip;
+ struct failure_info *notify = data;
+ int error = 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+ (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+
+ /* Get files that incore, filter out others that are not in use. */
+ error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+ 0, &ip);
+ /* Continue the rmap query if the inode isn't incore */
+ if (error == -ENODATA)
+ return 0;
+ if (error)
+ return error;
+
+ error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
+ xfs_failure_pgoff(mp, rec, notify),
+ xfs_failure_pgcnt(mp, rec, notify),
+ notify->mf_flags);
+ xfs_irele(ip);
+ return error;
+}
+
+static int
+xfs_dax_notify_ddev_failure(
+ struct xfs_mount *mp,
+ xfs_daddr_t daddr,
+ xfs_daddr_t bblen,
+ int mf_flags)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_buf *agf_bp = NULL;
+ int error = 0;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+ xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
+ xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ for (; agno <= end_agno; agno++) {
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct failure_info notify;
+ struct xfs_agf *agf;
+ xfs_agblock_t agend;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+ if (error) {
+ xfs_perag_put(pag);
+ break;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+
+ /*
+ * Set the rmap range from ri_low to ri_high, which represents
+ * a [start, end] where we looking for the files or metadata.
+ */
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+ ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+ if (agno == end_agno)
+ ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+
+ agf = agf_bp->b_addr;
+ agend = min(be32_to_cpu(agf->agf_length),
+ ri_high.rm_startblock);
+ notify.startblock = ri_low.rm_startblock;
+ notify.blockcount = agend - ri_low.rm_startblock;
+
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_dax_failure_fn, &notify);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agf_bp);
+ xfs_perag_put(pag);
+ if (error)
+ break;
+
+ fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+ }
+
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+xfs_dax_notify_failure(
+ struct dax_device *dax_dev,
+ u64 offset,
+ u64 len,
+ int mf_flags)
+{
+ struct xfs_mount *mp = dax_holder(dax_dev);
+ u64 ddev_start;
+ u64 ddev_end;
+
+ if (!(mp->m_sb.sb_flags & SB_BORN)) {
+ xfs_warn(mp, "filesystem is not ready for notify_failure()!");
+ return -EIO;
+ }
+
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
+ xfs_warn(mp,
+ "notify_failure() not supported on realtime device!");
+ return -EOPNOTSUPP;
+ }
+
+ if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
+ mp->m_logdev_targp != mp->m_ddev_targp) {
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_has_rmapbt(mp)) {
+ xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
+ return -EOPNOTSUPP;
+ }
+
+ ddev_start = mp->m_ddev_targp->bt_dax_part_off;
+ ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len < ddev_start)
+ return -ENXIO;
+ if (offset > ddev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > ddev_start)
+ offset -= ddev_start;
+ else {
+ len -= ddev_start - offset;
+ offset = 0;
+ }
+ if (offset + len > ddev_end)
+ len -= ddev_end - offset;
+
+ return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+ mf_flags);
+}
+
+const struct dax_holder_operations xfs_dax_holder_operations = {
+ .notify_failure = xfs_dax_notify_failure,
+};
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index abf08bbf34a9..18bb4ec4d7c9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -677,7 +677,8 @@ xfs_qm_init_quotainfo(
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
- error = register_shrinker(&qinf->qi_shrinker);
+ error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+ mp->m_super->s_id);
if (error)
goto out_free_inos;
@@ -1154,7 +1155,7 @@ xfs_qm_dqusage_adjust(
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
@@ -1229,10 +1230,14 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
- bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0);
- if (!bp) {
- error = -EINVAL;
+ error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ goto out_unlock;
+
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ error = -EAGAIN;
+ xfs_buf_relse(bp);
goto out_unlock;
}
xfs_buf_unlock(bp);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e7a7c00d93be..251f20ddd368 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -125,11 +125,10 @@
* shared blocks. If there are no shared extents, fbno and flen will
* be set to NULLAGBLOCK and 0, respectively.
*/
-int
+static int
xfs_reflink_find_shared(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
xfs_agblock_t *fbno,
@@ -140,11 +139,11 @@ xfs_reflink_find_shared(
struct xfs_btree_cur *cur;
int error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag);
+ cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared(
struct xfs_bmbt_irec *irec,
bool *shared)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
xfs_agblock_t agbno;
xfs_extlen_t aglen;
xfs_agblock_t fbno;
@@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared(
trace_xfs_reflink_trim_around_shared(ip, irec);
- agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
aglen = irec->br_blockcount;
- error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
- aglen, &fbno, &flen, true);
+ error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+ true);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -340,9 +341,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -351,25 +384,12 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -385,17 +405,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
+ ASSERT(cmap->br_startoff > imap->br_startoff);
+
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -415,26 +435,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
- if (!error)
- cmap->br_state = XFS_EXT_NORM;
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ do {
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ break;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+ cmap, &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even
+ * partially satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /* CoW fork has a real extent */
+ if (found)
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+
+ /*
+ * CoW fork does not have an extent and data extent is shared.
+ * Allocate a real extent in the CoW fork.
+ */
+ if (cmap->br_startoff > imap->br_startoff)
+ return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /*
+ * CoW fork has a delalloc reservation. Replace it with a real extent.
+ * There may or may not be a data fork mapping.
+ */
+ if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK)
+ return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /* Shouldn't get here. */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
@@ -452,7 +581,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb,
bool cancel_real)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -593,7 +722,7 @@ xfs_reflink_end_cow_extent(
struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
int nmaps;
int error;
@@ -1363,12 +1492,16 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ if (!IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, &xfs_read_iomap_ops);
if (ret || *len == 0)
goto out_unlock;
@@ -1420,16 +1553,11 @@ xfs_reflink_inode_has_shared_extents(
struct xfs_bmbt_irec got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
struct xfs_iext_cursor icur;
bool found;
int error;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
return error;
@@ -1437,17 +1565,25 @@ xfs_reflink_inode_has_shared_extents(
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
+ struct xfs_perag *pag;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
aglen = got.br_blockcount;
-
- error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
+ error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
&rbno, &rlen, false);
+ xfs_perag_put(pag);
if (error)
return error;
+
/* Is there still a shared block here? */
if (rbno != NULLAGBLOCK) {
*has_shared = true;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index bea65f2fe657..65c5dfe17ecf 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
-extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
- xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aa977c7ea370..9ac59814bbb6 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -40,6 +40,7 @@
#include "xfs_defer.h"
#include "xfs_attr_item.h"
#include "xfs_xattr.h"
+#include "xfs_iunlink_item.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -350,8 +351,10 @@ xfs_setup_dax_always(
goto disable_dax;
}
- if (xfs_has_reflink(mp)) {
- xfs_alert(mp, "DAX and reflink cannot be used together!");
+ if (xfs_has_reflink(mp) &&
+ bdev_is_partition(mp->m_ddev_targp->bt_bdev)) {
+ xfs_alert(mp,
+ "DAX and reflink cannot work with multi-partitions!");
return -EINVAL;
}
@@ -1966,11 +1969,19 @@ xfs_init_caches(void)
{
int error;
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!xfs_buf_cache)
+ goto out;
+
xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
if (!xfs_log_ticket_cache)
- goto out;
+ goto out_destroy_buf_cache;
error = xfs_btree_init_cur_caches();
if (error)
@@ -2096,8 +2107,16 @@ xfs_init_caches(void)
if (!xfs_attri_cache)
goto out_destroy_attrd_cache;
+ xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
+ sizeof(struct xfs_iunlink_item),
+ 0, 0, NULL);
+ if (!xfs_iunlink_cache)
+ goto out_destroy_attri_cache;
+
return 0;
+ out_destroy_attri_cache:
+ kmem_cache_destroy(xfs_attri_cache);
out_destroy_attrd_cache:
kmem_cache_destroy(xfs_attrd_cache);
out_destroy_bui_cache:
@@ -2136,6 +2155,8 @@ xfs_init_caches(void)
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
kmem_cache_destroy(xfs_log_ticket_cache);
+ out_destroy_buf_cache:
+ kmem_cache_destroy(xfs_buf_cache);
out:
return -ENOMEM;
}
@@ -2148,6 +2169,7 @@ xfs_destroy_caches(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_cache_destroy(xfs_iunlink_cache);
kmem_cache_destroy(xfs_attri_cache);
kmem_cache_destroy(xfs_attrd_cache);
kmem_cache_destroy(xfs_bui_cache);
@@ -2168,6 +2190,7 @@ xfs_destroy_caches(void)
xfs_defer_destroy_item_caches();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
+ kmem_cache_destroy(xfs_buf_cache);
}
STATIC int __init
@@ -2213,6 +2236,7 @@ xfs_cpu_dead(
list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
spin_unlock(&xfs_mount_list_lock);
xfs_inodegc_cpu_dead(mp, cpu);
+ xlog_cil_pcp_dead(mp->m_log, cpu);
spin_lock(&xfs_mount_list_lock);
}
spin_unlock(&xfs_mount_list_lock);
@@ -2272,13 +2296,9 @@ init_xfs_fs(void)
if (error)
goto out_destroy_wq;
- error = xfs_buf_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_init_procfs();
if (error)
- goto out_buf_terminate;
+ goto out_mru_cache_uninit;
error = xfs_sysctl_register();
if (error)
@@ -2335,8 +2355,6 @@ init_xfs_fs(void)
xfs_sysctl_unregister();
out_cleanup_procfs:
xfs_cleanup_procfs();
- out_buf_terminate:
- xfs_buf_terminate();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
@@ -2362,7 +2380,6 @@ exit_xfs_fs(void)
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
- xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_caches();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 3cd5a51bace1..364e2c2648a8 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -92,6 +92,7 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
extern const struct export_operations xfs_export_operations;
extern const struct quotactl_ops xfs_quotactl_operations;
+extern const struct dax_holder_operations xfs_dax_holder_operations;
extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 4145ba872547..8389f3ef88ef 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -256,7 +256,7 @@ xfs_symlink(
/*
* If the symlink will fit into the inode, write it inline.
*/
- if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
ip->i_disk_size = pathlen;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0fa1b7a2918c..f9057af6e0c8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->format = ip->i_df.if_format;
__entry->nex = ip->i_df.if_nextents;
__entry->broot_size = ip->i_df.if_broot_bytes;
- __entry->fork_off = XFS_IFORK_BOFF(ip);
+ __entry->fork_off = xfs_inode_fork_boff(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
@@ -3672,7 +3672,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
TP_ARGS(ip))
DEFINE_AGINODE_EVENT(xfs_iunlink);
DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
-DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
TP_PROTO(struct xfs_mount *mp, unsigned int flags),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 82cf0189c0db..7bd16fbff534 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -760,7 +760,7 @@ xfs_log_item_batch_insert(
void
xfs_trans_committed_bulk(
struct xfs_ail *ailp,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn,
bool aborted)
{
@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
spin_unlock(&ailp->ail_lock);
/* unpin all the log items */
- for (lv = log_vector; lv; lv = lv->lv_next ) {
+ list_for_each_entry(lv, lv_chain, lv_list) {
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
@@ -845,6 +845,90 @@ xfs_trans_committed_bulk(
}
/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last.
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+ void *unused_arg,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_item *lia = container_of(a,
+ struct xfs_log_item, li_trans);
+ struct xfs_log_item *lib = container_of(b,
+ struct xfs_log_item, li_trans);
+ int64_t diff;
+
+ /*
+ * If both items are non-sortable, leave them alone. If only one is
+ * sortable, move the non-sortable item towards the end of the list.
+ */
+ if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+ return 0;
+ if (!lia->li_ops->iop_sort)
+ return 1;
+ if (!lib->li_ops->iop_sort)
+ return -1;
+
+ diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item *lip, *n;
+ int error = 0;
+
+ /*
+ * Sort the item list to avoid ABBA deadlocks with other transactions
+ * running precommit operations that lock multiple shared items such as
+ * inode cluster buffers.
+ */
+ list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+ /*
+ * Precommit operations can remove the log item from the transaction
+ * if the log item exists purely to delay modifications until they
+ * can be ordered against other operations. Hence we have to use
+ * list_for_each_entry_safe() here.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ continue;
+ if (lip->li_ops->iop_precommit) {
+ error = lip->li_ops->iop_precommit(tp, lip);
+ if (error)
+ break;
+ }
+ }
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+}
+
+/*
* Commit the given transaction to the log.
*
* XFS disk error handling mechanism is not based on a typical
@@ -869,6 +953,13 @@ __xfs_trans_commit(
trace_xfs_trans_commit(tp, _RET_IP_);
+ error = xfs_trans_run_precommits(tp);
+ if (error) {
+ if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+ xfs_defer_cancel(tp);
+ goto out_unreserve;
+ }
+
/*
* Finish deferred items on final commit. Only permanent transactions
* should ever have deferred ops.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9561f193e7e1..55819785941c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -45,6 +45,7 @@ struct xfs_log_item {
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_csn_t li_seq; /* CIL commit seq */
+ uint32_t li_order_id; /* CIL commit order */
};
/*
@@ -71,10 +72,12 @@ struct xfs_item_ops {
void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
void (*iop_pin)(struct xfs_log_item *);
void (*iop_unpin)(struct xfs_log_item *, int remove);
- uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+ uint64_t (*iop_sort)(struct xfs_log_item *lip);
+ int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip);
void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
- void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+ void (*iop_release)(struct xfs_log_item *);
int (*iop_recover)(struct xfs_log_item *lip,
struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index f0d79a9050ba..d5400150358e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+void xfs_trans_committed_bulk(struct xfs_ail *ailp,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 511bb9fa3750..860f0b1032c6 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -231,13 +231,6 @@ static const struct iomap_writeback_ops zonefs_writeback_ops = {
.map_blocks = zonefs_write_map_blocks,
};
-static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct iomap_writepage_ctx wpc = { };
-
- return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops);
-}
-
static int zonefs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -265,7 +258,6 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
static const struct address_space_operations zonefs_file_aops = {
.read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
- .writepage = zonefs_writepage,
.writepages = zonefs_writepages,
.dirty_folio = filemap_dirty_folio,
.release_folio = iomap_release_folio,