Merge 916d636e0a ("Merge tag 'vfs-5.15-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux") into android-mainline
Steps on the way to 5.15-rc1. Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I44aeb7207e79c266f8ffb79ae2d69c40463be0a8
This commit is contained in:
@@ -24,6 +24,10 @@ Available fault injection capabilities
|
||||
|
||||
injects futex deadlock and uaddr fault errors.
|
||||
|
||||
- fail_sunrpc
|
||||
|
||||
injects kernel RPC client and server failures.
|
||||
|
||||
- fail_make_request
|
||||
|
||||
injects disk IO errors on devices permitted by setting
|
||||
@@ -151,6 +155,20 @@ configuration of fault-injection capabilities.
|
||||
default is 'N', setting it to 'Y' will disable failure injections
|
||||
when dealing with private (address space) futexes.
|
||||
|
||||
- /sys/kernel/debug/fail_sunrpc/ignore-client-disconnect:
|
||||
|
||||
Format: { 'Y' | 'N' }
|
||||
|
||||
default is 'N', setting it to 'Y' will disable disconnect
|
||||
injection on the RPC client.
|
||||
|
||||
- /sys/kernel/debug/fail_sunrpc/ignore-server-disconnect:
|
||||
|
||||
Format: { 'Y' | 'N' }
|
||||
|
||||
default is 'N', setting it to 'Y' will disable disconnect
|
||||
injection on the RPC server.
|
||||
|
||||
- /sys/kernel/debug/fail_function/inject:
|
||||
|
||||
Format: { 'function-name' | '!function-name' | '' }
|
||||
|
||||
10
Documentation/filesystems/cifs/index.rst
Normal file
10
Documentation/filesystems/cifs/index.rst
Normal file
@@ -0,0 +1,10 @@
|
||||
===============================
|
||||
CIFS
|
||||
===============================
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
ksmbd
|
||||
cifsroot
|
||||
165
Documentation/filesystems/cifs/ksmbd.rst
Normal file
165
Documentation/filesystems/cifs/ksmbd.rst
Normal file
@@ -0,0 +1,165 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========================
|
||||
KSMBD - SMB3 Kernel Server
|
||||
==========================
|
||||
|
||||
KSMBD is a linux kernel server which implements SMB3 protocol in kernel space
|
||||
for sharing files over network.
|
||||
|
||||
KSMBD architecture
|
||||
==================
|
||||
|
||||
The subset of performance related operations belong in kernelspace and
|
||||
the other subset which belong to operations which are not really related with
|
||||
performance in userspace. So, DCE/RPC management that has historically resulted
|
||||
into number of buffer overflow issues and dangerous security bugs and user
|
||||
account management are implemented in user space as ksmbd.mountd.
|
||||
File operations that are related with performance (open/read/write/close etc.)
|
||||
in kernel space (ksmbd). This also allows for easier integration with VFS
|
||||
interface for all file operations.
|
||||
|
||||
ksmbd (kernel daemon)
|
||||
---------------------
|
||||
|
||||
When the server daemon is started, It starts up a forker thread
|
||||
(ksmbd/interface name) at initialization time and open a dedicated port 445
|
||||
for listening to SMB requests. Whenever new clients make request, Forker
|
||||
thread will accept the client connection and fork a new thread for dedicated
|
||||
communication channel between the client and the server. It allows for parallel
|
||||
processing of SMB requests(commands) from clients as well as allowing for new
|
||||
clients to make new connections. Each instance is named ksmbd/1~n(port number)
|
||||
to indicate connected clients. Depending on the SMB request types, each new
|
||||
thread can decide to pass through the commands to the user space (ksmbd.mountd),
|
||||
currently DCE/RPC commands are identified to be handled through the user space.
|
||||
To further utilize the linux kernel, it has been chosen to process the commands
|
||||
as workitems and to be executed in the handlers of the ksmbd-io kworker threads.
|
||||
It allows for multiplexing of the handlers as the kernel take care of initiating
|
||||
extra worker threads if the load is increased and vice versa, if the load is
|
||||
decreased it destroys the extra worker threads. So, after connection is
|
||||
established with client. Dedicated ksmbd/1..n(port number) takes complete
|
||||
ownership of receiving/parsing of SMB commands. Each received command is worked
|
||||
in parallel i.e., There can be multiple clients commands which are worked in
|
||||
parallel. After receiving each command a separated kernel workitem is prepared
|
||||
for each command which is further queued to be handled by ksmbd-io kworkers.
|
||||
So, each SMB workitem is queued to the kworkers. This allows the benefit of load
|
||||
sharing to be managed optimally by the default kernel and optimizing client
|
||||
performance by handling client commands in parallel.
|
||||
|
||||
ksmbd.mountd (user space daemon)
|
||||
--------------------------------
|
||||
|
||||
ksmbd.mountd is userspace process to, transfer user account and password that
|
||||
are registered using ksmbd.adduser(part of utils for user space). Further it
|
||||
allows sharing information parameters that parsed from smb.conf to ksmbd in
|
||||
kernel. For the execution part it has a daemon which is continuously running
|
||||
and connected to the kernel interface using netlink socket, it waits for the
|
||||
requests(dcerpc and share/user info). It handles RPC calls (at a minimum few
|
||||
dozen) that are most important for file server from NetShareEnum and
|
||||
NetServerGetInfo. Complete DCE/RPC response is prepared from the user space
|
||||
and passed over to the associated kernel thread for the client.
|
||||
|
||||
|
||||
KSMBD Feature Status
|
||||
====================
|
||||
|
||||
============================== =================================================
|
||||
Feature name Status
|
||||
============================== =================================================
|
||||
Dialects Supported. SMB2.1 SMB3.0, SMB3.1.1 dialects
|
||||
(intentionally excludes security vulnerable SMB1
|
||||
dialect).
|
||||
Auto Negotiation Supported.
|
||||
Compound Request Supported.
|
||||
Oplock Cache Mechanism Supported.
|
||||
SMB2 leases(v1 lease) Supported.
|
||||
Directory leases(v2 lease) Planned for future.
|
||||
Multi-credits Supported.
|
||||
NTLM/NTLMv2 Supported.
|
||||
HMAC-SHA256 Signing Supported.
|
||||
Secure negotiate Supported.
|
||||
Signing Update Supported.
|
||||
Pre-authentication integrity Supported.
|
||||
SMB3 encryption(CCM, GCM) Supported. (CCM and GCM128 supported, GCM256 in
|
||||
progress)
|
||||
SMB direct(RDMA) Partially Supported. SMB3 Multi-channel is
|
||||
required to connect to Windows client.
|
||||
SMB3 Multi-channel Partially Supported. Planned to implement
|
||||
replay/retry mechanisms for future.
|
||||
SMB3.1.1 POSIX extension Supported.
|
||||
ACLs Partially Supported. only DACLs available, SACLs
|
||||
(auditing) is planned for the future. For
|
||||
ownership (SIDs) ksmbd generates random subauth
|
||||
values(then store it to disk) and use uid/gid
|
||||
get from inode as RID for local domain SID.
|
||||
The current acl implementation is limited to
|
||||
standalone server, not a domain member.
|
||||
Integration with Samba tools is being worked on
|
||||
to allow future support for running as a domain
|
||||
member.
|
||||
Kerberos Supported.
|
||||
Durable handle v1,v2 Planned for future.
|
||||
Persistent handle Planned for future.
|
||||
SMB2 notify Planned for future.
|
||||
Sparse file support Supported.
|
||||
DCE/RPC support Partially Supported. a few calls(NetShareEnumAll,
|
||||
NetServerGetInfo, SAMR, LSARPC) that are needed
|
||||
for file server handled via netlink interface
|
||||
from ksmbd.mountd. Additional integration with
|
||||
Samba tools and libraries via upcall is being
|
||||
investigated to allow support for additional
|
||||
DCE/RPC management calls (and future support
|
||||
for Witness protocol e.g.)
|
||||
ksmbd/nfsd interoperability Planned for future. The features that ksmbd
|
||||
support are Leases, Notify, ACLs and Share modes.
|
||||
============================== =================================================
|
||||
|
||||
|
||||
How to run
|
||||
==========
|
||||
|
||||
1. Download ksmbd-tools and compile them.
|
||||
- https://github.com/cifsd-team/ksmbd-tools
|
||||
|
||||
2. Create user/password for SMB share.
|
||||
|
||||
# mkdir /etc/ksmbd/
|
||||
# ksmbd.adduser -a <Enter USERNAME for SMB share access>
|
||||
|
||||
3. Create /etc/ksmbd/smb.conf file, add SMB share in smb.conf file
|
||||
- Refer smb.conf.example and
|
||||
https://github.com/cifsd-team/ksmbd-tools/blob/master/Documentation/configuration.txt
|
||||
|
||||
4. Insert ksmbd.ko module
|
||||
|
||||
# insmod ksmbd.ko
|
||||
|
||||
5. Start ksmbd user space daemon
|
||||
# ksmbd.mountd
|
||||
|
||||
6. Access share from Windows or Linux using CIFS
|
||||
|
||||
Shutdown KSMBD
|
||||
==============
|
||||
|
||||
1. kill user and kernel space daemon
|
||||
# sudo ksmbd.control -s
|
||||
|
||||
How to turn debug print on
|
||||
==========================
|
||||
|
||||
Each layer
|
||||
/sys/class/ksmbd-control/debug
|
||||
|
||||
1. Enable all component prints
|
||||
# sudo ksmbd.control -d "all"
|
||||
|
||||
2. Enable one of components(smb, auth, vfs, oplock, ipc, conn, rdma)
|
||||
# sudo ksmbd.control -d "smb"
|
||||
|
||||
3. Show what prints are enable.
|
||||
# cat/sys/class/ksmbd-control/debug
|
||||
[smb] auth vfs oplock ipc conn [rdma]
|
||||
|
||||
4. Disable prints:
|
||||
If you try the selected component once more, It is disabled without brackets.
|
||||
@@ -1063,11 +1063,6 @@ astute users may notice some differences in behavior:
|
||||
|
||||
- DAX (Direct Access) is not supported on encrypted files.
|
||||
|
||||
- The st_size of an encrypted symlink will not necessarily give the
|
||||
length of the symlink target as required by POSIX. It will actually
|
||||
give the length of the ciphertext, which will be slightly longer
|
||||
than the plaintext due to NUL-padding and an extra 2-byte overhead.
|
||||
|
||||
- The maximum length of an encrypted symlink is 2 bytes shorter than
|
||||
the maximum length of an unencrypted symlink. For example, on an
|
||||
EXT4 filesystem with a 4K block size, unencrypted symlinks can be up
|
||||
@@ -1235,12 +1230,12 @@ the user-supplied name to get the ciphertext.
|
||||
|
||||
Lookups without the key are more complicated. The raw ciphertext may
|
||||
contain the ``\0`` and ``/`` characters, which are illegal in
|
||||
filenames. Therefore, readdir() must base64-encode the ciphertext for
|
||||
presentation. For most filenames, this works fine; on ->lookup(), the
|
||||
filesystem just base64-decodes the user-supplied name to get back to
|
||||
the raw ciphertext.
|
||||
filenames. Therefore, readdir() must base64url-encode the ciphertext
|
||||
for presentation. For most filenames, this works fine; on ->lookup(),
|
||||
the filesystem just base64url-decodes the user-supplied name to get
|
||||
back to the raw ciphertext.
|
||||
|
||||
However, for very long filenames, base64 encoding would cause the
|
||||
However, for very long filenames, base64url encoding would cause the
|
||||
filename length to exceed NAME_MAX. To prevent this, readdir()
|
||||
actually presents long filenames in an abbreviated form which encodes
|
||||
a strong "hash" of the ciphertext filename, along with the optional
|
||||
|
||||
@@ -72,7 +72,7 @@ Documentation for filesystem implementations.
|
||||
befs
|
||||
bfs
|
||||
btrfs
|
||||
cifs/cifsroot
|
||||
cifs/index
|
||||
ceph
|
||||
coda
|
||||
configfs
|
||||
|
||||
14
MAINTAINERS
14
MAINTAINERS
@@ -4620,7 +4620,7 @@ F: include/linux/clk/
|
||||
F: include/linux/of_clk.h
|
||||
X: drivers/clk/clkdev.c
|
||||
|
||||
COMMON INTERNET FILE SYSTEM (CIFS)
|
||||
COMMON INTERNET FILE SYSTEM CLIENT (CIFS)
|
||||
M: Steve French <sfrench@samba.org>
|
||||
L: linux-cifs@vger.kernel.org
|
||||
L: samba-technical@lists.samba.org (moderated for non-subscribers)
|
||||
@@ -4629,6 +4629,7 @@ W: http://linux-cifs.samba.org/
|
||||
T: git git://git.samba.org/sfrench/cifs-2.6.git
|
||||
F: Documentation/admin-guide/cifs/
|
||||
F: fs/cifs/
|
||||
F: fs/cifs_common/
|
||||
|
||||
COMPACTPCI HOTPLUG CORE
|
||||
M: Scott Murray <scott@spiteful.org>
|
||||
@@ -10120,6 +10121,17 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git
|
||||
F: Documentation/dev-tools/kselftest*
|
||||
F: tools/testing/selftests/
|
||||
|
||||
KERNEL SMB3 SERVER (KSMBD)
|
||||
M: Namjae Jeon <linkinjeon@kernel.org>
|
||||
M: Sergey Senozhatsky <senozhatsky@chromium.org>
|
||||
M: Steve French <sfrench@samba.org>
|
||||
M: Hyunchul Lee <hyc.lee@gmail.com>
|
||||
L: linux-cifs@vger.kernel.org
|
||||
S: Maintained
|
||||
T: git git://git.samba.org/ksmbd.git
|
||||
F: fs/cifs_common/
|
||||
F: fs/ksmbd/
|
||||
|
||||
KERNEL UNIT TESTING FRAMEWORK (KUnit)
|
||||
M: Brendan Higgins <brendanhiggins@google.com>
|
||||
L: linux-kselftest@vger.kernel.org
|
||||
|
||||
175
block/bio.c
175
block/bio.c
@@ -25,6 +25,11 @@
|
||||
#include "blk.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
struct bio_alloc_cache {
|
||||
struct bio_list free_list;
|
||||
unsigned int nr;
|
||||
};
|
||||
|
||||
static struct biovec_slab {
|
||||
int nr_vecs;
|
||||
char *name;
|
||||
@@ -246,12 +251,40 @@ static void bio_free(struct bio *bio)
|
||||
void bio_init(struct bio *bio, struct bio_vec *table,
|
||||
unsigned short max_vecs)
|
||||
{
|
||||
memset(bio, 0, sizeof(*bio));
|
||||
bio->bi_next = NULL;
|
||||
bio->bi_bdev = NULL;
|
||||
bio->bi_opf = 0;
|
||||
bio->bi_flags = 0;
|
||||
bio->bi_ioprio = 0;
|
||||
bio->bi_write_hint = 0;
|
||||
bio->bi_status = 0;
|
||||
bio->bi_iter.bi_sector = 0;
|
||||
bio->bi_iter.bi_size = 0;
|
||||
bio->bi_iter.bi_idx = 0;
|
||||
bio->bi_iter.bi_bvec_done = 0;
|
||||
bio->bi_end_io = NULL;
|
||||
bio->bi_private = NULL;
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
bio->bi_blkg = NULL;
|
||||
bio->bi_issue.value = 0;
|
||||
#ifdef CONFIG_BLK_CGROUP_IOCOST
|
||||
bio->bi_iocost_cost = 0;
|
||||
#endif
|
||||
#endif
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
|
||||
bio->bi_crypt_context = NULL;
|
||||
#endif
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
bio->bi_integrity = NULL;
|
||||
#endif
|
||||
bio->bi_vcnt = 0;
|
||||
|
||||
atomic_set(&bio->__bi_remaining, 1);
|
||||
atomic_set(&bio->__bi_cnt, 1);
|
||||
|
||||
bio->bi_io_vec = table;
|
||||
bio->bi_max_vecs = max_vecs;
|
||||
bio->bi_io_vec = table;
|
||||
bio->bi_pool = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(bio_init);
|
||||
|
||||
@@ -586,6 +619,53 @@ void guard_bio_eod(struct bio *bio)
|
||||
bio_truncate(bio, maxsector << 9);
|
||||
}
|
||||
|
||||
#define ALLOC_CACHE_MAX 512
|
||||
#define ALLOC_CACHE_SLACK 64
|
||||
|
||||
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
unsigned int nr)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
struct bio *bio;
|
||||
|
||||
while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
|
||||
cache->nr--;
|
||||
bio_free(bio);
|
||||
if (++i == nr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
|
||||
{
|
||||
struct bio_set *bs;
|
||||
|
||||
bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
|
||||
if (bs->cache) {
|
||||
struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
|
||||
|
||||
bio_alloc_cache_prune(cache, -1U);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bio_alloc_cache_destroy(struct bio_set *bs)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!bs->cache)
|
||||
return;
|
||||
|
||||
cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct bio_alloc_cache *cache;
|
||||
|
||||
cache = per_cpu_ptr(bs->cache, cpu);
|
||||
bio_alloc_cache_prune(cache, -1U);
|
||||
}
|
||||
free_percpu(bs->cache);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_put - release a reference to a bio
|
||||
* @bio: bio to release reference to
|
||||
@@ -596,15 +676,22 @@ void guard_bio_eod(struct bio *bio)
|
||||
**/
|
||||
void bio_put(struct bio *bio)
|
||||
{
|
||||
if (!bio_flagged(bio, BIO_REFFED))
|
||||
bio_free(bio);
|
||||
else {
|
||||
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
|
||||
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
|
||||
if (!atomic_dec_and_test(&bio->__bi_cnt))
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* last put frees it
|
||||
*/
|
||||
if (atomic_dec_and_test(&bio->__bi_cnt))
|
||||
if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
|
||||
struct bio_alloc_cache *cache;
|
||||
|
||||
bio_uninit(bio);
|
||||
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
|
||||
bio_list_add_head(&cache->free_list, bio);
|
||||
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
|
||||
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
|
||||
put_cpu();
|
||||
} else {
|
||||
bio_free(bio);
|
||||
}
|
||||
}
|
||||
@@ -1457,12 +1544,15 @@ EXPORT_SYMBOL(bio_split);
|
||||
* @bio: bio to trim
|
||||
* @offset: number of sectors to trim from the front of @bio
|
||||
* @size: size we want to trim @bio to, in sectors
|
||||
*
|
||||
* This function is typically used for bios that are cloned and submitted
|
||||
* to the underlying device in parts.
|
||||
*/
|
||||
void bio_trim(struct bio *bio, int offset, int size)
|
||||
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
|
||||
{
|
||||
/* 'bio' is a cloned bio which we need to trim to match
|
||||
* the given offset and size.
|
||||
*/
|
||||
if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
|
||||
offset + size > bio->bi_iter.bi_size))
|
||||
return;
|
||||
|
||||
size <<= 9;
|
||||
if (offset == 0 && size == bio->bi_iter.bi_size)
|
||||
@@ -1473,7 +1563,6 @@ void bio_trim(struct bio *bio, int offset, int size)
|
||||
|
||||
if (bio_integrity(bio))
|
||||
bio_integrity_trim(bio);
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_trim);
|
||||
|
||||
@@ -1496,6 +1585,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
|
||||
*/
|
||||
void bioset_exit(struct bio_set *bs)
|
||||
{
|
||||
bio_alloc_cache_destroy(bs);
|
||||
if (bs->rescue_workqueue)
|
||||
destroy_workqueue(bs->rescue_workqueue);
|
||||
bs->rescue_workqueue = NULL;
|
||||
@@ -1557,12 +1647,18 @@ int bioset_init(struct bio_set *bs,
|
||||
biovec_init_pool(&bs->bvec_pool, pool_size))
|
||||
goto bad;
|
||||
|
||||
if (!(flags & BIOSET_NEED_RESCUER))
|
||||
return 0;
|
||||
|
||||
bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
|
||||
if (flags & BIOSET_NEED_RESCUER) {
|
||||
bs->rescue_workqueue = alloc_workqueue("bioset",
|
||||
WQ_MEM_RECLAIM, 0);
|
||||
if (!bs->rescue_workqueue)
|
||||
goto bad;
|
||||
}
|
||||
if (flags & BIOSET_PERCPU_CACHE) {
|
||||
bs->cache = alloc_percpu(struct bio_alloc_cache);
|
||||
if (!bs->cache)
|
||||
goto bad;
|
||||
cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
|
||||
}
|
||||
|
||||
return 0;
|
||||
bad:
|
||||
@@ -1589,6 +1685,46 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
|
||||
}
|
||||
EXPORT_SYMBOL(bioset_init_from_src);
|
||||
|
||||
/**
|
||||
* bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
|
||||
* @kiocb: kiocb describing the IO
|
||||
* @nr_iovecs: number of iovecs to pre-allocate
|
||||
* @bs: bio_set to allocate from
|
||||
*
|
||||
* Description:
|
||||
* Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
|
||||
* used to check if we should dip into the per-cpu bio_set allocation
|
||||
* cache. The allocation uses GFP_KERNEL internally. On return, the
|
||||
* bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
|
||||
* MUST be done from process context, not hard/soft IRQ.
|
||||
*
|
||||
*/
|
||||
struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
|
||||
struct bio_set *bs)
|
||||
{
|
||||
struct bio_alloc_cache *cache;
|
||||
struct bio *bio;
|
||||
|
||||
if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
|
||||
return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
|
||||
|
||||
cache = per_cpu_ptr(bs->cache, get_cpu());
|
||||
bio = bio_list_pop(&cache->free_list);
|
||||
if (bio) {
|
||||
cache->nr--;
|
||||
put_cpu();
|
||||
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
|
||||
bio->bi_pool = bs;
|
||||
bio_set_flag(bio, BIO_PERCPU_CACHE);
|
||||
return bio;
|
||||
}
|
||||
put_cpu();
|
||||
bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
|
||||
bio_set_flag(bio, BIO_PERCPU_CACHE);
|
||||
return bio;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
|
||||
|
||||
static int __init init_bio(void)
|
||||
{
|
||||
int i;
|
||||
@@ -1603,6 +1739,9 @@ static int __init init_bio(void)
|
||||
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
|
||||
}
|
||||
|
||||
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
|
||||
bio_cpu_dead);
|
||||
|
||||
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
|
||||
panic("bio: can't allocate bios\n");
|
||||
|
||||
|
||||
@@ -821,7 +821,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
|
||||
}
|
||||
|
||||
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
|
||||
bio->bi_opf &= ~REQ_HIPRI;
|
||||
bio_clear_hipri(bio);
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
|
||||
@@ -285,7 +285,7 @@ split:
|
||||
* iopoll in direct IO routine. Given performance gain of iopoll for
|
||||
* big IO can be trival, disable iopoll when split needed.
|
||||
*/
|
||||
bio->bi_opf &= ~REQ_HIPRI;
|
||||
bio_clear_hipri(bio);
|
||||
|
||||
return bio_split(bio, sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
@@ -366,4 +366,11 @@ extern struct device_attribute dev_attr_events;
|
||||
extern struct device_attribute dev_attr_events_async;
|
||||
extern struct device_attribute dev_attr_events_poll_msecs;
|
||||
|
||||
static inline void bio_clear_hipri(struct bio *bio)
|
||||
{
|
||||
/* can't support alloc cache if we turn off polling */
|
||||
bio_clear_flag(bio, BIO_PERCPU_CACHE);
|
||||
bio->bi_opf &= ~REQ_HIPRI;
|
||||
}
|
||||
|
||||
#endif /* BLK_INTERNAL_H */
|
||||
|
||||
@@ -612,12 +612,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
|
||||
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
|
||||
|
||||
inode = file_inode(vma->vm_file);
|
||||
|
||||
if (!mapping_can_writeback(inode->i_mapping))
|
||||
wbc.nr_to_write = 0;
|
||||
|
||||
might_sleep();
|
||||
sync_inode(inode, &wbc);
|
||||
filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -349,7 +349,15 @@ config NFS_V4_2_SSC_HELPER
|
||||
|
||||
source "net/sunrpc/Kconfig"
|
||||
source "fs/ceph/Kconfig"
|
||||
|
||||
source "fs/cifs/Kconfig"
|
||||
source "fs/ksmbd/Kconfig"
|
||||
|
||||
config CIFS_COMMON
|
||||
tristate
|
||||
default y if CIFS=y
|
||||
default m if CIFS=m
|
||||
|
||||
source "fs/coda/Kconfig"
|
||||
source "fs/afs/Kconfig"
|
||||
source "fs/9p/Kconfig"
|
||||
|
||||
@@ -96,7 +96,9 @@ obj-$(CONFIG_LOCKD) += lockd/
|
||||
obj-$(CONFIG_NLS) += nls/
|
||||
obj-$(CONFIG_UNICODE) += unicode/
|
||||
obj-$(CONFIG_SYSV_FS) += sysv/
|
||||
obj-$(CONFIG_CIFS_COMMON) += cifs_common/
|
||||
obj-$(CONFIG_CIFS) += cifs/
|
||||
obj-$(CONFIG_SMB_SERVER) += ksmbd/
|
||||
obj-$(CONFIG_HPFS_FS) += hpfs/
|
||||
obj-$(CONFIG_NTFS_FS) += ntfs/
|
||||
obj-$(CONFIG_UFS_FS) += ufs/
|
||||
|
||||
@@ -386,7 +386,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
(bdev_logical_block_size(bdev) - 1))
|
||||
return -EINVAL;
|
||||
|
||||
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
|
||||
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
|
||||
|
||||
dio = container_of(bio, struct blkdev_dio, bio);
|
||||
dio->is_sync = is_sync = is_sync_kiocb(iocb);
|
||||
@@ -514,7 +514,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
|
||||
static __init int blkdev_init(void)
|
||||
{
|
||||
return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
|
||||
return bioset_init(&blkdev_dio_pool, 4,
|
||||
offsetof(struct blkdev_dio, bio),
|
||||
BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
|
||||
}
|
||||
module_init(blkdev_init);
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
|
||||
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
|
||||
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
|
||||
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
|
||||
btrfs-$(CONFIG_FS_VERITY) += verity.o
|
||||
|
||||
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
|
||||
tests/extent-buffer-tests.o tests/btrfs-tests.o \
|
||||
|
||||
@@ -53,6 +53,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
|
||||
}
|
||||
|
||||
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
|
||||
struct user_namespace *mnt_userns,
|
||||
struct inode *inode, struct posix_acl *acl, int type)
|
||||
{
|
||||
int ret, size = 0;
|
||||
@@ -114,12 +115,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
|
||||
umode_t old_mode = inode->i_mode;
|
||||
|
||||
if (type == ACL_TYPE_ACCESS && acl) {
|
||||
ret = posix_acl_update_mode(&init_user_ns, inode,
|
||||
ret = posix_acl_update_mode(mnt_userns, inode,
|
||||
&inode->i_mode, &acl);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
ret = __btrfs_set_acl(NULL, inode, acl, type);
|
||||
ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
|
||||
if (ret)
|
||||
inode->i_mode = old_mode;
|
||||
return ret;
|
||||
@@ -140,14 +141,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
|
||||
return ret;
|
||||
|
||||
if (default_acl) {
|
||||
ret = __btrfs_set_acl(trans, inode, default_acl,
|
||||
ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
|
||||
ACL_TYPE_DEFAULT);
|
||||
posix_acl_release(default_acl);
|
||||
}
|
||||
|
||||
if (acl) {
|
||||
if (!ret)
|
||||
ret = __btrfs_set_acl(trans, inode, acl,
|
||||
ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
|
||||
ACL_TYPE_ACCESS);
|
||||
posix_acl_release(acl);
|
||||
}
|
||||
|
||||
@@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
|
||||
again:
|
||||
head = NULL;
|
||||
|
||||
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
|
||||
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
BUG_ON(ret == 0);
|
||||
@@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
|
||||
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||
u64 time_seq, struct ulist **roots,
|
||||
bool ignore_offset, bool skip_commit_root_sem)
|
||||
bool skip_commit_root_sem)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!trans && !skip_commit_root_sem)
|
||||
down_read(&fs_info->commit_root_sem);
|
||||
ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
|
||||
time_seq, roots, ignore_offset);
|
||||
time_seq, roots, false);
|
||||
if (!trans && !skip_commit_root_sem)
|
||||
up_read(&fs_info->commit_root_sem);
|
||||
return ret;
|
||||
|
||||
@@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
|
||||
const u64 *extent_item_pos, bool ignore_offset);
|
||||
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||
u64 time_seq, struct ulist **roots, bool ignore_offset,
|
||||
u64 time_seq, struct ulist **roots,
|
||||
bool skip_commit_root_sem);
|
||||
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
|
||||
u32 name_len, unsigned long name_off,
|
||||
|
||||
@@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
|
||||
div64_u64(zone_unusable * 100, bg->length));
|
||||
trace_btrfs_reclaim_block_group(bg);
|
||||
ret = btrfs_relocate_chunk(fs_info, bg->start);
|
||||
if (ret)
|
||||
if (ret && ret != -EAGAIN)
|
||||
btrfs_err(fs_info, "error relocating chunk %llu",
|
||||
bg->start);
|
||||
|
||||
@@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
|
||||
bg->used = em->len;
|
||||
bg->flags = map->type;
|
||||
ret = btrfs_add_block_group_cache(fs_info, bg);
|
||||
/*
|
||||
* We may have some valid block group cache added already, in
|
||||
* that case we skip to the next one.
|
||||
*/
|
||||
if (ret == -EEXIST) {
|
||||
ret = 0;
|
||||
btrfs_put_block_group(bg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
btrfs_remove_free_space_cache(bg);
|
||||
btrfs_put_block_group(bg);
|
||||
break;
|
||||
}
|
||||
|
||||
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
|
||||
0, 0, &space_info);
|
||||
bg->space_info = space_info;
|
||||
@@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
|
||||
ret = check_chunk_block_group_mappings(info);
|
||||
error:
|
||||
btrfs_free_path(path);
|
||||
/*
|
||||
* We've hit some error while reading the extent tree, and have
|
||||
* rescue=ibadroots mount option.
|
||||
* Try to fill the tree using dummy block groups so that the user can
|
||||
* continue to mount and grab their data.
|
||||
*/
|
||||
if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
|
||||
ret = fill_dummy_bgs(info);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
|
||||
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
|
||||
}
|
||||
|
||||
static int insert_dev_extent(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_device *device, u64 chunk_offset,
|
||||
u64 start, u64 num_bytes)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = device->fs_info;
|
||||
struct btrfs_root *root = fs_info->dev_root;
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_dev_extent *extent;
|
||||
struct extent_buffer *leaf;
|
||||
struct btrfs_key key;
|
||||
int ret;
|
||||
|
||||
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
|
||||
WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
key.objectid = device->devid;
|
||||
key.type = BTRFS_DEV_EXTENT_KEY;
|
||||
key.offset = start;
|
||||
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
leaf = path->nodes[0];
|
||||
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
|
||||
btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
|
||||
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
|
||||
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
|
||||
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
|
||||
|
||||
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
|
||||
btrfs_mark_buffer_dirty(leaf);
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function belongs to phase 2.
|
||||
*
|
||||
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
|
||||
* phases.
|
||||
*/
|
||||
static int insert_dev_extents(struct btrfs_trans_handle *trans,
|
||||
u64 chunk_offset, u64 chunk_size)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
struct btrfs_device *device;
|
||||
struct extent_map *em;
|
||||
struct map_lookup *map;
|
||||
u64 dev_offset;
|
||||
u64 stripe_size;
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
|
||||
if (IS_ERR(em))
|
||||
return PTR_ERR(em);
|
||||
|
||||
map = em->map_lookup;
|
||||
stripe_size = em->orig_block_len;
|
||||
|
||||
/*
|
||||
* Take the device list mutex to prevent races with the final phase of
|
||||
* a device replace operation that replaces the device object associated
|
||||
* with the map's stripes, because the device object's id can change
|
||||
* at any time during that final phase of the device replace operation
|
||||
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
|
||||
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
|
||||
* resulting in persisting a device extent item with such ID.
|
||||
*/
|
||||
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
||||
for (i = 0; i < map->num_stripes; i++) {
|
||||
device = map->stripes[i].dev;
|
||||
dev_offset = map->stripes[i].physical;
|
||||
|
||||
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
|
||||
stripe_size);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
||||
|
||||
free_extent_map(em);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
|
||||
* chunk allocation.
|
||||
@@ -2278,7 +2386,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
|
||||
if (ret)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
}
|
||||
ret = btrfs_finish_chunk_alloc(trans, block_group->start,
|
||||
ret = insert_dev_extents(trans, block_group->start,
|
||||
block_group->length);
|
||||
if (ret)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
|
||||
@@ -51,6 +51,13 @@ enum {
|
||||
* the file range, inode's io_tree).
|
||||
*/
|
||||
BTRFS_INODE_NO_DELALLOC_FLUSH,
|
||||
/*
|
||||
* Set when we are working on enabling verity for a file. Computing and
|
||||
* writing the whole Merkle tree can take a while so we want to prevent
|
||||
* races where two separate tasks attempt to simultaneously start verity
|
||||
* on the same file.
|
||||
*/
|
||||
BTRFS_INODE_VERITY_IN_PROGRESS,
|
||||
};
|
||||
|
||||
/* in memory btrfs inode */
|
||||
@@ -189,8 +196,10 @@ struct btrfs_inode {
|
||||
*/
|
||||
u64 csum_bytes;
|
||||
|
||||
/* flags field from the on disk inode */
|
||||
/* Backwards incompatible flags, lower half of inode_item::flags */
|
||||
u32 flags;
|
||||
/* Read-only compatibility flags, upper half of inode_item::flags */
|
||||
u32 ro_flags;
|
||||
|
||||
/*
|
||||
* Counters to keep track of the number of extent item's we may use due
|
||||
@@ -348,6 +357,22 @@ struct btrfs_dio_private {
|
||||
u8 csums[];
|
||||
};
|
||||
|
||||
/*
|
||||
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
|
||||
* separate u32s. These two functions convert between the two representations.
|
||||
*/
|
||||
static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
|
||||
{
|
||||
return (flags | ((u64)ro_flags << 32));
|
||||
}
|
||||
|
||||
static inline void btrfs_inode_split_flags(u64 inode_item_flags,
|
||||
u32 *flags, u32 *ro_flags)
|
||||
{
|
||||
*flags = (u32)inode_item_flags;
|
||||
*ro_flags = (u32)(inode_item_flags >> 32);
|
||||
}
|
||||
|
||||
/* Array of bytes with variable length, hexadecimal format 0x1234 */
|
||||
#define CSUM_FMT "0x%*phN"
|
||||
#define CSUM_FMT_VALUE(size, bytes) size, bytes
|
||||
|
||||
@@ -243,47 +243,6 @@ struct btrfsic_state {
|
||||
u32 datablock_size;
|
||||
};
|
||||
|
||||
static void btrfsic_block_init(struct btrfsic_block *b);
|
||||
static struct btrfsic_block *btrfsic_block_alloc(void);
|
||||
static void btrfsic_block_free(struct btrfsic_block *b);
|
||||
static void btrfsic_block_link_init(struct btrfsic_block_link *n);
|
||||
static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
|
||||
static void btrfsic_block_link_free(struct btrfsic_block_link *n);
|
||||
static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
|
||||
static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
|
||||
static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
|
||||
static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
|
||||
static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
|
||||
struct btrfsic_block_hashtable *h);
|
||||
static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
|
||||
static struct btrfsic_block *btrfsic_block_hashtable_lookup(
|
||||
struct block_device *bdev,
|
||||
u64 dev_bytenr,
|
||||
struct btrfsic_block_hashtable *h);
|
||||
static void btrfsic_block_link_hashtable_init(
|
||||
struct btrfsic_block_link_hashtable *h);
|
||||
static void btrfsic_block_link_hashtable_add(
|
||||
struct btrfsic_block_link *l,
|
||||
struct btrfsic_block_link_hashtable *h);
|
||||
static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
|
||||
static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
|
||||
struct block_device *bdev_ref_to,
|
||||
u64 dev_bytenr_ref_to,
|
||||
struct block_device *bdev_ref_from,
|
||||
u64 dev_bytenr_ref_from,
|
||||
struct btrfsic_block_link_hashtable *h);
|
||||
static void btrfsic_dev_state_hashtable_init(
|
||||
struct btrfsic_dev_state_hashtable *h);
|
||||
static void btrfsic_dev_state_hashtable_add(
|
||||
struct btrfsic_dev_state *ds,
|
||||
struct btrfsic_dev_state_hashtable *h);
|
||||
static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
|
||||
static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
|
||||
struct btrfsic_dev_state_hashtable *h);
|
||||
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
|
||||
static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
|
||||
static int btrfsic_process_superblock(struct btrfsic_state *state,
|
||||
struct btrfs_fs_devices *fs_devices);
|
||||
static int btrfsic_process_metablock(struct btrfsic_state *state,
|
||||
struct btrfsic_block *block,
|
||||
struct btrfsic_block_data_ctx *block_ctx,
|
||||
@@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
|
||||
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
|
||||
static int btrfsic_read_block(struct btrfsic_state *state,
|
||||
struct btrfsic_block_data_ctx *block_ctx);
|
||||
static void btrfsic_dump_database(struct btrfsic_state *state);
|
||||
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
|
||||
char **datav, unsigned int num_pages);
|
||||
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
|
||||
u64 dev_bytenr, char **mapped_datav,
|
||||
unsigned int num_pages,
|
||||
struct bio *bio, int *bio_is_patched,
|
||||
int submit_bio_bh_rw);
|
||||
static int btrfsic_process_written_superblock(
|
||||
struct btrfsic_state *state,
|
||||
struct btrfsic_block *const block,
|
||||
@@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
|
||||
/* Pages must be unmapped in reverse order */
|
||||
while (num_pages > 0) {
|
||||
num_pages--;
|
||||
if (block_ctx->datav[num_pages]) {
|
||||
kunmap_local(block_ctx->datav[num_pages]);
|
||||
if (block_ctx->datav[num_pages])
|
||||
block_ctx->datav[num_pages] = NULL;
|
||||
}
|
||||
if (block_ctx->pagev[num_pages]) {
|
||||
__free_page(block_ctx->pagev[num_pages]);
|
||||
block_ctx->pagev[num_pages] = NULL;
|
||||
@@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
|
||||
i = j;
|
||||
}
|
||||
for (i = 0; i < num_pages; i++)
|
||||
block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
|
||||
block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
|
||||
|
||||
return block_ctx->len;
|
||||
}
|
||||
@@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
|
||||
|
||||
bio_for_each_segment(bvec, bio, iter) {
|
||||
BUG_ON(bvec.bv_len != PAGE_SIZE);
|
||||
mapped_datav[i] = kmap_local_page(bvec.bv_page);
|
||||
mapped_datav[i] = page_address(bvec.bv_page);
|
||||
i++;
|
||||
|
||||
if (dev_state->state->print_mask &
|
||||
@@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio)
|
||||
mapped_datav, segs,
|
||||
bio, &bio_is_patched,
|
||||
bio->bi_opf);
|
||||
/* Unmap in reverse order */
|
||||
for (--i; i >= 0; i--)
|
||||
kunmap_local(mapped_datav[i]);
|
||||
kfree(mapped_datav);
|
||||
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
|
||||
if (dev_state->state->print_mask &
|
||||
|
||||
@@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
|
||||
/* Hash through the page sector by sector */
|
||||
for (pg_offset = 0; pg_offset < bytes_left;
|
||||
pg_offset += sectorsize) {
|
||||
kaddr = kmap_atomic(page);
|
||||
kaddr = page_address(page);
|
||||
crypto_shash_digest(shash, kaddr + pg_offset,
|
||||
sectorsize, csum);
|
||||
kunmap_atomic(kaddr);
|
||||
|
||||
if (memcmp(&csum, cb_sum, csum_size) != 0) {
|
||||
btrfs_print_data_csum_error(inode, disk_start,
|
||||
@@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode,
|
||||
if (isize == 0)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* For current subpage support, we only support 64K page size,
|
||||
* which means maximum compressed extent size (128K) is just 2x page
|
||||
* size.
|
||||
* This makes readahead less effective, so here disable readahead for
|
||||
* subpage for now, until full compressed write is supported.
|
||||
*/
|
||||
if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
|
||||
return 0;
|
||||
|
||||
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
|
||||
|
||||
while (last_offset < compressed_end) {
|
||||
@@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
||||
struct page *page;
|
||||
struct bio *comp_bio;
|
||||
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
|
||||
u64 file_offset;
|
||||
u64 em_len;
|
||||
u64 em_start;
|
||||
struct extent_map *em;
|
||||
@@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
||||
|
||||
em_tree = &BTRFS_I(inode)->extent_tree;
|
||||
|
||||
file_offset = bio_first_bvec_all(bio)->bv_offset +
|
||||
page_offset(bio_first_page_all(bio));
|
||||
|
||||
/* we need the actual starting offset of this extent in the file */
|
||||
read_lock(&em_tree->lock);
|
||||
em = lookup_extent_mapping(em_tree,
|
||||
page_offset(bio_first_page_all(bio)),
|
||||
fs_info->sectorsize);
|
||||
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
|
||||
read_unlock(&em_tree->lock);
|
||||
if (!em)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
|
||||
compressed_len = em->block_len;
|
||||
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
|
||||
if (!cb)
|
||||
@@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
|
||||
goto fail1;
|
||||
|
||||
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
|
||||
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
|
||||
__GFP_HIGHMEM);
|
||||
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
|
||||
if (!cb->compressed_pages[pg_index]) {
|
||||
faili = pg_index - 1;
|
||||
ret = BLK_STS_RESOURCE;
|
||||
@@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy uncompressed data from working buffer to pages.
|
||||
* Copy decompressed data from working buffer to pages.
|
||||
*
|
||||
* buf_start is the byte offset we're of the start of our workspace buffer.
|
||||
* @buf: The decompressed data buffer
|
||||
* @buf_len: The decompressed data length
|
||||
* @decompressed: Number of bytes that are already decompressed inside the
|
||||
* compressed extent
|
||||
* @cb: The compressed extent descriptor
|
||||
* @orig_bio: The original bio that the caller wants to read for
|
||||
*
|
||||
* total_out is the last byte of the buffer
|
||||
* An easier to understand graph is like below:
|
||||
*
|
||||
* |<- orig_bio ->| |<- orig_bio->|
|
||||
* |<------- full decompressed extent ----->|
|
||||
* |<----------- @cb range ---->|
|
||||
* | |<-- @buf_len -->|
|
||||
* |<--- @decompressed --->|
|
||||
*
|
||||
* Note that, @cb can be a subpage of the full decompressed extent, but
|
||||
* @cb->start always has the same as the orig_file_offset value of the full
|
||||
* decompressed extent.
|
||||
*
|
||||
* When reading compressed extent, we have to read the full compressed extent,
|
||||
* while @orig_bio may only want part of the range.
|
||||
* Thus this function will ensure only data covered by @orig_bio will be copied
|
||||
* to.
|
||||
*
|
||||
* Return 0 if we have copied all needed contents for @orig_bio.
|
||||
* Return >0 if we need continue decompress.
|
||||
*/
|
||||
int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
|
||||
unsigned long total_out, u64 disk_start,
|
||||
struct bio *bio)
|
||||
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
|
||||
struct compressed_bio *cb, u32 decompressed)
|
||||
{
|
||||
unsigned long buf_offset;
|
||||
unsigned long current_buf_start;
|
||||
unsigned long start_byte;
|
||||
unsigned long prev_start_byte;
|
||||
unsigned long working_bytes = total_out - buf_start;
|
||||
unsigned long bytes;
|
||||
struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter);
|
||||
struct bio *orig_bio = cb->orig_bio;
|
||||
/* Offset inside the full decompressed extent */
|
||||
u32 cur_offset;
|
||||
|
||||
cur_offset = decompressed;
|
||||
/* The main loop to do the copy */
|
||||
while (cur_offset < decompressed + buf_len) {
|
||||
struct bio_vec bvec;
|
||||
size_t copy_len;
|
||||
u32 copy_start;
|
||||
/* Offset inside the full decompressed extent */
|
||||
u32 bvec_offset;
|
||||
|
||||
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
|
||||
/*
|
||||
* start byte is the first byte of the page we're currently
|
||||
* copying into relative to the start of the compressed data.
|
||||
* cb->start may underflow, but subtracting that value can still
|
||||
* give us correct offset inside the full decompressed extent.
|
||||
*/
|
||||
start_byte = page_offset(bvec.bv_page) - disk_start;
|
||||
bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
|
||||
|
||||
/* we haven't yet hit data corresponding to this page */
|
||||
if (total_out <= start_byte)
|
||||
/* Haven't reached the bvec range, exit */
|
||||
if (decompressed + buf_len <= bvec_offset)
|
||||
return 1;
|
||||
|
||||
copy_start = max(cur_offset, bvec_offset);
|
||||
copy_len = min(bvec_offset + bvec.bv_len,
|
||||
decompressed + buf_len) - copy_start;
|
||||
ASSERT(copy_len);
|
||||
|
||||
/*
|
||||
* the start of the data we care about is offset into
|
||||
* the middle of our working buffer
|
||||
* Extra range check to ensure we didn't go beyond
|
||||
* @buf + @buf_len.
|
||||
*/
|
||||
if (total_out > start_byte && buf_start < start_byte) {
|
||||
buf_offset = start_byte - buf_start;
|
||||
working_bytes -= buf_offset;
|
||||
} else {
|
||||
buf_offset = 0;
|
||||
}
|
||||
current_buf_start = buf_start;
|
||||
|
||||
/* copy bytes from the working buffer into the pages */
|
||||
while (working_bytes > 0) {
|
||||
bytes = min_t(unsigned long, bvec.bv_len,
|
||||
PAGE_SIZE - (buf_offset % PAGE_SIZE));
|
||||
bytes = min(bytes, working_bytes);
|
||||
|
||||
memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset,
|
||||
bytes);
|
||||
ASSERT(copy_start - decompressed < buf_len);
|
||||
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
|
||||
buf + copy_start - decompressed, copy_len);
|
||||
flush_dcache_page(bvec.bv_page);
|
||||
cur_offset += copy_len;
|
||||
|
||||
buf_offset += bytes;
|
||||
working_bytes -= bytes;
|
||||
current_buf_start += bytes;
|
||||
|
||||
/* check if we need to pick another page */
|
||||
bio_advance(bio, bytes);
|
||||
if (!bio->bi_iter.bi_size)
|
||||
bio_advance(orig_bio, copy_len);
|
||||
/* Finished the bio */
|
||||
if (!orig_bio->bi_iter.bi_size)
|
||||
return 0;
|
||||
bvec = bio_iter_iovec(bio, bio->bi_iter);
|
||||
prev_start_byte = start_byte;
|
||||
start_byte = page_offset(bvec.bv_page) - disk_start;
|
||||
|
||||
/*
|
||||
* We need to make sure we're only adjusting
|
||||
* our offset into compression working buffer when
|
||||
* we're switching pages. Otherwise we can incorrectly
|
||||
* keep copying when we were actually done.
|
||||
*/
|
||||
if (start_byte != prev_start_byte) {
|
||||
/*
|
||||
* make sure our new page is covered by this
|
||||
* working buffer
|
||||
*/
|
||||
if (total_out <= start_byte)
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* the next page in the biovec might not be adjacent
|
||||
* to the last page, but it might still be found
|
||||
* inside this working buffer. bump our offset pointer
|
||||
*/
|
||||
if (total_out > start_byte &&
|
||||
current_buf_start < start_byte) {
|
||||
buf_offset = start_byte - buf_start;
|
||||
working_bytes = total_out - start_byte;
|
||||
current_buf_start = buf_start + buf_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
|
||||
unsigned long *total_out);
|
||||
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
|
||||
unsigned long start_byte, size_t srclen, size_t destlen);
|
||||
int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
|
||||
unsigned long total_out, u64 disk_start,
|
||||
struct bio *bio);
|
||||
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
|
||||
struct compressed_bio *cb, u32 decompressed);
|
||||
|
||||
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
|
||||
unsigned int len, u64 disk_start,
|
||||
|
||||
@@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
|
||||
|
||||
/*
|
||||
* search for key in the extent_buffer. The items start at offset p,
|
||||
* and they are item_size apart. There are 'max' items in p.
|
||||
* and they are item_size apart.
|
||||
*
|
||||
* the slot in the array is returned via slot, and it points to
|
||||
* the place where you would insert key if it is not found in
|
||||
* the array.
|
||||
*
|
||||
* slot may point to max if the key is bigger than all of the keys
|
||||
* Slot may point to total number of items if the key is bigger than
|
||||
* all of the keys
|
||||
*/
|
||||
static noinline int generic_bin_search(struct extent_buffer *eb,
|
||||
unsigned long p, int item_size,
|
||||
const struct btrfs_key *key,
|
||||
int max, int *slot)
|
||||
const struct btrfs_key *key, int *slot)
|
||||
{
|
||||
int low = 0;
|
||||
int high = max;
|
||||
int high = btrfs_header_nritems(eb);
|
||||
int ret;
|
||||
const int key_size = sizeof(struct btrfs_disk_key);
|
||||
|
||||
@@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
|
||||
if (btrfs_header_level(eb) == 0)
|
||||
return generic_bin_search(eb,
|
||||
offsetof(struct btrfs_leaf, items),
|
||||
sizeof(struct btrfs_item),
|
||||
key, btrfs_header_nritems(eb),
|
||||
slot);
|
||||
sizeof(struct btrfs_item), key, slot);
|
||||
else
|
||||
return generic_bin_search(eb,
|
||||
offsetof(struct btrfs_node, ptrs),
|
||||
sizeof(struct btrfs_key_ptr),
|
||||
key, btrfs_header_nritems(eb),
|
||||
slot);
|
||||
sizeof(struct btrfs_key_ptr), key, slot);
|
||||
}
|
||||
|
||||
static void root_add_used(struct btrfs_root *root, u32 size)
|
||||
@@ -1237,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
u64 target;
|
||||
u64 nread = 0;
|
||||
u64 nread_max;
|
||||
struct extent_buffer *eb;
|
||||
u32 nr;
|
||||
u32 blocksize;
|
||||
u32 nscan = 0;
|
||||
@@ -1266,11 +1261,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
|
||||
search = btrfs_node_blockptr(node, slot);
|
||||
blocksize = fs_info->nodesize;
|
||||
if (path->reada != READA_FORWARD_ALWAYS) {
|
||||
struct extent_buffer *eb;
|
||||
|
||||
eb = find_extent_buffer(fs_info, search);
|
||||
if (eb) {
|
||||
free_extent_buffer(eb);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
target = search;
|
||||
|
||||
@@ -2102,6 +2101,27 @@ again:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Execute search and call btrfs_previous_item to traverse backwards if the item
|
||||
* was not found.
|
||||
*
|
||||
* Return 0 if found, 1 if not found and < 0 if error.
|
||||
*/
|
||||
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
|
||||
struct btrfs_path *path)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
|
||||
if (ret > 0)
|
||||
ret = btrfs_previous_item(root, path, key->objectid, key->type);
|
||||
|
||||
if (ret == 0)
|
||||
btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* adjust the pointers going up the tree, starting at level
|
||||
* making sure the right key of each node is points to 'key'.
|
||||
@@ -4358,16 +4378,6 @@ next:
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* search the tree again to find a leaf with greater keys
|
||||
* returns 0 if it found something or 1 if there are no greater leaves.
|
||||
* returns < 0 on io errors.
|
||||
*/
|
||||
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
|
||||
{
|
||||
return btrfs_next_old_leaf(root, path, 0);
|
||||
}
|
||||
|
||||
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
|
||||
u64 time_seq)
|
||||
{
|
||||
|
||||
@@ -281,7 +281,8 @@ struct btrfs_super_block {
|
||||
|
||||
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
|
||||
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
|
||||
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
|
||||
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
|
||||
BTRFS_FEATURE_COMPAT_RO_VERITY)
|
||||
|
||||
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
|
||||
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
|
||||
@@ -1012,8 +1013,6 @@ struct btrfs_fs_info {
|
||||
u64 zoned;
|
||||
};
|
||||
|
||||
/* Max size to emit ZONE_APPEND write command */
|
||||
u64 max_zone_append_size;
|
||||
struct mutex zoned_meta_io_lock;
|
||||
spinlock_t treelog_bg_lock;
|
||||
u64 treelog_bg;
|
||||
@@ -1484,20 +1483,20 @@ do { \
|
||||
/*
|
||||
* Inode flags
|
||||
*/
|
||||
#define BTRFS_INODE_NODATASUM (1 << 0)
|
||||
#define BTRFS_INODE_NODATACOW (1 << 1)
|
||||
#define BTRFS_INODE_READONLY (1 << 2)
|
||||
#define BTRFS_INODE_NOCOMPRESS (1 << 3)
|
||||
#define BTRFS_INODE_PREALLOC (1 << 4)
|
||||
#define BTRFS_INODE_SYNC (1 << 5)
|
||||
#define BTRFS_INODE_IMMUTABLE (1 << 6)
|
||||
#define BTRFS_INODE_APPEND (1 << 7)
|
||||
#define BTRFS_INODE_NODUMP (1 << 8)
|
||||
#define BTRFS_INODE_NOATIME (1 << 9)
|
||||
#define BTRFS_INODE_DIRSYNC (1 << 10)
|
||||
#define BTRFS_INODE_COMPRESS (1 << 11)
|
||||
#define BTRFS_INODE_NODATASUM (1U << 0)
|
||||
#define BTRFS_INODE_NODATACOW (1U << 1)
|
||||
#define BTRFS_INODE_READONLY (1U << 2)
|
||||
#define BTRFS_INODE_NOCOMPRESS (1U << 3)
|
||||
#define BTRFS_INODE_PREALLOC (1U << 4)
|
||||
#define BTRFS_INODE_SYNC (1U << 5)
|
||||
#define BTRFS_INODE_IMMUTABLE (1U << 6)
|
||||
#define BTRFS_INODE_APPEND (1U << 7)
|
||||
#define BTRFS_INODE_NODUMP (1U << 8)
|
||||
#define BTRFS_INODE_NOATIME (1U << 9)
|
||||
#define BTRFS_INODE_DIRSYNC (1U << 10)
|
||||
#define BTRFS_INODE_COMPRESS (1U << 11)
|
||||
|
||||
#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
|
||||
#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31)
|
||||
|
||||
#define BTRFS_INODE_FLAG_MASK \
|
||||
(BTRFS_INODE_NODATASUM | \
|
||||
@@ -1514,6 +1513,10 @@ do { \
|
||||
BTRFS_INODE_COMPRESS | \
|
||||
BTRFS_INODE_ROOT_ITEM_INIT)
|
||||
|
||||
#define BTRFS_INODE_RO_VERITY (1U << 0)
|
||||
|
||||
#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY)
|
||||
|
||||
struct btrfs_map_token {
|
||||
struct extent_buffer *eb;
|
||||
char *kaddr;
|
||||
@@ -2781,10 +2784,11 @@ enum btrfs_flush_state {
|
||||
FLUSH_DELAYED_REFS = 4,
|
||||
FLUSH_DELALLOC = 5,
|
||||
FLUSH_DELALLOC_WAIT = 6,
|
||||
ALLOC_CHUNK = 7,
|
||||
ALLOC_CHUNK_FORCE = 8,
|
||||
RUN_DELAYED_IPUTS = 9,
|
||||
COMMIT_TRANS = 10,
|
||||
FLUSH_DELALLOC_FULL = 7,
|
||||
ALLOC_CHUNK = 8,
|
||||
ALLOC_CHUNK_FORCE = 9,
|
||||
RUN_DELAYED_IPUTS = 10,
|
||||
COMMIT_TRANS = 11,
|
||||
};
|
||||
|
||||
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
|
||||
@@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
|
||||
return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
|
||||
}
|
||||
|
||||
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
|
||||
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
|
||||
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
|
||||
u64 time_seq);
|
||||
|
||||
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
|
||||
struct btrfs_path *path);
|
||||
|
||||
static inline int btrfs_next_old_item(struct btrfs_root *root,
|
||||
struct btrfs_path *p, u64 time_seq)
|
||||
{
|
||||
@@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root,
|
||||
return btrfs_next_old_leaf(root, p, time_seq);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search the tree again to find a leaf with greater keys.
|
||||
*
|
||||
* Returns 0 if it found something or 1 if there are no greater leaves.
|
||||
* Returns < 0 on error.
|
||||
*/
|
||||
static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
|
||||
{
|
||||
return btrfs_next_old_leaf(root, path, 0);
|
||||
}
|
||||
|
||||
static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
|
||||
{
|
||||
return btrfs_next_old_item(root, p, 0);
|
||||
@@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
|
||||
struct extent_state **cached_state);
|
||||
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *new_root,
|
||||
struct btrfs_root *parent_root);
|
||||
struct btrfs_root *parent_root,
|
||||
struct user_namespace *mnt_userns);
|
||||
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
|
||||
unsigned *bits);
|
||||
void btrfs_clear_delalloc_extent(struct inode *inode,
|
||||
@@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
|
||||
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
|
||||
u64 start, u64 end, int *page_started, unsigned long *nr_written,
|
||||
struct writeback_control *wbc);
|
||||
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
|
||||
int btrfs_writepage_cow_fixup(struct page *page);
|
||||
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
|
||||
struct page *page, u64 start,
|
||||
u64 end, int uptodate);
|
||||
u64 end, bool uptodate);
|
||||
extern const struct dentry_operations btrfs_dentry_operations;
|
||||
extern const struct iomap_ops btrfs_dio_iomap_ops;
|
||||
extern const struct iomap_dio_ops btrfs_dio_ops;
|
||||
@@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
|
||||
return signal_pending(current);
|
||||
}
|
||||
|
||||
/* verity.c */
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
|
||||
extern const struct fsverity_operations btrfs_verityops;
|
||||
int btrfs_drop_verity_items(struct btrfs_inode *inode);
|
||||
|
||||
BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
|
||||
encryption, 8);
|
||||
BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
|
||||
size, 64);
|
||||
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
|
||||
struct btrfs_verity_descriptor_item, encryption, 8);
|
||||
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
|
||||
struct btrfs_verity_descriptor_item, size, 64);
|
||||
|
||||
#else
|
||||
|
||||
static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Sanity test specific functions */
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
void btrfs_test_destroy_inode(struct inode *inode);
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "misc.h"
|
||||
#include "delayed-inode.h"
|
||||
#include "disk-io.h"
|
||||
@@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
|
||||
}
|
||||
|
||||
/*
|
||||
* This helper will insert some continuous items into the same leaf according
|
||||
* to the free space of the leaf.
|
||||
*/
|
||||
static int btrfs_batch_insert_items(struct btrfs_root *root,
|
||||
struct btrfs_path *path,
|
||||
struct btrfs_delayed_item *item)
|
||||
{
|
||||
struct btrfs_delayed_item *curr, *next;
|
||||
int free_space;
|
||||
int total_size = 0;
|
||||
struct extent_buffer *leaf;
|
||||
char *data_ptr;
|
||||
struct btrfs_key *keys;
|
||||
u32 *data_size;
|
||||
struct list_head head;
|
||||
int slot;
|
||||
int nitems;
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(!path->nodes[0]);
|
||||
|
||||
leaf = path->nodes[0];
|
||||
free_space = btrfs_leaf_free_space(leaf);
|
||||
INIT_LIST_HEAD(&head);
|
||||
|
||||
next = item;
|
||||
nitems = 0;
|
||||
|
||||
/*
|
||||
* count the number of the continuous items that we can insert in batch
|
||||
*/
|
||||
while (total_size + next->data_len + sizeof(struct btrfs_item) <=
|
||||
free_space) {
|
||||
total_size += next->data_len + sizeof(struct btrfs_item);
|
||||
list_add_tail(&next->tree_list, &head);
|
||||
nitems++;
|
||||
|
||||
curr = next;
|
||||
next = __btrfs_next_delayed_item(curr);
|
||||
if (!next)
|
||||
break;
|
||||
|
||||
if (!btrfs_is_continuous_delayed_item(curr, next))
|
||||
break;
|
||||
}
|
||||
|
||||
if (!nitems) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
|
||||
if (!keys) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
|
||||
if (!data_size) {
|
||||
ret = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get keys of all the delayed items */
|
||||
i = 0;
|
||||
list_for_each_entry(next, &head, tree_list) {
|
||||
keys[i] = next->key;
|
||||
data_size[i] = next->data_len;
|
||||
i++;
|
||||
}
|
||||
|
||||
/* insert the keys of the items */
|
||||
setup_items_for_insert(root, path, keys, data_size, nitems);
|
||||
|
||||
/* insert the dir index items */
|
||||
slot = path->slots[0];
|
||||
list_for_each_entry_safe(curr, next, &head, tree_list) {
|
||||
data_ptr = btrfs_item_ptr(leaf, slot, char);
|
||||
write_extent_buffer(leaf, &curr->data,
|
||||
(unsigned long)data_ptr,
|
||||
curr->data_len);
|
||||
slot++;
|
||||
|
||||
btrfs_delayed_item_release_metadata(root, curr);
|
||||
|
||||
list_del(&curr->tree_list);
|
||||
btrfs_release_delayed_item(curr);
|
||||
}
|
||||
|
||||
error:
|
||||
kfree(data_size);
|
||||
kfree(keys);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This helper can just do simple insertion that needn't extend item for new
|
||||
* data, such as directory name index insertion, inode insertion.
|
||||
* Insert a single delayed item or a batch of delayed items that have consecutive
|
||||
* keys if they exist.
|
||||
*/
|
||||
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
struct btrfs_path *path,
|
||||
struct btrfs_delayed_item *delayed_item)
|
||||
struct btrfs_delayed_item *first_item)
|
||||
{
|
||||
struct extent_buffer *leaf;
|
||||
unsigned int nofs_flag;
|
||||
char *ptr;
|
||||
LIST_HEAD(batch);
|
||||
struct btrfs_delayed_item *curr;
|
||||
struct btrfs_delayed_item *next;
|
||||
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
|
||||
int total_size;
|
||||
int nitems;
|
||||
char *ins_data = NULL;
|
||||
struct btrfs_key *ins_keys;
|
||||
u32 *ins_sizes;
|
||||
int ret;
|
||||
|
||||
nofs_flag = memalloc_nofs_save();
|
||||
ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
|
||||
delayed_item->data_len);
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
if (ret < 0 && ret != -EEXIST)
|
||||
list_add_tail(&first_item->tree_list, &batch);
|
||||
nitems = 1;
|
||||
total_size = first_item->data_len + sizeof(struct btrfs_item);
|
||||
curr = first_item;
|
||||
|
||||
while (true) {
|
||||
int next_size;
|
||||
|
||||
next = __btrfs_next_delayed_item(curr);
|
||||
if (!next || !btrfs_is_continuous_delayed_item(curr, next))
|
||||
break;
|
||||
|
||||
next_size = next->data_len + sizeof(struct btrfs_item);
|
||||
if (total_size + next_size > max_size)
|
||||
break;
|
||||
|
||||
list_add_tail(&next->tree_list, &batch);
|
||||
nitems++;
|
||||
total_size += next_size;
|
||||
curr = next;
|
||||
}
|
||||
|
||||
if (nitems == 1) {
|
||||
ins_keys = &first_item->key;
|
||||
ins_sizes = &first_item->data_len;
|
||||
} else {
|
||||
int i = 0;
|
||||
|
||||
ins_data = kmalloc(nitems * sizeof(u32) +
|
||||
nitems * sizeof(struct btrfs_key), GFP_NOFS);
|
||||
if (!ins_data) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
ins_sizes = (u32 *)ins_data;
|
||||
ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
|
||||
list_for_each_entry(curr, &batch, tree_list) {
|
||||
ins_keys[i] = curr->key;
|
||||
ins_sizes[i] = curr->data_len;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
|
||||
nitems);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
list_for_each_entry(curr, &batch, tree_list) {
|
||||
char *data_ptr;
|
||||
|
||||
data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
|
||||
write_extent_buffer(path->nodes[0], &curr->data,
|
||||
(unsigned long)data_ptr, curr->data_len);
|
||||
path->slots[0]++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now release our path before releasing the delayed items and their
|
||||
* metadata reservations, so that we don't block other tasks for more
|
||||
* time than needed.
|
||||
*/
|
||||
btrfs_release_path(path);
|
||||
|
||||
list_for_each_entry_safe(curr, next, &batch, tree_list) {
|
||||
list_del(&curr->tree_list);
|
||||
btrfs_delayed_item_release_metadata(root, curr);
|
||||
btrfs_release_delayed_item(curr);
|
||||
}
|
||||
out:
|
||||
kfree(ins_data);
|
||||
return ret;
|
||||
|
||||
leaf = path->nodes[0];
|
||||
|
||||
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
|
||||
|
||||
write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
|
||||
delayed_item->data_len);
|
||||
btrfs_mark_buffer_dirty(leaf);
|
||||
|
||||
btrfs_delayed_item_release_metadata(root, delayed_item);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* we insert an item first, then if there are some continuous items, we try
|
||||
* to insert those items into the same leaf.
|
||||
*/
|
||||
static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_path *path,
|
||||
struct btrfs_root *root,
|
||||
struct btrfs_delayed_node *node)
|
||||
{
|
||||
struct btrfs_delayed_item *curr, *prev;
|
||||
int ret = 0;
|
||||
|
||||
do_again:
|
||||
while (ret == 0) {
|
||||
struct btrfs_delayed_item *curr;
|
||||
|
||||
mutex_lock(&node->mutex);
|
||||
curr = __btrfs_first_delayed_insertion_item(node);
|
||||
if (!curr)
|
||||
goto insert_end;
|
||||
|
||||
if (!curr) {
|
||||
mutex_unlock(&node->mutex);
|
||||
break;
|
||||
}
|
||||
ret = btrfs_insert_delayed_item(trans, root, path, curr);
|
||||
if (ret < 0) {
|
||||
btrfs_release_path(path);
|
||||
goto insert_end;
|
||||
mutex_unlock(&node->mutex);
|
||||
}
|
||||
|
||||
prev = curr;
|
||||
curr = __btrfs_next_delayed_item(prev);
|
||||
if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
|
||||
/* insert the continuous items into the same leaf */
|
||||
path->slots[0]++;
|
||||
btrfs_batch_insert_items(root, path, curr);
|
||||
}
|
||||
btrfs_release_delayed_item(prev);
|
||||
btrfs_mark_buffer_dirty(path->nodes[0]);
|
||||
|
||||
btrfs_release_path(path);
|
||||
mutex_unlock(&node->mutex);
|
||||
goto do_again;
|
||||
|
||||
insert_end:
|
||||
mutex_unlock(&node->mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_delayed_node *node)
|
||||
{
|
||||
struct btrfs_delayed_item *curr, *prev;
|
||||
unsigned int nofs_flag;
|
||||
int ret = 0;
|
||||
|
||||
do_again:
|
||||
@@ -923,9 +864,7 @@ do_again:
|
||||
if (!curr)
|
||||
goto delete_fail;
|
||||
|
||||
nofs_flag = memalloc_nofs_save();
|
||||
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
if (ret < 0)
|
||||
goto delete_fail;
|
||||
else if (ret > 0) {
|
||||
@@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_key key;
|
||||
struct btrfs_inode_item *inode_item;
|
||||
struct extent_buffer *leaf;
|
||||
unsigned int nofs_flag;
|
||||
int mod;
|
||||
int ret;
|
||||
|
||||
@@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
|
||||
else
|
||||
mod = 1;
|
||||
|
||||
nofs_flag = memalloc_nofs_save();
|
||||
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
if (ret > 0)
|
||||
ret = -ENOENT;
|
||||
if (ret < 0)
|
||||
@@ -1066,9 +1002,7 @@ search:
|
||||
key.type = BTRFS_INODE_EXTREF_KEY;
|
||||
key.offset = -1;
|
||||
|
||||
nofs_flag = memalloc_nofs_save();
|
||||
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
if (ret < 0)
|
||||
goto err_out;
|
||||
ASSERT(ret);
|
||||
@@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_inode_item *inode_item,
|
||||
struct inode *inode)
|
||||
{
|
||||
u64 flags;
|
||||
|
||||
btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
|
||||
btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
|
||||
btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
|
||||
@@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
|
||||
inode_peek_iversion(inode));
|
||||
btrfs_set_stack_inode_transid(inode_item, trans->transid);
|
||||
btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
|
||||
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
|
||||
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
|
||||
BTRFS_I(inode)->ro_flags);
|
||||
btrfs_set_stack_inode_flags(inode_item, flags);
|
||||
btrfs_set_stack_inode_block_group(inode_item, 0);
|
||||
|
||||
btrfs_set_stack_timespec_sec(&inode_item->atime,
|
||||
@@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
|
||||
btrfs_stack_inode_sequence(inode_item));
|
||||
inode->i_rdev = 0;
|
||||
*rdev = btrfs_stack_inode_rdev(inode_item);
|
||||
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
|
||||
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
|
||||
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
|
||||
|
||||
inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
|
||||
inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
|
||||
|
||||
@@ -170,6 +170,25 @@ out_free:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct btrfs_dir_item *btrfs_lookup_match_dir(
|
||||
struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root, struct btrfs_path *path,
|
||||
struct btrfs_key *key, const char *name,
|
||||
int name_len, int mod)
|
||||
{
|
||||
const int ins_len = (mod < 0 ? -1 : 0);
|
||||
const int cow = (mod != 0);
|
||||
int ret;
|
||||
|
||||
ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
if (ret > 0)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
|
||||
}
|
||||
|
||||
/*
|
||||
* lookup a directory item based on name. 'dir' is the objectid
|
||||
* we're searching in, and 'mod' tells us if you plan on deleting the
|
||||
@@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
|
||||
const char *name, int name_len,
|
||||
int mod)
|
||||
{
|
||||
int ret;
|
||||
struct btrfs_key key;
|
||||
int ins_len = mod < 0 ? -1 : 0;
|
||||
int cow = mod != 0;
|
||||
struct btrfs_dir_item *di;
|
||||
|
||||
key.objectid = dir;
|
||||
key.type = BTRFS_DIR_ITEM_KEY;
|
||||
|
||||
key.offset = btrfs_name_hash(name, name_len);
|
||||
|
||||
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
if (ret > 0)
|
||||
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
|
||||
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
|
||||
return NULL;
|
||||
|
||||
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
|
||||
return di;
|
||||
}
|
||||
|
||||
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
|
||||
@@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
|
||||
int slot;
|
||||
struct btrfs_path *path;
|
||||
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
@@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
|
||||
key.type = BTRFS_DIR_ITEM_KEY;
|
||||
key.offset = btrfs_name_hash(name, name_len);
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
|
||||
/* return back any errors */
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* nothing found, we're safe */
|
||||
if (ret > 0) {
|
||||
di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
|
||||
if (IS_ERR(di)) {
|
||||
ret = PTR_ERR(di);
|
||||
/* Nothing found, we're safe */
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* we found an item, look for our name in the item */
|
||||
di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
|
||||
if (di) {
|
||||
/* our exact name was found */
|
||||
ret = -EEXIST;
|
||||
@@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
|
||||
u64 objectid, const char *name, int name_len,
|
||||
int mod)
|
||||
{
|
||||
int ret;
|
||||
struct btrfs_key key;
|
||||
int ins_len = mod < 0 ? -1 : 0;
|
||||
int cow = mod != 0;
|
||||
|
||||
key.objectid = dir;
|
||||
key.type = BTRFS_DIR_INDEX_KEY;
|
||||
key.offset = objectid;
|
||||
|
||||
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
if (ret > 0)
|
||||
return ERR_PTR(-ENOENT);
|
||||
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
|
||||
return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
|
||||
}
|
||||
|
||||
struct btrfs_dir_item *
|
||||
@@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
|
||||
const char *name, u16 name_len,
|
||||
int mod)
|
||||
{
|
||||
int ret;
|
||||
struct btrfs_key key;
|
||||
int ins_len = mod < 0 ? -1 : 0;
|
||||
int cow = mod != 0;
|
||||
struct btrfs_dir_item *di;
|
||||
|
||||
key.objectid = dir;
|
||||
key.type = BTRFS_XATTR_ITEM_KEY;
|
||||
key.offset = btrfs_name_hash(name, name_len);
|
||||
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
if (ret > 0)
|
||||
|
||||
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
|
||||
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
|
||||
return NULL;
|
||||
|
||||
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
|
||||
return di;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
||||
goto fail_alloc;
|
||||
}
|
||||
|
||||
/* For 4K sector size support, it's only read-only */
|
||||
if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
|
||||
if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
|
||||
if (sectorsize != PAGE_SIZE) {
|
||||
btrfs_warn(fs_info,
|
||||
"read-write for sector size %u with page size %lu is experimental",
|
||||
sectorsize, PAGE_SIZE);
|
||||
}
|
||||
if (sectorsize != PAGE_SIZE) {
|
||||
if (btrfs_super_incompat_flags(fs_info->super_copy) &
|
||||
BTRFS_FEATURE_INCOMPAT_RAID56) {
|
||||
btrfs_err(fs_info,
|
||||
"subpage sectorsize %u only supported read-only for page size %lu",
|
||||
"RAID56 is not yet supported for sector size %u with page size %lu",
|
||||
sectorsize, PAGE_SIZE);
|
||||
err = -EINVAL;
|
||||
goto fail_alloc;
|
||||
|
||||
@@ -153,7 +153,7 @@ search_again:
|
||||
else
|
||||
key.type = BTRFS_EXTENT_ITEM_KEY;
|
||||
|
||||
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
|
||||
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
|
||||
if (ret < 0)
|
||||
goto out_free;
|
||||
|
||||
@@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
||||
*/
|
||||
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
||||
{
|
||||
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
||||
struct btrfs_block_group *cache = NULL;
|
||||
struct btrfs_device *device;
|
||||
struct list_head *devices;
|
||||
u64 group_trimmed;
|
||||
u64 range_end = U64_MAX;
|
||||
u64 start;
|
||||
@@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
||||
btrfs_warn(fs_info,
|
||||
"failed to trim %llu block group(s), last error %d",
|
||||
bg_failed, bg_ret);
|
||||
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
||||
devices = &fs_info->fs_devices->devices;
|
||||
list_for_each_entry(device, devices, dev_list) {
|
||||
|
||||
mutex_lock(&fs_devices->device_list_mutex);
|
||||
list_for_each_entry(device, &fs_devices->devices, dev_list) {
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
|
||||
continue;
|
||||
|
||||
@@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
||||
|
||||
trimmed += group_trimmed;
|
||||
}
|
||||
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
|
||||
if (dev_failed)
|
||||
btrfs_warn(fs_info,
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/cleancache.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include "misc.h"
|
||||
#include "extent_io.h"
|
||||
#include "extent-io-tree.h"
|
||||
@@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
|
||||
|
||||
bio->bi_private = NULL;
|
||||
|
||||
/* Caller should ensure the bio has at least some range added */
|
||||
ASSERT(bio->bi_iter.bi_size);
|
||||
if (is_data_inode(tree->private_data))
|
||||
ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
|
||||
bio_flags);
|
||||
@@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
||||
return bitset;
|
||||
}
|
||||
|
||||
/*
|
||||
* helper function to set a given page up to date if all the
|
||||
* extents in the tree for that page are up to date
|
||||
*/
|
||||
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
|
||||
{
|
||||
u64 start = page_offset(page);
|
||||
u64 end = start + PAGE_SIZE - 1;
|
||||
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
|
||||
SetPageUptodate(page);
|
||||
}
|
||||
|
||||
int free_io_failure(struct extent_io_tree *failure_tree,
|
||||
struct extent_io_tree *io_tree,
|
||||
struct io_failure_record *rec)
|
||||
@@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
|
||||
start + len <= page_offset(page) + PAGE_SIZE);
|
||||
|
||||
if (uptodate) {
|
||||
if (fsverity_active(page->mapping->host) &&
|
||||
!PageError(page) &&
|
||||
!PageUptodate(page) &&
|
||||
start < i_size_read(page->mapping->host) &&
|
||||
!fsverity_verify_page(page)) {
|
||||
btrfs_page_set_error(fs_info, page, start, len);
|
||||
} else {
|
||||
btrfs_page_set_uptodate(fs_info, page, start, len);
|
||||
}
|
||||
} else {
|
||||
btrfs_page_clear_uptodate(fs_info, page, start, len);
|
||||
btrfs_page_set_error(fs_info, page, start, len);
|
||||
@@ -2779,7 +2778,7 @@ next:
|
||||
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
|
||||
{
|
||||
struct btrfs_inode *inode;
|
||||
int uptodate = (err == 0);
|
||||
const bool uptodate = (err == 0);
|
||||
int ret = 0;
|
||||
|
||||
ASSERT(page && page->mapping);
|
||||
@@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
|
||||
btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
|
||||
|
||||
if (!uptodate) {
|
||||
ClearPageUptodate(page);
|
||||
SetPageError(page);
|
||||
const struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
u32 len;
|
||||
|
||||
ASSERT(end + 1 - start <= U32_MAX);
|
||||
len = end + 1 - start;
|
||||
|
||||
btrfs_page_clear_uptodate(fs_info, page, start, len);
|
||||
btrfs_page_set_error(fs_info, page, start, len);
|
||||
ret = err < 0 ? err : -EIO;
|
||||
mapping_set_error(page->mapping, ret);
|
||||
}
|
||||
@@ -3097,7 +3102,7 @@ readpage_ok:
|
||||
/* Update page status and unlock */
|
||||
end_page_read(page, uptodate, start, len);
|
||||
endio_readpage_release_extent(&processed, BTRFS_I(inode),
|
||||
start, end, uptodate);
|
||||
start, end, PageUptodate(page));
|
||||
}
|
||||
/* Release the last extent */
|
||||
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
|
||||
@@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
|
||||
return bio;
|
||||
}
|
||||
|
||||
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
|
||||
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
|
||||
{
|
||||
struct bio *bio;
|
||||
struct btrfs_io_bio *btrfs_bio;
|
||||
|
||||
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
|
||||
|
||||
/* this will never fail when it's backed by a bioset */
|
||||
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
|
||||
ASSERT(bio);
|
||||
@@ -3181,13 +3188,14 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
|
||||
* @size: portion of page that we want to write
|
||||
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
|
||||
* @bio_flags: flags of the current bio to see if we can merge them
|
||||
* @return: true if page was added, false otherwise
|
||||
*
|
||||
* Attempt to add a page to bio considering stripe alignment etc.
|
||||
*
|
||||
* Return true if successfully page added. Otherwise, return false.
|
||||
* Return >= 0 for the number of bytes added to the bio.
|
||||
* Can return 0 if the current bio is already at stripe/zone boundary.
|
||||
* Return <0 for error.
|
||||
*/
|
||||
static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
struct page *page,
|
||||
u64 disk_bytenr, unsigned int size,
|
||||
unsigned int pg_offset,
|
||||
@@ -3195,6 +3203,7 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
{
|
||||
struct bio *bio = bio_ctrl->bio;
|
||||
u32 bio_size = bio->bi_iter.bi_size;
|
||||
u32 real_size;
|
||||
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
|
||||
bool contig;
|
||||
int ret;
|
||||
@@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
/* The limit should be calculated when bio_ctrl->bio is allocated */
|
||||
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
|
||||
if (bio_ctrl->bio_flags != bio_flags)
|
||||
return false;
|
||||
return 0;
|
||||
|
||||
if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
|
||||
contig = bio->bi_iter.bi_sector == sector;
|
||||
else
|
||||
contig = bio_end_sector(bio) == sector;
|
||||
if (!contig)
|
||||
return false;
|
||||
return 0;
|
||||
|
||||
if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
|
||||
bio_size + size > bio_ctrl->len_to_stripe_boundary)
|
||||
return false;
|
||||
real_size = min(bio_ctrl->len_to_oe_boundary,
|
||||
bio_ctrl->len_to_stripe_boundary) - bio_size;
|
||||
real_size = min(real_size, size);
|
||||
|
||||
/*
|
||||
* If real_size is 0, never call bio_add_*_page(), as even size is 0,
|
||||
* bio will still execute its endio function on the page!
|
||||
*/
|
||||
if (real_size == 0)
|
||||
return 0;
|
||||
|
||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
|
||||
ret = bio_add_zone_append_page(bio, page, size, pg_offset);
|
||||
ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
|
||||
else
|
||||
ret = bio_add_page(bio, page, size, pg_offset);
|
||||
ret = bio_add_page(bio, page, real_size, pg_offset);
|
||||
|
||||
return ret == size;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
struct btrfs_inode *inode)
|
||||
struct btrfs_inode *inode, u64 file_offset)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
struct btrfs_io_geometry geom;
|
||||
@@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
return 0;
|
||||
}
|
||||
|
||||
ASSERT(fs_info->max_zone_append_size > 0);
|
||||
/* Ordered extent not yet created, so we're good */
|
||||
ordered = btrfs_lookup_ordered_extent(inode, logical);
|
||||
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
|
||||
if (!ordered) {
|
||||
bio_ctrl->len_to_oe_boundary = U32_MAX;
|
||||
return 0;
|
||||
@@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int alloc_new_bio(struct btrfs_inode *inode,
|
||||
struct btrfs_bio_ctrl *bio_ctrl,
|
||||
struct writeback_control *wbc,
|
||||
unsigned int opf,
|
||||
bio_end_io_t end_io_func,
|
||||
u64 disk_bytenr, u32 offset, u64 file_offset,
|
||||
unsigned long bio_flags)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* For compressed page range, its disk_bytenr is always @disk_bytenr
|
||||
* passed in, no matter if we have added any range into previous bio.
|
||||
*/
|
||||
if (bio_flags & EXTENT_BIO_COMPRESSED)
|
||||
bio = btrfs_bio_alloc(disk_bytenr);
|
||||
else
|
||||
bio = btrfs_bio_alloc(disk_bytenr + offset);
|
||||
bio_ctrl->bio = bio;
|
||||
bio_ctrl->bio_flags = bio_flags;
|
||||
bio->bi_end_io = end_io_func;
|
||||
bio->bi_private = &inode->io_tree;
|
||||
bio->bi_write_hint = inode->vfs_inode.i_write_hint;
|
||||
bio->bi_opf = opf;
|
||||
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
if (wbc) {
|
||||
struct block_device *bdev;
|
||||
|
||||
bdev = fs_info->fs_devices->latest_bdev;
|
||||
bio_set_dev(bio, bdev);
|
||||
wbc_init_bio(wbc, bio);
|
||||
}
|
||||
if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
||||
struct btrfs_device *device;
|
||||
|
||||
device = btrfs_zoned_get_device(fs_info, disk_bytenr,
|
||||
fs_info->sectorsize);
|
||||
if (IS_ERR(device)) {
|
||||
ret = PTR_ERR(device);
|
||||
goto error;
|
||||
}
|
||||
|
||||
btrfs_io_bio(bio)->device = device;
|
||||
}
|
||||
return 0;
|
||||
error:
|
||||
bio_ctrl->bio = NULL;
|
||||
bio->bi_status = errno_to_blk_status(ret);
|
||||
bio_endio(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* @opf: bio REQ_OP_* and REQ_* flags as one value
|
||||
* @wbc: optional writeback control for io accounting
|
||||
@@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf,
|
||||
bool force_bio_submit)
|
||||
{
|
||||
int ret = 0;
|
||||
struct bio *bio;
|
||||
size_t io_size = min_t(size_t, size, PAGE_SIZE);
|
||||
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
|
||||
struct extent_io_tree *tree = &inode->io_tree;
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
unsigned int cur = pg_offset;
|
||||
|
||||
ASSERT(bio_ctrl);
|
||||
|
||||
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
|
||||
pg_offset + size <= PAGE_SIZE);
|
||||
if (bio_ctrl->bio) {
|
||||
bio = bio_ctrl->bio;
|
||||
if (force_bio_submit ||
|
||||
!btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
|
||||
pg_offset, bio_flags)) {
|
||||
ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
|
||||
if (force_bio_submit && bio_ctrl->bio) {
|
||||
ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
|
||||
bio_ctrl->bio = NULL;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
} else {
|
||||
if (wbc)
|
||||
wbc_account_cgroup_owner(wbc, page, io_size);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
bio = btrfs_bio_alloc(disk_bytenr);
|
||||
bio_add_page(bio, page, io_size, pg_offset);
|
||||
bio->bi_end_io = end_io_func;
|
||||
bio->bi_private = tree;
|
||||
bio->bi_write_hint = page->mapping->host->i_write_hint;
|
||||
bio->bi_opf = opf;
|
||||
if (wbc) {
|
||||
struct block_device *bdev;
|
||||
|
||||
bdev = fs_info->fs_devices->latest_bdev;
|
||||
bio_set_dev(bio, bdev);
|
||||
wbc_init_bio(wbc, bio);
|
||||
wbc_account_cgroup_owner(wbc, page, io_size);
|
||||
}
|
||||
if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
||||
struct btrfs_device *device;
|
||||
|
||||
device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
|
||||
if (IS_ERR(device))
|
||||
return PTR_ERR(device);
|
||||
|
||||
btrfs_io_bio(bio)->device = device;
|
||||
}
|
||||
|
||||
bio_ctrl->bio = bio;
|
||||
bio_ctrl->bio_flags = bio_flags;
|
||||
ret = calc_bio_boundaries(bio_ctrl, inode);
|
||||
while (cur < pg_offset + size) {
|
||||
u32 offset = cur - pg_offset;
|
||||
int added;
|
||||
|
||||
/* Allocate new bio if needed */
|
||||
if (!bio_ctrl->bio) {
|
||||
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
|
||||
end_io_func, disk_bytenr, offset,
|
||||
page_offset(page) + cur,
|
||||
bio_flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* We must go through btrfs_bio_add_page() to ensure each
|
||||
* page range won't cross various boundaries.
|
||||
*/
|
||||
if (bio_flags & EXTENT_BIO_COMPRESSED)
|
||||
added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
|
||||
size - offset, pg_offset + offset,
|
||||
bio_flags);
|
||||
else
|
||||
added = btrfs_bio_add_page(bio_ctrl, page,
|
||||
disk_bytenr + offset, size - offset,
|
||||
pg_offset + offset, bio_flags);
|
||||
|
||||
/* Metadata page range should never be split */
|
||||
if (!is_data_inode(&inode->vfs_inode))
|
||||
ASSERT(added == 0 || added == size - offset);
|
||||
|
||||
/* At least we added some page, update the account */
|
||||
if (wbc && added)
|
||||
wbc_account_cgroup_owner(wbc, page, added);
|
||||
|
||||
/* We have reached boundary, submit right now */
|
||||
if (added < size - offset) {
|
||||
/* The bio should contain some page(s) */
|
||||
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
|
||||
ret = submit_one_bio(bio_ctrl->bio, mirror_num,
|
||||
bio_ctrl->bio_flags);
|
||||
bio_ctrl->bio = NULL;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
cur += added;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int attach_extent_buffer_page(struct extent_buffer *eb,
|
||||
@@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
||||
size_t pg_offset = 0;
|
||||
size_t iosize;
|
||||
size_t blocksize = inode->i_sb->s_blocksize;
|
||||
unsigned long this_bio_flag = 0;
|
||||
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
|
||||
|
||||
ret = set_page_extent_mapped(page);
|
||||
@@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
||||
}
|
||||
begin_page_read(fs_info, page);
|
||||
while (cur <= end) {
|
||||
unsigned long this_bio_flag = 0;
|
||||
bool force_bio_submit = false;
|
||||
u64 disk_bytenr;
|
||||
|
||||
@@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
|
||||
/* the get_extent function already copied into the page */
|
||||
if (test_range_bit(tree, cur, cur_end,
|
||||
EXTENT_UPTODATE, 1, NULL)) {
|
||||
check_page_uptodate(tree, page);
|
||||
unlock_extent(tree, cur, cur + iosize - 1);
|
||||
end_page_read(page, true, cur, iosize);
|
||||
cur = cur + iosize;
|
||||
@@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
|
||||
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
|
||||
delalloc_end, &page_started, nr_written, wbc);
|
||||
if (ret) {
|
||||
SetPageError(page);
|
||||
/*
|
||||
* btrfs_run_delalloc_range should return < 0 for error
|
||||
* but just in case, we use > 0 here meaning the IO is
|
||||
* started, so we don't want to return > 0 unless
|
||||
* things are going well.
|
||||
*/
|
||||
return ret < 0 ? ret : -EIO;
|
||||
btrfs_page_set_error(inode->root->fs_info, page,
|
||||
page_offset(page), PAGE_SIZE);
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* delalloc_end is already one less than the total length, so
|
||||
@@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
||||
int *nr_ret)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
u64 start = page_offset(page);
|
||||
u64 end = start + PAGE_SIZE - 1;
|
||||
u64 cur = start;
|
||||
u64 cur = page_offset(page);
|
||||
u64 end = cur + PAGE_SIZE - 1;
|
||||
u64 extent_offset;
|
||||
u64 block_start;
|
||||
struct extent_map *em;
|
||||
@@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
||||
const unsigned int write_flags = wbc_to_write_flags(wbc);
|
||||
bool compressed;
|
||||
|
||||
ret = btrfs_writepage_cow_fixup(page, start, end);
|
||||
ret = btrfs_writepage_cow_fixup(page);
|
||||
if (ret) {
|
||||
/* Fixup worker will requeue */
|
||||
redirty_page_for_writepage(wbc, page);
|
||||
@@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
||||
|
||||
if (cur >= i_size) {
|
||||
btrfs_writepage_endio_finish_ordered(inode, page, cur,
|
||||
end, 1);
|
||||
end, true);
|
||||
/*
|
||||
* This range is beyond i_size, thus we don't need to
|
||||
* bother writing back.
|
||||
* But we still need to clear the dirty subpage bit, or
|
||||
* the next time the page gets dirtied, we will try to
|
||||
* writeback the sectors with subpage dirty bits,
|
||||
* causing writeback without ordered extent.
|
||||
*/
|
||||
btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
||||
nr++;
|
||||
else
|
||||
btrfs_writepage_endio_finish_ordered(inode,
|
||||
page, cur, cur + iosize - 1, 1);
|
||||
page, cur, cur + iosize - 1, true);
|
||||
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
|
||||
cur += iosize;
|
||||
continue;
|
||||
}
|
||||
@@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
|
||||
cur += iosize;
|
||||
nr++;
|
||||
}
|
||||
/*
|
||||
* If we finish without problem, we should not only clear page dirty,
|
||||
* but also empty subpage dirty bits
|
||||
*/
|
||||
if (!ret)
|
||||
btrfs_page_assert_not_dirty(fs_info, page);
|
||||
*nr_ret = nr;
|
||||
return ret;
|
||||
}
|
||||
@@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
|
||||
|
||||
WARN_ON(!PageLocked(page));
|
||||
|
||||
ClearPageError(page);
|
||||
btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
|
||||
page_offset(page), PAGE_SIZE);
|
||||
|
||||
pg_offset = offset_in_page(i_size);
|
||||
if (page->index > end_index ||
|
||||
@@ -4022,10 +4109,39 @@ done:
|
||||
set_page_writeback(page);
|
||||
end_page_writeback(page);
|
||||
}
|
||||
if (PageError(page)) {
|
||||
ret = ret < 0 ? ret : -EIO;
|
||||
/*
|
||||
* Here we used to have a check for PageError() and then set @ret and
|
||||
* call end_extent_writepage().
|
||||
*
|
||||
* But in fact setting @ret here will cause different error paths
|
||||
* between subpage and regular sectorsize.
|
||||
*
|
||||
* For regular page size, we never submit current page, but only add
|
||||
* current page to current bio.
|
||||
* The bio submission can only happen in next page.
|
||||
* Thus if we hit the PageError() branch, @ret is already set to
|
||||
* non-zero value and will not get updated for regular sectorsize.
|
||||
*
|
||||
* But for subpage case, it's possible we submit part of current page,
|
||||
* thus can get PageError() set by submitted bio of the same page,
|
||||
* while our @ret is still 0.
|
||||
*
|
||||
* So here we unify the behavior and don't set @ret.
|
||||
* Error can still be properly passed to higher layer as page will
|
||||
* be set error, here we just don't handle the IO failure.
|
||||
*
|
||||
* NOTE: This is just a hotfix for subpage.
|
||||
* The root fix will be properly ending ordered extent when we hit
|
||||
* an error during writeback.
|
||||
*
|
||||
* But that needs a bigger refactoring, as we not only need to grab the
|
||||
* submitted OE, but also need to know exactly at which bytenr we hit
|
||||
* the error.
|
||||
* Currently the full page based __extent_writepage_io() is not
|
||||
* capable of that.
|
||||
*/
|
||||
if (PageError(page))
|
||||
end_extent_writepage(page, ret, start, page_end);
|
||||
}
|
||||
unlock_page(page);
|
||||
ASSERT(ret <= 0);
|
||||
return ret;
|
||||
@@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
|
||||
ret = __extent_writepage(page, &wbc_writepages, &epd);
|
||||
else {
|
||||
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
|
||||
page, start, start + PAGE_SIZE - 1, 1);
|
||||
page, start, start + PAGE_SIZE - 1, true);
|
||||
unlock_page(page);
|
||||
}
|
||||
put_page(page);
|
||||
|
||||
@@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
|
||||
struct bio *btrfs_bio_alloc(u64 first_byte);
|
||||
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
|
||||
struct bio *btrfs_bio_clone(struct bio *bio);
|
||||
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
|
||||
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
|
||||
|
||||
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
||||
u64 length, u64 logical, struct page *page,
|
||||
|
||||
@@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_path *path, u64 objectid,
|
||||
u64 offset, int mod)
|
||||
{
|
||||
int ret;
|
||||
struct btrfs_key file_key;
|
||||
int ins_len = mod < 0 ? -1 : 0;
|
||||
int cow = mod != 0;
|
||||
@@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
|
||||
file_key.objectid = objectid;
|
||||
file_key.offset = offset;
|
||||
file_key.type = BTRFS_EXTENT_DATA_KEY;
|
||||
ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
|
||||
return ret;
|
||||
|
||||
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include <linux/btrfs.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "transaction.h"
|
||||
@@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode,
|
||||
unlock_page(page);
|
||||
return -EIO;
|
||||
}
|
||||
if (page->mapping != inode->i_mapping) {
|
||||
|
||||
/*
|
||||
* Since btrfs_readpage() will unlock the page before it
|
||||
* returns, there is a window where btrfs_releasepage() can be
|
||||
* called to release the page. Here we check both inode
|
||||
* mapping and PagePrivate() to make sure the page was not
|
||||
* released.
|
||||
*
|
||||
* The private flag check is essential for subpage as we need
|
||||
* to store extra bitmap using page->private.
|
||||
*/
|
||||
if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
|
||||
unlock_page(page);
|
||||
return -EAGAIN;
|
||||
}
|
||||
@@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
|
||||
|
||||
static int btrfs_file_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
int ret;
|
||||
|
||||
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
|
||||
|
||||
ret = fsverity_file_open(inode, filp);
|
||||
if (ret)
|
||||
return ret;
|
||||
return generic_file_open(inode, filp);
|
||||
}
|
||||
|
||||
@@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
ssize_t ret;
|
||||
|
||||
if (fsverity_active(inode))
|
||||
return 0;
|
||||
|
||||
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
|
||||
return 0;
|
||||
|
||||
|
||||
@@ -344,19 +344,13 @@ fail:
|
||||
|
||||
static void readahead_cache(struct inode *inode)
|
||||
{
|
||||
struct file_ra_state *ra;
|
||||
struct file_ra_state ra;
|
||||
unsigned long last_index;
|
||||
|
||||
ra = kzalloc(sizeof(*ra), GFP_NOFS);
|
||||
if (!ra)
|
||||
return;
|
||||
|
||||
file_ra_state_init(ra, inode->i_mapping);
|
||||
file_ra_state_init(&ra, inode->i_mapping);
|
||||
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
|
||||
|
||||
page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
|
||||
|
||||
kfree(ra);
|
||||
page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
|
||||
}
|
||||
|
||||
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
|
||||
@@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
|
||||
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
|
||||
u64 offset = bytenr - block_group->start;
|
||||
u64 to_free, to_unusable;
|
||||
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
|
||||
|
||||
spin_lock(&ctl->tree_lock);
|
||||
if (!used)
|
||||
@@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
|
||||
/* All the region is now unusable. Mark it as unused and reclaim */
|
||||
if (block_group->zone_unusable == block_group->length) {
|
||||
btrfs_mark_bg_unused(block_group);
|
||||
} else if (block_group->zone_unusable >=
|
||||
div_factor_fine(block_group->length,
|
||||
fs_info->bg_reclaim_threshold)) {
|
||||
} else if (bg_reclaim_threshold &&
|
||||
block_group->zone_unusable >=
|
||||
div_factor_fine(block_group->length, bg_reclaim_threshold)) {
|
||||
btrfs_mark_bg_to_reclaim(block_group);
|
||||
}
|
||||
|
||||
@@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
|
||||
* btrfs_pin_extent_for_log_replay() when replaying the log.
|
||||
* Advance the pointer not to overwrite the tree-log nodes.
|
||||
*/
|
||||
if (block_group->alloc_offset < offset + bytes)
|
||||
block_group->alloc_offset = offset + bytes;
|
||||
if (block_group->start + block_group->alloc_offset <
|
||||
offset + bytes) {
|
||||
block_group->alloc_offset =
|
||||
offset + bytes - block_group->start;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
291
fs/btrfs/inode.c
291
fs/btrfs/inode.c
@@ -32,6 +32,7 @@
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/iomap.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include "misc.h"
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
@@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
|
||||
cur_size = min_t(unsigned long, compressed_size,
|
||||
PAGE_SIZE);
|
||||
|
||||
kaddr = kmap_atomic(cpage);
|
||||
kaddr = page_address(cpage);
|
||||
write_extent_buffer(leaf, kaddr, ptr, cur_size);
|
||||
kunmap_atomic(kaddr);
|
||||
|
||||
i++;
|
||||
ptr += cur_size;
|
||||
@@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
|
||||
*/
|
||||
static inline bool inode_can_compress(struct btrfs_inode *inode)
|
||||
{
|
||||
/* Subpage doesn't support compression yet */
|
||||
if (inode->root->fs_info->sectorsize < PAGE_SIZE)
|
||||
return false;
|
||||
if (inode->flags & BTRFS_INODE_NODATACOW ||
|
||||
inode->flags & BTRFS_INODE_NODATASUM)
|
||||
return false;
|
||||
@@ -682,7 +685,11 @@ again:
|
||||
}
|
||||
}
|
||||
cont:
|
||||
if (start == 0) {
|
||||
/*
|
||||
* Check cow_file_range() for why we don't even try to create inline
|
||||
* extent for subpage case.
|
||||
*/
|
||||
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
|
||||
/* lets try to make an inline extent */
|
||||
if (ret || total_in < actual_end) {
|
||||
/* we didn't compress the entire range, try
|
||||
@@ -973,7 +980,7 @@ retry:
|
||||
|
||||
p->mapping = inode->vfs_inode.i_mapping;
|
||||
btrfs_writepage_endio_finish_ordered(inode, p, start,
|
||||
end, 0);
|
||||
end, false);
|
||||
|
||||
p->mapping = NULL;
|
||||
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
|
||||
@@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
|
||||
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
|
||||
|
||||
if (start == 0) {
|
||||
/*
|
||||
* Due to the page size limit, for subpage we can only trigger the
|
||||
* writeback for the dirty sectors of page, that means data writeback
|
||||
* is doing more writeback than what we want.
|
||||
*
|
||||
* This is especially unexpected for some call sites like fallocate,
|
||||
* where we only increase i_size after everything is done.
|
||||
* This means we can trigger inline extent even if we didn't want to.
|
||||
* So here we skip inline extent creation completely.
|
||||
*/
|
||||
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
|
||||
/* lets try to make an inline extent */
|
||||
ret = cow_file_range_inline(inode, start, end, 0,
|
||||
BTRFS_COMPRESS_NONE, NULL);
|
||||
@@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
|
||||
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
|
||||
PAGE_SHIFT;
|
||||
|
||||
/* atomic_sub_return implies a barrier */
|
||||
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
|
||||
5 * SZ_1M)
|
||||
cond_wake_up_nomb(&fs_info->async_submit_wait);
|
||||
|
||||
/*
|
||||
* ->inode could be NULL if async_chunk_start has failed to compress,
|
||||
* in which case we don't have anything to submit, yet we need to
|
||||
@@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
|
||||
*/
|
||||
if (async_chunk->inode)
|
||||
submit_compressed_extents(async_chunk);
|
||||
|
||||
/* atomic_sub_return implies a barrier */
|
||||
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
|
||||
5 * SZ_1M)
|
||||
cond_wake_up_nomb(&fs_info->async_submit_wait);
|
||||
}
|
||||
|
||||
static noinline void async_cow_free(struct btrfs_work *work)
|
||||
@@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
|
||||
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
|
||||
page_started, nr_written);
|
||||
}
|
||||
ASSERT(ret <= 0);
|
||||
if (ret)
|
||||
btrfs_cleanup_ordered_extents(inode, locked_page, start,
|
||||
end - start + 1);
|
||||
@@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
||||
struct extent_map *split_mid = NULL;
|
||||
struct extent_map *split_post = NULL;
|
||||
int ret = 0;
|
||||
int modified;
|
||||
unsigned long flags;
|
||||
|
||||
/* Sanity check */
|
||||
@@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
||||
ASSERT(em->len == len);
|
||||
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
|
||||
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
|
||||
ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
|
||||
ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
|
||||
ASSERT(!list_empty(&em->list));
|
||||
|
||||
flags = em->flags;
|
||||
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
|
||||
clear_bit(EXTENT_FLAG_LOGGING, &flags);
|
||||
modified = !list_empty(&em->list);
|
||||
|
||||
/* First, replace the em with a new extent_map starting from * em->start */
|
||||
split_pre->start = em->start;
|
||||
@@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
||||
split_pre->compress_type = em->compress_type;
|
||||
split_pre->generation = em->generation;
|
||||
|
||||
replace_extent_mapping(em_tree, em, split_pre, modified);
|
||||
replace_extent_mapping(em_tree, em, split_pre, 1);
|
||||
|
||||
/*
|
||||
* Now we only have an extent_map at:
|
||||
@@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
||||
split_mid->flags = flags;
|
||||
split_mid->compress_type = em->compress_type;
|
||||
split_mid->generation = em->generation;
|
||||
add_extent_mapping(em_tree, split_mid, modified);
|
||||
add_extent_mapping(em_tree, split_mid, 1);
|
||||
}
|
||||
|
||||
if (post) {
|
||||
@@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
|
||||
split_post->flags = flags;
|
||||
split_post->compress_type = em->compress_type;
|
||||
split_post->generation = em->generation;
|
||||
add_extent_mapping(em_tree, split_post, modified);
|
||||
add_extent_mapping(em_tree, split_post, 1);
|
||||
}
|
||||
|
||||
/* Once for us */
|
||||
@@ -2770,7 +2788,7 @@ out_page:
|
||||
* to fix it up. The async helper will wait for ordered extents, set
|
||||
* the delalloc bit and make it safe to write the page.
|
||||
*/
|
||||
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
|
||||
int btrfs_writepage_cow_fixup(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
@@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
|
||||
|
||||
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
|
||||
struct page *page, u64 start,
|
||||
u64 end, int uptodate)
|
||||
u64 end, bool uptodate)
|
||||
{
|
||||
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
|
||||
|
||||
@@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For subpage case, above PageChecked is not safe as it's not subpage
|
||||
* compatible.
|
||||
* But for now only cow fixup and compressed read utilize PageChecked
|
||||
* flag, while in this context we can easily use io_bio->csum to
|
||||
* determine if we really need to do csum verification.
|
||||
*
|
||||
* So for now, just exit if io_bio->csum is NULL, as it means it's
|
||||
* compressed read, and its compressed data csum has already been
|
||||
* verified.
|
||||
*/
|
||||
if (io_bio->csum == NULL)
|
||||
return 0;
|
||||
|
||||
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
||||
return 0;
|
||||
|
||||
if (!root->fs_info->csum_root)
|
||||
return 0;
|
||||
|
||||
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
||||
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
|
||||
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ASSERT(page_offset(page) <= start &&
|
||||
end <= page_offset(page) + PAGE_SIZE - 1);
|
||||
for (pg_off = offset_in_page(start);
|
||||
pg_off < offset_in_page(end);
|
||||
pg_off += sectorsize, bio_offset += sectorsize) {
|
||||
u64 file_offset = pg_off + page_offset(page);
|
||||
int ret;
|
||||
|
||||
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
||||
test_range_bit(io_tree, file_offset,
|
||||
file_offset + sectorsize - 1,
|
||||
EXTENT_NODATASUM, 1, NULL)) {
|
||||
/* Skip the range without csum for data reloc inode */
|
||||
clear_extent_bits(io_tree, file_offset,
|
||||
file_offset + sectorsize - 1,
|
||||
EXTENT_NODATASUM);
|
||||
continue;
|
||||
}
|
||||
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
|
||||
page_offset(page) + pg_off);
|
||||
if (ret < 0) {
|
||||
@@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
|
||||
|
||||
/*
|
||||
* If we have an inode with links, there are a couple of
|
||||
* possibilities. Old kernels (before v3.12) used to create an
|
||||
* possibilities:
|
||||
*
|
||||
* 1. We were halfway through creating fsverity metadata for the
|
||||
* file. In that case, the orphan item represents incomplete
|
||||
* fsverity metadata which must be cleaned up with
|
||||
* btrfs_drop_verity_items and deleting the orphan item.
|
||||
|
||||
* 2. Old kernels (before v3.12) used to create an
|
||||
* orphan item for truncate indicating that there were possibly
|
||||
* extent items past i_size that needed to be deleted. In v3.12,
|
||||
* truncate was changed to update i_size in sync with the extent
|
||||
@@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
|
||||
* but either way, we can delete the orphan item.
|
||||
*/
|
||||
if (ret == -ENOENT || inode->i_nlink) {
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
ret = btrfs_drop_verity_items(BTRFS_I(inode));
|
||||
iput(inode);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
@@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
|
||||
rdev = btrfs_inode_rdev(leaf, inode_item);
|
||||
|
||||
BTRFS_I(inode)->index_cnt = (u64)-1;
|
||||
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
|
||||
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
|
||||
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
|
||||
|
||||
cache_index:
|
||||
/*
|
||||
@@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
|
||||
struct inode *inode)
|
||||
{
|
||||
struct btrfs_map_token token;
|
||||
u64 flags;
|
||||
|
||||
btrfs_init_map_token(&token, leaf);
|
||||
|
||||
@@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
|
||||
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
|
||||
btrfs_set_token_inode_transid(&token, item, trans->transid);
|
||||
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
|
||||
btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
|
||||
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
|
||||
BTRFS_I(inode)->ro_flags);
|
||||
btrfs_set_token_inode_flags(&token, item, flags);
|
||||
btrfs_set_token_inode_block_group(&token, item, 0);
|
||||
}
|
||||
|
||||
@@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Still need to make sure the inode looks like it's been updated so
|
||||
* that any holes get logged if we fsync.
|
||||
* If NO_HOLES is enabled, we don't need to do anything.
|
||||
* Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
|
||||
* or btrfs_update_inode() will be called, which guarantee that the next
|
||||
* fsync will know this inode was changed and needs to be logged.
|
||||
*/
|
||||
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
|
||||
inode->last_trans = fs_info->generation;
|
||||
inode->last_sub_trans = root->log_transid;
|
||||
inode->last_log_commit = root->last_log_commit;
|
||||
if (btrfs_fs_incompat(fs_info, NO_HOLES))
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1 - for the one we're dropping
|
||||
@@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
|
||||
if (btrfs_root_readonly(root))
|
||||
return -EROFS;
|
||||
|
||||
err = setattr_prepare(&init_user_ns, dentry, attr);
|
||||
err = setattr_prepare(mnt_userns, dentry, attr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
@@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
|
||||
}
|
||||
|
||||
if (attr->ia_valid) {
|
||||
setattr_copy(&init_user_ns, inode, attr);
|
||||
setattr_copy(mnt_userns, inode, attr);
|
||||
inode_inc_iversion(inode);
|
||||
err = btrfs_dirty_inode(inode);
|
||||
|
||||
if (!err && attr->ia_valid & ATTR_MODE)
|
||||
err = posix_acl_chmod(&init_user_ns, inode,
|
||||
inode->i_mode);
|
||||
err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
|
||||
}
|
||||
|
||||
return err;
|
||||
@@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
|
||||
trace_btrfs_inode_evict(inode);
|
||||
|
||||
if (!root) {
|
||||
fsverity_cleanup_inode(inode);
|
||||
clear_inode(inode);
|
||||
return;
|
||||
}
|
||||
@@ -5604,6 +5654,7 @@ no_delete:
|
||||
* to retry these periodically in the future.
|
||||
*/
|
||||
btrfs_remove_delayed_node(BTRFS_I(inode));
|
||||
fsverity_cleanup_inode(inode);
|
||||
clear_inode(inode);
|
||||
}
|
||||
|
||||
@@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
|
||||
|
||||
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
struct user_namespace *mnt_userns,
|
||||
struct inode *dir,
|
||||
const char *name, int name_len,
|
||||
u64 ref_objectid, u64 objectid,
|
||||
@@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
|
||||
if (ret != 0)
|
||||
goto fail_unlock;
|
||||
|
||||
inode_init_owner(&init_user_ns, inode, dir, mode);
|
||||
inode_init_owner(mnt_userns, inode, dir, mode);
|
||||
inode_set_bytes(inode, 0);
|
||||
|
||||
inode->i_mtime = current_time(inode);
|
||||
@@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
||||
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
||||
mode, &index);
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
|
||||
if (IS_ERR(inode)) {
|
||||
err = PTR_ERR(inode);
|
||||
inode = NULL;
|
||||
@@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
||||
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
||||
mode, &index);
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
|
||||
if (IS_ERR(inode)) {
|
||||
err = PTR_ERR(inode);
|
||||
inode = NULL;
|
||||
@@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
if (err)
|
||||
goto out_fail;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
||||
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
btrfs_ino(BTRFS_I(dir)), objectid,
|
||||
S_IFDIR | mode, &index);
|
||||
if (IS_ERR(inode)) {
|
||||
err = PTR_ERR(inode);
|
||||
@@ -8206,8 +8259,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
|
||||
u64 start_sector;
|
||||
int async_submit = 0;
|
||||
u64 submit_len;
|
||||
int clone_offset = 0;
|
||||
int clone_len;
|
||||
u64 clone_offset = 0;
|
||||
u64 clone_len;
|
||||
u64 logical;
|
||||
int ret;
|
||||
blk_status_t status;
|
||||
@@ -8255,9 +8308,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
|
||||
status = errno_to_blk_status(ret);
|
||||
goto out_err_em;
|
||||
}
|
||||
ASSERT(geom.len <= INT_MAX);
|
||||
|
||||
clone_len = min_t(int, submit_len, geom.len);
|
||||
clone_len = min(submit_len, geom.len);
|
||||
ASSERT(clone_len <= UINT_MAX);
|
||||
|
||||
/*
|
||||
* This will never fail as it's passing GPF_NOFS and
|
||||
@@ -8401,11 +8454,47 @@ static void btrfs_readahead(struct readahead_control *rac)
|
||||
extent_readahead(rac);
|
||||
}
|
||||
|
||||
/*
|
||||
* For releasepage() and invalidatepage() we have a race window where
|
||||
* end_page_writeback() is called but the subpage spinlock is not yet released.
|
||||
* If we continue to release/invalidate the page, we could cause use-after-free
|
||||
* for subpage spinlock. So this function is to spin and wait for subpage
|
||||
* spinlock.
|
||||
*/
|
||||
static void wait_subpage_spinlock(struct page *page)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
|
||||
struct btrfs_subpage *subpage;
|
||||
|
||||
if (fs_info->sectorsize == PAGE_SIZE)
|
||||
return;
|
||||
|
||||
ASSERT(PagePrivate(page) && page->private);
|
||||
subpage = (struct btrfs_subpage *)page->private;
|
||||
|
||||
/*
|
||||
* This may look insane as we just acquire the spinlock and release it,
|
||||
* without doing anything. But we just want to make sure no one is
|
||||
* still holding the subpage spinlock.
|
||||
* And since the page is not dirty nor writeback, and we have page
|
||||
* locked, the only possible way to hold a spinlock is from the endio
|
||||
* function to clear page writeback.
|
||||
*
|
||||
* Here we just acquire the spinlock so that all existing callers
|
||||
* should exit and we're safe to release/invalidate the page.
|
||||
*/
|
||||
spin_lock_irq(&subpage->lock);
|
||||
spin_unlock_irq(&subpage->lock);
|
||||
}
|
||||
|
||||
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
||||
{
|
||||
int ret = try_release_extent_mapping(page, gfp_flags);
|
||||
if (ret == 1)
|
||||
|
||||
if (ret == 1) {
|
||||
wait_subpage_spinlock(page);
|
||||
clear_page_extent_mapped(page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -8469,6 +8558,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
|
||||
* do double ordered extent accounting on the same page.
|
||||
*/
|
||||
wait_on_page_writeback(page);
|
||||
wait_subpage_spinlock(page);
|
||||
|
||||
/*
|
||||
* For subpage case, we have call sites like
|
||||
@@ -8557,7 +8647,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
|
||||
spin_unlock_irq(&inode->ordered_tree.lock);
|
||||
|
||||
if (btrfs_dec_test_ordered_pending(inode, &ordered,
|
||||
cur, range_end + 1 - cur, 1)) {
|
||||
cur, range_end + 1 - cur)) {
|
||||
btrfs_finish_ordered_io(ordered);
|
||||
/*
|
||||
* The ordered extent has finished, now we're again
|
||||
@@ -8938,7 +9028,8 @@ out:
|
||||
*/
|
||||
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *new_root,
|
||||
struct btrfs_root *parent_root)
|
||||
struct btrfs_root *parent_root,
|
||||
struct user_namespace *mnt_userns)
|
||||
{
|
||||
struct inode *inode;
|
||||
int err;
|
||||
@@ -8949,7 +9040,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
|
||||
inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
|
||||
ino, ino,
|
||||
S_IFDIR | (~current_umask() & S_IRWXUGO),
|
||||
&index);
|
||||
if (IS_ERR(inode))
|
||||
@@ -8993,6 +9085,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
|
||||
ei->defrag_bytes = 0;
|
||||
ei->disk_i_size = 0;
|
||||
ei->flags = 0;
|
||||
ei->ro_flags = 0;
|
||||
ei->csum_bytes = 0;
|
||||
ei->index_cnt = (u64)-1;
|
||||
ei->dir_index = 0;
|
||||
@@ -9174,6 +9267,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
|
||||
struct inode *inode = d_inode(path->dentry);
|
||||
u32 blocksize = inode->i_sb->s_blocksize;
|
||||
u32 bi_flags = BTRFS_I(inode)->flags;
|
||||
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
|
||||
|
||||
stat->result_mask |= STATX_BTIME;
|
||||
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
|
||||
@@ -9186,13 +9280,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
|
||||
stat->attributes |= STATX_ATTR_IMMUTABLE;
|
||||
if (bi_flags & BTRFS_INODE_NODUMP)
|
||||
stat->attributes |= STATX_ATTR_NODUMP;
|
||||
if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
|
||||
stat->attributes |= STATX_ATTR_VERITY;
|
||||
|
||||
stat->attributes_mask |= (STATX_ATTR_APPEND |
|
||||
STATX_ATTR_COMPRESSED |
|
||||
STATX_ATTR_IMMUTABLE |
|
||||
STATX_ATTR_NODUMP);
|
||||
|
||||
generic_fillattr(&init_user_ns, inode, stat);
|
||||
generic_fillattr(mnt_userns, inode, stat);
|
||||
stat->dev = BTRFS_I(inode)->root->anon_dev;
|
||||
|
||||
spin_lock(&BTRFS_I(inode)->lock);
|
||||
@@ -9280,8 +9376,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
|
||||
/* force full log commit if subvolume involved. */
|
||||
btrfs_set_log_full_commit(trans);
|
||||
} else {
|
||||
btrfs_pin_log_trans(root);
|
||||
root_log_pinned = true;
|
||||
ret = btrfs_insert_inode_ref(trans, dest,
|
||||
new_dentry->d_name.name,
|
||||
new_dentry->d_name.len,
|
||||
@@ -9298,8 +9392,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
|
||||
/* force full log commit if subvolume involved. */
|
||||
btrfs_set_log_full_commit(trans);
|
||||
} else {
|
||||
btrfs_pin_log_trans(dest);
|
||||
dest_log_pinned = true;
|
||||
ret = btrfs_insert_inode_ref(trans, root,
|
||||
old_dentry->d_name.name,
|
||||
old_dentry->d_name.len,
|
||||
@@ -9330,6 +9422,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
|
||||
BTRFS_I(new_inode), 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now pin the logs of the roots. We do it to ensure that no other task
|
||||
* can sync the logs while we are in progress with the rename, because
|
||||
* that could result in an inconsistency in case any of the inodes that
|
||||
* are part of this rename operation were logged before.
|
||||
*
|
||||
* We pin the logs even if at this precise moment none of the inodes was
|
||||
* logged before. This is because right after we checked for that, some
|
||||
* other task fsyncing some other inode not involved with this rename
|
||||
* operation could log that one of our inodes exists.
|
||||
*
|
||||
* We don't need to pin the logs before the above calls to
|
||||
* btrfs_insert_inode_ref(), since those don't ever need to change a log.
|
||||
*/
|
||||
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
|
||||
btrfs_pin_log_trans(root);
|
||||
root_log_pinned = true;
|
||||
}
|
||||
if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
|
||||
btrfs_pin_log_trans(dest);
|
||||
dest_log_pinned = true;
|
||||
}
|
||||
|
||||
/* src is a subvolume */
|
||||
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
|
||||
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
|
||||
@@ -9411,8 +9526,7 @@ out_fail:
|
||||
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
|
||||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
|
||||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
|
||||
(new_inode &&
|
||||
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
|
||||
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
|
||||
btrfs_set_log_full_commit(trans);
|
||||
|
||||
if (root_log_pinned) {
|
||||
@@ -9436,6 +9550,7 @@ out_notrans:
|
||||
|
||||
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
struct user_namespace *mnt_userns,
|
||||
struct inode *dir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
@@ -9448,7 +9563,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir,
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
|
||||
dentry->d_name.name,
|
||||
dentry->d_name.len,
|
||||
btrfs_ino(BTRFS_I(dir)),
|
||||
@@ -9485,7 +9600,8 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
static int btrfs_rename(struct user_namespace *mnt_userns,
|
||||
struct inode *old_dir, struct dentry *old_dentry,
|
||||
struct inode *new_dir, struct dentry *new_dentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
@@ -9582,8 +9698,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
/* force full log commit if subvolume involved. */
|
||||
btrfs_set_log_full_commit(trans);
|
||||
} else {
|
||||
btrfs_pin_log_trans(root);
|
||||
log_pinned = true;
|
||||
ret = btrfs_insert_inode_ref(trans, dest,
|
||||
new_dentry->d_name.name,
|
||||
new_dentry->d_name.len,
|
||||
@@ -9607,6 +9721,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
||||
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
|
||||
} else {
|
||||
/*
|
||||
* Now pin the log. We do it to ensure that no other task can
|
||||
* sync the log while we are in progress with the rename, as
|
||||
* that could result in an inconsistency in case any of the
|
||||
* inodes that are part of this rename operation were logged
|
||||
* before.
|
||||
*
|
||||
* We pin the log even if at this precise moment none of the
|
||||
* inodes was logged before. This is because right after we
|
||||
* checked for that, some other task fsyncing some other inode
|
||||
* not involved with this rename operation could log that one of
|
||||
* our inodes exists.
|
||||
*
|
||||
* We don't need to pin the logs before the above call to
|
||||
* btrfs_insert_inode_ref(), since that does not need to change
|
||||
* a log.
|
||||
*/
|
||||
btrfs_pin_log_trans(root);
|
||||
log_pinned = true;
|
||||
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
|
||||
BTRFS_I(d_inode(old_dentry)),
|
||||
old_dentry->d_name.name,
|
||||
@@ -9660,8 +9793,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
}
|
||||
|
||||
if (flags & RENAME_WHITEOUT) {
|
||||
ret = btrfs_whiteout_for_rename(trans, root, old_dir,
|
||||
old_dentry);
|
||||
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
|
||||
old_dir, old_dentry);
|
||||
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
@@ -9711,7 +9844,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
|
||||
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
|
||||
new_dentry);
|
||||
|
||||
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
|
||||
return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
|
||||
new_dentry, flags);
|
||||
}
|
||||
|
||||
struct btrfs_delalloc_work {
|
||||
@@ -9808,11 +9942,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
|
||||
btrfs_queue_work(root->fs_info->flush_workers,
|
||||
&work->work);
|
||||
} else {
|
||||
ret = sync_inode(inode, wbc);
|
||||
if (!ret &&
|
||||
test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
||||
&BTRFS_I(inode)->runtime_flags))
|
||||
ret = sync_inode(inode, wbc);
|
||||
ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
|
||||
btrfs_add_delayed_iput(inode);
|
||||
if (ret || wbc->nr_to_write <= 0)
|
||||
goto out;
|
||||
@@ -9947,9 +10077,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
||||
dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
|
||||
objectid, S_IFLNK|S_IRWXUGO, &index);
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
|
||||
dentry->d_name.name, dentry->d_name.len,
|
||||
btrfs_ino(BTRFS_I(dir)), objectid,
|
||||
S_IFLNK | S_IRWXUGO, &index);
|
||||
if (IS_ERR(inode)) {
|
||||
err = PTR_ERR(inode);
|
||||
inode = NULL;
|
||||
@@ -10273,7 +10404,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
|
||||
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
|
||||
return -EACCES;
|
||||
}
|
||||
return generic_permission(&init_user_ns, inode, mask);
|
||||
return generic_permission(mnt_userns, inode, mask);
|
||||
}
|
||||
|
||||
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
@@ -10298,7 +10429,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
inode = btrfs_new_inode(trans, root, dir, NULL, 0,
|
||||
inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
|
||||
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
|
||||
if (IS_ERR(inode)) {
|
||||
ret = PTR_ERR(inode);
|
||||
|
||||
176
fs/btrfs/ioctl.c
176
fs/btrfs/ioctl.c
@@ -27,6 +27,7 @@
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/fileattr.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "export.h"
|
||||
@@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
|
||||
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
|
||||
* ioctl.
|
||||
*/
|
||||
static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
|
||||
static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
|
||||
{
|
||||
unsigned int iflags = 0;
|
||||
u32 flags = binode->flags;
|
||||
u32 ro_flags = binode->ro_flags;
|
||||
|
||||
if (flags & BTRFS_INODE_SYNC)
|
||||
iflags |= FS_SYNC_FL;
|
||||
@@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
|
||||
iflags |= FS_DIRSYNC_FL;
|
||||
if (flags & BTRFS_INODE_NODATACOW)
|
||||
iflags |= FS_NOCOW_FL;
|
||||
if (ro_flags & BTRFS_INODE_RO_VERITY)
|
||||
iflags |= FS_VERITY_FL;
|
||||
|
||||
if (flags & BTRFS_INODE_NOCOMPRESS)
|
||||
iflags |= FS_NOCOMP_FL;
|
||||
@@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
|
||||
new_fl |= S_NOATIME;
|
||||
if (binode->flags & BTRFS_INODE_DIRSYNC)
|
||||
new_fl |= S_DIRSYNC;
|
||||
if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
|
||||
new_fl |= S_VERITY;
|
||||
|
||||
set_mask_bits(&inode->i_flags,
|
||||
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
|
||||
new_fl);
|
||||
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
|
||||
S_VERITY, new_fl);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
|
||||
{
|
||||
struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
|
||||
|
||||
fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
|
||||
fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
|
||||
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
|
||||
old_fsflags = btrfs_inode_flags_to_fsflags(binode);
|
||||
ret = check_fsflags(old_fsflags, fsflags);
|
||||
if (ret)
|
||||
return ret;
|
||||
@@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static noinline int create_subvol(struct inode *dir,
|
||||
struct dentry *dentry,
|
||||
static noinline int create_subvol(struct user_namespace *mnt_userns,
|
||||
struct inode *dir, struct dentry *dentry,
|
||||
const char *name, int namelen,
|
||||
struct btrfs_qgroup_inherit *inherit)
|
||||
{
|
||||
@@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir,
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ret = btrfs_create_subvol_root(trans, new_root, root);
|
||||
ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
|
||||
btrfs_put_root(new_root);
|
||||
if (ret) {
|
||||
/* We potentially lose an unused inode item here */
|
||||
@@ -830,7 +837,8 @@ free_pending:
|
||||
* nfs_async_unlink().
|
||||
*/
|
||||
|
||||
static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
|
||||
static int btrfs_may_delete(struct user_namespace *mnt_userns,
|
||||
struct inode *dir, struct dentry *victim, int isdir)
|
||||
{
|
||||
int error;
|
||||
|
||||
@@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
|
||||
BUG_ON(d_inode(victim->d_parent) != dir);
|
||||
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
|
||||
|
||||
error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
|
||||
error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
|
||||
if (error)
|
||||
return error;
|
||||
if (IS_APPEND(dir))
|
||||
return -EPERM;
|
||||
if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
|
||||
if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
|
||||
IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
|
||||
IS_SWAPFILE(d_inode(victim)))
|
||||
return -EPERM;
|
||||
@@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
|
||||
}
|
||||
|
||||
/* copy of may_create in fs/namei.c() */
|
||||
static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
|
||||
static inline int btrfs_may_create(struct user_namespace *mnt_userns,
|
||||
struct inode *dir, struct dentry *child)
|
||||
{
|
||||
if (d_really_is_positive(child))
|
||||
return -EEXIST;
|
||||
if (IS_DEADDIR(dir))
|
||||
return -ENOENT;
|
||||
return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
|
||||
if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
|
||||
return -EOVERFLOW;
|
||||
return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
|
||||
* inside this filesystem so it's quite a bit simpler.
|
||||
*/
|
||||
static noinline int btrfs_mksubvol(const struct path *parent,
|
||||
struct user_namespace *mnt_userns,
|
||||
const char *name, int namelen,
|
||||
struct btrfs_root *snap_src,
|
||||
bool readonly,
|
||||
@@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent,
|
||||
if (error == -EINTR)
|
||||
return error;
|
||||
|
||||
dentry = lookup_one_len(name, parent->dentry, namelen);
|
||||
dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
|
||||
error = PTR_ERR(dentry);
|
||||
if (IS_ERR(dentry))
|
||||
goto out_unlock;
|
||||
|
||||
error = btrfs_may_create(dir, dentry);
|
||||
error = btrfs_may_create(mnt_userns, dir, dentry);
|
||||
if (error)
|
||||
goto out_dput;
|
||||
|
||||
@@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
|
||||
if (snap_src)
|
||||
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
|
||||
else
|
||||
error = create_subvol(dir, dentry, name, namelen, inherit);
|
||||
error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
|
||||
|
||||
if (!error)
|
||||
fsnotify_mkdir(dir, dentry);
|
||||
@@ -934,6 +946,7 @@ out_unlock:
|
||||
}
|
||||
|
||||
static noinline int btrfs_mksnapshot(const struct path *parent,
|
||||
struct user_namespace *mnt_userns,
|
||||
const char *name, int namelen,
|
||||
struct btrfs_root *root,
|
||||
bool readonly,
|
||||
@@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
|
||||
|
||||
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
|
||||
|
||||
ret = btrfs_mksubvol(parent, name, namelen,
|
||||
ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
|
||||
root, readonly, inherit);
|
||||
out:
|
||||
if (snapshot_force_cow)
|
||||
@@ -1792,6 +1805,7 @@ out_drop:
|
||||
}
|
||||
|
||||
static noinline int __btrfs_ioctl_snap_create(struct file *file,
|
||||
struct user_namespace *mnt_userns,
|
||||
const char *name, unsigned long fd, int subvol,
|
||||
bool readonly,
|
||||
struct btrfs_qgroup_inherit *inherit)
|
||||
@@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
|
||||
}
|
||||
|
||||
if (subvol) {
|
||||
ret = btrfs_mksubvol(&file->f_path, name, namelen,
|
||||
NULL, readonly, inherit);
|
||||
ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
|
||||
namelen, NULL, readonly, inherit);
|
||||
} else {
|
||||
struct fd src = fdget(fd);
|
||||
struct inode *src_inode;
|
||||
@@ -1834,14 +1848,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
|
||||
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
|
||||
"Snapshot src from another FS");
|
||||
ret = -EXDEV;
|
||||
} else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
|
||||
} else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
|
||||
/*
|
||||
* Subvolume creation is not restricted, but snapshots
|
||||
* are limited to own subvolumes only
|
||||
*/
|
||||
ret = -EPERM;
|
||||
} else {
|
||||
ret = btrfs_mksnapshot(&file->f_path, name, namelen,
|
||||
ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
|
||||
name, namelen,
|
||||
BTRFS_I(src_inode)->root,
|
||||
readonly, inherit);
|
||||
}
|
||||
@@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
|
||||
return PTR_ERR(vol_args);
|
||||
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
|
||||
|
||||
ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
|
||||
subvol, false, NULL);
|
||||
ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
|
||||
vol_args->name, vol_args->fd, subvol,
|
||||
false, NULL);
|
||||
|
||||
kfree(vol_args);
|
||||
return ret;
|
||||
@@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
|
||||
}
|
||||
}
|
||||
|
||||
ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
|
||||
subvol, readonly, inherit);
|
||||
ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
|
||||
vol_args->name, vol_args->fd, subvol,
|
||||
readonly, inherit);
|
||||
if (ret)
|
||||
goto free_inherit;
|
||||
free_inherit:
|
||||
@@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
|
||||
u64 flags;
|
||||
int ret = 0;
|
||||
|
||||
if (!inode_owner_or_capable(&init_user_ns, inode))
|
||||
if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
|
||||
return -EPERM;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
@@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
|
||||
key.offset = (u64)-1;
|
||||
|
||||
while (1) {
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
else if (ret > 0) {
|
||||
ret = btrfs_previous_item(root, path, dirid,
|
||||
BTRFS_INODE_REF_KEY);
|
||||
ret = btrfs_search_backwards(root, &key, path);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
else if (ret > 0) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
l = path->nodes[0];
|
||||
slot = path->slots[0];
|
||||
btrfs_item_key_to_cpu(l, &key, slot);
|
||||
|
||||
iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
|
||||
len = btrfs_inode_ref_name_len(l, iref);
|
||||
@@ -2429,7 +2439,8 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_search_path_in_tree_user(struct inode *inode,
|
||||
static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
|
||||
struct inode *inode,
|
||||
struct btrfs_ioctl_ino_lookup_user_args *args)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
||||
@@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
|
||||
key.type = BTRFS_INODE_REF_KEY;
|
||||
key.offset = (u64)-1;
|
||||
while (1) {
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
ret = btrfs_search_backwards(root, &key, path);
|
||||
if (ret < 0)
|
||||
goto out_put;
|
||||
} else if (ret > 0) {
|
||||
ret = btrfs_previous_item(root, path, dirid,
|
||||
BTRFS_INODE_REF_KEY);
|
||||
if (ret < 0) {
|
||||
goto out_put;
|
||||
} else if (ret > 0) {
|
||||
else if (ret > 0) {
|
||||
ret = -ENOENT;
|
||||
goto out_put;
|
||||
}
|
||||
}
|
||||
|
||||
leaf = path->nodes[0];
|
||||
slot = path->slots[0];
|
||||
btrfs_item_key_to_cpu(leaf, &key, slot);
|
||||
|
||||
iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
|
||||
len = btrfs_inode_ref_name_len(leaf, iref);
|
||||
@@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
|
||||
ret = PTR_ERR(temp_inode);
|
||||
goto out_put;
|
||||
}
|
||||
ret = inode_permission(&init_user_ns, temp_inode,
|
||||
ret = inode_permission(mnt_userns, temp_inode,
|
||||
MAY_READ | MAY_EXEC);
|
||||
iput(temp_inode);
|
||||
if (ret) {
|
||||
@@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
ret = btrfs_search_path_in_tree_user(inode, args);
|
||||
ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
|
||||
|
||||
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
|
||||
ret = -EFAULT;
|
||||
@@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
struct btrfs_root *dest = NULL;
|
||||
struct btrfs_ioctl_vol_args *vol_args = NULL;
|
||||
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
|
||||
struct user_namespace *mnt_userns = file_mnt_user_ns(file);
|
||||
char *subvol_name, *subvol_name_ptr = NULL;
|
||||
int subvol_namelen;
|
||||
int err = 0;
|
||||
@@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
if (err)
|
||||
goto out;
|
||||
} else {
|
||||
struct inode *old_dir;
|
||||
|
||||
if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
@@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
err = PTR_ERR(parent);
|
||||
goto out_drop_write;
|
||||
}
|
||||
old_dir = dir;
|
||||
dir = d_inode(parent);
|
||||
|
||||
/*
|
||||
@@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
*/
|
||||
destroy_parent = true;
|
||||
|
||||
/*
|
||||
* On idmapped mounts, deletion via subvolid is
|
||||
* restricted to subvolumes that are immediate
|
||||
* ancestors of the inode referenced by the file
|
||||
* descriptor in the ioctl. Otherwise the idmapping
|
||||
* could potentially be abused to delete subvolumes
|
||||
* anywhere in the filesystem the user wouldn't be able
|
||||
* to delete without an idmapped mount.
|
||||
*/
|
||||
if (old_dir != dir && mnt_userns != &init_user_ns) {
|
||||
err = -EOPNOTSUPP;
|
||||
goto free_parent;
|
||||
}
|
||||
|
||||
subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
|
||||
fs_info, vol_args2->subvolid);
|
||||
if (IS_ERR(subvol_name_ptr)) {
|
||||
@@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
|
||||
if (err == -EINTR)
|
||||
goto free_subvol_name;
|
||||
dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
|
||||
dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
|
||||
if (IS_ERR(dentry)) {
|
||||
err = PTR_ERR(dentry);
|
||||
goto out_unlock_dir;
|
||||
@@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
if (root == dest)
|
||||
goto out_dput;
|
||||
|
||||
err = inode_permission(&init_user_ns, inode,
|
||||
MAY_WRITE | MAY_EXEC);
|
||||
err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
/* check if subvolume may be deleted by a user */
|
||||
err = btrfs_may_delete(dir, dentry, 1);
|
||||
err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
@@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct btrfs_root *root = BTRFS_I(inode)->root;
|
||||
struct btrfs_ioctl_defrag_range_args *range;
|
||||
struct btrfs_ioctl_defrag_range_args range = {0};
|
||||
int ret;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
@@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Subpage defrag will be supported in later commits */
|
||||
if (root->fs_info->sectorsize < PAGE_SIZE) {
|
||||
ret = -ENOTTY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
switch (inode->i_mode & S_IFMT) {
|
||||
case S_IFDIR:
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
@@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
|
||||
goto out;
|
||||
}
|
||||
|
||||
range = kzalloc(sizeof(*range), GFP_KERNEL);
|
||||
if (!range) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (argp) {
|
||||
if (copy_from_user(range, argp,
|
||||
sizeof(*range))) {
|
||||
if (copy_from_user(&range, argp, sizeof(range))) {
|
||||
ret = -EFAULT;
|
||||
kfree(range);
|
||||
goto out;
|
||||
}
|
||||
/* compression requires us to start the IO */
|
||||
if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
|
||||
range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
|
||||
range->extent_thresh = (u32)-1;
|
||||
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
|
||||
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
|
||||
range.extent_thresh = (u32)-1;
|
||||
}
|
||||
} else {
|
||||
/* the rest are all set to zero by kzalloc */
|
||||
range->len = (u64)-1;
|
||||
range.len = (u64)-1;
|
||||
}
|
||||
ret = btrfs_defrag_file(file_inode(file), file,
|
||||
range, BTRFS_OLDEST_GENERATION, 0);
|
||||
&range, BTRFS_OLDEST_GENERATION, 0);
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
kfree(range);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
@@ -4404,25 +4422,20 @@ drop_write:
|
||||
static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
|
||||
void __user *arg)
|
||||
{
|
||||
struct btrfs_ioctl_quota_rescan_args *qsa;
|
||||
struct btrfs_ioctl_quota_rescan_args qsa = {0};
|
||||
int ret = 0;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
|
||||
if (!qsa)
|
||||
return -ENOMEM;
|
||||
|
||||
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
||||
qsa->flags = 1;
|
||||
qsa->progress = fs_info->qgroup_rescan_progress.objectid;
|
||||
qsa.flags = 1;
|
||||
qsa.progress = fs_info->qgroup_rescan_progress.objectid;
|
||||
}
|
||||
|
||||
if (copy_to_user(arg, qsa, sizeof(*qsa)))
|
||||
if (copy_to_user(arg, &qsa, sizeof(qsa)))
|
||||
ret = -EFAULT;
|
||||
|
||||
kfree(qsa);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -4436,6 +4449,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
|
||||
}
|
||||
|
||||
static long _btrfs_ioctl_set_received_subvol(struct file *file,
|
||||
struct user_namespace *mnt_userns,
|
||||
struct btrfs_ioctl_received_subvol_args *sa)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
@@ -4447,7 +4461,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
|
||||
int ret = 0;
|
||||
int received_uuid_changed;
|
||||
|
||||
if (!inode_owner_or_capable(&init_user_ns, inode))
|
||||
if (!inode_owner_or_capable(mnt_userns, inode))
|
||||
return -EPERM;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
@@ -4552,7 +4566,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
|
||||
args64->rtime.nsec = args32->rtime.nsec;
|
||||
args64->flags = args32->flags;
|
||||
|
||||
ret = _btrfs_ioctl_set_received_subvol(file, args64);
|
||||
ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -4586,7 +4600,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
|
||||
if (IS_ERR(sa))
|
||||
return PTR_ERR(sa);
|
||||
|
||||
ret = _btrfs_ioctl_set_received_subvol(file, sa);
|
||||
ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
@@ -5013,6 +5027,10 @@ long btrfs_ioctl(struct file *file, unsigned int
|
||||
return btrfs_ioctl_get_subvol_rootref(file, argp);
|
||||
case BTRFS_IOC_INO_LOOKUP_USER:
|
||||
return btrfs_ioctl_ino_lookup_user(file, argp);
|
||||
case FS_IOC_ENABLE_VERITY:
|
||||
return fsverity_ioctl_enable(file, (const void __user *)argp);
|
||||
case FS_IOC_MEASURE_VERITY:
|
||||
return fsverity_ioctl_measure(file, argp);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
234
fs/btrfs/lzo.c
234
fs/btrfs/lzo.c
@@ -14,6 +14,7 @@
|
||||
#include <linux/lzo.h>
|
||||
#include <linux/refcount.h>
|
||||
#include "compression.h"
|
||||
#include "ctree.h"
|
||||
|
||||
#define LZO_LEN 4
|
||||
|
||||
@@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
*total_in = 0;
|
||||
|
||||
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
|
||||
data_in = kmap(in_page);
|
||||
data_in = page_address(in_page);
|
||||
|
||||
/*
|
||||
* store the size of all chunks of compressed data in
|
||||
* the first 4 bytes
|
||||
*/
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
cpage_out = kmap(out_page);
|
||||
cpage_out = page_address(out_page);
|
||||
out_offset = LZO_LEN;
|
||||
tot_out = LZO_LEN;
|
||||
pages[0] = out_page;
|
||||
@@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
if (out_len == 0 && tot_in >= len)
|
||||
break;
|
||||
|
||||
kunmap(out_page);
|
||||
if (nr_pages == nr_dest_pages) {
|
||||
out_page = NULL;
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
cpage_out = kmap(out_page);
|
||||
cpage_out = page_address(out_page);
|
||||
pages[nr_pages++] = out_page;
|
||||
|
||||
pg_bytes_left = PAGE_SIZE;
|
||||
@@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
break;
|
||||
|
||||
bytes_left = len - tot_in;
|
||||
kunmap(in_page);
|
||||
put_page(in_page);
|
||||
|
||||
start += PAGE_SIZE;
|
||||
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
|
||||
data_in = kmap(in_page);
|
||||
data_in = page_address(in_page);
|
||||
in_len = min(bytes_left, PAGE_SIZE);
|
||||
}
|
||||
|
||||
@@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
}
|
||||
|
||||
/* store the size of all chunks of compressed data */
|
||||
sizes_ptr = kmap_local_page(pages[0]);
|
||||
sizes_ptr = page_address(pages[0]);
|
||||
write_compress_length(sizes_ptr, tot_out);
|
||||
kunmap_local(sizes_ptr);
|
||||
|
||||
ret = 0;
|
||||
*total_out = tot_out;
|
||||
*total_in = tot_in;
|
||||
out:
|
||||
*out_pages = nr_pages;
|
||||
if (out_page)
|
||||
kunmap(out_page);
|
||||
|
||||
if (in_page) {
|
||||
kunmap(in_page);
|
||||
if (in_page)
|
||||
put_page(in_page);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the compressed segment payload into @dest.
|
||||
*
|
||||
* For the payload there will be no padding, just need to do page switching.
|
||||
*/
|
||||
static void copy_compressed_segment(struct compressed_bio *cb,
|
||||
char *dest, u32 len, u32 *cur_in)
|
||||
{
|
||||
u32 orig_in = *cur_in;
|
||||
|
||||
while (*cur_in < orig_in + len) {
|
||||
struct page *cur_page;
|
||||
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
|
||||
orig_in + len - *cur_in);
|
||||
|
||||
ASSERT(copy_len);
|
||||
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
|
||||
|
||||
memcpy(dest + *cur_in - orig_in,
|
||||
page_address(cur_page) + offset_in_page(*cur_in),
|
||||
copy_len);
|
||||
|
||||
*cur_in += copy_len;
|
||||
}
|
||||
}
|
||||
|
||||
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
{
|
||||
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
||||
int ret = 0, ret2;
|
||||
char *data_in;
|
||||
unsigned long page_in_index = 0;
|
||||
size_t srclen = cb->compressed_len;
|
||||
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
|
||||
unsigned long buf_start;
|
||||
unsigned long buf_offset = 0;
|
||||
unsigned long bytes;
|
||||
unsigned long working_bytes;
|
||||
size_t in_len;
|
||||
size_t out_len;
|
||||
const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
|
||||
unsigned long in_offset;
|
||||
unsigned long in_page_bytes_left;
|
||||
unsigned long tot_in;
|
||||
unsigned long tot_out;
|
||||
unsigned long tot_len;
|
||||
char *buf;
|
||||
bool may_late_unmap, need_unmap;
|
||||
struct page **pages_in = cb->compressed_pages;
|
||||
u64 disk_start = cb->start;
|
||||
struct bio *orig_bio = cb->orig_bio;
|
||||
const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
|
||||
const u32 sectorsize = fs_info->sectorsize;
|
||||
int ret;
|
||||
/* Compressed data length, can be unaligned */
|
||||
u32 len_in;
|
||||
/* Offset inside the compressed data */
|
||||
u32 cur_in = 0;
|
||||
/* Bytes decompressed so far */
|
||||
u32 cur_out = 0;
|
||||
|
||||
data_in = kmap(pages_in[0]);
|
||||
tot_len = read_compress_length(data_in);
|
||||
/*
|
||||
* Compressed data header check.
|
||||
*
|
||||
* The real compressed size can't exceed the maximum extent length, and
|
||||
* all pages should be used (whole unused page with just the segment
|
||||
* header is not possible). If this happens it means the compressed
|
||||
* extent is corrupted.
|
||||
*/
|
||||
if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) ||
|
||||
tot_len < srclen - PAGE_SIZE) {
|
||||
ret = -EUCLEAN;
|
||||
goto done;
|
||||
}
|
||||
|
||||
tot_in = LZO_LEN;
|
||||
in_offset = LZO_LEN;
|
||||
in_page_bytes_left = PAGE_SIZE - LZO_LEN;
|
||||
|
||||
tot_out = 0;
|
||||
|
||||
while (tot_in < tot_len) {
|
||||
in_len = read_compress_length(data_in + in_offset);
|
||||
in_page_bytes_left -= LZO_LEN;
|
||||
in_offset += LZO_LEN;
|
||||
tot_in += LZO_LEN;
|
||||
len_in = read_compress_length(page_address(cb->compressed_pages[0]));
|
||||
cur_in += LZO_LEN;
|
||||
|
||||
/*
|
||||
* Segment header check.
|
||||
* LZO header length check
|
||||
*
|
||||
* The segment length must not exceed the maximum LZO
|
||||
* compression size, nor the total compressed size.
|
||||
* The total length should not exceed the maximum extent length,
|
||||
* and all sectors should be used.
|
||||
* If this happens, it means the compressed extent is corrupted.
|
||||
*/
|
||||
if (in_len > max_segment_len || tot_in + in_len > tot_len) {
|
||||
ret = -EUCLEAN;
|
||||
goto done;
|
||||
if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
|
||||
round_up(len_in, sectorsize) < cb->compressed_len) {
|
||||
btrfs_err(fs_info,
|
||||
"invalid lzo header, lzo len %u compressed len %u",
|
||||
len_in, cb->compressed_len);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
tot_in += in_len;
|
||||
working_bytes = in_len;
|
||||
may_late_unmap = need_unmap = false;
|
||||
/* Go through each lzo segment */
|
||||
while (cur_in < len_in) {
|
||||
struct page *cur_page;
|
||||
/* Length of the compressed segment */
|
||||
u32 seg_len;
|
||||
u32 sector_bytes_left;
|
||||
size_t out_len = lzo1x_worst_compress(sectorsize);
|
||||
|
||||
/* fast path: avoid using the working buffer */
|
||||
if (in_page_bytes_left >= in_len) {
|
||||
buf = data_in + in_offset;
|
||||
bytes = in_len;
|
||||
may_late_unmap = true;
|
||||
goto cont;
|
||||
}
|
||||
/*
|
||||
* We should always have enough space for one segment header
|
||||
* inside current sector.
|
||||
*/
|
||||
ASSERT(cur_in / sectorsize ==
|
||||
(cur_in + LZO_LEN - 1) / sectorsize);
|
||||
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
|
||||
ASSERT(cur_page);
|
||||
seg_len = read_compress_length(page_address(cur_page) +
|
||||
offset_in_page(cur_in));
|
||||
cur_in += LZO_LEN;
|
||||
|
||||
/* copy bytes from the pages into the working buffer */
|
||||
buf = workspace->cbuf;
|
||||
buf_offset = 0;
|
||||
while (working_bytes) {
|
||||
bytes = min(working_bytes, in_page_bytes_left);
|
||||
/* Copy the compressed segment payload into workspace */
|
||||
copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
|
||||
|
||||
memcpy(buf + buf_offset, data_in + in_offset, bytes);
|
||||
buf_offset += bytes;
|
||||
cont:
|
||||
working_bytes -= bytes;
|
||||
in_page_bytes_left -= bytes;
|
||||
in_offset += bytes;
|
||||
|
||||
/* check if we need to pick another page */
|
||||
if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
|
||||
|| in_page_bytes_left == 0) {
|
||||
tot_in += in_page_bytes_left;
|
||||
|
||||
if (working_bytes == 0 && tot_in >= tot_len)
|
||||
break;
|
||||
|
||||
if (page_in_index + 1 >= total_pages_in) {
|
||||
ret = -EIO;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (may_late_unmap)
|
||||
need_unmap = true;
|
||||
else
|
||||
kunmap(pages_in[page_in_index]);
|
||||
|
||||
data_in = kmap(pages_in[++page_in_index]);
|
||||
|
||||
in_page_bytes_left = PAGE_SIZE;
|
||||
in_offset = 0;
|
||||
}
|
||||
}
|
||||
|
||||
out_len = max_segment_len;
|
||||
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
|
||||
&out_len);
|
||||
if (need_unmap)
|
||||
kunmap(pages_in[page_in_index - 1]);
|
||||
/* Decompress the data */
|
||||
ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
|
||||
workspace->buf, &out_len);
|
||||
if (ret != LZO_E_OK) {
|
||||
pr_warn("BTRFS: decompress failed\n");
|
||||
btrfs_err(fs_info, "failed to decompress");
|
||||
ret = -EIO;
|
||||
break;
|
||||
goto out;
|
||||
}
|
||||
|
||||
buf_start = tot_out;
|
||||
tot_out += out_len;
|
||||
/* Copy the data into inode pages */
|
||||
ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out);
|
||||
cur_out += out_len;
|
||||
|
||||
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
|
||||
tot_out, disk_start, orig_bio);
|
||||
if (ret2 == 0)
|
||||
break;
|
||||
/* All data read, exit */
|
||||
if (ret == 0)
|
||||
goto out;
|
||||
ret = 0;
|
||||
|
||||
/* Check if the sector has enough space for a segment header */
|
||||
sector_bytes_left = sectorsize - (cur_in % sectorsize);
|
||||
if (sector_bytes_left >= LZO_LEN)
|
||||
continue;
|
||||
|
||||
/* Skip the padding zeros */
|
||||
cur_in += sector_bytes_left;
|
||||
}
|
||||
done:
|
||||
kunmap(pages_in[page_in_index]);
|
||||
out:
|
||||
if (!ret)
|
||||
zero_fill_bio(orig_bio);
|
||||
zero_fill_bio(cb->orig_bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
|
||||
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
|
||||
bytes = min_t(unsigned long, destlen, out_len - start_byte);
|
||||
|
||||
kaddr = kmap_local_page(dest_page);
|
||||
kaddr = page_address(dest_page);
|
||||
memcpy(kaddr, workspace->buf + start_byte, bytes);
|
||||
|
||||
/*
|
||||
@@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
|
||||
*/
|
||||
if (bytes < destlen)
|
||||
memset(kaddr+bytes, 0, destlen-bytes);
|
||||
kunmap_local(kaddr);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
|
||||
* Will be also used to store the finished ordered extent.
|
||||
* @file_offset: File offset for the finished IO
|
||||
* @io_size: Length of the finish IO range
|
||||
* @uptodate: If the IO finishes without problem
|
||||
*
|
||||
* Return true if the ordered extent is finished in the range, and update
|
||||
* @cached.
|
||||
@@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
|
||||
*/
|
||||
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
|
||||
struct btrfs_ordered_extent **cached,
|
||||
u64 file_offset, u64 io_size, int uptodate)
|
||||
u64 file_offset, u64 io_size)
|
||||
{
|
||||
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
|
||||
struct rb_node *node;
|
||||
@@ -486,8 +485,6 @@ have_entry:
|
||||
entry->bytes_left, io_size);
|
||||
|
||||
entry->bytes_left -= io_size;
|
||||
if (!uptodate)
|
||||
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
|
||||
|
||||
if (entry->bytes_left == 0) {
|
||||
/*
|
||||
|
||||
@@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
|
||||
bool uptodate);
|
||||
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
|
||||
struct btrfs_ordered_extent **cached,
|
||||
u64 file_offset, u64 io_size, int uptodate);
|
||||
u64 file_offset, u64 io_size);
|
||||
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
|
||||
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
|
||||
int type);
|
||||
|
||||
@@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
|
||||
ASSERT(trans != NULL);
|
||||
|
||||
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
|
||||
false, true);
|
||||
true);
|
||||
if (ret < 0) {
|
||||
trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
||||
btrfs_warn(trans->fs_info,
|
||||
@@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
|
||||
/* Search commit root to find old_roots */
|
||||
ret = btrfs_find_all_roots(NULL, fs_info,
|
||||
record->bytenr, 0,
|
||||
&record->old_roots, false, false);
|
||||
&record->old_roots, false);
|
||||
if (ret < 0)
|
||||
goto cleanup;
|
||||
}
|
||||
@@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
|
||||
* current root. It's safe inside commit_transaction().
|
||||
*/
|
||||
ret = btrfs_find_all_roots(trans, fs_info,
|
||||
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
|
||||
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
|
||||
if (ret < 0)
|
||||
goto cleanup;
|
||||
if (qgroup_to_skip) {
|
||||
@@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
|
||||
num_bytes = found.offset;
|
||||
|
||||
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
|
||||
&roots, false, false);
|
||||
&roots, false);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* For rescan, just pass old_roots as NULL */
|
||||
|
||||
@@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
|
||||
for (i = 0; i < rbio->nr_pages; i++) {
|
||||
if (rbio->stripe_pages[i])
|
||||
continue;
|
||||
page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
page = alloc_page(GFP_NOFS);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
rbio->stripe_pages[i] = page;
|
||||
@@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
|
||||
for (; i < rbio->nr_pages; i++) {
|
||||
if (rbio->stripe_pages[i])
|
||||
continue;
|
||||
page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
page = alloc_page(GFP_NOFS);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
rbio->stripe_pages[i] = page;
|
||||
@@ -1636,9 +1636,9 @@ struct btrfs_plug_cb {
|
||||
static int plug_cmp(void *priv, const struct list_head *a,
|
||||
const struct list_head *b)
|
||||
{
|
||||
struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
|
||||
const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
|
||||
plug_list);
|
||||
struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
|
||||
const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
|
||||
plug_list);
|
||||
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
|
||||
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
|
||||
@@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
|
||||
if (rbio->stripe_pages[index])
|
||||
continue;
|
||||
|
||||
page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
page = alloc_page(GFP_NOFS);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
rbio->stripe_pages[index] = page;
|
||||
@@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
|
||||
if (!need_check)
|
||||
goto writeback;
|
||||
|
||||
p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
p_page = alloc_page(GFP_NOFS);
|
||||
if (!p_page)
|
||||
goto cleanup;
|
||||
SetPageUptodate(p_page);
|
||||
|
||||
if (has_qstripe) {
|
||||
/* RAID6, allocate and map temp space for the Q stripe */
|
||||
q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
q_page = alloc_page(GFP_NOFS);
|
||||
if (!q_page) {
|
||||
__free_page(p_page);
|
||||
goto cleanup;
|
||||
|
||||
@@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
|
||||
struct block_entry *be = NULL, *exist;
|
||||
struct root_entry *re = NULL;
|
||||
|
||||
re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
|
||||
be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
|
||||
re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
|
||||
be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
|
||||
if (!be || !re) {
|
||||
kfree(re);
|
||||
kfree(be);
|
||||
@@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
|
||||
struct root_entry *re;
|
||||
struct ref_entry *ref = NULL, *exist;
|
||||
|
||||
ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
|
||||
ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
|
||||
if (!ref)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
|
||||
struct block_entry *be;
|
||||
struct ref_entry *ref;
|
||||
|
||||
ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
|
||||
ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
|
||||
if (!ref)
|
||||
return -ENOMEM;
|
||||
be = add_block_entry(fs_info, bytenr, num_bytes, 0);
|
||||
@@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
|
||||
u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
|
||||
u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
|
||||
|
||||
ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
|
||||
ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
|
||||
if (!ref)
|
||||
return -ENOMEM;
|
||||
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "block-group.h"
|
||||
#include "backref.h"
|
||||
#include "misc.h"
|
||||
#include "subpage.h"
|
||||
|
||||
/*
|
||||
* Relocation overview
|
||||
@@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster(
|
||||
u64 num_bytes;
|
||||
int nr;
|
||||
int ret = 0;
|
||||
u64 i_size = i_size_read(&inode->vfs_inode);
|
||||
u64 prealloc_start = cluster->start - offset;
|
||||
u64 prealloc_end = cluster->end - offset;
|
||||
u64 cur_offset = prealloc_start;
|
||||
|
||||
/*
|
||||
* For subpage case, previous i_size may not be aligned to PAGE_SIZE.
|
||||
* This means the range [i_size, PAGE_END + 1) is filled with zeros by
|
||||
* btrfs_do_readpage() call of previously relocated file cluster.
|
||||
*
|
||||
* If the current cluster starts in the above range, btrfs_do_readpage()
|
||||
* will skip the read, and relocate_one_page() will later writeback
|
||||
* the padding zeros as new data, causing data corruption.
|
||||
*
|
||||
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
|
||||
*/
|
||||
if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
|
||||
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
const u32 sectorsize = fs_info->sectorsize;
|
||||
struct page *page;
|
||||
|
||||
ASSERT(sectorsize < PAGE_SIZE);
|
||||
ASSERT(IS_ALIGNED(i_size, sectorsize));
|
||||
|
||||
/*
|
||||
* Subpage can't handle page with DIRTY but without UPTODATE
|
||||
* bit as it can lead to the following deadlock:
|
||||
*
|
||||
* btrfs_readpage()
|
||||
* | Page already *locked*
|
||||
* |- btrfs_lock_and_flush_ordered_range()
|
||||
* |- btrfs_start_ordered_extent()
|
||||
* |- extent_write_cache_pages()
|
||||
* |- lock_page()
|
||||
* We try to lock the page we already hold.
|
||||
*
|
||||
* Here we just writeback the whole data reloc inode, so that
|
||||
* we will be ensured to have no dirty range in the page, and
|
||||
* are safe to clear the uptodate bits.
|
||||
*
|
||||
* This shouldn't cause too much overhead, as we need to write
|
||||
* the data back anyway.
|
||||
*/
|
||||
ret = filemap_write_and_wait(mapping);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
clear_extent_bits(&inode->io_tree, i_size,
|
||||
round_up(i_size, PAGE_SIZE) - 1,
|
||||
EXTENT_UPTODATE);
|
||||
page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
|
||||
/*
|
||||
* If page is freed we don't need to do anything then, as we
|
||||
* will re-read the whole page anyway.
|
||||
*/
|
||||
if (page) {
|
||||
btrfs_subpage_clear_uptodate(fs_info, page, i_size,
|
||||
round_up(i_size, PAGE_SIZE) - i_size);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
BUG_ON(cluster->start != cluster->boundary[0]);
|
||||
ret = btrfs_alloc_data_chunk_ondemand(inode,
|
||||
prealloc_end + 1 - prealloc_start);
|
||||
@@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
|
||||
}
|
||||
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
|
||||
|
||||
static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
|
||||
int cluster_nr)
|
||||
{
|
||||
/* Last extent, use cluster end directly */
|
||||
if (cluster_nr >= cluster->nr - 1)
|
||||
return cluster->end;
|
||||
|
||||
/* Use next boundary start*/
|
||||
return cluster->boundary[cluster_nr + 1] - 1;
|
||||
}
|
||||
|
||||
static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
|
||||
struct file_extent_cluster *cluster,
|
||||
int *cluster_nr, unsigned long page_index)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
u64 offset = BTRFS_I(inode)->index_cnt;
|
||||
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
|
||||
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
|
||||
struct page *page;
|
||||
u64 page_start;
|
||||
u64 page_end;
|
||||
u64 cur;
|
||||
int ret;
|
||||
|
||||
ASSERT(page_index <= last_index);
|
||||
page = find_lock_page(inode->i_mapping, page_index);
|
||||
if (!page) {
|
||||
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
|
||||
page_index, last_index + 1 - page_index);
|
||||
page = find_or_create_page(inode->i_mapping, page_index, mask);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
}
|
||||
ret = set_page_extent_mapped(page);
|
||||
if (ret < 0)
|
||||
goto release_page;
|
||||
|
||||
if (PageReadahead(page))
|
||||
page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
|
||||
page_index, last_index + 1 - page_index);
|
||||
|
||||
if (!PageUptodate(page)) {
|
||||
btrfs_readpage(NULL, page);
|
||||
lock_page(page);
|
||||
if (!PageUptodate(page)) {
|
||||
ret = -EIO;
|
||||
goto release_page;
|
||||
}
|
||||
}
|
||||
|
||||
page_start = page_offset(page);
|
||||
page_end = page_start + PAGE_SIZE - 1;
|
||||
|
||||
/*
|
||||
* Start from the cluster, as for subpage case, the cluster can start
|
||||
* inside the page.
|
||||
*/
|
||||
cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
|
||||
while (cur <= page_end) {
|
||||
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
|
||||
u64 extent_end = get_cluster_boundary_end(cluster,
|
||||
*cluster_nr) - offset;
|
||||
u64 clamped_start = max(page_start, extent_start);
|
||||
u64 clamped_end = min(page_end, extent_end);
|
||||
u32 clamped_len = clamped_end + 1 - clamped_start;
|
||||
|
||||
/* Reserve metadata for this range */
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
|
||||
clamped_len);
|
||||
if (ret)
|
||||
goto release_page;
|
||||
|
||||
/* Mark the range delalloc and dirty for later writeback */
|
||||
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
|
||||
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
|
||||
clamped_end, 0, NULL);
|
||||
if (ret) {
|
||||
clear_extent_bits(&BTRFS_I(inode)->io_tree,
|
||||
clamped_start, clamped_end,
|
||||
EXTENT_LOCKED | EXTENT_BOUNDARY);
|
||||
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
||||
clamped_len, true);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode),
|
||||
clamped_len);
|
||||
goto release_page;
|
||||
}
|
||||
btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
|
||||
|
||||
/*
|
||||
* Set the boundary if it's inside the page.
|
||||
* Data relocation requires the destination extents to have the
|
||||
* same size as the source.
|
||||
* EXTENT_BOUNDARY bit prevents current extent from being merged
|
||||
* with previous extent.
|
||||
*/
|
||||
if (in_range(cluster->boundary[*cluster_nr] - offset,
|
||||
page_start, PAGE_SIZE)) {
|
||||
u64 boundary_start = cluster->boundary[*cluster_nr] -
|
||||
offset;
|
||||
u64 boundary_end = boundary_start +
|
||||
fs_info->sectorsize - 1;
|
||||
|
||||
set_extent_bits(&BTRFS_I(inode)->io_tree,
|
||||
boundary_start, boundary_end,
|
||||
EXTENT_BOUNDARY);
|
||||
}
|
||||
unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
|
||||
cur += clamped_len;
|
||||
|
||||
/* Crossed extent end, go to next extent */
|
||||
if (cur >= extent_end) {
|
||||
(*cluster_nr)++;
|
||||
/* Just finished the last extent of the cluster, exit. */
|
||||
if (*cluster_nr >= cluster->nr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
balance_dirty_pages_ratelimited(inode->i_mapping);
|
||||
btrfs_throttle(fs_info);
|
||||
if (btrfs_should_cancel_balance(fs_info))
|
||||
ret = -ECANCELED;
|
||||
return ret;
|
||||
|
||||
release_page:
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int relocate_file_extent_cluster(struct inode *inode,
|
||||
struct file_extent_cluster *cluster)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
u64 page_start;
|
||||
u64 page_end;
|
||||
u64 offset = BTRFS_I(inode)->index_cnt;
|
||||
unsigned long index;
|
||||
unsigned long last_index;
|
||||
struct page *page;
|
||||
struct file_ra_state *ra;
|
||||
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
|
||||
int nr = 0;
|
||||
int cluster_nr = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (!cluster->nr)
|
||||
@@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
index = (cluster->start - offset) >> PAGE_SHIFT;
|
||||
last_index = (cluster->end - offset) >> PAGE_SHIFT;
|
||||
while (index <= last_index) {
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
|
||||
PAGE_SIZE);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
page = find_lock_page(inode->i_mapping, index);
|
||||
if (!page) {
|
||||
page_cache_sync_readahead(inode->i_mapping,
|
||||
ra, NULL, index,
|
||||
last_index + 1 - index);
|
||||
page = find_or_create_page(inode->i_mapping, index,
|
||||
mask);
|
||||
if (!page) {
|
||||
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
||||
PAGE_SIZE, true);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode),
|
||||
PAGE_SIZE);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ret = set_page_extent_mapped(page);
|
||||
if (ret < 0) {
|
||||
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
||||
PAGE_SIZE, true);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (PageReadahead(page)) {
|
||||
page_cache_async_readahead(inode->i_mapping,
|
||||
ra, NULL, page, index,
|
||||
last_index + 1 - index);
|
||||
}
|
||||
|
||||
if (!PageUptodate(page)) {
|
||||
btrfs_readpage(NULL, page);
|
||||
lock_page(page);
|
||||
if (!PageUptodate(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
||||
PAGE_SIZE, true);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode),
|
||||
PAGE_SIZE);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
page_start = page_offset(page);
|
||||
page_end = page_start + PAGE_SIZE - 1;
|
||||
|
||||
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
|
||||
|
||||
if (nr < cluster->nr &&
|
||||
page_start + offset == cluster->boundary[nr]) {
|
||||
set_extent_bits(&BTRFS_I(inode)->io_tree,
|
||||
page_start, page_end,
|
||||
EXTENT_BOUNDARY);
|
||||
nr++;
|
||||
}
|
||||
|
||||
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
|
||||
page_end, 0, NULL);
|
||||
if (ret) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
btrfs_delalloc_release_metadata(BTRFS_I(inode),
|
||||
PAGE_SIZE, true);
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode),
|
||||
PAGE_SIZE);
|
||||
|
||||
clear_extent_bits(&BTRFS_I(inode)->io_tree,
|
||||
page_start, page_end,
|
||||
EXTENT_LOCKED | EXTENT_BOUNDARY);
|
||||
goto out;
|
||||
|
||||
}
|
||||
set_page_dirty(page);
|
||||
|
||||
unlock_extent(&BTRFS_I(inode)->io_tree,
|
||||
page_start, page_end);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
index++;
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
||||
balance_dirty_pages_ratelimited(inode->i_mapping);
|
||||
btrfs_throttle(fs_info);
|
||||
if (btrfs_should_cancel_balance(fs_info)) {
|
||||
ret = -ECANCELED;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
WARN_ON(nr != cluster->nr);
|
||||
for (index = (cluster->start - offset) >> PAGE_SHIFT;
|
||||
index <= last_index && !ret; index++)
|
||||
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
|
||||
if (btrfs_is_zoned(fs_info) && !ret)
|
||||
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
||||
if (ret == 0)
|
||||
WARN_ON(cluster_nr != cluster->nr);
|
||||
out:
|
||||
kfree(ra);
|
||||
return ret;
|
||||
|
||||
@@ -1198,7 +1198,7 @@ struct backref_ctx {
|
||||
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
|
||||
{
|
||||
u64 root = (u64)(uintptr_t)key;
|
||||
struct clone_root *cr = (struct clone_root *)elt;
|
||||
const struct clone_root *cr = elt;
|
||||
|
||||
if (root < cr->root->root_key.objectid)
|
||||
return -1;
|
||||
@@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
|
||||
|
||||
static int __clone_root_cmp_sort(const void *e1, const void *e2)
|
||||
{
|
||||
struct clone_root *cr1 = (struct clone_root *)e1;
|
||||
struct clone_root *cr2 = (struct clone_root *)e2;
|
||||
const struct clone_root *cr1 = e1;
|
||||
const struct clone_root *cr2 = e2;
|
||||
|
||||
if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
|
||||
return -1;
|
||||
@@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
u64 flags = 0;
|
||||
struct btrfs_file_extent_item *fi;
|
||||
struct extent_buffer *eb = path->nodes[0];
|
||||
struct backref_ctx *backref_ctx = NULL;
|
||||
struct backref_ctx backref_ctx = {0};
|
||||
struct clone_root *cur_clone_root;
|
||||
struct btrfs_key found_key;
|
||||
struct btrfs_path *tmp_path;
|
||||
@@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
/* We only use this path under the commit sem */
|
||||
tmp_path->need_commit_sem = 0;
|
||||
|
||||
backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
|
||||
if (!backref_ctx) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (data_offset >= ino_size) {
|
||||
/*
|
||||
* There may be extents that lie behind the file's size.
|
||||
@@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
cur_clone_root->found_refs = 0;
|
||||
}
|
||||
|
||||
backref_ctx->sctx = sctx;
|
||||
backref_ctx->found = 0;
|
||||
backref_ctx->cur_objectid = ino;
|
||||
backref_ctx->cur_offset = data_offset;
|
||||
backref_ctx->found_itself = 0;
|
||||
backref_ctx->extent_len = num_bytes;
|
||||
backref_ctx.sctx = sctx;
|
||||
backref_ctx.found = 0;
|
||||
backref_ctx.cur_objectid = ino;
|
||||
backref_ctx.cur_offset = data_offset;
|
||||
backref_ctx.found_itself = 0;
|
||||
backref_ctx.extent_len = num_bytes;
|
||||
|
||||
/*
|
||||
* The last extent of a file may be too large due to page alignment.
|
||||
@@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
* __iterate_backrefs work.
|
||||
*/
|
||||
if (data_offset + num_bytes >= ino_size)
|
||||
backref_ctx->extent_len = ino_size - data_offset;
|
||||
backref_ctx.extent_len = ino_size - data_offset;
|
||||
|
||||
/*
|
||||
* Now collect all backrefs.
|
||||
@@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
extent_item_pos = 0;
|
||||
ret = iterate_extent_inodes(fs_info, found_key.objectid,
|
||||
extent_item_pos, 1, __iterate_backrefs,
|
||||
backref_ctx, false);
|
||||
&backref_ctx, false);
|
||||
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (!backref_ctx->found_itself) {
|
||||
if (!backref_ctx.found_itself) {
|
||||
/* found a bug in backref code? */
|
||||
ret = -EIO;
|
||||
btrfs_err(fs_info,
|
||||
@@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
|
||||
data_offset, ino, num_bytes, logical);
|
||||
|
||||
if (!backref_ctx->found)
|
||||
if (!backref_ctx.found)
|
||||
btrfs_debug(fs_info, "no clones found");
|
||||
|
||||
cur_clone_root = NULL;
|
||||
@@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx,
|
||||
|
||||
out:
|
||||
btrfs_free_path(tmp_path);
|
||||
kfree(backref_ctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
|
||||
long time_left;
|
||||
int loops;
|
||||
|
||||
delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
|
||||
ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
|
||||
if (delalloc_bytes == 0 && ordered_bytes == 0)
|
||||
return;
|
||||
|
||||
/* Calc the number of the pages we need flush for space reservation */
|
||||
if (to_reclaim == U64_MAX) {
|
||||
items = U64_MAX;
|
||||
@@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
|
||||
/*
|
||||
* to_reclaim is set to however much metadata we need to
|
||||
* reclaim, but reclaiming that much data doesn't really track
|
||||
* exactly, so increase the amount to reclaim by 2x in order to
|
||||
* make sure we're flushing enough delalloc to hopefully reclaim
|
||||
* some metadata reservations.
|
||||
* exactly. What we really want to do is reclaim full inode's
|
||||
* worth of reservations, however that's not available to us
|
||||
* here. We will take a fraction of the delalloc bytes for our
|
||||
* flushing loops and hope for the best. Delalloc will expand
|
||||
* the amount we write to cover an entire dirty extent, which
|
||||
* will reclaim the metadata reservation for that range. If
|
||||
* it's not enough subsequent flush stages will be more
|
||||
* aggressive.
|
||||
*/
|
||||
to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
|
||||
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
|
||||
to_reclaim = items * EXTENT_SIZE_PER_ITEM;
|
||||
}
|
||||
|
||||
trans = (struct btrfs_trans_handle *)current->journal_info;
|
||||
|
||||
delalloc_bytes = percpu_counter_sum_positive(
|
||||
&fs_info->delalloc_bytes);
|
||||
ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
|
||||
if (delalloc_bytes == 0 && ordered_bytes == 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we are doing more ordered than delalloc we need to just wait on
|
||||
* ordered extents, otherwise we'll waste time trying to flush delalloc
|
||||
@@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
|
||||
while ((delalloc_bytes || ordered_bytes) && loops < 3) {
|
||||
u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
|
||||
long nr_pages = min_t(u64, temp, LONG_MAX);
|
||||
int async_pages;
|
||||
|
||||
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
|
||||
|
||||
/*
|
||||
* We need to make sure any outstanding async pages are now
|
||||
* processed before we continue. This is because things like
|
||||
* sync_inode() try to be smart and skip writing if the inode is
|
||||
* marked clean. We don't use filemap_fwrite for flushing
|
||||
* because we want to control how many pages we write out at a
|
||||
* time, thus this is the only safe way to make sure we've
|
||||
* waited for outstanding compressed workers to have started
|
||||
* their jobs and thus have ordered extents set up properly.
|
||||
*
|
||||
* This exists because we do not want to wait for each
|
||||
* individual inode to finish its async work, we simply want to
|
||||
* start the IO on everybody, and then come back here and wait
|
||||
* for all of the async work to catch up. Once we're done with
|
||||
* that we know we'll have ordered extents for everything and we
|
||||
* can decide if we wait for that or not.
|
||||
*
|
||||
* If we choose to replace this in the future, make absolutely
|
||||
* sure that the proper waiting is being done in the async case,
|
||||
* as there have been bugs in that area before.
|
||||
*/
|
||||
async_pages = atomic_read(&fs_info->async_delalloc_pages);
|
||||
if (!async_pages)
|
||||
goto skip_async;
|
||||
|
||||
/*
|
||||
* We don't want to wait forever, if we wrote less pages in this
|
||||
* loop than we have outstanding, only wait for that number of
|
||||
* pages, otherwise we can wait for all async pages to finish
|
||||
* before continuing.
|
||||
*/
|
||||
if (async_pages > nr_pages)
|
||||
async_pages -= nr_pages;
|
||||
else
|
||||
async_pages = 0;
|
||||
wait_event(fs_info->async_submit_wait,
|
||||
atomic_read(&fs_info->async_delalloc_pages) <=
|
||||
async_pages);
|
||||
skip_async:
|
||||
loops++;
|
||||
if (wait_ordered && !trans) {
|
||||
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
|
||||
@@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
|
||||
break;
|
||||
case FLUSH_DELALLOC:
|
||||
case FLUSH_DELALLOC_WAIT:
|
||||
case FLUSH_DELALLOC_FULL:
|
||||
if (state == FLUSH_DELALLOC_FULL)
|
||||
num_bytes = U64_MAX;
|
||||
shrink_delalloc(fs_info, space_info, num_bytes,
|
||||
state == FLUSH_DELALLOC_WAIT, for_preempt);
|
||||
state != FLUSH_DELALLOC, for_preempt);
|
||||
break;
|
||||
case FLUSH_DELAYED_REFS_NR:
|
||||
case FLUSH_DELAYED_REFS:
|
||||
@@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
|
||||
{
|
||||
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
|
||||
u64 ordered, delalloc;
|
||||
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
|
||||
u64 thresh = div_factor_fine(space_info->total_bytes, 90);
|
||||
u64 used;
|
||||
|
||||
/* If we're just plain full then async reclaim just slows us down. */
|
||||
@@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
|
||||
global_rsv_size) >= thresh)
|
||||
return false;
|
||||
|
||||
used = space_info->bytes_may_use + space_info->bytes_pinned;
|
||||
|
||||
/* The total flushable belongs to the global rsv, don't flush. */
|
||||
if (global_rsv_size >= used)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* 128MiB is 1/4 of the maximum global rsv size. If we have less than
|
||||
* that devoted to other reservations then there's no sense in flushing,
|
||||
* we don't have a lot of things that need flushing.
|
||||
*/
|
||||
if (used - global_rsv_size <= SZ_128M)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* We have tickets queued, bail so we don't compete with the async
|
||||
* flushers.
|
||||
@@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
|
||||
struct reserve_ticket *ticket;
|
||||
u64 tickets_id = space_info->tickets_id;
|
||||
|
||||
trace_btrfs_fail_all_tickets(fs_info, space_info);
|
||||
|
||||
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
||||
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
|
||||
__btrfs_dump_space_info(fs_info, space_info);
|
||||
@@ -904,6 +967,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
|
||||
commit_cycles--;
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not want to empty the system of delalloc unless we're
|
||||
* under heavy pressure, so allow one trip through the flushing
|
||||
* logic before we start doing a FLUSH_DELALLOC_FULL.
|
||||
*/
|
||||
if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
|
||||
flush_state++;
|
||||
|
||||
/*
|
||||
* We don't want to force a chunk allocation until we've tried
|
||||
* pretty hard to reclaim space. Think of the case where we
|
||||
@@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
|
||||
* so if we now have space to allocate do the force chunk allocation.
|
||||
*/
|
||||
static const enum btrfs_flush_state data_flush_states[] = {
|
||||
FLUSH_DELALLOC_WAIT,
|
||||
FLUSH_DELALLOC_FULL,
|
||||
RUN_DELAYED_IPUTS,
|
||||
COMMIT_TRANS,
|
||||
ALLOC_CHUNK_FORCE,
|
||||
@@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
|
||||
FLUSH_DELAYED_REFS,
|
||||
FLUSH_DELALLOC,
|
||||
FLUSH_DELALLOC_WAIT,
|
||||
FLUSH_DELALLOC_FULL,
|
||||
ALLOC_CHUNK,
|
||||
COMMIT_TRANS,
|
||||
};
|
||||
|
||||
@@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
|
||||
} \
|
||||
token->kaddr = page_address(token->eb->pages[idx]); \
|
||||
token->offset = idx << PAGE_SHIFT; \
|
||||
if (oip + size <= PAGE_SIZE) \
|
||||
if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
|
||||
return get_unaligned_le##bits(token->kaddr + oip); \
|
||||
\
|
||||
memcpy(lebytes, token->kaddr + oip, part); \
|
||||
@@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
|
||||
u8 lebytes[sizeof(u##bits)]; \
|
||||
\
|
||||
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
|
||||
if (oip + size <= PAGE_SIZE) \
|
||||
if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
|
||||
return get_unaligned_le##bits(kaddr + oip); \
|
||||
\
|
||||
memcpy(lebytes, kaddr + oip, part); \
|
||||
@@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
|
||||
} \
|
||||
token->kaddr = page_address(token->eb->pages[idx]); \
|
||||
token->offset = idx << PAGE_SHIFT; \
|
||||
if (oip + size <= PAGE_SIZE) { \
|
||||
if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
|
||||
put_unaligned_le##bits(val, token->kaddr + oip); \
|
||||
return; \
|
||||
} \
|
||||
@@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
|
||||
u8 lebytes[sizeof(u##bits)]; \
|
||||
\
|
||||
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
|
||||
if (oip + size <= PAGE_SIZE) { \
|
||||
if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
|
||||
put_unaligned_le##bits(val, kaddr + oip); \
|
||||
return; \
|
||||
} \
|
||||
|
||||
@@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
subpage->writeback_bitmap &= ~tmp;
|
||||
if (subpage->writeback_bitmap == 0)
|
||||
if (subpage->writeback_bitmap == 0) {
|
||||
ASSERT(PageWriteback(page));
|
||||
end_page_writeback(page);
|
||||
}
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
|
||||
@@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
|
||||
PageWriteback);
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
|
||||
PageOrdered);
|
||||
|
||||
/*
|
||||
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
|
||||
* is cleared.
|
||||
*/
|
||||
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page)
|
||||
{
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
|
||||
return;
|
||||
|
||||
ASSERT(!PageDirty(page));
|
||||
if (fs_info->sectorsize == PAGE_SIZE)
|
||||
return;
|
||||
|
||||
ASSERT(PagePrivate(page) && page->private);
|
||||
ASSERT(subpage->dirty_bitmap == 0);
|
||||
}
|
||||
|
||||
@@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
|
||||
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len);
|
||||
|
||||
void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
|
||||
key.type = BTRFS_ROOT_BACKREF_KEY;
|
||||
key.offset = (u64)-1;
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
goto err;
|
||||
} else if (ret > 0) {
|
||||
ret = btrfs_previous_item(root, path, subvol_objectid,
|
||||
BTRFS_ROOT_BACKREF_KEY);
|
||||
ret = btrfs_search_backwards(root, &key, path);
|
||||
if (ret < 0) {
|
||||
goto err;
|
||||
} else if (ret > 0) {
|
||||
ret = -ENOENT;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
||||
subvol_objectid = key.offset;
|
||||
|
||||
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
||||
@@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
|
||||
key.type = BTRFS_INODE_REF_KEY;
|
||||
key.offset = (u64)-1;
|
||||
|
||||
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
goto err;
|
||||
} else if (ret > 0) {
|
||||
ret = btrfs_previous_item(fs_root, path, dirid,
|
||||
BTRFS_INODE_REF_KEY);
|
||||
ret = btrfs_search_backwards(fs_root, &key, path);
|
||||
if (ret < 0) {
|
||||
goto err;
|
||||
} else if (ret > 0) {
|
||||
ret = -ENOENT;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
||||
dirid = key.offset;
|
||||
|
||||
inode_ref = btrfs_item_ptr(path->nodes[0],
|
||||
@@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb,
|
||||
sb->s_op = &btrfs_super_ops;
|
||||
sb->s_d_op = &btrfs_dentry_operations;
|
||||
sb->s_export_op = &btrfs_export_ops;
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
sb->s_vop = &btrfs_verityops;
|
||||
#endif
|
||||
sb->s_xattr = btrfs_xattr_handlers;
|
||||
sb->s_time_gran = 1;
|
||||
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
|
||||
@@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
|
||||
ret = -EINVAL;
|
||||
goto restore;
|
||||
}
|
||||
if (fs_info->sectorsize < PAGE_SIZE) {
|
||||
btrfs_warn(fs_info,
|
||||
"read-write mount is not yet allowed for sectorsize %u page size %lu",
|
||||
fs_info->sectorsize, PAGE_SIZE);
|
||||
ret = -EINVAL;
|
||||
goto restore;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: when remounting with a change that does writes, don't
|
||||
@@ -2096,16 +2078,15 @@ restore:
|
||||
}
|
||||
|
||||
/* Used to sort the devices by max_avail(descending sort) */
|
||||
static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
|
||||
const void *dev_info2)
|
||||
static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
|
||||
{
|
||||
if (((struct btrfs_device_info *)dev_info1)->max_avail >
|
||||
((struct btrfs_device_info *)dev_info2)->max_avail)
|
||||
const struct btrfs_device_info *dev_info1 = a;
|
||||
const struct btrfs_device_info *dev_info2 = b;
|
||||
|
||||
if (dev_info1->max_avail > dev_info2->max_avail)
|
||||
return -1;
|
||||
else if (((struct btrfs_device_info *)dev_info1)->max_avail <
|
||||
((struct btrfs_device_info *)dev_info2)->max_avail)
|
||||
else if (dev_info1->max_avail < dev_info2->max_avail)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = {
|
||||
.name = "btrfs",
|
||||
.mount = btrfs_mount_root,
|
||||
.kill_sb = btrfs_kill_super,
|
||||
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
|
||||
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
|
||||
};
|
||||
|
||||
MODULE_ALIAS_FS("btrfs");
|
||||
@@ -2571,6 +2552,11 @@ static void __init btrfs_print_mod_info(void)
|
||||
", zoned=yes"
|
||||
#else
|
||||
", zoned=no"
|
||||
#endif
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
", fsverity=yes"
|
||||
#else
|
||||
", fsverity=no"
|
||||
#endif
|
||||
;
|
||||
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
|
||||
|
||||
108
fs/btrfs/sysfs.c
108
fs/btrfs/sysfs.c
@@ -22,6 +22,26 @@
|
||||
#include "block-group.h"
|
||||
#include "qgroup.h"
|
||||
|
||||
/*
|
||||
* Structure name Path
|
||||
* --------------------------------------------------------------------------
|
||||
* btrfs_supported_static_feature_attrs /sys/fs/btrfs/features
|
||||
* btrfs_supported_feature_attrs /sys/fs/btrfs/features and
|
||||
* /sys/fs/btrfs/<uuid>/features
|
||||
* btrfs_attrs /sys/fs/btrfs/<uuid>
|
||||
* devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid>
|
||||
* allocation_attrs /sys/fs/btrfs/<uuid>/allocation
|
||||
* qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
|
||||
* space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
|
||||
* raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
|
||||
*
|
||||
* When built with BTRFS_CONFIG_DEBUG:
|
||||
*
|
||||
* btrfs_debug_feature_attrs /sys/fs/btrfs/debug
|
||||
* btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
|
||||
* discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
|
||||
*/
|
||||
|
||||
struct btrfs_feature_attr {
|
||||
struct kobj_attribute kobj_attr;
|
||||
enum btrfs_feature_set feature_set;
|
||||
@@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
|
||||
#endif
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Features which depend on feature bits and may differ between each fs.
|
||||
*
|
||||
* /sys/fs/btrfs/features - all available features implemeted by this version
|
||||
* /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
|
||||
* can be changed on a mounted filesystem.
|
||||
*/
|
||||
static struct attribute *btrfs_supported_feature_attrs[] = {
|
||||
BTRFS_FEAT_ATTR_PTR(mixed_backref),
|
||||
BTRFS_FEAT_ATTR_PTR(default_subvol),
|
||||
@@ -284,17 +314,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
|
||||
BTRFS_FEAT_ATTR_PTR(raid1c34),
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
BTRFS_FEAT_ATTR_PTR(zoned),
|
||||
#endif
|
||||
#ifdef CONFIG_FS_VERITY
|
||||
BTRFS_FEAT_ATTR_PTR(verity),
|
||||
#endif
|
||||
NULL
|
||||
};
|
||||
|
||||
/*
|
||||
* Features which depend on feature bits and may differ between each fs.
|
||||
*
|
||||
* /sys/fs/btrfs/features lists all available features of this kernel while
|
||||
* /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or
|
||||
* can be changed online.
|
||||
*/
|
||||
static const struct attribute_group btrfs_feature_attr_group = {
|
||||
.name = "features",
|
||||
.is_visible = btrfs_feature_visible,
|
||||
@@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
|
||||
{
|
||||
ssize_t ret = 0;
|
||||
|
||||
/* 4K sector size is also supported with 64K page size */
|
||||
if (PAGE_SIZE == SZ_64K)
|
||||
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
|
||||
|
||||
/* Only sectorsize == PAGE_SIZE is now supported */
|
||||
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
|
||||
|
||||
@@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
|
||||
BTRFS_ATTR(static_feature, supported_sectorsizes,
|
||||
supported_sectorsizes_show);
|
||||
|
||||
/*
|
||||
* Features which only depend on kernel version.
|
||||
*
|
||||
* These are listed in /sys/fs/btrfs/features along with
|
||||
* btrfs_supported_feature_attrs.
|
||||
*/
|
||||
static struct attribute *btrfs_supported_static_feature_attrs[] = {
|
||||
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
|
||||
BTRFS_ATTR_PTR(static_feature, supported_checksums),
|
||||
@@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
/*
|
||||
* Features which only depend on kernel version.
|
||||
*
|
||||
* These are listed in /sys/fs/btrfs/features along with
|
||||
* btrfs_feature_attr_group
|
||||
*/
|
||||
static const struct attribute_group btrfs_static_feature_attr_group = {
|
||||
.name = "features",
|
||||
.attrs = btrfs_supported_static_feature_attrs,
|
||||
@@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
|
||||
BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
|
||||
btrfs_discard_max_discard_size_store);
|
||||
|
||||
/*
|
||||
* Per-filesystem debugging of discard (when mounted with discard=async).
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/debug/discard/
|
||||
*/
|
||||
static const struct attribute *discard_debug_attrs[] = {
|
||||
BTRFS_ATTR_PTR(discard, discardable_bytes),
|
||||
BTRFS_ATTR_PTR(discard, discardable_extents),
|
||||
@@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Runtime debugging exported via sysfs
|
||||
* Per-filesystem runtime debugging exported via sysfs.
|
||||
*
|
||||
* /sys/fs/btrfs/debug - applies to module or all filesystems
|
||||
* /sys/fs/btrfs/UUID - applies only to the given filesystem
|
||||
* Path: /sys/fs/btrfs/UUID/debug/
|
||||
*/
|
||||
static const struct attribute *btrfs_debug_mount_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
* Runtime debugging exported via sysfs, applies to all mounted filesystems.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/debug
|
||||
*/
|
||||
static struct attribute *btrfs_debug_feature_attrs[] = {
|
||||
NULL
|
||||
};
|
||||
@@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
|
||||
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocation information about block group profiles.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/
|
||||
*/
|
||||
static struct attribute *raid_attrs[] = {
|
||||
BTRFS_ATTR_PTR(raid, total_bytes),
|
||||
BTRFS_ATTR_PTR(raid, used_bytes),
|
||||
@@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
|
||||
SPACE_INFO_ATTR(disk_used);
|
||||
SPACE_INFO_ATTR(disk_total);
|
||||
|
||||
/*
|
||||
* Allocation information about block group types.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/
|
||||
*/
|
||||
static struct attribute *space_info_attrs[] = {
|
||||
BTRFS_ATTR_PTR(space_info, flags),
|
||||
BTRFS_ATTR_PTR(space_info, total_bytes),
|
||||
@@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = {
|
||||
.default_groups = space_info_groups,
|
||||
};
|
||||
|
||||
/*
|
||||
* Allocation information about block groups.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/allocation/
|
||||
*/
|
||||
static const struct attribute *allocation_attrs[] = {
|
||||
BTRFS_ATTR_PTR(allocation, global_rsv_reserved),
|
||||
BTRFS_ATTR_PTR(allocation, global_rsv_size),
|
||||
@@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
|
||||
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
|
||||
ssize_t ret;
|
||||
|
||||
ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
|
||||
ret = scnprintf(buf, PAGE_SIZE, "%d\n",
|
||||
READ_ONCE(fs_info->bg_reclaim_threshold));
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (thresh <= 50 || thresh > 100)
|
||||
if (thresh != 0 && (thresh <= 50 || thresh > 100))
|
||||
return -EINVAL;
|
||||
|
||||
fs_info->bg_reclaim_threshold = thresh;
|
||||
WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
|
||||
|
||||
return len;
|
||||
}
|
||||
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
|
||||
btrfs_bg_reclaim_threshold_store);
|
||||
|
||||
/*
|
||||
* Per-filesystem information and stats.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/
|
||||
*/
|
||||
static const struct attribute *btrfs_attrs[] = {
|
||||
BTRFS_ATTR_PTR(, label),
|
||||
BTRFS_ATTR_PTR(, nodesize),
|
||||
@@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
|
||||
}
|
||||
BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
|
||||
|
||||
/*
|
||||
* Information about one device.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/
|
||||
*/
|
||||
static struct attribute *devid_attrs[] = {
|
||||
BTRFS_ATTR_PTR(devid, error_stats),
|
||||
BTRFS_ATTR_PTR(devid, in_fs_metadata),
|
||||
@@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
|
||||
QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
|
||||
QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
|
||||
|
||||
/*
|
||||
* Qgroup information.
|
||||
*
|
||||
* Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/
|
||||
*/
|
||||
static struct attribute *qgroup_attrs[] = {
|
||||
BTRFS_ATTR_PTR(qgroup, referenced),
|
||||
BTRFS_ATTR_PTR(qgroup, exclusive),
|
||||
|
||||
@@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
|
||||
* we can only call btrfs_qgroup_account_extent() directly to test
|
||||
* quota.
|
||||
*/
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
test_err("couldn't find old roots: %d", ret);
|
||||
@@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
ulist_free(new_roots);
|
||||
@@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
|
||||
old_roots = NULL;
|
||||
new_roots = NULL;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
test_err("couldn't find old roots: %d", ret);
|
||||
@@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
|
||||
if (ret)
|
||||
return -EINVAL;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
ulist_free(new_roots);
|
||||
@@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
test_err("couldn't find old roots: %d", ret);
|
||||
@@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
ulist_free(new_roots);
|
||||
@@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
test_err("couldn't find old roots: %d", ret);
|
||||
@@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
ulist_free(new_roots);
|
||||
@@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
test_err("couldn't find old roots: %d", ret);
|
||||
@@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
|
||||
false, false);
|
||||
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
|
||||
if (ret) {
|
||||
ulist_free(old_roots);
|
||||
ulist_free(new_roots);
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "compression.h"
|
||||
#include "volumes.h"
|
||||
#include "misc.h"
|
||||
#include "btrfs_inode.h"
|
||||
|
||||
/*
|
||||
* Error message should follow the following format:
|
||||
@@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
|
||||
(type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
|
||||
if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
|
||||
sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID1 &&
|
||||
num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID1C3 &&
|
||||
num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID1C4 &&
|
||||
num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID5 &&
|
||||
num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
|
||||
(type & BTRFS_BLOCK_GROUP_RAID6 &&
|
||||
num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
|
||||
(type & BTRFS_BLOCK_GROUP_DUP &&
|
||||
num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
|
||||
((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
|
||||
num_stripes != 1))) {
|
||||
num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
|
||||
chunk_err(leaf, chunk, logical,
|
||||
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
|
||||
num_stripes, sub_stripes,
|
||||
@@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf,
|
||||
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
|
||||
u32 mode;
|
||||
int ret;
|
||||
u32 flags;
|
||||
u32 ro_flags;
|
||||
|
||||
ret = check_inode_key(leaf, key, slot);
|
||||
if (unlikely(ret < 0))
|
||||
@@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf,
|
||||
btrfs_inode_nlink(leaf, iitem));
|
||||
return -EUCLEAN;
|
||||
}
|
||||
if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
|
||||
btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
|
||||
if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
|
||||
inode_item_err(leaf, slot,
|
||||
"unknown flags detected: 0x%llx",
|
||||
btrfs_inode_flags(leaf, iitem) &
|
||||
~BTRFS_INODE_FLAG_MASK);
|
||||
"unknown incompat flags detected: 0x%x", flags);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
if (unlikely(!sb_rdonly(fs_info->sb) &&
|
||||
(ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
|
||||
inode_item_err(leaf, slot,
|
||||
"unknown ro-compat flags detected on writeable mount: 0x%x",
|
||||
ro_flags);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
return 0;
|
||||
|
||||
@@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
|
||||
*/
|
||||
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
|
||||
ins.offset);
|
||||
if (ret == 0) {
|
||||
if (ret < 0) {
|
||||
goto out;
|
||||
} else if (ret == 0) {
|
||||
btrfs_init_generic_ref(&ref,
|
||||
BTRFS_ADD_DELAYED_REF,
|
||||
ins.objectid, ins.offset, 0);
|
||||
@@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
|
||||
list_del_init(&ctx->list);
|
||||
ctx->log_ret = error;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&root->log_ctxs[index]);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
|
||||
goto out_wake_log_root;
|
||||
}
|
||||
|
||||
mutex_lock(&root->log_mutex);
|
||||
if (root->last_log_commit < log_transid)
|
||||
/*
|
||||
* We know there can only be one task here, since we have not yet set
|
||||
* root->log_commit[index1] to 0 and any task attempting to sync the
|
||||
* log must wait for the previous log transaction to commit if it's
|
||||
* still in progress or wait for the current log transaction commit if
|
||||
* someone else already started it. We use <= and not < because the
|
||||
* first log transaction has an ID of 0.
|
||||
*/
|
||||
ASSERT(root->last_log_commit <= log_transid);
|
||||
root->last_log_commit = log_transid;
|
||||
mutex_unlock(&root->log_mutex);
|
||||
|
||||
out_wake_log_root:
|
||||
mutex_lock(&log_root_tree->log_mutex);
|
||||
@@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if an inode was logged in the current transaction. We can't always rely
|
||||
* on an inode's logged_trans value, because it's an in-memory only field and
|
||||
* therefore not persisted. This means that its value is lost if the inode gets
|
||||
* evicted and loaded again from disk (in which case it has a value of 0, and
|
||||
* certainly it is smaller then any possible transaction ID), when that happens
|
||||
* the full_sync flag is set in the inode's runtime flags, so on that case we
|
||||
* assume eviction happened and ignore the logged_trans value, assuming the
|
||||
* worst case, that the inode was logged before in the current transaction.
|
||||
* Check if an inode was logged in the current transaction. This may often
|
||||
* return some false positives, because logged_trans is an in memory only field,
|
||||
* not persisted anywhere. This is meant to be used in contexts where a false
|
||||
* positive has no functional consequences.
|
||||
*/
|
||||
static bool inode_logged(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_inode *inode)
|
||||
@@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
|
||||
if (inode->logged_trans == trans->transid)
|
||||
return true;
|
||||
|
||||
if (inode->last_trans == trans->transid &&
|
||||
test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
|
||||
/*
|
||||
* The inode's logged_trans is always 0 when we load it (because it is
|
||||
* not persisted in the inode item or elsewhere). So if it is 0, the
|
||||
* inode was last modified in the current transaction then the inode may
|
||||
* have been logged before in the current transaction, then evicted and
|
||||
* loaded again in the current transaction - or may have never been logged
|
||||
* in the current transaction, but since we can not be sure, we have to
|
||||
* assume it was, otherwise our callers can leave an inconsistent log.
|
||||
*/
|
||||
if (inode->logged_trans == 0 &&
|
||||
inode->last_trans == trans->transid &&
|
||||
!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
|
||||
return true;
|
||||
|
||||
@@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
|
||||
u64 logged_isize)
|
||||
{
|
||||
struct btrfs_map_token token;
|
||||
u64 flags;
|
||||
|
||||
btrfs_init_map_token(&token, leaf);
|
||||
|
||||
@@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
|
||||
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
|
||||
btrfs_set_token_inode_transid(&token, item, trans->transid);
|
||||
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
|
||||
btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
|
||||
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
|
||||
BTRFS_I(inode)->ro_flags);
|
||||
btrfs_set_token_inode_flags(&token, item, flags);
|
||||
btrfs_set_token_inode_block_group(&token, item, 0);
|
||||
}
|
||||
|
||||
static int log_inode_item(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *log, struct btrfs_path *path,
|
||||
struct btrfs_inode *inode)
|
||||
struct btrfs_inode *inode, bool inode_item_dropped)
|
||||
{
|
||||
struct btrfs_inode_item *inode_item;
|
||||
int ret;
|
||||
|
||||
ret = btrfs_insert_empty_item(trans, log, path,
|
||||
&inode->location, sizeof(*inode_item));
|
||||
if (ret && ret != -EEXIST)
|
||||
/*
|
||||
* If we are doing a fast fsync and the inode was logged before in the
|
||||
* current transaction, then we know the inode was previously logged and
|
||||
* it exists in the log tree. For performance reasons, in this case use
|
||||
* btrfs_search_slot() directly with ins_len set to 0 so that we never
|
||||
* attempt a write lock on the leaf's parent, which adds unnecessary lock
|
||||
* contention in case there are concurrent fsyncs for other inodes of the
|
||||
* same subvolume. Using btrfs_insert_empty_item() when the inode item
|
||||
* already exists can also result in unnecessarily splitting a leaf.
|
||||
*/
|
||||
if (!inode_item_dropped && inode->logged_trans == trans->transid) {
|
||||
ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
|
||||
ASSERT(ret <= 0);
|
||||
if (ret > 0)
|
||||
ret = -ENOENT;
|
||||
} else {
|
||||
/*
|
||||
* This means it is the first fsync in the current transaction,
|
||||
* so the inode item is not in the log and we need to insert it.
|
||||
* We can never get -EEXIST because we are only called for a fast
|
||||
* fsync and in case an inode eviction happens after the inode was
|
||||
* logged before in the current transaction, when we load again
|
||||
* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
|
||||
* flags and set ->logged_trans to 0.
|
||||
*/
|
||||
ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
|
||||
sizeof(*inode_item));
|
||||
ASSERT(ret != -EEXIST);
|
||||
}
|
||||
if (ret)
|
||||
return ret;
|
||||
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
||||
struct btrfs_inode_item);
|
||||
@@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
|
||||
static int extent_cmp(void *priv, const struct list_head *a,
|
||||
const struct list_head *b)
|
||||
{
|
||||
struct extent_map *em1, *em2;
|
||||
const struct extent_map *em1, *em2;
|
||||
|
||||
em1 = list_entry(a, struct extent_map, list);
|
||||
em2 = list_entry(b, struct extent_map, list);
|
||||
@@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
|
||||
/*
|
||||
* Check the inode's logged_trans only instead of
|
||||
* btrfs_inode_in_log(). This is because the last_log_commit of
|
||||
* the inode is not updated when we only log that it exists and
|
||||
* it has the full sync bit set (see btrfs_log_inode()).
|
||||
* the inode is not updated when we only log that it exists (see
|
||||
* btrfs_log_inode()).
|
||||
*/
|
||||
if (BTRFS_I(inode)->logged_trans == trans->transid) {
|
||||
spin_unlock(&BTRFS_I(inode)->lock);
|
||||
@@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
|
||||
bool need_log_inode_item = true;
|
||||
bool xattrs_logged = false;
|
||||
bool recursive_logging = false;
|
||||
bool inode_item_dropped = true;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
@@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
|
||||
} else {
|
||||
if (inode_only == LOG_INODE_ALL)
|
||||
fast_search = true;
|
||||
inode_item_dropped = false;
|
||||
goto log_extents;
|
||||
}
|
||||
|
||||
@@ -5466,7 +5509,7 @@ log_extents:
|
||||
btrfs_release_path(path);
|
||||
btrfs_release_path(dst_path);
|
||||
if (need_log_inode_item) {
|
||||
err = log_inode_item(trans, log, dst_path, inode);
|
||||
err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
/*
|
||||
@@ -5572,6 +5615,13 @@ out_unlock:
|
||||
static bool need_log_inode(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_inode *inode)
|
||||
{
|
||||
/*
|
||||
* If a directory was not modified, no dentries added or removed, we can
|
||||
* and should avoid logging it.
|
||||
*/
|
||||
if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If this inode does not have new/updated/deleted xattrs since the last
|
||||
* time it was logged and is flagged as logged in the current transaction,
|
||||
|
||||
811
fs/btrfs/verity.c
Normal file
811
fs/btrfs/verity.c
Normal file
@@ -0,0 +1,811 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/posix_acl_xattr.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "ctree.h"
|
||||
#include "btrfs_inode.h"
|
||||
#include "transaction.h"
|
||||
#include "disk-io.h"
|
||||
#include "locking.h"
|
||||
|
||||
/*
|
||||
* Implementation of the interface defined in struct fsverity_operations.
|
||||
*
|
||||
* The main question is how and where to store the verity descriptor and the
|
||||
* Merkle tree. We store both in dedicated btree items in the filesystem tree,
|
||||
* together with the rest of the inode metadata. This means we'll need to do
|
||||
* extra work to encrypt them once encryption is supported in btrfs, but btrfs
|
||||
* has a lot of careful code around i_size and it seems better to make a new key
|
||||
* type than try and adjust all of our expectations for i_size.
|
||||
*
|
||||
* Note that this differs from the implementation in ext4 and f2fs, where
|
||||
* this data is stored as if it were in the file, but past EOF. However, btrfs
|
||||
* does not have a widespread mechanism for caching opaque metadata pages, so we
|
||||
* do pretend that the Merkle tree pages themselves are past EOF for the
|
||||
* purposes of caching them (as opposed to creating a virtual inode).
|
||||
*
|
||||
* fs verity items are stored under two different key types on disk.
|
||||
* The descriptor items:
|
||||
* [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
|
||||
*
|
||||
* At offset 0, we store a btrfs_verity_descriptor_item which tracks the
|
||||
* size of the descriptor item and some extra data for encryption.
|
||||
* Starting at offset 1, these hold the generic fs verity descriptor.
|
||||
* The latter are opaque to btrfs, we just read and write them as a blob for
|
||||
* the higher level verity code. The most common descriptor size is 256 bytes.
|
||||
*
|
||||
* The merkle tree items:
|
||||
* [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
|
||||
*
|
||||
* These also start at offset 0, and correspond to the merkle tree bytes.
|
||||
* So when fsverity asks for page 0 of the merkle tree, we pull up one page
|
||||
* starting at offset 0 for this key type. These are also opaque to btrfs,
|
||||
* we're blindly storing whatever fsverity sends down.
|
||||
*
|
||||
* Another important consideration is the fact that the Merkle tree data scales
|
||||
* linearly with the size of the file (with 4K pages/blocks and SHA-256, it's
|
||||
* ~1/127th the size) so for large files, writing the tree can be a lengthy
|
||||
* operation. For that reason, we guard the whole enable verity operation
|
||||
* (between begin_enable_verity and end_enable_verity) with an orphan item.
|
||||
* Again, because the data can be pretty large, it's quite possible that we
|
||||
* could run out of space writing it, so we try our best to handle errors by
|
||||
* stopping and rolling back rather than aborting the victim transaction.
|
||||
*/
|
||||
|
||||
#define MERKLE_START_ALIGN 65536
|
||||
|
||||
/*
|
||||
* Compute the logical file offset where we cache the Merkle tree.
|
||||
*
|
||||
* @inode: inode of the verity file
|
||||
*
|
||||
* For the purposes of caching the Merkle tree pages, as required by
|
||||
* fs-verity, it is convenient to do size computations in terms of a file
|
||||
* offset, rather than in terms of page indices.
|
||||
*
|
||||
* Use 64K to be sure it's past the last page in the file, even with 64K pages.
|
||||
* That rounding operation itself can overflow loff_t, so we do it in u64 and
|
||||
* check.
|
||||
*
|
||||
* Returns the file offset on success, negative error code on failure.
|
||||
*/
|
||||
static loff_t merkle_file_pos(const struct inode *inode)
|
||||
{
|
||||
u64 sz = inode->i_size;
|
||||
u64 rounded = round_up(sz, MERKLE_START_ALIGN);
|
||||
|
||||
if (rounded > inode->i_sb->s_maxbytes)
|
||||
return -EFBIG;
|
||||
|
||||
return rounded;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop all the items for this inode with this key_type.
|
||||
*
|
||||
* @inode: inode to drop items for
|
||||
* @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or
|
||||
* BTRFS_VERITY_MERKLE_ITEM)
|
||||
*
|
||||
* Before doing a verity enable we cleanup any existing verity items.
|
||||
* This is also used to clean up if a verity enable failed half way through.
|
||||
*
|
||||
* Returns number of dropped items on success, negative error code on failure.
|
||||
*/
|
||||
static int drop_verity_items(struct btrfs_inode *inode, u8 key_type)
|
||||
{
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_key key;
|
||||
int count = 0;
|
||||
int ret;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
while (1) {
|
||||
/* 1 for the item being dropped */
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk backwards through all the items until we find one that
|
||||
* isn't from our key type or objectid
|
||||
*/
|
||||
key.objectid = btrfs_ino(inode);
|
||||
key.type = key_type;
|
||||
key.offset = (u64)-1;
|
||||
|
||||
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
||||
if (ret > 0) {
|
||||
ret = 0;
|
||||
/* No more keys of this type, we're done */
|
||||
if (path->slots[0] == 0)
|
||||
break;
|
||||
path->slots[0]--;
|
||||
} else if (ret < 0) {
|
||||
btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
|
||||
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
||||
|
||||
/* No more keys of this type, we're done */
|
||||
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
|
||||
break;
|
||||
|
||||
/*
|
||||
* This shouldn't be a performance sensitive function because
|
||||
* it's not used as part of truncate. If it ever becomes
|
||||
* perf sensitive, change this to walk forward and bulk delete
|
||||
* items
|
||||
*/
|
||||
ret = btrfs_del_items(trans, root, path, path->slots[0], 1);
|
||||
if (ret) {
|
||||
btrfs_end_transaction(trans);
|
||||
goto out;
|
||||
}
|
||||
count++;
|
||||
btrfs_release_path(path);
|
||||
btrfs_end_transaction(trans);
|
||||
}
|
||||
ret = count;
|
||||
btrfs_end_transaction(trans);
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop all verity items
|
||||
*
|
||||
* @inode: inode to drop verity items for
|
||||
*
|
||||
* In most contexts where we are dropping verity items, we want to do it for all
|
||||
* the types of verity items, not a particular one.
|
||||
*
|
||||
* Returns: 0 on success, negative error code on failure.
|
||||
*/
|
||||
int btrfs_drop_verity_items(struct btrfs_inode *inode)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert and write inode items with a given key type and offset.
|
||||
*
|
||||
* @inode: inode to insert for
|
||||
* @key_type: key type to insert
|
||||
* @offset: item offset to insert at
|
||||
* @src: source data to write
|
||||
* @len: length of source data to write
|
||||
*
|
||||
* Write len bytes from src into items of up to 2K length.
|
||||
* The inserted items will have key (ino, key_type, offset + off) where off is
|
||||
* consecutively increasing from 0 up to the last item ending at offset + len.
|
||||
*
|
||||
* Returns 0 on success and a negative error code on failure.
|
||||
*/
|
||||
static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
|
||||
const char *src, u64 len)
|
||||
{
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct extent_buffer *leaf;
|
||||
struct btrfs_key key;
|
||||
unsigned long copy_bytes;
|
||||
unsigned long src_offset = 0;
|
||||
void *data;
|
||||
int ret = 0;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
while (len > 0) {
|
||||
/* 1 for the new item being inserted */
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
break;
|
||||
}
|
||||
|
||||
key.objectid = btrfs_ino(inode);
|
||||
key.type = key_type;
|
||||
key.offset = offset;
|
||||
|
||||
/*
|
||||
* Insert 2K at a time mostly to be friendly for smaller leaf
|
||||
* size filesystems
|
||||
*/
|
||||
copy_bytes = min_t(u64, len, 2048);
|
||||
|
||||
ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes);
|
||||
if (ret) {
|
||||
btrfs_end_transaction(trans);
|
||||
break;
|
||||
}
|
||||
|
||||
leaf = path->nodes[0];
|
||||
|
||||
data = btrfs_item_ptr(leaf, path->slots[0], void);
|
||||
write_extent_buffer(leaf, src + src_offset,
|
||||
(unsigned long)data, copy_bytes);
|
||||
offset += copy_bytes;
|
||||
src_offset += copy_bytes;
|
||||
len -= copy_bytes;
|
||||
|
||||
btrfs_release_path(path);
|
||||
btrfs_end_transaction(trans);
|
||||
}
|
||||
|
||||
btrfs_free_path(path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read inode items of the given key type and offset from the btree.
|
||||
*
|
||||
* @inode: inode to read items of
|
||||
* @key_type: key type to read
|
||||
* @offset: item offset to read from
|
||||
* @dest: Buffer to read into. This parameter has slightly tricky
|
||||
* semantics. If it is NULL, the function will not do any copying
|
||||
* and will just return the size of all the items up to len bytes.
|
||||
* If dest_page is passed, then the function will kmap_local the
|
||||
* page and ignore dest, but it must still be non-NULL to avoid the
|
||||
* counting-only behavior.
|
||||
* @len: length in bytes to read
|
||||
* @dest_page: copy into this page instead of the dest buffer
|
||||
*
|
||||
* Helper function to read items from the btree. This returns the number of
|
||||
* bytes read or < 0 for errors. We can return short reads if the items don't
|
||||
* exist on disk or aren't big enough to fill the desired length. Supports
|
||||
* reading into a provided buffer (dest) or into the page cache
|
||||
*
|
||||
* Returns number of bytes read or a negative error code on failure.
|
||||
*/
|
||||
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
|
||||
char *dest, u64 len, struct page *dest_page)
|
||||
{
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct extent_buffer *leaf;
|
||||
struct btrfs_key key;
|
||||
u64 item_end;
|
||||
u64 copy_end;
|
||||
int copied = 0;
|
||||
u32 copy_offset;
|
||||
unsigned long copy_bytes;
|
||||
unsigned long dest_offset = 0;
|
||||
void *data;
|
||||
char *kaddr = dest;
|
||||
int ret;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
if (dest_page)
|
||||
path->reada = READA_FORWARD;
|
||||
|
||||
key.objectid = btrfs_ino(inode);
|
||||
key.type = key_type;
|
||||
key.offset = offset;
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
goto out;
|
||||
} else if (ret > 0) {
|
||||
ret = 0;
|
||||
if (path->slots[0] == 0)
|
||||
goto out;
|
||||
path->slots[0]--;
|
||||
}
|
||||
|
||||
while (len > 0) {
|
||||
leaf = path->nodes[0];
|
||||
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
||||
|
||||
if (key.objectid != btrfs_ino(inode) || key.type != key_type)
|
||||
break;
|
||||
|
||||
item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset;
|
||||
|
||||
if (copied > 0) {
|
||||
/*
|
||||
* Once we've copied something, we want all of the items
|
||||
* to be sequential
|
||||
*/
|
||||
if (key.offset != offset)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Our initial offset might be in the middle of an
|
||||
* item. Make sure it all makes sense.
|
||||
*/
|
||||
if (key.offset > offset)
|
||||
break;
|
||||
if (item_end <= offset)
|
||||
break;
|
||||
}
|
||||
|
||||
/* desc = NULL to just sum all the item lengths */
|
||||
if (!dest)
|
||||
copy_end = item_end;
|
||||
else
|
||||
copy_end = min(offset + len, item_end);
|
||||
|
||||
/* Number of bytes in this item we want to copy */
|
||||
copy_bytes = copy_end - offset;
|
||||
|
||||
/* Offset from the start of item for copying */
|
||||
copy_offset = offset - key.offset;
|
||||
|
||||
if (dest) {
|
||||
if (dest_page)
|
||||
kaddr = kmap_local_page(dest_page);
|
||||
|
||||
data = btrfs_item_ptr(leaf, path->slots[0], void);
|
||||
read_extent_buffer(leaf, kaddr + dest_offset,
|
||||
(unsigned long)data + copy_offset,
|
||||
copy_bytes);
|
||||
|
||||
if (dest_page)
|
||||
kunmap_local(kaddr);
|
||||
}
|
||||
|
||||
offset += copy_bytes;
|
||||
dest_offset += copy_bytes;
|
||||
len -= copy_bytes;
|
||||
copied += copy_bytes;
|
||||
|
||||
path->slots[0]++;
|
||||
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
||||
/*
|
||||
* We've reached the last slot in this leaf and we need
|
||||
* to go to the next leaf.
|
||||
*/
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
if (ret < 0) {
|
||||
break;
|
||||
} else if (ret > 0) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
if (!ret)
|
||||
ret = copied;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete an fsverity orphan
|
||||
*
|
||||
* @trans: transaction to do the delete in
|
||||
* @inode: inode to orphan
|
||||
*
|
||||
* Capture verity orphan specific logic that is repeated in the couple places
|
||||
* we delete verity orphans. Specifically, handling ENOENT and ignoring inodes
|
||||
* with 0 links.
|
||||
*
|
||||
* Returns zero on success or a negative error code on failure.
|
||||
*/
|
||||
static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode)
|
||||
{
|
||||
struct btrfs_root *root = inode->root;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If the inode has no links, it is either already unlinked, or was
|
||||
* created with O_TMPFILE. In either case, it should have an orphan from
|
||||
* that other operation. Rather than reference count the orphans, we
|
||||
* simply ignore them here, because we only invoke the verity path in
|
||||
* the orphan logic when i_nlink is 1.
|
||||
*/
|
||||
if (!inode->vfs_inode.i_nlink)
|
||||
return 0;
|
||||
|
||||
ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rollback in-progress verity if we encounter an error.
|
||||
*
|
||||
* @inode: inode verity had an error for
|
||||
*
|
||||
* We try to handle recoverable errors while enabling verity by rolling it back
|
||||
* and just failing the operation, rather than having an fs level error no
|
||||
* matter what. However, any error in rollback is unrecoverable.
|
||||
*
|
||||
* Returns 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int rollback_verity(struct btrfs_inode *inode)
|
||||
{
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_root *root = inode->root;
|
||||
int ret;
|
||||
|
||||
ASSERT(inode_is_locked(&inode->vfs_inode));
|
||||
truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
|
||||
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
|
||||
ret = btrfs_drop_verity_items(inode);
|
||||
if (ret) {
|
||||
btrfs_handle_fs_error(root->fs_info, ret,
|
||||
"failed to drop verity items in rollback %llu",
|
||||
(u64)inode->vfs_inode.i_ino);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1 for updating the inode flag
|
||||
* 1 for deleting the orphan
|
||||
*/
|
||||
trans = btrfs_start_transaction(root, 2);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
btrfs_handle_fs_error(root->fs_info, ret,
|
||||
"failed to start transaction in verity rollback %llu",
|
||||
(u64)inode->vfs_inode.i_ino);
|
||||
goto out;
|
||||
}
|
||||
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
|
||||
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto out;
|
||||
}
|
||||
ret = del_orphan(trans, inode);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto out;
|
||||
}
|
||||
btrfs_end_transaction(trans);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finalize making the file a valid verity file
|
||||
*
|
||||
* @inode: inode to be marked as verity
|
||||
* @desc: contents of the verity descriptor to write (not NULL)
|
||||
* @desc_size: size of the verity descriptor
|
||||
*
|
||||
* Do the actual work of finalizing verity after successfully writing the Merkle
|
||||
* tree:
|
||||
*
|
||||
* - write out the descriptor items
|
||||
* - mark the inode with the verity flag
|
||||
* - delete the orphan item
|
||||
* - mark the ro compat bit
|
||||
* - clear the in progress bit
|
||||
*
|
||||
* Returns 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int finish_verity(struct btrfs_inode *inode, const void *desc,
|
||||
size_t desc_size)
|
||||
{
|
||||
struct btrfs_trans_handle *trans = NULL;
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_verity_descriptor_item item;
|
||||
int ret;
|
||||
|
||||
/* Write out the descriptor item */
|
||||
memset(&item, 0, sizeof(item));
|
||||
btrfs_set_stack_verity_descriptor_size(&item, desc_size);
|
||||
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0,
|
||||
(const char *)&item, sizeof(item));
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* Write out the descriptor itself */
|
||||
ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1,
|
||||
desc, desc_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* 1 for updating the inode flag
|
||||
* 1 for deleting the orphan
|
||||
*/
|
||||
trans = btrfs_start_transaction(root, 2);
|
||||
if (IS_ERR(trans)) {
|
||||
ret = PTR_ERR(trans);
|
||||
goto out;
|
||||
}
|
||||
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
|
||||
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret)
|
||||
goto end_trans;
|
||||
ret = del_orphan(trans, inode);
|
||||
if (ret)
|
||||
goto end_trans;
|
||||
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
|
||||
btrfs_set_fs_compat_ro(root->fs_info, VERITY);
|
||||
end_trans:
|
||||
btrfs_end_transaction(trans);
|
||||
out:
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* fsverity op that begins enabling verity.
|
||||
*
|
||||
* @filp: file to enable verity on
|
||||
*
|
||||
* Begin enabling fsverity for the file. We drop any existing verity items, add
|
||||
* an orphan and set the in progress bit.
|
||||
*
|
||||
* Returns 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int btrfs_begin_enable_verity(struct file *filp)
|
||||
{
|
||||
struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_trans_handle *trans;
|
||||
int ret;
|
||||
|
||||
ASSERT(inode_is_locked(file_inode(filp)));
|
||||
|
||||
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
* This should almost never do anything, but theoretically, it's
|
||||
* possible that we failed to enable verity on a file, then were
|
||||
* interrupted or failed while rolling back, failed to cleanup the
|
||||
* orphan, and finally attempt to enable verity again.
|
||||
*/
|
||||
ret = btrfs_drop_verity_items(inode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* 1 for the orphan item */
|
||||
trans = btrfs_start_transaction(root, 1);
|
||||
if (IS_ERR(trans))
|
||||
return PTR_ERR(trans);
|
||||
|
||||
ret = btrfs_orphan_add(trans, inode);
|
||||
if (!ret)
|
||||
set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
|
||||
btrfs_end_transaction(trans);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* fsverity op that ends enabling verity.
|
||||
*
|
||||
* @filp: file we are finishing enabling verity on
|
||||
* @desc: verity descriptor to write out (NULL in error conditions)
|
||||
* @desc_size: size of the verity descriptor (variable with signatures)
|
||||
* @merkle_tree_size: size of the merkle tree in bytes
|
||||
*
|
||||
* If desc is null, then VFS is signaling an error occurred during verity
|
||||
* enable, and we should try to rollback. Otherwise, attempt to finish verity.
|
||||
*
|
||||
* Returns 0 on success, negative error code on error.
|
||||
*/
|
||||
static int btrfs_end_enable_verity(struct file *filp, const void *desc,
|
||||
size_t desc_size, u64 merkle_tree_size)
|
||||
{
|
||||
struct btrfs_inode *inode = BTRFS_I(file_inode(filp));
|
||||
int ret = 0;
|
||||
int rollback_ret;
|
||||
|
||||
ASSERT(inode_is_locked(file_inode(filp)));
|
||||
|
||||
if (desc == NULL)
|
||||
goto rollback;
|
||||
|
||||
ret = finish_verity(inode, desc, desc_size);
|
||||
if (ret)
|
||||
goto rollback;
|
||||
return ret;
|
||||
|
||||
rollback:
|
||||
rollback_ret = rollback_verity(inode);
|
||||
if (rollback_ret)
|
||||
btrfs_err(inode->root->fs_info,
|
||||
"failed to rollback verity items: %d", rollback_ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* fsverity op that gets the struct fsverity_descriptor.
|
||||
*
|
||||
* @inode: inode to get the descriptor of
|
||||
* @buf: output buffer for the descriptor contents
|
||||
* @buf_size: size of the output buffer. 0 to query the size
|
||||
*
|
||||
* fsverity does a two pass setup for reading the descriptor, in the first pass
|
||||
* it calls with buf_size = 0 to query the size of the descriptor, and then in
|
||||
* the second pass it actually reads the descriptor off disk.
|
||||
*
|
||||
* Returns the size on success or a negative error code on failure.
|
||||
*/
|
||||
static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
|
||||
size_t buf_size)
|
||||
{
|
||||
u64 true_size;
|
||||
int ret = 0;
|
||||
struct btrfs_verity_descriptor_item item;
|
||||
|
||||
memset(&item, 0, sizeof(item));
|
||||
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0,
|
||||
(char *)&item, sizeof(item), NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (item.reserved[0] != 0 || item.reserved[1] != 0)
|
||||
return -EUCLEAN;
|
||||
|
||||
true_size = btrfs_stack_verity_descriptor_size(&item);
|
||||
if (true_size > INT_MAX)
|
||||
return -EUCLEAN;
|
||||
|
||||
if (buf_size == 0)
|
||||
return true_size;
|
||||
if (buf_size < true_size)
|
||||
return -ERANGE;
|
||||
|
||||
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1,
|
||||
buf, buf_size, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret != true_size)
|
||||
return -EIO;
|
||||
|
||||
return true_size;
|
||||
}
|
||||
|
||||
/*
|
||||
* fsverity op that reads and caches a merkle tree page.
|
||||
*
|
||||
* @inode: inode to read a merkle tree page for
|
||||
* @index: page index relative to the start of the merkle tree
|
||||
* @num_ra_pages: number of pages to readahead. Optional, we ignore it
|
||||
*
|
||||
* The Merkle tree is stored in the filesystem btree, but its pages are cached
|
||||
* with a logical position past EOF in the inode's mapping.
|
||||
*
|
||||
* Returns the page we read, or an ERR_PTR on error.
|
||||
*/
|
||||
static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
|
||||
pgoff_t index,
|
||||
unsigned long num_ra_pages)
|
||||
{
|
||||
struct page *page;
|
||||
u64 off = (u64)index << PAGE_SHIFT;
|
||||
loff_t merkle_pos = merkle_file_pos(inode);
|
||||
int ret;
|
||||
|
||||
if (merkle_pos < 0)
|
||||
return ERR_PTR(merkle_pos);
|
||||
if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE)
|
||||
return ERR_PTR(-EFBIG);
|
||||
index += merkle_pos >> PAGE_SHIFT;
|
||||
again:
|
||||
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
|
||||
if (page) {
|
||||
if (PageUptodate(page))
|
||||
return page;
|
||||
|
||||
lock_page(page);
|
||||
/*
|
||||
* We only insert uptodate pages, so !Uptodate has to be
|
||||
* an error
|
||||
*/
|
||||
if (!PageUptodate(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
unlock_page(page);
|
||||
return page;
|
||||
}
|
||||
|
||||
page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
|
||||
if (!page)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* Merkle item keys are indexed from byte 0 in the merkle tree.
|
||||
* They have the form:
|
||||
*
|
||||
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
|
||||
*/
|
||||
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
|
||||
page_address(page), PAGE_SIZE, page);
|
||||
if (ret < 0) {
|
||||
put_page(page);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
if (ret < PAGE_SIZE)
|
||||
memzero_page(page, ret, PAGE_SIZE - ret);
|
||||
|
||||
SetPageUptodate(page);
|
||||
ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
|
||||
|
||||
if (!ret) {
|
||||
/* Inserted and ready for fsverity */
|
||||
unlock_page(page);
|
||||
} else {
|
||||
put_page(page);
|
||||
/* Did someone race us into inserting this page? */
|
||||
if (ret == -EEXIST)
|
||||
goto again;
|
||||
page = ERR_PTR(ret);
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* fsverity op that writes a Merkle tree block into the btree.
|
||||
*
|
||||
* @inode: inode to write a Merkle tree block for
|
||||
* @buf: Merkle tree data block to write
|
||||
* @index: index of the block in the Merkle tree
|
||||
* @log_blocksize: log base 2 of the Merkle tree block size
|
||||
*
|
||||
* Note that the block size could be different from the page size, so it is not
|
||||
* safe to assume that index is a page index.
|
||||
*
|
||||
* Returns 0 on success or negative error code on failure
|
||||
*/
|
||||
static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf,
|
||||
u64 index, int log_blocksize)
|
||||
{
|
||||
u64 off = index << log_blocksize;
|
||||
u64 len = 1ULL << log_blocksize;
|
||||
loff_t merkle_pos = merkle_file_pos(inode);
|
||||
|
||||
if (merkle_pos < 0)
|
||||
return merkle_pos;
|
||||
if (merkle_pos > inode->i_sb->s_maxbytes - off - len)
|
||||
return -EFBIG;
|
||||
|
||||
return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY,
|
||||
off, buf, len);
|
||||
}
|
||||
|
||||
const struct fsverity_operations btrfs_verityops = {
|
||||
.begin_enable_verity = btrfs_begin_enable_verity,
|
||||
.end_enable_verity = btrfs_end_enable_verity,
|
||||
.get_verity_descriptor = btrfs_get_verity_descriptor,
|
||||
.read_merkle_tree_page = btrfs_read_merkle_tree_page,
|
||||
.write_merkle_tree_block = btrfs_write_merkle_tree_block,
|
||||
};
|
||||
@@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
|
||||
.sub_stripes = 2,
|
||||
.dev_stripes = 1,
|
||||
.devs_max = 0, /* 0 == as many as possible */
|
||||
.devs_min = 4,
|
||||
.devs_min = 2,
|
||||
.tolerated_failures = 1,
|
||||
.devs_increment = 2,
|
||||
.ncopies = 2,
|
||||
@@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
|
||||
.sub_stripes = 1,
|
||||
.dev_stripes = 1,
|
||||
.devs_max = 0,
|
||||
.devs_min = 2,
|
||||
.devs_min = 1,
|
||||
.tolerated_failures = 0,
|
||||
.devs_increment = 1,
|
||||
.ncopies = 1,
|
||||
@@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
|
||||
},
|
||||
};
|
||||
|
||||
/*
|
||||
* Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
|
||||
* can be used as index to access btrfs_raid_array[].
|
||||
*/
|
||||
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
|
||||
{
|
||||
if (flags & BTRFS_BLOCK_GROUP_RAID10)
|
||||
return BTRFS_RAID_RAID10;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
|
||||
return BTRFS_RAID_RAID1;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
|
||||
return BTRFS_RAID_RAID1C3;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
|
||||
return BTRFS_RAID_RAID1C4;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_DUP)
|
||||
return BTRFS_RAID_DUP;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
|
||||
return BTRFS_RAID_RAID0;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID5)
|
||||
return BTRFS_RAID_RAID5;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID6)
|
||||
return BTRFS_RAID_RAID6;
|
||||
|
||||
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
|
||||
}
|
||||
|
||||
const char *btrfs_bg_type_to_raid_name(u64 flags)
|
||||
{
|
||||
const int index = btrfs_bg_flags_to_raid_index(flags);
|
||||
@@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
|
||||
* Returned struct is not linked onto any lists and must be destroyed using
|
||||
* btrfs_free_device.
|
||||
*/
|
||||
static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
struct btrfs_device *dev;
|
||||
|
||||
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
|
||||
if (!dev)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* Preallocate a bio that's always going to be used for flushing device
|
||||
* barriers and matches the device lifespan
|
||||
*/
|
||||
dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
|
||||
if (!dev->flush_bio) {
|
||||
kfree(dev);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&dev->dev_list);
|
||||
INIT_LIST_HEAD(&dev->dev_alloc_list);
|
||||
INIT_LIST_HEAD(&dev->post_commit_list);
|
||||
|
||||
atomic_set(&dev->reada_in_flight, 0);
|
||||
atomic_set(&dev->dev_stats_ccnt, 0);
|
||||
btrfs_device_data_ordered_init(dev);
|
||||
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||
extent_io_tree_init(fs_info, &dev->alloc_state,
|
||||
IO_TREE_DEVICE_ALLOC_STATE, NULL);
|
||||
|
||||
return dev;
|
||||
}
|
||||
|
||||
static noinline struct btrfs_fs_devices *find_fsid(
|
||||
const u8 *fsid, const u8 *metadata_fsid)
|
||||
{
|
||||
@@ -1130,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device)
|
||||
fs_devices->rw_devices--;
|
||||
}
|
||||
|
||||
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
|
||||
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
|
||||
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
|
||||
fs_devices->missing_devices--;
|
||||
|
||||
@@ -1228,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
||||
static int devid_cmp(void *priv, const struct list_head *a,
|
||||
const struct list_head *b)
|
||||
{
|
||||
struct btrfs_device *dev1, *dev2;
|
||||
const struct btrfs_device *dev1, *dev2;
|
||||
|
||||
dev1 = list_entry(a, struct btrfs_device, dev_list);
|
||||
dev2 = list_entry(b, struct btrfs_device, dev_list);
|
||||
@@ -1598,14 +1589,9 @@ again:
|
||||
key.offset = search_start;
|
||||
key.type = BTRFS_DEV_EXTENT_KEY;
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
ret = btrfs_search_backwards(root, &key, path);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret > 0) {
|
||||
ret = btrfs_previous_item(root, path, key.objectid, key.type);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
l = path->nodes[0];
|
||||
@@ -1759,48 +1745,6 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_device *device,
|
||||
u64 chunk_offset, u64 start, u64 num_bytes)
|
||||
{
|
||||
int ret;
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_fs_info *fs_info = device->fs_info;
|
||||
struct btrfs_root *root = fs_info->dev_root;
|
||||
struct btrfs_dev_extent *extent;
|
||||
struct extent_buffer *leaf;
|
||||
struct btrfs_key key;
|
||||
|
||||
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
|
||||
WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
key.objectid = device->devid;
|
||||
key.offset = start;
|
||||
key.type = BTRFS_DEV_EXTENT_KEY;
|
||||
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
||||
sizeof(*extent));
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
leaf = path->nodes[0];
|
||||
extent = btrfs_item_ptr(leaf, path->slots[0],
|
||||
struct btrfs_dev_extent);
|
||||
btrfs_set_dev_extent_chunk_tree(leaf, extent,
|
||||
BTRFS_CHUNK_TREE_OBJECTID);
|
||||
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
|
||||
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
|
||||
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
|
||||
|
||||
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
|
||||
btrfs_mark_buffer_dirty(leaf);
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
struct extent_map_tree *em_tree;
|
||||
@@ -2003,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
|
||||
if (!(all_avail & btrfs_raid_array[i].bg_flag))
|
||||
continue;
|
||||
|
||||
if (num_devices < btrfs_raid_array[i].devs_min) {
|
||||
int ret = btrfs_raid_array[i].mindev_error;
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
if (num_devices < btrfs_raid_array[i].devs_min)
|
||||
return btrfs_raid_array[i].mindev_error;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -2137,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
|
||||
|
||||
if (IS_ERR(device)) {
|
||||
if (PTR_ERR(device) == -ENOENT &&
|
||||
strcmp(device_path, "missing") == 0)
|
||||
device_path && strcmp(device_path, "missing") == 0)
|
||||
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
|
||||
else
|
||||
ret = PTR_ERR(device);
|
||||
@@ -3622,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
|
||||
const int ncopies = btrfs_raid_array[index].ncopies;
|
||||
const int nparity = btrfs_raid_array[index].nparity;
|
||||
|
||||
if (nparity)
|
||||
return num_stripes - nparity;
|
||||
else
|
||||
return num_stripes / ncopies;
|
||||
return (num_stripes - nparity) / ncopies;
|
||||
}
|
||||
|
||||
/* [pstart, pend) */
|
||||
@@ -4025,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
|
||||
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
|
||||
return true;
|
||||
|
||||
if (fs_info->sectorsize < PAGE_SIZE &&
|
||||
bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
||||
btrfs_err(fs_info,
|
||||
"RAID56 is not yet supported for sectorsize %u with page size %lu",
|
||||
fs_info->sectorsize, PAGE_SIZE);
|
||||
return false;
|
||||
}
|
||||
/* Profile is valid and does not have bits outside of the allowed set */
|
||||
if (alloc_profile_is_valid(bargs->target, 1) &&
|
||||
(bargs->target & ~allowed) == 0)
|
||||
@@ -5463,56 +5407,6 @@ out:
|
||||
return block_group;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
|
||||
*
|
||||
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
|
||||
* phases.
|
||||
*/
|
||||
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
|
||||
u64 chunk_offset, u64 chunk_size)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
struct btrfs_device *device;
|
||||
struct extent_map *em;
|
||||
struct map_lookup *map;
|
||||
u64 dev_offset;
|
||||
u64 stripe_size;
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
|
||||
if (IS_ERR(em))
|
||||
return PTR_ERR(em);
|
||||
|
||||
map = em->map_lookup;
|
||||
stripe_size = em->orig_block_len;
|
||||
|
||||
/*
|
||||
* Take the device list mutex to prevent races with the final phase of
|
||||
* a device replace operation that replaces the device object associated
|
||||
* with the map's stripes, because the device object's id can change
|
||||
* at any time during that final phase of the device replace operation
|
||||
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
|
||||
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
|
||||
* resulting in persisting a device extent item with such ID.
|
||||
*/
|
||||
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
||||
for (i = 0; i < map->num_stripes; i++) {
|
||||
device = map->stripes[i].dev;
|
||||
dev_offset = map->stripes[i].physical;
|
||||
|
||||
ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
|
||||
dev_offset, stripe_size);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
||||
|
||||
free_extent_map(em);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
|
||||
* phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
|
||||
@@ -6923,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
|
||||
if (WARN_ON(!devid && !fs_info))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
dev = __alloc_device(fs_info);
|
||||
if (IS_ERR(dev))
|
||||
return dev;
|
||||
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
|
||||
if (!dev)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* Preallocate a bio that's always going to be used for flushing device
|
||||
* barriers and matches the device lifespan
|
||||
*/
|
||||
dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
|
||||
if (!dev->flush_bio) {
|
||||
kfree(dev);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&dev->dev_list);
|
||||
INIT_LIST_HEAD(&dev->dev_alloc_list);
|
||||
INIT_LIST_HEAD(&dev->post_commit_list);
|
||||
|
||||
atomic_set(&dev->reada_in_flight, 0);
|
||||
atomic_set(&dev->dev_stats_ccnt, 0);
|
||||
btrfs_device_data_ordered_init(dev);
|
||||
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||
extent_io_tree_init(fs_info, &dev->alloc_state,
|
||||
IO_TREE_DEVICE_ALLOC_STATE, NULL);
|
||||
|
||||
if (devid)
|
||||
tmp = *devid;
|
||||
@@ -6961,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
|
||||
|
||||
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
|
||||
{
|
||||
int index = btrfs_bg_flags_to_raid_index(type);
|
||||
int ncopies = btrfs_raid_array[index].ncopies;
|
||||
const int nparity = btrfs_raid_array[index].nparity;
|
||||
int data_stripes;
|
||||
|
||||
if (nparity)
|
||||
data_stripes = num_stripes - nparity;
|
||||
else
|
||||
data_stripes = num_stripes / ncopies;
|
||||
const int data_stripes = calc_data_stripes(type, num_stripes);
|
||||
|
||||
return div_u64(chunk_len, data_stripes);
|
||||
}
|
||||
@@ -8144,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
|
||||
goto out;
|
||||
|
||||
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
||||
ret = btrfs_next_item(root, path);
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* No dev extents at all? Not good */
|
||||
|
||||
@@ -508,8 +508,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
|
||||
u64 logical, u64 len);
|
||||
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
|
||||
u64 logical);
|
||||
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
|
||||
u64 chunk_offset, u64 chunk_size);
|
||||
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_block_group *bg);
|
||||
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
|
||||
@@ -568,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
|
||||
atomic_inc(&dev->dev_stats_ccnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
|
||||
* can be used as index to access btrfs_raid_array[].
|
||||
*/
|
||||
static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
|
||||
{
|
||||
if (flags & BTRFS_BLOCK_GROUP_RAID10)
|
||||
return BTRFS_RAID_RAID10;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
|
||||
return BTRFS_RAID_RAID1;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
|
||||
return BTRFS_RAID_RAID1C3;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
|
||||
return BTRFS_RAID_RAID1C4;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_DUP)
|
||||
return BTRFS_RAID_DUP;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
|
||||
return BTRFS_RAID_RAID0;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID5)
|
||||
return BTRFS_RAID_RAID5;
|
||||
else if (flags & BTRFS_BLOCK_GROUP_RAID6)
|
||||
return BTRFS_RAID_RAID6;
|
||||
|
||||
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
|
||||
}
|
||||
|
||||
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
|
||||
|
||||
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
|
||||
@@ -603,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
|
||||
struct block_device *bdev,
|
||||
const char *device_path);
|
||||
|
||||
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
|
||||
int btrfs_bg_type_to_factor(u64 flags);
|
||||
const char *btrfs_bg_type_to_raid_name(u64 flags);
|
||||
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
|
||||
|
||||
@@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
workspace->strm.total_in = 0;
|
||||
workspace->strm.total_out = 0;
|
||||
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
cpage_out = kmap(out_page);
|
||||
cpage_out = page_address(out_page);
|
||||
pages[0] = out_page;
|
||||
nr_pages = 1;
|
||||
|
||||
@@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
int i;
|
||||
|
||||
for (i = 0; i < in_buf_pages; i++) {
|
||||
if (in_page) {
|
||||
kunmap(in_page);
|
||||
if (in_page)
|
||||
put_page(in_page);
|
||||
}
|
||||
in_page = find_get_page(mapping,
|
||||
start >> PAGE_SHIFT);
|
||||
data_in = kmap(in_page);
|
||||
data_in = page_address(in_page);
|
||||
memcpy(workspace->buf + i * PAGE_SIZE,
|
||||
data_in, PAGE_SIZE);
|
||||
start += PAGE_SIZE;
|
||||
}
|
||||
workspace->strm.next_in = workspace->buf;
|
||||
} else {
|
||||
if (in_page) {
|
||||
kunmap(in_page);
|
||||
if (in_page)
|
||||
put_page(in_page);
|
||||
}
|
||||
in_page = find_get_page(mapping,
|
||||
start >> PAGE_SHIFT);
|
||||
data_in = kmap(in_page);
|
||||
data_in = page_address(in_page);
|
||||
start += PAGE_SIZE;
|
||||
workspace->strm.next_in = data_in;
|
||||
}
|
||||
@@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
* the stream end if required
|
||||
*/
|
||||
if (workspace->strm.avail_out == 0) {
|
||||
kunmap(out_page);
|
||||
if (nr_pages == nr_dest_pages) {
|
||||
out_page = NULL;
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
cpage_out = kmap(out_page);
|
||||
cpage_out = page_address(out_page);
|
||||
pages[nr_pages] = out_page;
|
||||
nr_pages++;
|
||||
workspace->strm.avail_out = PAGE_SIZE;
|
||||
@@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
goto out;
|
||||
} else if (workspace->strm.avail_out == 0) {
|
||||
/* get another page for the stream end */
|
||||
kunmap(out_page);
|
||||
if (nr_pages == nr_dest_pages) {
|
||||
out_page = NULL;
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
cpage_out = kmap(out_page);
|
||||
cpage_out = page_address(out_page);
|
||||
pages[nr_pages] = out_page;
|
||||
nr_pages++;
|
||||
workspace->strm.avail_out = PAGE_SIZE;
|
||||
@@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
*total_in = workspace->strm.total_in;
|
||||
out:
|
||||
*out_pages = nr_pages;
|
||||
if (out_page)
|
||||
kunmap(out_page);
|
||||
|
||||
if (in_page) {
|
||||
kunmap(in_page);
|
||||
if (in_page)
|
||||
put_page(in_page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
|
||||
unsigned long buf_start;
|
||||
struct page **pages_in = cb->compressed_pages;
|
||||
u64 disk_start = cb->start;
|
||||
struct bio *orig_bio = cb->orig_bio;
|
||||
|
||||
data_in = kmap(pages_in[page_in_index]);
|
||||
data_in = page_address(pages_in[page_in_index]);
|
||||
workspace->strm.next_in = data_in;
|
||||
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
|
||||
workspace->strm.total_in = 0;
|
||||
@@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
|
||||
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
|
||||
pr_warn("BTRFS: inflateInit failed\n");
|
||||
kunmap(pages_in[page_in_index]);
|
||||
return -EIO;
|
||||
}
|
||||
while (workspace->strm.total_in < srclen) {
|
||||
@@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
if (buf_start == total_out)
|
||||
break;
|
||||
|
||||
ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
|
||||
total_out, disk_start,
|
||||
orig_bio);
|
||||
ret2 = btrfs_decompress_buf2page(workspace->buf,
|
||||
total_out - buf_start, cb, buf_start);
|
||||
if (ret2 == 0) {
|
||||
ret = 0;
|
||||
goto done;
|
||||
@@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
|
||||
if (workspace->strm.avail_in == 0) {
|
||||
unsigned long tmp;
|
||||
kunmap(pages_in[page_in_index]);
|
||||
|
||||
page_in_index++;
|
||||
if (page_in_index >= total_pages_in) {
|
||||
data_in = NULL;
|
||||
break;
|
||||
}
|
||||
data_in = kmap(pages_in[page_in_index]);
|
||||
data_in = page_address(pages_in[page_in_index]);
|
||||
workspace->strm.next_in = data_in;
|
||||
tmp = srclen - workspace->strm.total_in;
|
||||
workspace->strm.avail_in = min(tmp,
|
||||
PAGE_SIZE);
|
||||
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
if (ret != Z_STREAM_END)
|
||||
@@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
ret = 0;
|
||||
done:
|
||||
zlib_inflateEnd(&workspace->strm);
|
||||
if (data_in)
|
||||
kunmap(pages_in[page_in_index]);
|
||||
if (!ret)
|
||||
zero_fill_bio(orig_bio);
|
||||
zero_fill_bio(cb->orig_bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
|
||||
goto out;
|
||||
|
||||
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
||||
ret = btrfs_next_item(root, path);
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* No dev extents at all? Not good */
|
||||
@@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
|
||||
struct btrfs_fs_info *fs_info = device->fs_info;
|
||||
struct btrfs_zoned_device_info *zone_info = NULL;
|
||||
struct block_device *bdev = device->bdev;
|
||||
struct request_queue *queue = bdev_get_queue(bdev);
|
||||
sector_t nr_sectors;
|
||||
sector_t sector = 0;
|
||||
struct blk_zone *zones = NULL;
|
||||
@@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
|
||||
|
||||
nr_sectors = bdev_nr_sectors(bdev);
|
||||
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
|
||||
zone_info->max_zone_append_size =
|
||||
(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
|
||||
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
|
||||
if (!IS_ALIGNED(nr_sectors, zone_sectors))
|
||||
zone_info->nr_zones++;
|
||||
|
||||
if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
|
||||
btrfs_err(fs_info, "zoned: device %pg does not support zone append",
|
||||
bdev);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
|
||||
if (!zone_info->seq_zones) {
|
||||
ret = -ENOMEM;
|
||||
@@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
|
||||
u64 zoned_devices = 0;
|
||||
u64 nr_devices = 0;
|
||||
u64 zone_size = 0;
|
||||
u64 max_zone_append_size = 0;
|
||||
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
|
||||
int ret = 0;
|
||||
|
||||
@@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (!max_zone_append_size ||
|
||||
(zone_info->max_zone_append_size &&
|
||||
zone_info->max_zone_append_size < max_zone_append_size))
|
||||
max_zone_append_size =
|
||||
zone_info->max_zone_append_size;
|
||||
}
|
||||
nr_devices++;
|
||||
}
|
||||
@@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
|
||||
}
|
||||
|
||||
fs_info->zone_size = zone_size;
|
||||
fs_info->max_zone_append_size = max_zone_append_size;
|
||||
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
|
||||
|
||||
/*
|
||||
@@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
|
||||
if (!btrfs_is_zoned(fs_info))
|
||||
return false;
|
||||
|
||||
if (!fs_info->max_zone_append_size)
|
||||
return false;
|
||||
|
||||
if (!is_data_inode(&inode->vfs_inode))
|
||||
return false;
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ struct btrfs_zoned_device_info {
|
||||
*/
|
||||
u64 zone_size;
|
||||
u8 zone_size_shift;
|
||||
u64 max_zone_append_size;
|
||||
u32 nr_zones;
|
||||
unsigned long *seq_zones;
|
||||
unsigned long *empty_zones;
|
||||
|
||||
@@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
|
||||
/* map in the first page of input data */
|
||||
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
|
||||
workspace->in_buf.src = kmap(in_page);
|
||||
workspace->in_buf.src = page_address(in_page);
|
||||
workspace->in_buf.pos = 0;
|
||||
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
|
||||
|
||||
|
||||
/* Allocate and map in the output buffer */
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
pages[nr_pages++] = out_page;
|
||||
workspace->out_buf.dst = kmap(out_page);
|
||||
workspace->out_buf.dst = page_address(out_page);
|
||||
workspace->out_buf.pos = 0;
|
||||
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
|
||||
|
||||
@@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
if (workspace->out_buf.pos == workspace->out_buf.size) {
|
||||
tot_out += PAGE_SIZE;
|
||||
max_out -= PAGE_SIZE;
|
||||
kunmap(out_page);
|
||||
if (nr_pages == nr_dest_pages) {
|
||||
out_page = NULL;
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
pages[nr_pages++] = out_page;
|
||||
workspace->out_buf.dst = kmap(out_page);
|
||||
workspace->out_buf.dst = page_address(out_page);
|
||||
workspace->out_buf.pos = 0;
|
||||
workspace->out_buf.size = min_t(size_t, max_out,
|
||||
PAGE_SIZE);
|
||||
@@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
/* Check if we need more input */
|
||||
if (workspace->in_buf.pos == workspace->in_buf.size) {
|
||||
tot_in += PAGE_SIZE;
|
||||
kunmap(in_page);
|
||||
put_page(in_page);
|
||||
|
||||
start += PAGE_SIZE;
|
||||
len -= PAGE_SIZE;
|
||||
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
|
||||
workspace->in_buf.src = kmap(in_page);
|
||||
workspace->in_buf.src = page_address(in_page);
|
||||
workspace->in_buf.pos = 0;
|
||||
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
|
||||
}
|
||||
@@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
|
||||
tot_out += PAGE_SIZE;
|
||||
max_out -= PAGE_SIZE;
|
||||
kunmap(out_page);
|
||||
if (nr_pages == nr_dest_pages) {
|
||||
out_page = NULL;
|
||||
ret = -E2BIG;
|
||||
goto out;
|
||||
}
|
||||
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
|
||||
out_page = alloc_page(GFP_NOFS);
|
||||
if (out_page == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
pages[nr_pages++] = out_page;
|
||||
workspace->out_buf.dst = kmap(out_page);
|
||||
workspace->out_buf.dst = page_address(out_page);
|
||||
workspace->out_buf.pos = 0;
|
||||
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
|
||||
}
|
||||
@@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
out:
|
||||
*out_pages = nr_pages;
|
||||
/* Cleanup */
|
||||
if (in_page) {
|
||||
kunmap(in_page);
|
||||
if (in_page)
|
||||
put_page(in_page);
|
||||
}
|
||||
if (out_page)
|
||||
kunmap(out_page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
{
|
||||
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
||||
struct page **pages_in = cb->compressed_pages;
|
||||
u64 disk_start = cb->start;
|
||||
struct bio *orig_bio = cb->orig_bio;
|
||||
size_t srclen = cb->compressed_len;
|
||||
ZSTD_DStream *stream;
|
||||
int ret = 0;
|
||||
@@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
goto done;
|
||||
}
|
||||
|
||||
workspace->in_buf.src = kmap(pages_in[page_in_index]);
|
||||
workspace->in_buf.src = page_address(pages_in[page_in_index]);
|
||||
workspace->in_buf.pos = 0;
|
||||
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
|
||||
|
||||
@@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
workspace->out_buf.pos = 0;
|
||||
|
||||
ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
|
||||
buf_start, total_out, disk_start, orig_bio);
|
||||
total_out - buf_start, cb, buf_start);
|
||||
if (ret == 0)
|
||||
break;
|
||||
|
||||
@@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
|
||||
break;
|
||||
|
||||
if (workspace->in_buf.pos == workspace->in_buf.size) {
|
||||
kunmap(pages_in[page_in_index++]);
|
||||
page_in_index++;
|
||||
if (page_in_index >= total_pages_in) {
|
||||
workspace->in_buf.src = NULL;
|
||||
ret = -EIO;
|
||||
goto done;
|
||||
}
|
||||
srclen -= PAGE_SIZE;
|
||||
workspace->in_buf.src = kmap(pages_in[page_in_index]);
|
||||
workspace->in_buf.src = page_address(pages_in[page_in_index]);
|
||||
workspace->in_buf.pos = 0;
|
||||
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
ret = 0;
|
||||
zero_fill_bio(orig_bio);
|
||||
zero_fill_bio(cb->orig_bio);
|
||||
done:
|
||||
if (workspace->in_buf.src)
|
||||
kunmap(pages_in[page_in_index]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,19 +4,16 @@ config CIFS
|
||||
depends on INET
|
||||
select NLS
|
||||
select CRYPTO
|
||||
select CRYPTO_MD4
|
||||
select CRYPTO_MD5
|
||||
select CRYPTO_SHA256
|
||||
select CRYPTO_SHA512
|
||||
select CRYPTO_CMAC
|
||||
select CRYPTO_HMAC
|
||||
select CRYPTO_LIB_ARC4
|
||||
select CRYPTO_AEAD2
|
||||
select CRYPTO_CCM
|
||||
select CRYPTO_GCM
|
||||
select CRYPTO_ECB
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_LIB_DES
|
||||
select KEYS
|
||||
select DNS_RESOLVER
|
||||
select ASN1
|
||||
@@ -85,33 +82,6 @@ config CIFS_ALLOW_INSECURE_LEGACY
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config CIFS_WEAK_PW_HASH
|
||||
bool "Support legacy servers which use weaker LANMAN security"
|
||||
depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY
|
||||
help
|
||||
Modern CIFS servers including Samba and most Windows versions
|
||||
(since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
|
||||
security mechanisms. These hash the password more securely
|
||||
than the mechanisms used in the older LANMAN version of the
|
||||
SMB protocol but LANMAN based authentication is needed to
|
||||
establish sessions with some old SMB servers.
|
||||
|
||||
Enabling this option allows the cifs module to mount to older
|
||||
LANMAN based servers such as OS/2 and Windows 95, but such
|
||||
mounts may be less secure than mounts using NTLM or more recent
|
||||
security mechanisms if you are on a public network. Unless you
|
||||
have a need to access old SMB servers (and are on a private
|
||||
network) you probably want to say N. Even if this support
|
||||
is enabled in the kernel build, LANMAN authentication will not be
|
||||
used automatically. At runtime LANMAN mounts are disabled but
|
||||
can be set to required (or optional) either in
|
||||
/proc/fs/cifs (see Documentation/admin-guide/cifs/usage.rst for
|
||||
more detail) or via an option on the mount command. This support
|
||||
is disabled by default in order to reduce the possibility of a
|
||||
downgrade attack.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config CIFS_UPCALL
|
||||
bool "Kerberos/SPNEGO advanced session setup"
|
||||
depends on CIFS
|
||||
|
||||
@@ -250,9 +250,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
|
||||
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
|
||||
seq_printf(m, ",ALLOW_INSECURE_LEGACY");
|
||||
#endif
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
seq_printf(m, ",WEAK_PW_HASH");
|
||||
#endif
|
||||
#ifdef CONFIG_CIFS_POSIX
|
||||
seq_printf(m, ",CIFS_POSIX");
|
||||
#endif
|
||||
@@ -929,14 +926,6 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
|
||||
*flags = CIFSSEC_MUST_NTLMSSP;
|
||||
else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
|
||||
*flags = CIFSSEC_MUST_NTLMV2;
|
||||
else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
|
||||
*flags = CIFSSEC_MUST_NTLM;
|
||||
else if (CIFSSEC_MUST_LANMAN &&
|
||||
(*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
|
||||
*flags = CIFSSEC_MUST_LANMAN;
|
||||
else if (CIFSSEC_MUST_PLNTXT &&
|
||||
(*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
|
||||
*flags = CIFSSEC_MUST_PLNTXT;
|
||||
|
||||
*flags |= signflags;
|
||||
}
|
||||
|
||||
@@ -147,8 +147,6 @@ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg)
|
||||
goto nlmsg_fail;
|
||||
}
|
||||
break;
|
||||
case LANMAN:
|
||||
case NTLM:
|
||||
case NTLMv2:
|
||||
case RawNTLMSSP:
|
||||
ret = cifs_swn_auth_info_ntlm(swnreg->tcon, skb);
|
||||
|
||||
@@ -360,12 +360,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
|
||||
cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
|
||||
NO_MAP_UNI_RSVD);
|
||||
} else {
|
||||
len = strnlen(src, maxlen);
|
||||
len++;
|
||||
dst = kmalloc(len, GFP_KERNEL);
|
||||
if (!dst)
|
||||
return NULL;
|
||||
strlcpy(dst, src, len);
|
||||
dst = kstrndup(src, maxlen, GFP_KERNEL);
|
||||
}
|
||||
|
||||
return dst;
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#include <linux/random.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/fips.h>
|
||||
#include <crypto/arc4.h>
|
||||
#include "../cifs_common/arc4.h"
|
||||
#include <crypto/aead.h>
|
||||
|
||||
int __cifs_calc_signature(struct smb_rqst *rqst,
|
||||
@@ -250,87 +250,6 @@ int cifs_verify_signature(struct smb_rqst *rqst,
|
||||
|
||||
}
|
||||
|
||||
/* first calculate 24 bytes ntlm response and then 16 byte session key */
|
||||
int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
|
||||
{
|
||||
int rc = 0;
|
||||
unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
|
||||
char temp_key[CIFS_SESS_KEY_SIZE];
|
||||
|
||||
if (!ses)
|
||||
return -EINVAL;
|
||||
|
||||
ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
|
||||
if (!ses->auth_key.response)
|
||||
return -ENOMEM;
|
||||
|
||||
ses->auth_key.len = temp_len;
|
||||
|
||||
rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
|
||||
ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
|
||||
if (rc) {
|
||||
cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
|
||||
__func__, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = E_md4hash(ses->password, temp_key, nls_cp);
|
||||
if (rc) {
|
||||
cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
|
||||
__func__, rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
|
||||
if (rc)
|
||||
cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
|
||||
__func__, rc);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
|
||||
char *lnm_session_key)
|
||||
{
|
||||
int i, len;
|
||||
int rc;
|
||||
char password_with_pad[CIFS_ENCPWD_SIZE] = {0};
|
||||
|
||||
if (password) {
|
||||
for (len = 0; len < CIFS_ENCPWD_SIZE; len++)
|
||||
if (!password[len])
|
||||
break;
|
||||
|
||||
memcpy(password_with_pad, password, len);
|
||||
}
|
||||
|
||||
if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
|
||||
memcpy(lnm_session_key, password_with_pad,
|
||||
CIFS_ENCPWD_SIZE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* calculate old style session key */
|
||||
/* calling toupper is less broken than repeatedly
|
||||
calling nls_toupper would be since that will never
|
||||
work for UTF8, but neither handles multibyte code pages
|
||||
but the only alternative would be converting to UCS-16 (Unicode)
|
||||
(using a routine something like UniStrupr) then
|
||||
uppercasing and then converting back from Unicode - which
|
||||
would only worth doing it if we knew it were utf8. Basically
|
||||
utf8 and other multibyte codepages each need their own strupper
|
||||
function since a byte at a time will ont work. */
|
||||
|
||||
for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
|
||||
password_with_pad[i] = toupper(password_with_pad[i]);
|
||||
|
||||
rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
|
||||
|
||||
return rc;
|
||||
}
|
||||
#endif /* CIFS_WEAK_PW_HASH */
|
||||
|
||||
/* Build a proper attribute value/target info pairs blob.
|
||||
* Fill in netbios and dns domain name and workstation name
|
||||
* and client time (total five av pairs and + one end of fields indicator.
|
||||
@@ -780,8 +699,8 @@ calc_seckey(struct cifs_ses *ses)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
|
||||
arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
|
||||
cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE);
|
||||
cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key,
|
||||
CIFS_CPHTXT_SIZE);
|
||||
|
||||
/* make secondary_key/nonce as session key */
|
||||
|
||||
@@ -399,7 +399,6 @@ cifs_evict_inode(struct inode *inode)
|
||||
{
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
clear_inode(inode);
|
||||
cifs_fscache_release_inode_cookie(inode);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -438,15 +437,9 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
|
||||
seq_puts(s, ",sec=");
|
||||
|
||||
switch (ses->sectype) {
|
||||
case LANMAN:
|
||||
seq_puts(s, "lanman");
|
||||
break;
|
||||
case NTLMv2:
|
||||
seq_puts(s, "ntlmv2");
|
||||
break;
|
||||
case NTLM:
|
||||
seq_puts(s, "ntlm");
|
||||
break;
|
||||
case Kerberos:
|
||||
seq_puts(s, "krb5");
|
||||
break;
|
||||
@@ -1755,7 +1748,6 @@ MODULE_DESCRIPTION
|
||||
MODULE_VERSION(CIFS_VERSION);
|
||||
MODULE_SOFTDEP("ecb");
|
||||
MODULE_SOFTDEP("hmac");
|
||||
MODULE_SOFTDEP("md4");
|
||||
MODULE_SOFTDEP("md5");
|
||||
MODULE_SOFTDEP("nls");
|
||||
MODULE_SOFTDEP("aes");
|
||||
|
||||
@@ -114,8 +114,6 @@ enum statusEnum {
|
||||
|
||||
enum securityEnum {
|
||||
Unspecified = 0, /* not specified */
|
||||
LANMAN, /* Legacy LANMAN auth */
|
||||
NTLM, /* Legacy NTLM012 auth with NTLM hash */
|
||||
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
|
||||
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
|
||||
Kerberos, /* Kerberos via SPNEGO */
|
||||
@@ -634,7 +632,6 @@ struct TCP_Server_Info {
|
||||
struct session_key session_key;
|
||||
unsigned long lstrp; /* when we got last response from this server */
|
||||
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
|
||||
#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
|
||||
#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
|
||||
#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
|
||||
char negflavor; /* NEGOTIATE response flavor */
|
||||
@@ -1734,16 +1731,8 @@ static inline bool is_retryable_error(int error)
|
||||
|
||||
/* Security Flags: indicate type of session setup needed */
|
||||
#define CIFSSEC_MAY_SIGN 0x00001
|
||||
#define CIFSSEC_MAY_NTLM 0x00002
|
||||
#define CIFSSEC_MAY_NTLMV2 0x00004
|
||||
#define CIFSSEC_MAY_KRB5 0x00008
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
#define CIFSSEC_MAY_LANMAN 0x00010
|
||||
#define CIFSSEC_MAY_PLNTXT 0x00020
|
||||
#else
|
||||
#define CIFSSEC_MAY_LANMAN 0
|
||||
#define CIFSSEC_MAY_PLNTXT 0
|
||||
#endif /* weak passwords */
|
||||
#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */
|
||||
#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */
|
||||
|
||||
@@ -1751,32 +1740,19 @@ static inline bool is_retryable_error(int error)
|
||||
/* note that only one of the following can be set so the
|
||||
result of setting MUST flags more than once will be to
|
||||
require use of the stronger protocol */
|
||||
#define CIFSSEC_MUST_NTLM 0x02002
|
||||
#define CIFSSEC_MUST_NTLMV2 0x04004
|
||||
#define CIFSSEC_MUST_KRB5 0x08008
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
#define CIFSSEC_MUST_LANMAN 0x10010
|
||||
#define CIFSSEC_MUST_PLNTXT 0x20020
|
||||
#ifdef CONFIG_CIFS_UPCALL
|
||||
#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
|
||||
#else
|
||||
#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
|
||||
#endif /* UPCALL */
|
||||
#else /* do not allow weak pw hash */
|
||||
#define CIFSSEC_MUST_LANMAN 0
|
||||
#define CIFSSEC_MUST_PLNTXT 0
|
||||
#ifdef CONFIG_CIFS_UPCALL
|
||||
#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
|
||||
#else
|
||||
#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */
|
||||
#endif /* UPCALL */
|
||||
#endif /* WEAK_PW_HASH */
|
||||
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
|
||||
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
|
||||
|
||||
#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
|
||||
#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
|
||||
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
|
||||
#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2)
|
||||
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
|
||||
/*
|
||||
*****************************************************************
|
||||
* All constants go here
|
||||
@@ -1940,10 +1916,6 @@ static inline char *get_security_type_str(enum securityEnum sectype)
|
||||
return "Kerberos";
|
||||
case NTLMv2:
|
||||
return "NTLMv2";
|
||||
case NTLM:
|
||||
return "NTLM";
|
||||
case LANMAN:
|
||||
return "LANMAN";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
@@ -14,13 +14,7 @@
|
||||
#include <asm/unaligned.h>
|
||||
#include "smbfsctl.h"
|
||||
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
#define LANMAN_PROT 0
|
||||
#define LANMAN2_PROT 1
|
||||
#define CIFS_PROT 2
|
||||
#else
|
||||
#define CIFS_PROT 0
|
||||
#endif
|
||||
#define POSIX_PROT (CIFS_PROT+1)
|
||||
#define BAD_PROT 0xFFFF
|
||||
|
||||
@@ -505,30 +499,8 @@ typedef struct negotiate_req {
|
||||
unsigned char DialectsArray[1];
|
||||
} __attribute__((packed)) NEGOTIATE_REQ;
|
||||
|
||||
/* Dialect index is 13 for LANMAN */
|
||||
|
||||
#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
|
||||
|
||||
typedef struct lanman_neg_rsp {
|
||||
struct smb_hdr hdr; /* wct = 13 */
|
||||
__le16 DialectIndex;
|
||||
__le16 SecurityMode;
|
||||
__le16 MaxBufSize;
|
||||
__le16 MaxMpxCount;
|
||||
__le16 MaxNumberVcs;
|
||||
__le16 RawMode;
|
||||
__le32 SessionKey;
|
||||
struct {
|
||||
__le16 Time;
|
||||
__le16 Date;
|
||||
} __attribute__((packed)) SrvTime;
|
||||
__le16 ServerTimeZone;
|
||||
__le16 EncryptionKeyLength;
|
||||
__le16 Reserved;
|
||||
__u16 ByteCount;
|
||||
unsigned char EncryptionKey[1];
|
||||
} __attribute__((packed)) LANMAN_NEG_RSP;
|
||||
|
||||
#define READ_RAW_ENABLE 1
|
||||
#define WRITE_RAW_ENABLE 2
|
||||
#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
|
||||
|
||||
@@ -498,19 +498,12 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
|
||||
extern int cifs_verify_signature(struct smb_rqst *rqst,
|
||||
struct TCP_Server_Info *server,
|
||||
__u32 expected_sequence_number);
|
||||
extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
|
||||
const struct nls_table *);
|
||||
extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
|
||||
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
|
||||
extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server);
|
||||
extern int calc_seckey(struct cifs_ses *);
|
||||
extern int generate_smb30signingkey(struct cifs_ses *);
|
||||
extern int generate_smb311signingkey(struct cifs_ses *);
|
||||
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
extern int calc_lanman_hash(const char *password, const char *cryptkey,
|
||||
bool encrypt, char *lnm_session_key);
|
||||
#endif /* CIFS_WEAK_PW_HASH */
|
||||
extern int CIFSSMBCopy(unsigned int xid,
|
||||
struct cifs_tcon *source_tcon,
|
||||
const char *fromName,
|
||||
@@ -547,11 +540,8 @@ extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
|
||||
struct cifs_sb_info *cifs_sb,
|
||||
struct cifs_fattr *fattr,
|
||||
const unsigned char *path);
|
||||
extern int mdfour(unsigned char *, unsigned char *, int);
|
||||
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
|
||||
const struct nls_table *codepage);
|
||||
extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
|
||||
unsigned char *p24);
|
||||
|
||||
extern int
|
||||
cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname);
|
||||
|
||||
@@ -42,10 +42,6 @@ static struct {
|
||||
int index;
|
||||
char *name;
|
||||
} protocols[] = {
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
{LANMAN_PROT, "\2LM1.2X002"},
|
||||
{LANMAN2_PROT, "\2LANMAN2.1"},
|
||||
#endif /* weak password hashing for legacy clients */
|
||||
{CIFS_PROT, "\2NT LM 0.12"},
|
||||
{POSIX_PROT, "\2POSIX 2"},
|
||||
{BAD_PROT, "\2"}
|
||||
@@ -55,10 +51,6 @@ static struct {
|
||||
int index;
|
||||
char *name;
|
||||
} protocols[] = {
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
{LANMAN_PROT, "\2LM1.2X002"},
|
||||
{LANMAN2_PROT, "\2LANMAN2.1"},
|
||||
#endif /* weak password hashing for legacy clients */
|
||||
{CIFS_PROT, "\2NT LM 0.12"},
|
||||
{BAD_PROT, "\2"}
|
||||
};
|
||||
@@ -66,17 +58,9 @@ static struct {
|
||||
|
||||
/* define the number of elements in the cifs dialect array */
|
||||
#ifdef CONFIG_CIFS_POSIX
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
#define CIFS_NUM_PROT 4
|
||||
#else
|
||||
#define CIFS_NUM_PROT 2
|
||||
#endif /* CIFS_WEAK_PW_HASH */
|
||||
#else /* not posix */
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
#define CIFS_NUM_PROT 3
|
||||
#else
|
||||
#define CIFS_NUM_PROT 1
|
||||
#endif /* CONFIG_CIFS_WEAK_PW_HASH */
|
||||
#endif /* CIFS_POSIX */
|
||||
|
||||
/*
|
||||
@@ -475,89 +459,6 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
static int
|
||||
decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
|
||||
{
|
||||
__s16 tmp;
|
||||
struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
|
||||
|
||||
if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
server->sec_mode = le16_to_cpu(rsp->SecurityMode);
|
||||
server->maxReq = min_t(unsigned int,
|
||||
le16_to_cpu(rsp->MaxMpxCount),
|
||||
cifs_max_pending);
|
||||
set_credits(server, server->maxReq);
|
||||
server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
|
||||
/* set up max_read for readpages check */
|
||||
server->max_read = server->maxBuf;
|
||||
/* even though we do not use raw we might as well set this
|
||||
accurately, in case we ever find a need for it */
|
||||
if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
|
||||
server->max_rw = 0xFF00;
|
||||
server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
|
||||
} else {
|
||||
server->max_rw = 0;/* do not need to use raw anyway */
|
||||
server->capabilities = CAP_MPX_MODE;
|
||||
}
|
||||
tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
|
||||
if (tmp == -1) {
|
||||
/* OS/2 often does not set timezone therefore
|
||||
* we must use server time to calc time zone.
|
||||
* Could deviate slightly from the right zone.
|
||||
* Smallest defined timezone difference is 15 minutes
|
||||
* (i.e. Nepal). Rounding up/down is done to match
|
||||
* this requirement.
|
||||
*/
|
||||
int val, seconds, remain, result;
|
||||
struct timespec64 ts;
|
||||
time64_t utc = ktime_get_real_seconds();
|
||||
ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
|
||||
rsp->SrvTime.Time, 0);
|
||||
cifs_dbg(FYI, "SrvTime %lld sec since 1970 (utc: %lld) diff: %lld\n",
|
||||
ts.tv_sec, utc,
|
||||
utc - ts.tv_sec);
|
||||
val = (int)(utc - ts.tv_sec);
|
||||
seconds = abs(val);
|
||||
result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
|
||||
remain = seconds % MIN_TZ_ADJ;
|
||||
if (remain >= (MIN_TZ_ADJ / 2))
|
||||
result += MIN_TZ_ADJ;
|
||||
if (val < 0)
|
||||
result = -result;
|
||||
server->timeAdj = result;
|
||||
} else {
|
||||
server->timeAdj = (int)tmp;
|
||||
server->timeAdj *= 60; /* also in seconds */
|
||||
}
|
||||
cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
|
||||
|
||||
|
||||
/* BB get server time for time conversions and add
|
||||
code to use it and timezone since this is not UTC */
|
||||
|
||||
if (rsp->EncryptionKeyLength ==
|
||||
cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
|
||||
memcpy(server->cryptkey, rsp->EncryptionKey,
|
||||
CIFS_CRYPTO_KEY_SIZE);
|
||||
} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
|
||||
return -EIO; /* need cryptkey unless plain text */
|
||||
}
|
||||
|
||||
cifs_dbg(FYI, "LANMAN negotiated\n");
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline int
|
||||
decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
|
||||
{
|
||||
cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool
|
||||
should_set_ext_sec_flag(enum securityEnum sectype)
|
||||
{
|
||||
@@ -626,16 +527,12 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
|
||||
server->dialect = le16_to_cpu(pSMBr->DialectIndex);
|
||||
cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
|
||||
/* Check wct = 1 error case */
|
||||
if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
|
||||
if ((pSMBr->hdr.WordCount <= 13) || (server->dialect == BAD_PROT)) {
|
||||
/* core returns wct = 1, but we do not ask for core - otherwise
|
||||
small wct just comes when dialect index is -1 indicating we
|
||||
could not negotiate a common dialect */
|
||||
rc = -EOPNOTSUPP;
|
||||
goto neg_err_exit;
|
||||
} else if (pSMBr->hdr.WordCount == 13) {
|
||||
server->negflavor = CIFS_NEGFLAVOR_LANMAN;
|
||||
rc = decode_lanman_negprot_rsp(server, pSMBr);
|
||||
goto signing_check;
|
||||
} else if (pSMBr->hdr.WordCount != 17) {
|
||||
/* unknown wct */
|
||||
rc = -EOPNOTSUPP;
|
||||
@@ -677,7 +574,6 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
|
||||
server->capabilities &= ~CAP_EXTENDED_SECURITY;
|
||||
}
|
||||
|
||||
signing_check:
|
||||
if (!rc)
|
||||
rc = cifs_enable_signing(server, ses->sign);
|
||||
neg_err_exit:
|
||||
@@ -2101,6 +1997,7 @@ cifs_writev_complete(struct work_struct *work)
|
||||
else if (wdata->result < 0)
|
||||
SetPageError(page);
|
||||
end_page_writeback(page);
|
||||
cifs_readpage_to_fscache(inode, page);
|
||||
put_page(page);
|
||||
}
|
||||
if (wdata->result != -EAGAIN)
|
||||
|
||||
@@ -3684,38 +3684,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
|
||||
*bcc_ptr = 0; /* password is null byte */
|
||||
bcc_ptr++; /* skip password */
|
||||
/* already aligned so no need to do it below */
|
||||
} else {
|
||||
pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
|
||||
/* BB FIXME add code to fail this if NTLMv2 or Kerberos
|
||||
specified as required (when that support is added to
|
||||
the vfs in the future) as only NTLM or the much
|
||||
weaker LANMAN (which we do not send by default) is accepted
|
||||
by Samba (not sure whether other servers allow
|
||||
NTLMv2 password here) */
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
|
||||
(ses->sectype == LANMAN))
|
||||
calc_lanman_hash(tcon->password, ses->server->cryptkey,
|
||||
ses->server->sec_mode &
|
||||
SECMODE_PW_ENCRYPT ? true : false,
|
||||
bcc_ptr);
|
||||
else
|
||||
#endif /* CIFS_WEAK_PW_HASH */
|
||||
rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
|
||||
bcc_ptr, nls_codepage);
|
||||
if (rc) {
|
||||
cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
|
||||
__func__, rc);
|
||||
cifs_buf_release(smb_buffer);
|
||||
return rc;
|
||||
}
|
||||
|
||||
bcc_ptr += CIFS_AUTH_RESP_SIZE;
|
||||
if (ses->capabilities & CAP_UNICODE) {
|
||||
/* must align unicode strings */
|
||||
*bcc_ptr = 0; /* null byte password */
|
||||
bcc_ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
if (ses->server->sign)
|
||||
|
||||
@@ -377,6 +377,8 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
|
||||
struct cifsLockInfo *li, *tmp;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
|
||||
cifs_fscache_release_inode_cookie(inode);
|
||||
|
||||
/*
|
||||
* Delete any outstanding lock records. We'll lose them when the file
|
||||
* is closed anyway.
|
||||
@@ -882,8 +884,10 @@ int cifs_close(struct inode *inode, struct file *file)
|
||||
if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
|
||||
cinode->lease_granted &&
|
||||
dclose) {
|
||||
if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags))
|
||||
if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
|
||||
inode->i_ctime = inode->i_mtime = current_time(inode);
|
||||
cifs_fscache_update_inode_cookie(inode);
|
||||
}
|
||||
spin_lock(&cinode->deferred_lock);
|
||||
cifs_add_deferred_close(cfile, dclose);
|
||||
if (cfile->deferred_close_scheduled &&
|
||||
@@ -4170,6 +4174,10 @@ static vm_fault_t
|
||||
cifs_page_mkwrite(struct vm_fault *vmf)
|
||||
{
|
||||
struct page *page = vmf->page;
|
||||
struct file *file = vmf->vma->vm_file;
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
cifs_fscache_wait_on_page_write(inode, page);
|
||||
|
||||
lock_page(page);
|
||||
return VM_FAULT_LOCKED;
|
||||
@@ -4235,13 +4243,16 @@ cifs_readv_complete(struct work_struct *work)
|
||||
(rdata->result == -EAGAIN && got_bytes)) {
|
||||
flush_dcache_page(page);
|
||||
SetPageUptodate(page);
|
||||
}
|
||||
} else
|
||||
SetPageError(page);
|
||||
|
||||
unlock_page(page);
|
||||
|
||||
if (rdata->result == 0 ||
|
||||
(rdata->result == -EAGAIN && got_bytes))
|
||||
cifs_readpage_to_fscache(rdata->mapping->host, page);
|
||||
else
|
||||
cifs_fscache_uncache_page(rdata->mapping->host, page);
|
||||
|
||||
got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
|
||||
|
||||
|
||||
@@ -57,12 +57,9 @@ static const match_table_t cifs_secflavor_tokens = {
|
||||
{ Opt_sec_krb5p, "krb5p" },
|
||||
{ Opt_sec_ntlmsspi, "ntlmsspi" },
|
||||
{ Opt_sec_ntlmssp, "ntlmssp" },
|
||||
{ Opt_ntlm, "ntlm" },
|
||||
{ Opt_sec_ntlmi, "ntlmi" },
|
||||
{ Opt_sec_ntlmv2, "nontlm" },
|
||||
{ Opt_sec_ntlmv2, "ntlmv2" },
|
||||
{ Opt_sec_ntlmv2i, "ntlmv2i" },
|
||||
{ Opt_sec_lanman, "lanman" },
|
||||
{ Opt_sec_none, "none" },
|
||||
|
||||
{ Opt_sec_err, NULL }
|
||||
@@ -221,23 +218,12 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
|
||||
case Opt_sec_ntlmssp:
|
||||
ctx->sectype = RawNTLMSSP;
|
||||
break;
|
||||
case Opt_sec_ntlmi:
|
||||
ctx->sign = true;
|
||||
fallthrough;
|
||||
case Opt_ntlm:
|
||||
ctx->sectype = NTLM;
|
||||
break;
|
||||
case Opt_sec_ntlmv2i:
|
||||
ctx->sign = true;
|
||||
fallthrough;
|
||||
case Opt_sec_ntlmv2:
|
||||
ctx->sectype = NTLMv2;
|
||||
break;
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
case Opt_sec_lanman:
|
||||
ctx->sectype = LANMAN;
|
||||
break;
|
||||
#endif
|
||||
case Opt_sec_none:
|
||||
ctx->nullauth = 1;
|
||||
break;
|
||||
@@ -1266,10 +1252,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
|
||||
ctx->posix_paths = 1;
|
||||
break;
|
||||
case Opt_unix:
|
||||
if (result.negated)
|
||||
if (result.negated) {
|
||||
if (ctx->linux_ext == 1)
|
||||
pr_warn_once("conflicting posix mount options specified\n");
|
||||
ctx->linux_ext = 0;
|
||||
else
|
||||
ctx->no_linux_ext = 1;
|
||||
} else {
|
||||
if (ctx->no_linux_ext == 1)
|
||||
pr_warn_once("conflicting posix mount options specified\n");
|
||||
ctx->linux_ext = 1;
|
||||
ctx->no_linux_ext = 0;
|
||||
}
|
||||
break;
|
||||
case Opt_nocase:
|
||||
ctx->nocase = 1;
|
||||
|
||||
@@ -47,11 +47,8 @@ enum cifs_sec_param {
|
||||
Opt_sec_krb5p,
|
||||
Opt_sec_ntlmsspi,
|
||||
Opt_sec_ntlmssp,
|
||||
Opt_ntlm,
|
||||
Opt_sec_ntlmi,
|
||||
Opt_sec_ntlmv2,
|
||||
Opt_sec_ntlmv2i,
|
||||
Opt_sec_lanman,
|
||||
Opt_sec_none,
|
||||
|
||||
Opt_sec_err
|
||||
|
||||
@@ -176,28 +176,33 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
|
||||
auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
|
||||
|
||||
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
|
||||
/* fscache_relinquish_cookie does not seem to update auxdata */
|
||||
fscache_update_cookie(cifsi->fscache, &auxdata);
|
||||
fscache_relinquish_cookie(cifsi->fscache, &auxdata, false);
|
||||
cifsi->fscache = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void cifs_fscache_disable_inode_cookie(struct inode *inode)
|
||||
void cifs_fscache_update_inode_cookie(struct inode *inode)
|
||||
{
|
||||
struct cifs_fscache_inode_auxdata auxdata;
|
||||
struct cifsInodeInfo *cifsi = CIFS_I(inode);
|
||||
|
||||
if (cifsi->fscache) {
|
||||
memset(&auxdata, 0, sizeof(auxdata));
|
||||
auxdata.eof = cifsi->server_eof;
|
||||
auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec;
|
||||
auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec;
|
||||
auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec;
|
||||
auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec;
|
||||
|
||||
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
|
||||
fscache_uncache_all_inode_pages(cifsi->fscache, inode);
|
||||
fscache_relinquish_cookie(cifsi->fscache, NULL, true);
|
||||
cifsi->fscache = NULL;
|
||||
fscache_update_cookie(cifsi->fscache, &auxdata);
|
||||
}
|
||||
}
|
||||
|
||||
void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
|
||||
{
|
||||
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
|
||||
cifs_fscache_disable_inode_cookie(inode);
|
||||
else
|
||||
cifs_fscache_enable_inode_cookie(inode);
|
||||
}
|
||||
|
||||
@@ -310,6 +315,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
|
||||
struct cifsInodeInfo *cifsi = CIFS_I(inode);
|
||||
int ret;
|
||||
|
||||
WARN_ON(!cifsi->fscache);
|
||||
|
||||
cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
|
||||
__func__, cifsi->fscache, page, inode);
|
||||
ret = fscache_write_page(cifsi->fscache, page,
|
||||
@@ -334,3 +341,21 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
|
||||
fscache_wait_on_page_write(cookie, page);
|
||||
fscache_uncache_page(cookie, page);
|
||||
}
|
||||
|
||||
void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
|
||||
{
|
||||
struct cifsInodeInfo *cifsi = CIFS_I(inode);
|
||||
struct fscache_cookie *cookie = cifsi->fscache;
|
||||
|
||||
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
|
||||
fscache_wait_on_page_write(cookie, page);
|
||||
}
|
||||
|
||||
void __cifs_fscache_uncache_page(struct inode *inode, struct page *page)
|
||||
{
|
||||
struct cifsInodeInfo *cifsi = CIFS_I(inode);
|
||||
struct fscache_cookie *cookie = cifsi->fscache;
|
||||
|
||||
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
|
||||
fscache_uncache_page(cookie, page);
|
||||
}
|
||||
|
||||
@@ -55,10 +55,13 @@ extern void cifs_fscache_get_super_cookie(struct cifs_tcon *);
|
||||
extern void cifs_fscache_release_super_cookie(struct cifs_tcon *);
|
||||
|
||||
extern void cifs_fscache_release_inode_cookie(struct inode *);
|
||||
extern void cifs_fscache_update_inode_cookie(struct inode *inode);
|
||||
extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *);
|
||||
extern void cifs_fscache_reset_inode_cookie(struct inode *);
|
||||
|
||||
extern void __cifs_fscache_invalidate_page(struct page *, struct inode *);
|
||||
extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page);
|
||||
extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page);
|
||||
extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
|
||||
extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
|
||||
extern int __cifs_readpages_from_fscache(struct inode *,
|
||||
@@ -76,6 +79,20 @@ static inline void cifs_fscache_invalidate_page(struct page *page,
|
||||
__cifs_fscache_invalidate_page(page, inode);
|
||||
}
|
||||
|
||||
static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
if (PageFsCache(page))
|
||||
__cifs_fscache_wait_on_page_write(inode, page);
|
||||
}
|
||||
|
||||
static inline void cifs_fscache_uncache_page(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
if (PageFsCache(page))
|
||||
__cifs_fscache_uncache_page(inode, page);
|
||||
}
|
||||
|
||||
static inline int cifs_readpage_from_fscache(struct inode *inode,
|
||||
struct page *page)
|
||||
{
|
||||
@@ -123,6 +140,7 @@ static inline void
|
||||
cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {}
|
||||
|
||||
static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
|
||||
static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {}
|
||||
static inline void cifs_fscache_set_inode_cookie(struct inode *inode,
|
||||
struct file *filp) {}
|
||||
static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {}
|
||||
@@ -133,6 +151,11 @@ static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
|
||||
|
||||
static inline void cifs_fscache_invalidate_page(struct page *page,
|
||||
struct inode *inode) {}
|
||||
static inline void cifs_fscache_wait_on_page_write(struct inode *inode,
|
||||
struct page *page) {}
|
||||
static inline void cifs_fscache_uncache_page(struct inode *inode,
|
||||
struct page *page) {}
|
||||
|
||||
static inline int
|
||||
cifs_readpage_from_fscache(struct inode *inode, struct page *page)
|
||||
{
|
||||
|
||||
@@ -2297,6 +2297,7 @@ cifs_revalidate_mapping(struct inode *inode)
|
||||
{
|
||||
int rc;
|
||||
unsigned long *flags = &CIFS_I(inode)->flags;
|
||||
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
|
||||
|
||||
/* swapfiles are not supposed to be shared */
|
||||
if (IS_SWAPFILE(inode))
|
||||
@@ -2308,11 +2309,16 @@ cifs_revalidate_mapping(struct inode *inode)
|
||||
return rc;
|
||||
|
||||
if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) {
|
||||
/* for cache=singleclient, do not invalidate */
|
||||
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)
|
||||
goto skip_invalidate;
|
||||
|
||||
rc = cifs_invalidate_mapping(inode);
|
||||
if (rc)
|
||||
set_bit(CIFS_INO_INVALID_MAPPING, flags);
|
||||
}
|
||||
|
||||
skip_invalidate:
|
||||
clear_bit_unlock(CIFS_INO_LOCK, flags);
|
||||
smp_mb__after_atomic();
|
||||
wake_up_bit(flags, CIFS_INO_LOCK);
|
||||
|
||||
@@ -369,7 +369,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
|
||||
*/
|
||||
|
||||
static int
|
||||
initiate_cifs_search(const unsigned int xid, struct file *file,
|
||||
_initiate_cifs_search(const unsigned int xid, struct file *file,
|
||||
const char *full_path)
|
||||
{
|
||||
__u16 search_flags;
|
||||
@@ -451,6 +451,27 @@ error_exit:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int
|
||||
initiate_cifs_search(const unsigned int xid, struct file *file,
|
||||
const char *full_path)
|
||||
{
|
||||
int rc, retry_count = 0;
|
||||
|
||||
do {
|
||||
rc = _initiate_cifs_search(xid, file, full_path);
|
||||
/*
|
||||
* If we don't have enough credits to start reading the
|
||||
* directory just try again after short wait.
|
||||
*/
|
||||
if (rc != -EDEADLK)
|
||||
break;
|
||||
|
||||
usleep_range(512, 2048);
|
||||
} while (retry_count++ < 5);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* return length of unicode string in bytes */
|
||||
static int cifs_unicode_bytelen(const char *str)
|
||||
{
|
||||
|
||||
255
fs/cifs/sess.c
255
fs/cifs/sess.c
@@ -799,33 +799,19 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
|
||||
}
|
||||
case CIFS_NEGFLAVOR_UNENCAP:
|
||||
switch (requested) {
|
||||
case NTLM:
|
||||
case NTLMv2:
|
||||
return requested;
|
||||
case Unspecified:
|
||||
if (global_secflags & CIFSSEC_MAY_NTLMV2)
|
||||
return NTLMv2;
|
||||
if (global_secflags & CIFSSEC_MAY_NTLM)
|
||||
return NTLM;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
fallthrough; /* to attempt LANMAN authentication next */
|
||||
case CIFS_NEGFLAVOR_LANMAN:
|
||||
switch (requested) {
|
||||
case LANMAN:
|
||||
return requested;
|
||||
case Unspecified:
|
||||
if (global_secflags & CIFSSEC_MAY_LANMAN)
|
||||
return LANMAN;
|
||||
fallthrough;
|
||||
default:
|
||||
return Unspecified;
|
||||
}
|
||||
default:
|
||||
return Unspecified;
|
||||
}
|
||||
}
|
||||
|
||||
struct sess_data {
|
||||
@@ -877,7 +863,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct)
|
||||
return 0;
|
||||
|
||||
out_free_smb_buf:
|
||||
kfree(smb_buf);
|
||||
cifs_small_buf_release(smb_buf);
|
||||
sess_data->iov[0].iov_base = NULL;
|
||||
sess_data->iov[0].iov_len = 0;
|
||||
sess_data->buf0_type = CIFS_NO_BUFFER;
|
||||
@@ -947,230 +933,6 @@ sess_sendreceive(struct sess_data *sess_data)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* LANMAN and plaintext are less secure and off by default.
|
||||
* So we make this explicitly be turned on in kconfig (in the
|
||||
* build) and turned on at runtime (changed from the default)
|
||||
* in proc/fs/cifs or via mount parm. Unfortunately this is
|
||||
* needed for old Win (e.g. Win95), some obscure NAS and OS/2
|
||||
*/
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
static void
|
||||
sess_auth_lanman(struct sess_data *sess_data)
|
||||
{
|
||||
int rc = 0;
|
||||
struct smb_hdr *smb_buf;
|
||||
SESSION_SETUP_ANDX *pSMB;
|
||||
char *bcc_ptr;
|
||||
struct cifs_ses *ses = sess_data->ses;
|
||||
char lnm_session_key[CIFS_AUTH_RESP_SIZE];
|
||||
__u16 bytes_remaining;
|
||||
|
||||
/* lanman 2 style sessionsetup */
|
||||
/* wct = 10 */
|
||||
rc = sess_alloc_buffer(sess_data, 10);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
|
||||
bcc_ptr = sess_data->iov[2].iov_base;
|
||||
(void)cifs_ssetup_hdr(ses, pSMB);
|
||||
|
||||
pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
|
||||
|
||||
if (ses->user_name != NULL) {
|
||||
/* no capabilities flags in old lanman negotiation */
|
||||
pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
|
||||
|
||||
/* Calculate hash with password and copy into bcc_ptr.
|
||||
* Encryption Key (stored as in cryptkey) gets used if the
|
||||
* security mode bit in Negotiate Protocol response states
|
||||
* to use challenge/response method (i.e. Password bit is 1).
|
||||
*/
|
||||
rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
|
||||
ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
|
||||
true : false, lnm_session_key);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
|
||||
bcc_ptr += CIFS_AUTH_RESP_SIZE;
|
||||
} else {
|
||||
pSMB->old_req.PasswordLength = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* can not sign if LANMAN negotiated so no need
|
||||
* to calculate signing key? but what if server
|
||||
* changed to do higher than lanman dialect and
|
||||
* we reconnected would we ever calc signing_key?
|
||||
*/
|
||||
|
||||
cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
|
||||
/* Unicode not allowed for LANMAN dialects */
|
||||
ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
|
||||
|
||||
sess_data->iov[2].iov_len = (long) bcc_ptr -
|
||||
(long) sess_data->iov[2].iov_base;
|
||||
|
||||
rc = sess_sendreceive(sess_data);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
|
||||
smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
|
||||
|
||||
/* lanman response has a word count of 3 */
|
||||
if (smb_buf->WordCount != 3) {
|
||||
rc = -EIO;
|
||||
cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
|
||||
cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
|
||||
|
||||
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
|
||||
cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
|
||||
|
||||
bytes_remaining = get_bcc(smb_buf);
|
||||
bcc_ptr = pByteArea(smb_buf);
|
||||
|
||||
/* BB check if Unicode and decode strings */
|
||||
if (bytes_remaining == 0) {
|
||||
/* no string area to decode, do nothing */
|
||||
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
|
||||
/* unicode string area must be word-aligned */
|
||||
if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
|
||||
++bcc_ptr;
|
||||
--bytes_remaining;
|
||||
}
|
||||
decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
|
||||
sess_data->nls_cp);
|
||||
} else {
|
||||
decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
|
||||
sess_data->nls_cp);
|
||||
}
|
||||
|
||||
rc = sess_establish_session(sess_data);
|
||||
out:
|
||||
sess_data->result = rc;
|
||||
sess_data->func = NULL;
|
||||
sess_free_buffer(sess_data);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void
|
||||
sess_auth_ntlm(struct sess_data *sess_data)
|
||||
{
|
||||
int rc = 0;
|
||||
struct smb_hdr *smb_buf;
|
||||
SESSION_SETUP_ANDX *pSMB;
|
||||
char *bcc_ptr;
|
||||
struct cifs_ses *ses = sess_data->ses;
|
||||
__u32 capabilities;
|
||||
__u16 bytes_remaining;
|
||||
|
||||
/* old style NTLM sessionsetup */
|
||||
/* wct = 13 */
|
||||
rc = sess_alloc_buffer(sess_data, 13);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
|
||||
bcc_ptr = sess_data->iov[2].iov_base;
|
||||
capabilities = cifs_ssetup_hdr(ses, pSMB);
|
||||
|
||||
pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
|
||||
if (ses->user_name != NULL) {
|
||||
pSMB->req_no_secext.CaseInsensitivePasswordLength =
|
||||
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
|
||||
pSMB->req_no_secext.CaseSensitivePasswordLength =
|
||||
cpu_to_le16(CIFS_AUTH_RESP_SIZE);
|
||||
|
||||
/* calculate ntlm response and session key */
|
||||
rc = setup_ntlm_response(ses, sess_data->nls_cp);
|
||||
if (rc) {
|
||||
cifs_dbg(VFS, "Error %d during NTLM authentication\n",
|
||||
rc);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* copy ntlm response */
|
||||
memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
|
||||
CIFS_AUTH_RESP_SIZE);
|
||||
bcc_ptr += CIFS_AUTH_RESP_SIZE;
|
||||
memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
|
||||
CIFS_AUTH_RESP_SIZE);
|
||||
bcc_ptr += CIFS_AUTH_RESP_SIZE;
|
||||
} else {
|
||||
pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
|
||||
pSMB->req_no_secext.CaseSensitivePasswordLength = 0;
|
||||
}
|
||||
|
||||
if (ses->capabilities & CAP_UNICODE) {
|
||||
/* unicode strings must be word aligned */
|
||||
if (sess_data->iov[0].iov_len % 2) {
|
||||
*bcc_ptr = 0;
|
||||
bcc_ptr++;
|
||||
}
|
||||
unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
|
||||
} else {
|
||||
ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
|
||||
}
|
||||
|
||||
|
||||
sess_data->iov[2].iov_len = (long) bcc_ptr -
|
||||
(long) sess_data->iov[2].iov_base;
|
||||
|
||||
rc = sess_sendreceive(sess_data);
|
||||
if (rc)
|
||||
goto out;
|
||||
|
||||
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
|
||||
smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
|
||||
|
||||
if (smb_buf->WordCount != 3) {
|
||||
rc = -EIO;
|
||||
cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
|
||||
cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
|
||||
|
||||
ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
|
||||
cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
|
||||
|
||||
bytes_remaining = get_bcc(smb_buf);
|
||||
bcc_ptr = pByteArea(smb_buf);
|
||||
|
||||
/* BB check if Unicode and decode strings */
|
||||
if (bytes_remaining == 0) {
|
||||
/* no string area to decode, do nothing */
|
||||
} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
|
||||
/* unicode string area must be word-aligned */
|
||||
if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
|
||||
++bcc_ptr;
|
||||
--bytes_remaining;
|
||||
}
|
||||
decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
|
||||
sess_data->nls_cp);
|
||||
} else {
|
||||
decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
|
||||
sess_data->nls_cp);
|
||||
}
|
||||
|
||||
rc = sess_establish_session(sess_data);
|
||||
out:
|
||||
sess_data->result = rc;
|
||||
sess_data->func = NULL;
|
||||
sess_free_buffer(sess_data);
|
||||
kfree(ses->auth_key.response);
|
||||
ses->auth_key.response = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
sess_auth_ntlmv2(struct sess_data *sess_data)
|
||||
{
|
||||
@@ -1675,21 +1437,6 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case LANMAN:
|
||||
/* LANMAN and plaintext are less secure and off by default.
|
||||
* So we make this explicitly be turned on in kconfig (in the
|
||||
* build) and turned on at runtime (changed from the default)
|
||||
* in proc/fs/cifs or via mount parm. Unfortunately this is
|
||||
* needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
|
||||
#ifdef CONFIG_CIFS_WEAK_PW_HASH
|
||||
sess_data->func = sess_auth_lanman;
|
||||
break;
|
||||
#else
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
case NTLM:
|
||||
sess_data->func = sess_auth_ntlm;
|
||||
break;
|
||||
case NTLMv2:
|
||||
sess_data->func = sess_auth_ntlmv2;
|
||||
break;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
// SPDX-License-Identifier: LGPL-2.1
|
||||
/*
|
||||
* fs/smb2/smb2maperror.c
|
||||
*
|
||||
* Functions which do error mapping of SMB2 status codes to POSIX errors
|
||||
*
|
||||
|
||||
@@ -18,13 +18,13 @@
|
||||
#include <linux/string.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/random.h>
|
||||
#include <crypto/des.h>
|
||||
#include "cifs_fs_sb.h"
|
||||
#include "cifs_unicode.h"
|
||||
#include "cifspdu.h"
|
||||
#include "cifsglob.h"
|
||||
#include "cifs_debug.h"
|
||||
#include "cifsproto.h"
|
||||
#include "../cifs_common/md4.h"
|
||||
|
||||
#ifndef false
|
||||
#define false 0
|
||||
@@ -38,126 +38,29 @@
|
||||
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
|
||||
#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
|
||||
|
||||
static void
|
||||
str_to_key(unsigned char *str, unsigned char *key)
|
||||
{
|
||||
int i;
|
||||
|
||||
key[0] = str[0] >> 1;
|
||||
key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
|
||||
key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
|
||||
key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
|
||||
key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
|
||||
key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
|
||||
key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
|
||||
key[7] = str[6] & 0x7F;
|
||||
for (i = 0; i < 8; i++)
|
||||
key[i] = (key[i] << 1);
|
||||
}
|
||||
|
||||
static int
|
||||
smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
|
||||
{
|
||||
unsigned char key2[8];
|
||||
struct des_ctx ctx;
|
||||
|
||||
str_to_key(key, key2);
|
||||
|
||||
if (fips_enabled) {
|
||||
cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n");
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
des_expand_key(&ctx, key2, DES_KEY_SIZE);
|
||||
des_encrypt(&ctx, out, in);
|
||||
memzero_explicit(&ctx, sizeof(ctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
E_P16(unsigned char *p14, unsigned char *p16)
|
||||
{
|
||||
int rc;
|
||||
unsigned char sp8[8] =
|
||||
{ 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
|
||||
|
||||
rc = smbhash(p16, sp8, p14);
|
||||
if (rc)
|
||||
return rc;
|
||||
rc = smbhash(p16 + 8, sp8, p14 + 7);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int
|
||||
E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = smbhash(p24, c8, p21);
|
||||
if (rc)
|
||||
return rc;
|
||||
rc = smbhash(p24 + 8, c8, p21 + 7);
|
||||
if (rc)
|
||||
return rc;
|
||||
rc = smbhash(p24 + 16, c8, p21 + 14);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* produce a md4 message digest from data of length n bytes */
|
||||
int
|
||||
static int
|
||||
mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
|
||||
{
|
||||
int rc;
|
||||
struct crypto_shash *md4 = NULL;
|
||||
struct sdesc *sdescmd4 = NULL;
|
||||
struct md4_ctx mctx;
|
||||
|
||||
rc = cifs_alloc_hash("md4", &md4, &sdescmd4);
|
||||
if (rc)
|
||||
goto mdfour_err;
|
||||
|
||||
rc = crypto_shash_init(&sdescmd4->shash);
|
||||
rc = cifs_md4_init(&mctx);
|
||||
if (rc) {
|
||||
cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
|
||||
cifs_dbg(VFS, "%s: Could not init MD4\n", __func__);
|
||||
goto mdfour_err;
|
||||
}
|
||||
rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
|
||||
rc = cifs_md4_update(&mctx, link_str, link_len);
|
||||
if (rc) {
|
||||
cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
|
||||
cifs_dbg(VFS, "%s: Could not update MD4\n", __func__);
|
||||
goto mdfour_err;
|
||||
}
|
||||
rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
|
||||
rc = cifs_md4_final(&mctx, md4_hash);
|
||||
if (rc)
|
||||
cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
|
||||
cifs_dbg(VFS, "%s: Could not finalize MD4\n", __func__);
|
||||
|
||||
|
||||
mdfour_err:
|
||||
cifs_free_hash(&md4, &sdescmd4);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
This implements the X/Open SMB password encryption
|
||||
It takes a password, a 8 byte "crypt key" and puts 24 bytes of
|
||||
encrypted password into p24 */
|
||||
/* Note that password must be uppercased and null terminated */
|
||||
int
|
||||
SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
|
||||
{
|
||||
int rc;
|
||||
unsigned char p14[14], p16[16], p21[21];
|
||||
|
||||
memset(p14, '\0', 14);
|
||||
memset(p16, '\0', 16);
|
||||
memset(p21, '\0', 21);
|
||||
|
||||
memcpy(p14, passwd, 14);
|
||||
rc = E_P16(p14, p16);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
memcpy(p21, p16, 16);
|
||||
rc = E_P24(p21, c8, p24);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
@@ -186,25 +89,3 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Does the NT MD4 hash then des encryption. */
|
||||
int
|
||||
SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
|
||||
const struct nls_table *codepage)
|
||||
{
|
||||
int rc;
|
||||
unsigned char p16[16], p21[21];
|
||||
|
||||
memset(p16, '\0', 16);
|
||||
memset(p21, '\0', 21);
|
||||
|
||||
rc = E_md4hash(passwd, p16, codepage);
|
||||
if (rc) {
|
||||
cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
|
||||
__func__, rc);
|
||||
return rc;
|
||||
}
|
||||
memcpy(p21, p16, 16);
|
||||
rc = E_P24(p21, c8, p24);
|
||||
return rc;
|
||||
}
|
||||
|
||||
7
fs/cifs_common/Makefile
Normal file
7
fs/cifs_common/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
#
|
||||
# Makefile for Linux filesystem routines that are shared by client and server.
|
||||
#
|
||||
|
||||
obj-$(CONFIG_CIFS_COMMON) += cifs_arc4.o
|
||||
obj-$(CONFIG_CIFS_COMMON) += cifs_md4.o
|
||||
23
fs/cifs_common/arc4.h
Normal file
23
fs/cifs_common/arc4.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
/*
|
||||
* Common values for ARC4 Cipher Algorithm
|
||||
*/
|
||||
|
||||
#ifndef _CRYPTO_ARC4_H
|
||||
#define _CRYPTO_ARC4_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#define ARC4_MIN_KEY_SIZE 1
|
||||
#define ARC4_MAX_KEY_SIZE 256
|
||||
#define ARC4_BLOCK_SIZE 1
|
||||
|
||||
struct arc4_ctx {
|
||||
u32 S[256];
|
||||
u32 x, y;
|
||||
};
|
||||
|
||||
int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len);
|
||||
void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len);
|
||||
|
||||
#endif /* _CRYPTO_ARC4_H */
|
||||
87
fs/cifs_common/cifs_arc4.c
Normal file
87
fs/cifs_common/cifs_arc4.c
Normal file
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* Cryptographic API
|
||||
*
|
||||
* ARC4 Cipher Algorithm
|
||||
*
|
||||
* Jon Oberheide <jon@oberheide.org>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include "arc4.h"
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
|
||||
{
|
||||
int i, j = 0, k = 0;
|
||||
|
||||
ctx->x = 1;
|
||||
ctx->y = 0;
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
ctx->S[i] = i;
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
u32 a = ctx->S[i];
|
||||
|
||||
j = (j + in_key[k] + a) & 0xff;
|
||||
ctx->S[i] = ctx->S[j];
|
||||
ctx->S[j] = a;
|
||||
if (++k >= key_len)
|
||||
k = 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cifs_arc4_setkey);
|
||||
|
||||
void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
|
||||
{
|
||||
u32 *const S = ctx->S;
|
||||
u32 x, y, a, b;
|
||||
u32 ty, ta, tb;
|
||||
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
x = ctx->x;
|
||||
y = ctx->y;
|
||||
|
||||
a = S[x];
|
||||
y = (y + a) & 0xff;
|
||||
b = S[y];
|
||||
|
||||
do {
|
||||
S[y] = a;
|
||||
a = (a + b) & 0xff;
|
||||
S[x] = b;
|
||||
x = (x + 1) & 0xff;
|
||||
ta = S[x];
|
||||
ty = (y + ta) & 0xff;
|
||||
tb = S[ty];
|
||||
*out++ = *in++ ^ S[a];
|
||||
if (--len == 0)
|
||||
break;
|
||||
y = ty;
|
||||
a = ta;
|
||||
b = tb;
|
||||
} while (true);
|
||||
|
||||
ctx->x = x;
|
||||
ctx->y = y;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cifs_arc4_crypt);
|
||||
|
||||
static int __init
|
||||
init_cifs_common(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static void __init
|
||||
exit_cifs_common(void)
|
||||
{
|
||||
}
|
||||
|
||||
module_init(init_cifs_common)
|
||||
module_exit(exit_cifs_common)
|
||||
197
fs/cifs_common/cifs_md4.c
Normal file
197
fs/cifs_common/cifs_md4.c
Normal file
@@ -0,0 +1,197 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Cryptographic API.
|
||||
*
|
||||
* MD4 Message Digest Algorithm (RFC1320).
|
||||
*
|
||||
* Implementation derived from Andrew Tridgell and Steve French's
|
||||
* CIFS MD4 implementation, and the cryptoapi implementation
|
||||
* originally based on the public domain implementation written
|
||||
* by Colin Plumb in 1993.
|
||||
*
|
||||
* Copyright (c) Andrew Tridgell 1997-1998.
|
||||
* Modified by Steve French (sfrench@us.ibm.com) 2002
|
||||
* Copyright (c) Cryptoapi developers.
|
||||
* Copyright (c) 2002 David S. Miller (davem@redhat.com)
|
||||
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
|
||||
*
|
||||
*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <asm/byteorder.h>
|
||||
#include "md4.h"
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static inline u32 lshift(u32 x, unsigned int s)
|
||||
{
|
||||
x &= 0xFFFFFFFF;
|
||||
return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
|
||||
}
|
||||
|
||||
static inline u32 F(u32 x, u32 y, u32 z)
|
||||
{
|
||||
return (x & y) | ((~x) & z);
|
||||
}
|
||||
|
||||
static inline u32 G(u32 x, u32 y, u32 z)
|
||||
{
|
||||
return (x & y) | (x & z) | (y & z);
|
||||
}
|
||||
|
||||
static inline u32 H(u32 x, u32 y, u32 z)
|
||||
{
|
||||
return x ^ y ^ z;
|
||||
}
|
||||
|
||||
#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s))
|
||||
#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s))
|
||||
#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s))
|
||||
|
||||
static void md4_transform(u32 *hash, u32 const *in)
|
||||
{
|
||||
u32 a, b, c, d;
|
||||
|
||||
a = hash[0];
|
||||
b = hash[1];
|
||||
c = hash[2];
|
||||
d = hash[3];
|
||||
|
||||
ROUND1(a, b, c, d, in[0], 3);
|
||||
ROUND1(d, a, b, c, in[1], 7);
|
||||
ROUND1(c, d, a, b, in[2], 11);
|
||||
ROUND1(b, c, d, a, in[3], 19);
|
||||
ROUND1(a, b, c, d, in[4], 3);
|
||||
ROUND1(d, a, b, c, in[5], 7);
|
||||
ROUND1(c, d, a, b, in[6], 11);
|
||||
ROUND1(b, c, d, a, in[7], 19);
|
||||
ROUND1(a, b, c, d, in[8], 3);
|
||||
ROUND1(d, a, b, c, in[9], 7);
|
||||
ROUND1(c, d, a, b, in[10], 11);
|
||||
ROUND1(b, c, d, a, in[11], 19);
|
||||
ROUND1(a, b, c, d, in[12], 3);
|
||||
ROUND1(d, a, b, c, in[13], 7);
|
||||
ROUND1(c, d, a, b, in[14], 11);
|
||||
ROUND1(b, c, d, a, in[15], 19);
|
||||
|
||||
ROUND2(a, b, c, d, in[0], 3);
|
||||
ROUND2(d, a, b, c, in[4], 5);
|
||||
ROUND2(c, d, a, b, in[8], 9);
|
||||
ROUND2(b, c, d, a, in[12], 13);
|
||||
ROUND2(a, b, c, d, in[1], 3);
|
||||
ROUND2(d, a, b, c, in[5], 5);
|
||||
ROUND2(c, d, a, b, in[9], 9);
|
||||
ROUND2(b, c, d, a, in[13], 13);
|
||||
ROUND2(a, b, c, d, in[2], 3);
|
||||
ROUND2(d, a, b, c, in[6], 5);
|
||||
ROUND2(c, d, a, b, in[10], 9);
|
||||
ROUND2(b, c, d, a, in[14], 13);
|
||||
ROUND2(a, b, c, d, in[3], 3);
|
||||
ROUND2(d, a, b, c, in[7], 5);
|
||||
ROUND2(c, d, a, b, in[11], 9);
|
||||
ROUND2(b, c, d, a, in[15], 13);
|
||||
|
||||
ROUND3(a, b, c, d, in[0], 3);
|
||||
ROUND3(d, a, b, c, in[8], 9);
|
||||
ROUND3(c, d, a, b, in[4], 11);
|
||||
ROUND3(b, c, d, a, in[12], 15);
|
||||
ROUND3(a, b, c, d, in[2], 3);
|
||||
ROUND3(d, a, b, c, in[10], 9);
|
||||
ROUND3(c, d, a, b, in[6], 11);
|
||||
ROUND3(b, c, d, a, in[14], 15);
|
||||
ROUND3(a, b, c, d, in[1], 3);
|
||||
ROUND3(d, a, b, c, in[9], 9);
|
||||
ROUND3(c, d, a, b, in[5], 11);
|
||||
ROUND3(b, c, d, a, in[13], 15);
|
||||
ROUND3(a, b, c, d, in[3], 3);
|
||||
ROUND3(d, a, b, c, in[11], 9);
|
||||
ROUND3(c, d, a, b, in[7], 11);
|
||||
ROUND3(b, c, d, a, in[15], 15);
|
||||
|
||||
hash[0] += a;
|
||||
hash[1] += b;
|
||||
hash[2] += c;
|
||||
hash[3] += d;
|
||||
}
|
||||
|
||||
static inline void md4_transform_helper(struct md4_ctx *ctx)
|
||||
{
|
||||
le32_to_cpu_array(ctx->block, ARRAY_SIZE(ctx->block));
|
||||
md4_transform(ctx->hash, ctx->block);
|
||||
}
|
||||
|
||||
int cifs_md4_init(struct md4_ctx *mctx)
|
||||
{
|
||||
memset(mctx, 0, sizeof(struct md4_ctx));
|
||||
mctx->hash[0] = 0x67452301;
|
||||
mctx->hash[1] = 0xefcdab89;
|
||||
mctx->hash[2] = 0x98badcfe;
|
||||
mctx->hash[3] = 0x10325476;
|
||||
mctx->byte_count = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cifs_md4_init);
|
||||
|
||||
int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len)
|
||||
{
|
||||
const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f);
|
||||
|
||||
mctx->byte_count += len;
|
||||
|
||||
if (avail > len) {
|
||||
memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
|
||||
data, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
memcpy((char *)mctx->block + (sizeof(mctx->block) - avail),
|
||||
data, avail);
|
||||
|
||||
md4_transform_helper(mctx);
|
||||
data += avail;
|
||||
len -= avail;
|
||||
|
||||
while (len >= sizeof(mctx->block)) {
|
||||
memcpy(mctx->block, data, sizeof(mctx->block));
|
||||
md4_transform_helper(mctx);
|
||||
data += sizeof(mctx->block);
|
||||
len -= sizeof(mctx->block);
|
||||
}
|
||||
|
||||
memcpy(mctx->block, data, len);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cifs_md4_update);
|
||||
|
||||
int cifs_md4_final(struct md4_ctx *mctx, u8 *out)
|
||||
{
|
||||
const unsigned int offset = mctx->byte_count & 0x3f;
|
||||
char *p = (char *)mctx->block + offset;
|
||||
int padding = 56 - (offset + 1);
|
||||
|
||||
*p++ = 0x80;
|
||||
if (padding < 0) {
|
||||
memset(p, 0x00, padding + sizeof(u64));
|
||||
md4_transform_helper(mctx);
|
||||
p = (char *)mctx->block;
|
||||
padding = 56;
|
||||
}
|
||||
|
||||
memset(p, 0, padding);
|
||||
mctx->block[14] = mctx->byte_count << 3;
|
||||
mctx->block[15] = mctx->byte_count >> 29;
|
||||
le32_to_cpu_array(mctx->block, (sizeof(mctx->block) -
|
||||
sizeof(u64)) / sizeof(u32));
|
||||
md4_transform(mctx->hash, mctx->block);
|
||||
cpu_to_le32_array(mctx->hash, ARRAY_SIZE(mctx->hash));
|
||||
memcpy(out, mctx->hash, sizeof(mctx->hash));
|
||||
memset(mctx, 0, sizeof(*mctx));
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cifs_md4_final);
|
||||
27
fs/cifs_common/md4.h
Normal file
27
fs/cifs_common/md4.h
Normal file
@@ -0,0 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
/*
|
||||
* Common values for ARC4 Cipher Algorithm
|
||||
*/
|
||||
|
||||
#ifndef _CIFS_MD4_H
|
||||
#define _CIFS_MD4_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#define MD4_DIGEST_SIZE 16
|
||||
#define MD4_HMAC_BLOCK_SIZE 64
|
||||
#define MD4_BLOCK_WORDS 16
|
||||
#define MD4_HASH_WORDS 4
|
||||
|
||||
struct md4_ctx {
|
||||
u32 hash[MD4_HASH_WORDS];
|
||||
u32 block[MD4_BLOCK_WORDS];
|
||||
u64 byte_count;
|
||||
};
|
||||
|
||||
|
||||
int cifs_md4_init(struct md4_ctx *mctx);
|
||||
int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len);
|
||||
int cifs_md4_final(struct md4_ctx *mctx, u8 *out);
|
||||
|
||||
#endif /* _CIFS_MD4_H */
|
||||
@@ -26,7 +26,7 @@
|
||||
* it to find the directory entry again if requested. Naively, that would just
|
||||
* mean using the ciphertext filenames. However, since the ciphertext filenames
|
||||
* can contain illegal characters ('\0' and '/'), they must be encoded in some
|
||||
* way. We use base64. But that can cause names to exceed NAME_MAX (255
|
||||
* way. We use base64url. But that can cause names to exceed NAME_MAX (255
|
||||
* bytes), so we also need to use a strong hash to abbreviate long names.
|
||||
*
|
||||
* The filesystem may also need another kind of hash, the "dirhash", to quickly
|
||||
@@ -38,7 +38,7 @@
|
||||
* casefolded directories use this type of dirhash. At least in these cases,
|
||||
* each no-key name must include the name's dirhash too.
|
||||
*
|
||||
* To meet all these requirements, we base64-encode the following
|
||||
* To meet all these requirements, we base64url-encode the following
|
||||
* variable-length structure. It contains the dirhash, or 0's if the filesystem
|
||||
* didn't provide one; up to 149 bytes of the ciphertext name; and for
|
||||
* ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes.
|
||||
@@ -52,15 +52,19 @@ struct fscrypt_nokey_name {
|
||||
u32 dirhash[2];
|
||||
u8 bytes[149];
|
||||
u8 sha256[SHA256_DIGEST_SIZE];
|
||||
}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */
|
||||
}; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */
|
||||
|
||||
/*
|
||||
* Decoded size of max-size nokey name, i.e. a name that was abbreviated using
|
||||
* Decoded size of max-size no-key name, i.e. a name that was abbreviated using
|
||||
* the strong hash and thus includes the 'sha256' field. This isn't simply
|
||||
* sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included.
|
||||
*/
|
||||
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
|
||||
|
||||
/* Encoded size of max-size no-key name */
|
||||
#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \
|
||||
FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX)
|
||||
|
||||
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
|
||||
{
|
||||
if (str->len == 1 && str->name[0] == '.')
|
||||
@@ -175,62 +179,82 @@ static int fname_decrypt(const struct inode *inode,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char lookup_table[65] =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
|
||||
static const char base64url_table[65] =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
|
||||
|
||||
#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
|
||||
#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
|
||||
|
||||
/**
|
||||
* base64_encode() - base64-encode some bytes
|
||||
* @src: the bytes to encode
|
||||
* @len: number of bytes to encode
|
||||
* @dst: (output) the base64-encoded string. Not NUL-terminated.
|
||||
* fscrypt_base64url_encode() - base64url-encode some binary data
|
||||
* @src: the binary data to encode
|
||||
* @srclen: the length of @src in bytes
|
||||
* @dst: (output) the base64url-encoded string. Not NUL-terminated.
|
||||
*
|
||||
* Encodes the input string using characters from the set [A-Za-z0-9+,].
|
||||
* The encoded string is roughly 4/3 times the size of the input string.
|
||||
* Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL
|
||||
* and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used,
|
||||
* as it's unneeded and not required by the RFC. base64url is used instead of
|
||||
* base64 to avoid the '/' character, which isn't allowed in filenames.
|
||||
*
|
||||
* Return: length of the encoded string
|
||||
* Return: the length of the resulting base64url-encoded string in bytes.
|
||||
* This will be equal to FSCRYPT_BASE64URL_CHARS(srclen).
|
||||
*/
|
||||
static int base64_encode(const u8 *src, int len, char *dst)
|
||||
static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst)
|
||||
{
|
||||
int i, bits = 0, ac = 0;
|
||||
u32 ac = 0;
|
||||
int bits = 0;
|
||||
int i;
|
||||
char *cp = dst;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
ac += src[i] << bits;
|
||||
for (i = 0; i < srclen; i++) {
|
||||
ac = (ac << 8) | src[i];
|
||||
bits += 8;
|
||||
do {
|
||||
*cp++ = lookup_table[ac & 0x3f];
|
||||
ac >>= 6;
|
||||
bits -= 6;
|
||||
*cp++ = base64url_table[(ac >> bits) & 0x3f];
|
||||
} while (bits >= 6);
|
||||
}
|
||||
if (bits)
|
||||
*cp++ = lookup_table[ac & 0x3f];
|
||||
*cp++ = base64url_table[(ac << (6 - bits)) & 0x3f];
|
||||
return cp - dst;
|
||||
}
|
||||
|
||||
static int base64_decode(const char *src, int len, u8 *dst)
|
||||
/**
|
||||
* fscrypt_base64url_decode() - base64url-decode a string
|
||||
* @src: the string to decode. Doesn't need to be NUL-terminated.
|
||||
* @srclen: the length of @src in bytes
|
||||
* @dst: (output) the decoded binary data
|
||||
*
|
||||
* Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with
|
||||
* URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't
|
||||
* accepted, nor are non-encoding characters such as whitespace.
|
||||
*
|
||||
* This implementation hasn't been optimized for performance.
|
||||
*
|
||||
* Return: the length of the resulting decoded binary data in bytes,
|
||||
* or -1 if the string isn't a valid base64url string.
|
||||
*/
|
||||
static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
|
||||
{
|
||||
int i, bits = 0, ac = 0;
|
||||
const char *p;
|
||||
u8 *cp = dst;
|
||||
u32 ac = 0;
|
||||
int bits = 0;
|
||||
int i;
|
||||
u8 *bp = dst;
|
||||
|
||||
for (i = 0; i < srclen; i++) {
|
||||
const char *p = strchr(base64url_table, src[i]);
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
p = strchr(lookup_table, src[i]);
|
||||
if (p == NULL || src[i] == 0)
|
||||
return -2;
|
||||
ac += (p - lookup_table) << bits;
|
||||
return -1;
|
||||
ac = (ac << 6) | (p - base64url_table);
|
||||
bits += 6;
|
||||
if (bits >= 8) {
|
||||
*cp++ = ac & 0xff;
|
||||
ac >>= 8;
|
||||
bits -= 8;
|
||||
*bp++ = (u8)(ac >> bits);
|
||||
}
|
||||
}
|
||||
if (ac)
|
||||
if (ac & ((1 << bits) - 1))
|
||||
return -1;
|
||||
return cp - dst;
|
||||
return bp - dst;
|
||||
}
|
||||
|
||||
bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
|
||||
@@ -263,10 +287,8 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
|
||||
int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
|
||||
struct fscrypt_str *crypto_str)
|
||||
{
|
||||
const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
|
||||
u32 max_presented_len;
|
||||
|
||||
max_presented_len = max(max_encoded_len, max_encrypted_len);
|
||||
u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED,
|
||||
max_encrypted_len);
|
||||
|
||||
crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
|
||||
if (!crypto_str->name)
|
||||
@@ -342,7 +364,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
|
||||
offsetof(struct fscrypt_nokey_name, bytes));
|
||||
BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) !=
|
||||
offsetof(struct fscrypt_nokey_name, sha256));
|
||||
BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX);
|
||||
BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX);
|
||||
|
||||
nokey_name.dirhash[0] = hash;
|
||||
nokey_name.dirhash[1] = minor_hash;
|
||||
@@ -358,7 +380,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
|
||||
nokey_name.sha256);
|
||||
size = FSCRYPT_NOKEY_NAME_MAX;
|
||||
}
|
||||
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
|
||||
oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size,
|
||||
oname->name);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
|
||||
@@ -432,14 +455,15 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
|
||||
* user-supplied name
|
||||
*/
|
||||
|
||||
if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX))
|
||||
if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED)
|
||||
return -ENOENT;
|
||||
|
||||
fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL);
|
||||
if (fname->crypto_buf.name == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name);
|
||||
ret = fscrypt_base64url_decode(iname->name, iname->len,
|
||||
fname->crypto_buf.name);
|
||||
if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) ||
|
||||
(ret > offsetof(struct fscrypt_nokey_name, sha256) &&
|
||||
ret != FSCRYPT_NOKEY_NAME_MAX)) {
|
||||
|
||||
@@ -384,3 +384,47 @@ err_kfree:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
|
||||
|
||||
/**
|
||||
* fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
|
||||
* @path: the path for the encrypted symlink being queried
|
||||
* @stat: the struct being filled with the symlink's attributes
|
||||
*
|
||||
* Override st_size of encrypted symlinks to be the length of the decrypted
|
||||
* symlink target (or the no-key encoded symlink target, if the key is
|
||||
* unavailable) rather than the length of the encrypted symlink target. This is
|
||||
* necessary for st_size to match the symlink target that userspace actually
|
||||
* sees. POSIX requires this, and some userspace programs depend on it.
|
||||
*
|
||||
* This requires reading the symlink target from disk if needed, setting up the
|
||||
* inode's encryption key if possible, and then decrypting or encoding the
|
||||
* symlink target. This makes lstat() more heavyweight than is normally the
|
||||
* case. However, decrypted symlink targets will be cached in ->i_link, so
|
||||
* usually the symlink won't have to be read and decrypted again later if/when
|
||||
* it is actually followed, readlink() is called, or lstat() is called again.
|
||||
*
|
||||
* Return: 0 on success, -errno on failure
|
||||
*/
|
||||
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
|
||||
{
|
||||
struct dentry *dentry = path->dentry;
|
||||
struct inode *inode = d_inode(dentry);
|
||||
const char *link;
|
||||
DEFINE_DELAYED_CALL(done);
|
||||
|
||||
/*
|
||||
* To get the symlink target that userspace will see (whether it's the
|
||||
* decrypted target or the no-key encoded target), we can just get it in
|
||||
* the same way the VFS does during path resolution and readlink().
|
||||
*/
|
||||
link = READ_ONCE(inode->i_link);
|
||||
if (!link) {
|
||||
link = inode->i_op->get_link(dentry, inode, &done);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
stat->size = strlen(link);
|
||||
do_delayed_call(&done);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
|
||||
|
||||
@@ -2070,10 +2070,8 @@ SYSCALL_DEFINE5(execveat,
|
||||
const char __user *const __user *, envp,
|
||||
int, flags)
|
||||
{
|
||||
int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
|
||||
|
||||
return do_execveat(fd,
|
||||
getname_flags(filename, lookup_flags, NULL),
|
||||
getname_uflags(filename, flags),
|
||||
argv, envp, flags);
|
||||
}
|
||||
|
||||
@@ -2091,10 +2089,8 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
|
||||
const compat_uptr_t __user *, envp,
|
||||
int, flags)
|
||||
{
|
||||
int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
|
||||
|
||||
return compat_do_execveat(fd,
|
||||
getname_flags(filename, lookup_flags, NULL),
|
||||
getname_uflags(filename, flags),
|
||||
argv, envp, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -52,10 +52,20 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
|
||||
return paddr;
|
||||
}
|
||||
|
||||
static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
|
||||
const struct path *path,
|
||||
struct kstat *stat, u32 request_mask,
|
||||
unsigned int query_flags)
|
||||
{
|
||||
ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
|
||||
|
||||
return fscrypt_symlink_getattr(path, stat);
|
||||
}
|
||||
|
||||
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
|
||||
.get_link = ext4_encrypted_get_link,
|
||||
.setattr = ext4_setattr,
|
||||
.getattr = ext4_getattr,
|
||||
.getattr = ext4_encrypted_symlink_getattr,
|
||||
.listxattr = ext4_listxattr,
|
||||
};
|
||||
|
||||
|
||||
@@ -1323,9 +1323,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
|
||||
return target;
|
||||
}
|
||||
|
||||
static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
|
||||
const struct path *path,
|
||||
struct kstat *stat, u32 request_mask,
|
||||
unsigned int query_flags)
|
||||
{
|
||||
f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags);
|
||||
|
||||
return fscrypt_symlink_getattr(path, stat);
|
||||
}
|
||||
|
||||
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
|
||||
.get_link = f2fs_encrypted_get_link,
|
||||
.getattr = f2fs_getattr,
|
||||
.getattr = f2fs_encrypted_symlink_getattr,
|
||||
.setattr = f2fs_setattr,
|
||||
.listxattr = f2fs_listxattr,
|
||||
};
|
||||
|
||||
@@ -2729,23 +2729,6 @@ int write_inode_now(struct inode *inode, int sync)
|
||||
}
|
||||
EXPORT_SYMBOL(write_inode_now);
|
||||
|
||||
/**
|
||||
* sync_inode - write an inode and its pages to disk.
|
||||
* @inode: the inode to sync
|
||||
* @wbc: controls the writeback mode
|
||||
*
|
||||
* sync_inode() will write an inode and its pages to disk. It will also
|
||||
* correctly update the inode on its superblock's dirty inode lists and will
|
||||
* update inode->i_state.
|
||||
*
|
||||
* The caller must have a ref on the inode.
|
||||
*/
|
||||
int sync_inode(struct inode *inode, struct writeback_control *wbc)
|
||||
{
|
||||
return writeback_single_inode(inode, wbc);
|
||||
}
|
||||
EXPORT_SYMBOL(sync_inode);
|
||||
|
||||
/**
|
||||
* sync_inode_metadata - write an inode to disk
|
||||
* @inode: the inode to sync
|
||||
@@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait)
|
||||
.nr_to_write = 0, /* metadata-only */
|
||||
};
|
||||
|
||||
return sync_inode(inode, &wbc);
|
||||
return writeback_single_inode(inode, &wbc);
|
||||
}
|
||||
EXPORT_SYMBOL(sync_inode_metadata);
|
||||
|
||||
@@ -574,10 +574,9 @@ void adjust_fs_space(struct inode *inode)
|
||||
{
|
||||
struct gfs2_sbd *sdp = GFS2_SB(inode);
|
||||
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
|
||||
struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
|
||||
struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
|
||||
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
|
||||
struct buffer_head *m_bh, *l_bh;
|
||||
struct buffer_head *m_bh;
|
||||
u64 fs_total, new_free;
|
||||
|
||||
if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0)
|
||||
@@ -600,11 +599,7 @@ void adjust_fs_space(struct inode *inode)
|
||||
(unsigned long long)new_free);
|
||||
gfs2_statfs_change(sdp, new_free, new_free, 0);
|
||||
|
||||
if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
|
||||
goto out2;
|
||||
update_statfs(sdp, m_bh, l_bh);
|
||||
brelse(l_bh);
|
||||
out2:
|
||||
update_statfs(sdp, m_bh);
|
||||
brelse(m_bh);
|
||||
out:
|
||||
sdp->sd_rindex_uptodate = 0;
|
||||
|
||||
@@ -1494,12 +1494,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
|
||||
|
||||
list_del_init(&gh->gh_list);
|
||||
clear_bit(HIF_HOLDER, &gh->gh_iflags);
|
||||
if (find_first_holder(gl) == NULL) {
|
||||
if (list_empty(&gl->gl_holders) &&
|
||||
!test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
|
||||
!test_bit(GLF_DEMOTE, &gl->gl_flags))
|
||||
fast_path = 1;
|
||||
}
|
||||
|
||||
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
|
||||
gfs2_glock_add_to_lru(gl);
|
||||
|
||||
@@ -2077,8 +2076,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
|
||||
*p++ = 'H';
|
||||
if (test_bit(HIF_WAIT, &iflags))
|
||||
*p++ = 'W';
|
||||
if (test_bit(HIF_FIRST, &iflags))
|
||||
*p++ = 'F';
|
||||
*p = 0;
|
||||
return buf;
|
||||
}
|
||||
|
||||
@@ -33,16 +33,18 @@ extern struct workqueue_struct *gfs2_control_wq;
|
||||
|
||||
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
|
||||
{
|
||||
fs_err(gl->gl_name.ln_sbd,
|
||||
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
|
||||
|
||||
fs_err(sdp,
|
||||
"AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
|
||||
"state 0x%lx\n",
|
||||
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
|
||||
bh->b_page->mapping, bh->b_page->flags);
|
||||
fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
|
||||
fs_err(sdp, "AIL glock %u:%llu mapping %p\n",
|
||||
gl->gl_name.ln_type, gl->gl_name.ln_number,
|
||||
gfs2_glock2aspace(gl));
|
||||
gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n");
|
||||
gfs2_withdraw(gl->gl_name.ln_sbd);
|
||||
gfs2_lm(sdp, "AIL error\n");
|
||||
gfs2_withdraw_delayed(sdp);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -610,17 +612,14 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
|
||||
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
|
||||
|
||||
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
|
||||
if (error)
|
||||
gfs2_consist(sdp);
|
||||
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
|
||||
gfs2_consist(sdp);
|
||||
|
||||
/* Initialize some head of the log stuff */
|
||||
if (!gfs2_withdrawn(sdp)) {
|
||||
if (gfs2_assert_withdraw_delayed(sdp, !error))
|
||||
return error;
|
||||
if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags &
|
||||
GFS2_LOG_HEAD_UNMOUNT))
|
||||
return -EIO;
|
||||
sdp->sd_log_sequence = head.lh_sequence + 1;
|
||||
gfs2_log_pointers_init(sdp, head.lh_blkno);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -253,7 +253,6 @@ struct gfs2_lkstats {
|
||||
enum {
|
||||
/* States */
|
||||
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
|
||||
HIF_FIRST = 7,
|
||||
HIF_WAIT = 10,
|
||||
};
|
||||
|
||||
@@ -768,6 +767,7 @@ struct gfs2_sbd {
|
||||
struct gfs2_glock *sd_jinode_gl;
|
||||
|
||||
struct gfs2_holder sd_sc_gh;
|
||||
struct buffer_head *sd_sc_bh;
|
||||
struct gfs2_holder sd_qc_gh;
|
||||
|
||||
struct completion sd_journal_ready;
|
||||
|
||||
@@ -299,6 +299,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
|
||||
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
|
||||
gfs2_update_request_times(gl);
|
||||
|
||||
/* don't want to call dlm if we've unmounted the lock protocol */
|
||||
if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) {
|
||||
gfs2_glock_free(gl);
|
||||
return;
|
||||
}
|
||||
/* don't want to skip dlm_unlock writing the lvb when lock has one */
|
||||
|
||||
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
|
||||
|
||||
@@ -594,7 +594,7 @@ void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
|
||||
{
|
||||
unsigned int blks = tr->tr_reserved;
|
||||
unsigned int revokes = tr->tr_revokes;
|
||||
unsigned int revoke_blks = 0;
|
||||
unsigned int revoke_blks;
|
||||
|
||||
*extra_revokes = 0;
|
||||
if (revokes) {
|
||||
|
||||
@@ -761,6 +761,32 @@ static void buf_lo_before_scan(struct gfs2_jdesc *jd,
|
||||
jd->jd_replayed_blocks = 0;
|
||||
}
|
||||
|
||||
#define obsolete_rgrp_replay \
|
||||
"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n"
|
||||
#define obsolete_rgrp_replay2 \
|
||||
"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n"
|
||||
|
||||
static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log,
|
||||
u64 blkno)
|
||||
{
|
||||
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
|
||||
struct gfs2_rgrpd *rgd;
|
||||
struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data;
|
||||
|
||||
rgd = gfs2_blk2rgrpd(sdp, blkno, false);
|
||||
if (rgd && rgd->rd_addr == blkno &&
|
||||
rgd->rd_bits && rgd->rd_bits->bi_bh) {
|
||||
fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno,
|
||||
jd->jd_jid, bh_log->b_blocknr);
|
||||
fs_info(sdp, obsolete_rgrp_replay2,
|
||||
buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
|
||||
buffer_pinned(rgd->rd_bits->bi_bh),
|
||||
rgd->rd_igeneration,
|
||||
be64_to_cpu(jrgd->rg_igeneration));
|
||||
gfs2_dump_glock(NULL, rgd->rd_gl, true);
|
||||
}
|
||||
}
|
||||
|
||||
static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
|
||||
struct gfs2_log_descriptor *ld, __be64 *ptr,
|
||||
int pass)
|
||||
@@ -799,21 +825,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start,
|
||||
struct gfs2_meta_header *mh =
|
||||
(struct gfs2_meta_header *)bh_ip->b_data;
|
||||
|
||||
if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) {
|
||||
struct gfs2_rgrpd *rgd;
|
||||
if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG))
|
||||
obsolete_rgrp(jd, bh_log, blkno);
|
||||
|
||||
rgd = gfs2_blk2rgrpd(sdp, blkno, false);
|
||||
if (rgd && rgd->rd_addr == blkno &&
|
||||
rgd->rd_bits && rgd->rd_bits->bi_bh) {
|
||||
fs_info(sdp, "Replaying 0x%llx but we "
|
||||
"already have a bh!\n",
|
||||
(unsigned long long)blkno);
|
||||
fs_info(sdp, "busy:%d, pinned:%d\n",
|
||||
buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0,
|
||||
buffer_pinned(rgd->rd_bits->bi_bh));
|
||||
gfs2_dump_glock(NULL, rgd->rd_gl, true);
|
||||
}
|
||||
}
|
||||
mark_buffer_dirty(bh_ip);
|
||||
}
|
||||
brelse(bh_log);
|
||||
|
||||
@@ -258,8 +258,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
|
||||
struct buffer_head *bh, *bhs[2];
|
||||
int num = 0;
|
||||
|
||||
if (unlikely(gfs2_withdrawn(sdp)) &&
|
||||
(!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) {
|
||||
if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) {
|
||||
*bhp = NULL;
|
||||
return -EIO;
|
||||
}
|
||||
@@ -317,7 +316,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
|
||||
|
||||
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
|
||||
{
|
||||
if (unlikely(gfs2_withdrawn(sdp)))
|
||||
if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
|
||||
return -EIO;
|
||||
|
||||
wait_on_buffer(bh);
|
||||
@@ -328,7 +327,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
|
||||
gfs2_io_error_bh_wd(sdp, bh);
|
||||
return -EIO;
|
||||
}
|
||||
if (unlikely(gfs2_withdrawn(sdp)))
|
||||
if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user