Merge fd47ff55c9 ("Merge tag 'usb-5.15-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb") into android-mainline

Steps on the way to 5.15-rc1. Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I42ffa8818bbb2072f043923553c4d8f91d9647a5
2021-09-14 14:42:51 +02:00
parent 6ca04bd7ae fd47ff55c9
commit c5cd945b24
150 changed files with 33297 additions and 1301 deletions
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -41,8 +41,7 @@ Description:	This parameter controls the number of prefree segments to be
 What:		/sys/fs/f2fs/<disk>/main_blkaddr
 Date:		November 2019
 Contact:	"Ramon Pantin" <pantin@google.com>
-Description:
-		 Shows first block address of MAIN area.
+Description:	Shows first block address of MAIN area.

 What:		/sys/fs/f2fs/<disk>/ipu_policy
 Date:		November 2013
@@ -493,3 +492,23 @@ Contact:	"Chao Yu" <yuchao0@huawei.com>
 Description:	When ATGC is on, it controls age threshold to bypass GCing young
 		candidates whose age is not beyond the threshold, by default it was
 		initialized as 604800 seconds (equals to 7 days).
+
+What:		/sys/fs/f2fs/<disk>/gc_reclaimed_segments
+Date:		July 2021
+Contact:	"Daeho Jeong" <daehojeong@google.com>
+Description:	Show how many segments have been reclaimed by GC during a specific
+		GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
+		3: GC idle AT, 4: GC urgent high, 5: GC urgent low)
+		You can re-initialize this value to "0".
+
+What:		/sys/fs/f2fs/<disk>/gc_segment_mode
+Date:		July 2021
+Contact:	"Daeho Jeong" <daehojeong@google.com>
+Description:	You can control for which gc mode the "gc_reclaimed_segments" node shows.
+		Refer to the description of the modes in "gc_reclaimed_segments".
+
+What:		/sys/fs/f2fs/<disk>/seq_file_ra_mul
+Date:		July 2021
+Contact:	"Daeho Jeong" <daehojeong@google.com>
+Description:	You can	control the multiplier value of	bdi device readahead window size
+		between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option.
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -122,7 +122,7 @@ on various other factors also like;
 	so the device should have enough free bytes available its OOB/Spare
 	area to accommodate ECC for entire page. In general following expression
 	helps in determining if given device can accommodate ECC syndrome:
-	"2 + (PAGESIZE / 512) * ECC_BYTES" >= OOBSIZE"
+	"2 + (PAGESIZE / 512) * ECC_BYTES" <= OOBSIZE"
 	where
 		OOBSIZE		number of bytes in OOB/spare area
 		PAGESIZE	number of bytes in main-area of device page
--- a/Documentation/devicetree/bindings/mtd/partitions/redboot-fis.txt
+++ b/Documentation/devicetree/bindings/mtd/partitions/redboot-fis.txt
@@ -1,27 +0,0 @@
-RedBoot FLASH Image System (FIS) Partitions
-===========================================
-
-The FLASH Image System (FIS) directory is a flash description
-format closely associated with the RedBoot boot loader.
-
-It uses one single flash eraseblock in the flash to store an index of
-all images in the flash.
-
-This block size will vary depending on flash but is typically
-32 KB in size.
-
-Required properties:
- compatible : (required) must be "redboot-fis"
- fis-index-block : (required) a index to the eraseblock containing
-  the FIS directory on this device. On a flash memory with 32KB
-  eraseblocks, 0 means the first eraseblock at 0x00000000, 1 means the
-  second eraseblock at 0x00008000 and so on.
-
-Example:
-
-flash@0 {
-	partitions {
-		compatible = "redboot-fis";
-		fis-index-block = <0>;
-	};
-};
--- a/Documentation/devicetree/bindings/mtd/partitions/redboot-fis.yaml
+++ b/Documentation/devicetree/bindings/mtd/partitions/redboot-fis.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/redboot-fis.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RedBoot FLASH Image System (FIS) Partitions
+
+description: The FLASH Image System (FIS) directory is a flash description
+    format closely associated with the RedBoot boot loader.
+    It uses one single flash eraseblock in the flash to store an index of
+    all images in the flash.
+    This block size will vary depending on flash but is typically
+    32 KB in size.
+
+maintainers:
+  - Linus Walleij <linus.walleij@linaro.org>
+
+properties:
+  compatible:
+    const: redboot-fis
+
+  fis-index-block:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: a index to the eraseblock containing the FIS directory on this
+      device. On a flash memory with 32KB eraseblocks, 0 means the first
+      eraseblock at 0x00000000, 1 means the second eraseblock at 0x00008000 and so on.
+
+required:
+  - compatible
+  - fis-index-block
+
+additionalProperties: false
+
+examples:
+  - |
+    flash {
+      partitions {
+        compatible = "redboot-fis";
+        fis-index-block = <0>;
+      };
+    };
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -185,6 +185,7 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 FAULT_KVMALLOC		  0x000000002
 			 FAULT_PAGE_ALLOC	  0x000000004
 			 FAULT_PAGE_GET		  0x000000008
+			 FAULT_ALLOC_BIO	  0x000000010 (obsolete)
 			 FAULT_ALLOC_NID	  0x000000020
 			 FAULT_ORPHAN		  0x000000040
 			 FAULT_BLOCK		  0x000000080
@@ -195,6 +196,7 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 FAULT_CHECKPOINT	  0x000001000
 			 FAULT_DISCARD		  0x000002000
 			 FAULT_WRITE_IO		  0x000004000
+			 FAULT_SLAB_ALLOC	  0x000008000
 			 ===================	  ===========
 mode=%s			 Control block allocation mode which supports "adaptive"
 			 and "lfs". In "lfs" mode, there should be no random
@@ -312,6 +314,14 @@ inlinecrypt		 When possible, encrypt/decrypt the contents of encrypted
 			 Documentation/block/inline-encryption.rst.
 atgc			 Enable age-threshold garbage collection, it provides high
 			 effectiveness and efficiency on background GC.
+discard_unit=%s		 Control discard unit, the argument can be "block", "segment"
+			 and "section", issued discard command's offset/size will be
+			 aligned to the unit, by default, "discard_unit=block" is set,
+			 so that small discard functionality is enabled.
+			 For blkzoned device, "discard_unit=section" will be set by
+			 default, it is helpful for large sized SMR or ZNS devices to
+			 reduce memory cost by getting rid of fs metadata supports small
+			 discard.
 ======================== ============================================================

 Debugfs Entries
@@ -857,8 +867,11 @@ Compression implementation
  directly in order to guarantee potential data updates later to the space.
  Instead, the main goal is to reduce data writes to flash disk as much as
  possible, resulting in extending disk life time as well as relaxing IO
-  congestion. Alternatively, we've added ioctl interface to reclaim compressed
-  space and show it to user after putting the immutable bit.
+  congestion. Alternatively, we've added ioctl(F2FS_IOC_RELEASE_COMPRESS_BLOCKS)
+  interface to reclaim compressed space and show it to user after putting the
+  immutable bit. Immutable bit, after release, it doesn't allow writing/mmaping
+  on the file, until reserving compressed space via
+  ioctl(F2FS_IOC_RESERVE_COMPRESS_BLOCKS) or truncating filesize to zero.

 Compress metadata layout::

--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -101,6 +101,7 @@ Documentation for filesystem implementations.
   nilfs2
   nfs/index
   ntfs
+   ntfs3
   ocfs2
   ocfs2-online-filecheck
   omfs
--- a/Documentation/filesystems/ntfs3.rst
+++ b/Documentation/filesystems/ntfs3.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====
+NTFS3
+=====
+
+
+Summary and Features
+====================
+
+NTFS3 is fully functional NTFS Read-Write driver. The driver works with
+NTFS versions up to 3.1, normal/compressed/sparse files
+and journal replaying. File system type to use on mount is 'ntfs3'.
+
+- This driver implements NTFS read/write support for normal, sparse and
+  compressed files.
+- Supports native journal replaying;
+- Supports extended attributes
+	Predefined extended attributes:
+	- 'system.ntfs_security' gets/sets security
+			descriptor (SECURITY_DESCRIPTOR_RELATIVE)
+	- 'system.ntfs_attrib' gets/sets ntfs file/dir attributes.
+		Note: applied to empty files, this allows to switch type between
+		sparse(0x200), compressed(0x800) and normal;
+- Supports NFS export of mounted NTFS volumes.
+
+Mount Options
+=============
+
+The list below describes mount options supported by NTFS3 driver in addition to
+generic ones.
+
+===============================================================================
+
+nls=name		This option informs the driver how to interpret path
+			strings and translate them to Unicode and back. If
+			this option is not set, the default codepage will be
+			used (CONFIG_NLS_DEFAULT).
+			Examples:
+				'nls=utf8'
+
+uid=
+gid=
+umask=			Controls the default permissions for files/directories created
+			after the NTFS volume is mounted.
+
+fmask=
+dmask=			Instead of specifying umask which applies both to
+			files and directories, fmask applies only to files and
+			dmask only to directories.
+
+nohidden		Files with the Windows-specific HIDDEN (FILE_ATTRIBUTE_HIDDEN)
+			attribute will not be shown under Linux.
+
+sys_immutable		Files with the Windows-specific SYSTEM
+			(FILE_ATTRIBUTE_SYSTEM) attribute will be marked as system
+			immutable files.
+
+discard			Enable support of the TRIM command for improved performance
+			on delete operations, which is recommended for use with the
+			solid-state drives (SSD).
+
+force			Forces the driver to mount partitions even if 'dirty' flag
+			(volume dirty) is set. Not recommended for use.
+
+sparse			Create new files as "sparse".
+
+showmeta		Use this parameter to show all meta-files (System Files) on
+			a mounted NTFS partition.
+			By default, all meta-files are hidden.
+
+prealloc		Preallocate space for files excessively when file size is
+			increasing on writes. Decreases fragmentation in case of
+			parallel write operations to different files.
+
+no_acs_rules		"No access rules" mount option sets access rights for
+			files/folders to 777 and owner/group to root. This mount
+			option absorbs all other permissions:
+			- permissions change for files/folders will be reported
+				as successful, but they will remain 777;
+			- owner/group change will be reported as successful, but
+				they will stay as root
+
+acl			Support POSIX ACLs (Access Control Lists). Effective if
+			supported by Kernel. Not to be confused with NTFS ACLs.
+			The option specified as acl enables support for POSIX ACLs.
+
+noatime			All files and directories will not update their last access
+			time attribute if a partition is mounted with this parameter.
+			This option can speed up file system operation.
+
+===============================================================================
+
+ToDo list
+=========
+
+- Full journaling support (currently journal replaying is supported) over JBD.
+
+
+References
+==========
+https://www.paragon-software.com/home/ntfs-linux-professional/
+	- Commercial version of the NTFS driver for Linux.
+
+almaz.alexandrovich@paragon-software.com
+	- Direct e-mail address for feedback and requests on the NTFS3 implementation.
--- a/13
+++ b/13
@@ -1496,7 +1496,7 @@ F:	drivers/amba/
 F:	include/linux/amba/bus.h

 ARM PRIMECELL PL35X NAND CONTROLLER DRIVER
-M:	Miquel Raynal <miquel.raynal@bootlin.com@bootlin.com>
+M:	Miquel Raynal <miquel.raynal@bootlin.com>
 M:	Naga Sureshkumar Relli <nagasure@xilinx.com>
 L:	linux-mtd@lists.infradead.org
 S:	Maintained
@@ -1504,7 +1504,7 @@ F:	Documentation/devicetree/bindings/mtd/arm,pl353-nand-r2p1.yaml
 F:	drivers/mtd/nand/raw/pl35x-nand-controller.c

 ARM PRIMECELL PL35X SMC DRIVER
-M:	Miquel Raynal <miquel.raynal@bootlin.com@bootlin.com>
+M:	Miquel Raynal <miquel.raynal@bootlin.com>
 M:	Naga Sureshkumar Relli <nagasure@xilinx.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -13353,6 +13353,15 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/aia21/ntfs.git
 F:	Documentation/filesystems/ntfs.rst
 F:	fs/ntfs/

+NTFS3 FILESYSTEM
+M:	Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
+L:	ntfs3@lists.linux.dev
+S:	Supported
+W:	http://www.paragon-software.com/
+T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
+F:	Documentation/filesystems/ntfs3.rst
+F:	fs/ntfs3/
+
 NUBUS SUBSYSTEM
 M:	Finn Thain <fthain@linux-m68k.org>
 L:	linux-m68k@lists.linux-m68k.org
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -202,8 +202,7 @@ static int load_aout_binary(struct linux_binprm *bprm)

 		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
 				PROT_READ | PROT_EXEC,
-				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
-				MAP_32BIT,
+				MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
 				fd_offset);

 		if (error != N_TXTADDR(ex))
@@ -211,8 +210,7 @@ static int load_aout_binary(struct linux_binprm *bprm)

 		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
 				PROT_READ | PROT_WRITE | PROT_EXEC,
-				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
-				MAP_32BIT,
+				MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
 				fd_offset + ex.a_text);
 		if (error != N_DATADDR(ex))
 			return error;
@@ -293,7 +291,7 @@ static int load_aout_library(struct file *file)
 	/* Now use mmap to map the library into memory. */
 	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+			MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
 			N_TXTOFF(ex));
 	retval = error;
 	if (error != start_addr)
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -45,10 +45,9 @@ config MTD_BLOCK
 	  on RAM chips in this manner. This block device is a user of MTD
 	  devices performing that function.

-	  At the moment, it is also required for the Journalling Flash File
-	  System(s) to obtain a handle on the MTD device when it's mounted
-	  (although JFFS and JFFS2 don't actually use any of the functionality
-	  of the mtdblock device).
+	  Note that mounting a JFFS2 filesystem doesn't require using mtdblock.
+	  It's possible to mount a rootfs using the MTD device on the "root="
+	  bootargs as "root=mtd2" or "root=mtd:name_of_device".

 	  Later, it may be extended to perform read/erase/modify/write cycles
 	  on flash chips to emulate a smaller block size. Needless to say,
@@ -70,6 +69,9 @@ config MTD_BLOCK_RO
 	  You do not need this option for use with the DiskOnChip devices. For
 	  those, enable NFTL support (CONFIG_NFTL) instead.

+comment "Note that in some cases UBI block is preferred. See MTD_UBI_BLOCK."
+	depends on MTD_BLOCK || MTD_BLOCK_RO
+
 config FTL
 	tristate "FTL (Flash Translation Layer) support"
 	depends on BLOCK
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -1029,7 +1029,7 @@ static void ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)

 		partition->mbd.tr = tr;
 		partition->mbd.devnum = -1;
-		if (!add_mtd_blktrans_dev((void *)partition))
+		if (!add_mtd_blktrans_dev(&partition->mbd))
 			return;
 	}

--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -127,29 +127,6 @@ config MTD_PHYSMAP_GPIO_ADDR
 	  Extend the physmap driver to allow flashes to be partially
 	  physically addressed and assisted by GPIOs.

-config MTD_PMC_MSP_EVM
-	tristate "CFI Flash device mapped on PMC-Sierra MSP"
-	depends on PMC_MSP && MTD_CFI
-	help
-	  This provides a 'mapping' driver which supports the way
-	  in which user-programmable flash chips are connected on the
-	  PMC-Sierra MSP eval/demo boards.
-
-choice
-	prompt "Maximum mappable memory available for flash IO"
-	depends on MTD_PMC_MSP_EVM
-	default MSP_FLASH_MAP_LIMIT_32M
-
-config MSP_FLASH_MAP_LIMIT_32M
-	bool "32M"
-
-endchoice
-
-config MSP_FLASH_MAP_LIMIT
-	hex
-	default "0x02000000"
-	depends on MSP_FLASH_MAP_LIMIT_32M
-
 config MTD_SUN_UFLASH
 	tristate "Sun Microsystems userflash support"
 	depends on SPARC && MTD_CFI && PCI
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -25,7 +25,6 @@ physmap-objs-$(CONFIG_MTD_PHYSMAP_IXP4XX) += physmap-ixp4xx.o
 physmap-objs			:= $(physmap-objs-y)
 obj-$(CONFIG_MTD_PHYSMAP)	+= physmap.o
 obj-$(CONFIG_MTD_PISMO)		+= pismo.o
-obj-$(CONFIG_MTD_PMC_MSP_EVM)   += pmcmsp-flash.o
 obj-$(CONFIG_MTD_PCMCIA)	+= pcmciamtd.o
 obj-$(CONFIG_MTD_SA1100)	+= sa1100-flash.o
 obj-$(CONFIG_MTD_SBC_GXX)	+= sbc_gxx.o
--- a/drivers/mtd/maps/pmcmsp-flash.c
+++ b/drivers/mtd/maps/pmcmsp-flash.c
@@ -1,227 +0,0 @@
-/*
- * Mapping of a custom board with both AMD CFI and JEDEC flash in partitions.
- * Config with both CFI and JEDEC device support.
- *
- * Basically physmap.c with the addition of partitions and
- * an array of mapping info to accommodate more than one flash type per board.
- *
- * Copyright 2005-2007 PMC-Sierra, Inc.
- *
- *  This program is free software; you can redistribute  it and/or modify it
- *  under  the terms of  the GNU General  Public License as published by the
- *  Free Software Foundation;  either version 2 of the  License, or (at your
- *  option) any later version.
- *
- *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
- *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
- *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
- *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
- *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the  GNU General Public License along
- *  with this program; if not, write  to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-#include <asm/io.h>
-
-#include <msp_prom.h>
-#include <msp_regs.h>
-
-
-static struct mtd_info **msp_flash;
-static struct mtd_partition **msp_parts;
-static struct map_info *msp_maps;
-static int fcnt;
-
-#define DEBUG_MARKER printk(KERN_NOTICE "%s[%d]\n", __func__, __LINE__)
-
-static int __init init_msp_flash(void)
-{
-	int i, j, ret = -ENOMEM;
-	int offset, coff;
-	char *env;
-	int pcnt;
-	char flash_name[] = "flash0";
-	char part_name[] = "flash0_0";
-	unsigned addr, size;
-
-	/* If ELB is disabled by "ful-mux" mode, we can't get at flash */
-	if ((*DEV_ID_REG & DEV_ID_SINGLE_PC) &&
-	    (*ELB_1PC_EN_REG & SINGLE_PCCARD)) {
-		printk(KERN_NOTICE "Single PC Card mode: no flash access\n");
-		return -ENXIO;
-	}
-
-	/* examine the prom environment for flash devices */
-	for (fcnt = 0; (env = prom_getenv(flash_name)); fcnt++)
-		flash_name[5] = '0' + fcnt + 1;
-
-	if (fcnt < 1)
-		return -ENXIO;
-
-	printk(KERN_NOTICE "Found %d PMC flash devices\n", fcnt);
-
-	msp_flash = kcalloc(fcnt, sizeof(*msp_flash), GFP_KERNEL);
-	if (!msp_flash)
-		return -ENOMEM;
-
-	msp_parts = kcalloc(fcnt, sizeof(*msp_parts), GFP_KERNEL);
-	if (!msp_parts)
-		goto free_msp_flash;
-
-	msp_maps = kcalloc(fcnt, sizeof(*msp_maps), GFP_KERNEL);
-	if (!msp_maps)
-		goto free_msp_parts;
-
-	/* loop over the flash devices, initializing each */
-	for (i = 0; i < fcnt; i++) {
-		/* examine the prom environment for flash partititions */
-		part_name[5] = '0' + i;
-		part_name[7] = '0';
-		for (pcnt = 0; (env = prom_getenv(part_name)); pcnt++)
-			part_name[7] = '0' + pcnt + 1;
-
-		if (pcnt == 0) {
-			printk(KERN_NOTICE "Skipping flash device %d "
-				"(no partitions defined)\n", i);
-			continue;
-		}
-
-		msp_parts[i] = kcalloc(pcnt, sizeof(struct mtd_partition),
-				       GFP_KERNEL);
-		if (!msp_parts[i])
-			goto cleanup_loop;
-
-		/* now initialize the devices proper */
-		flash_name[5] = '0' + i;
-		env = prom_getenv(flash_name);
-
-		if (sscanf(env, "%x:%x", &addr, &size) < 2) {
-			ret = -ENXIO;
-			kfree(msp_parts[i]);
-			goto cleanup_loop;
-		}
-		addr = CPHYSADDR(addr);
-
-		printk(KERN_NOTICE
-			"MSP flash device \"%s\": 0x%08x at 0x%08x\n",
-			flash_name, size, addr);
-		/* This must matchs the actual size of the flash chip */
-		msp_maps[i].size = size;
-		msp_maps[i].phys = addr;
-
-		/*
-		 * Platforms have a specific limit of the size of memory
-		 * which may be mapped for flash:
-		 */
-		if (size > CONFIG_MSP_FLASH_MAP_LIMIT)
-			size = CONFIG_MSP_FLASH_MAP_LIMIT;
-
-		msp_maps[i].virt = ioremap(addr, size);
-		if (msp_maps[i].virt == NULL) {
-			ret = -ENXIO;
-			kfree(msp_parts[i]);
-			goto cleanup_loop;
-		}
-
-		msp_maps[i].bankwidth = 1;
-		msp_maps[i].name = kstrndup(flash_name, 7, GFP_KERNEL);
-		if (!msp_maps[i].name) {
-			iounmap(msp_maps[i].virt);
-			kfree(msp_parts[i]);
-			goto cleanup_loop;
-		}
-
-		for (j = 0; j < pcnt; j++) {
-			part_name[5] = '0' + i;
-			part_name[7] = '0' + j;
-
-			env = prom_getenv(part_name);
-
-			if (sscanf(env, "%x:%x:%n", &offset, &size,
-						&coff) < 2) {
-				ret = -ENXIO;
-				kfree(msp_maps[i].name);
-				iounmap(msp_maps[i].virt);
-				kfree(msp_parts[i]);
-				goto cleanup_loop;
-			}
-
-			msp_parts[i][j].size = size;
-			msp_parts[i][j].offset = offset;
-			msp_parts[i][j].name = env + coff;
-		}
-
-		/* now probe and add the device */
-		simple_map_init(&msp_maps[i]);
-		msp_flash[i] = do_map_probe("cfi_probe", &msp_maps[i]);
-		if (msp_flash[i]) {
-			msp_flash[i]->owner = THIS_MODULE;
-			mtd_device_register(msp_flash[i], msp_parts[i], pcnt);
-		} else {
-			printk(KERN_ERR "map probe failed for flash\n");
-			ret = -ENXIO;
-			kfree(msp_maps[i].name);
-			iounmap(msp_maps[i].virt);
-			kfree(msp_parts[i]);
-			goto cleanup_loop;
-		}
-	}
-
-	return 0;
-
-cleanup_loop:
-	while (i--) {
-		mtd_device_unregister(msp_flash[i]);
-		map_destroy(msp_flash[i]);
-		kfree(msp_maps[i].name);
-		iounmap(msp_maps[i].virt);
-		kfree(msp_parts[i]);
-	}
-	kfree(msp_maps);
-free_msp_parts:
-	kfree(msp_parts);
-free_msp_flash:
-	kfree(msp_flash);
-	return ret;
-}
-
-static void __exit cleanup_msp_flash(void)
-{
-	int i;
-
-	for (i = 0; i < fcnt; i++) {
-		mtd_device_unregister(msp_flash[i]);
-		map_destroy(msp_flash[i]);
-		iounmap((void *)msp_maps[i].virt);
-
-		/* free the memory */
-		kfree(msp_maps[i].name);
-		kfree(msp_parts[i]);
-	}
-
-	kfree(msp_flash);
-	kfree(msp_parts);
-	kfree(msp_maps);
-}
-
-MODULE_AUTHOR("PMC-Sierra, Inc");
-MODULE_DESCRIPTION("MTD map driver for PMC-Sierra MSP boards");
-MODULE_LICENSE("GPL");
-
-module_init(init_msp_flash);
-module_exit(cleanup_msp_flash);
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -23,7 +23,6 @@
 #include "mtdcore.h"

 static LIST_HEAD(blktrans_majors);
-static DEFINE_MUTEX(blktrans_ref_mutex);

 static void blktrans_dev_release(struct kref *kref)
 {
@@ -37,26 +36,9 @@ static void blktrans_dev_release(struct kref *kref)
 	kfree(dev);
 }

-static struct mtd_blktrans_dev *blktrans_dev_get(struct gendisk *disk)
-{
-	struct mtd_blktrans_dev *dev;
-
-	mutex_lock(&blktrans_ref_mutex);
-	dev = disk->private_data;
-
-	if (!dev)
-		goto unlock;
-	kref_get(&dev->ref);
-unlock:
-	mutex_unlock(&blktrans_ref_mutex);
-	return dev;
-}
-
 static void blktrans_dev_put(struct mtd_blktrans_dev *dev)
 {
-	mutex_lock(&blktrans_ref_mutex);
 	kref_put(&dev->ref, blktrans_dev_release);
-	mutex_unlock(&blktrans_ref_mutex);
 }


@@ -201,19 +183,16 @@ static blk_status_t mtd_queue_rq(struct blk_mq_hw_ctx *hctx,

 static int blktrans_open(struct block_device *bdev, fmode_t mode)
 {
-	struct mtd_blktrans_dev *dev = blktrans_dev_get(bdev->bd_disk);
+	struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
 	int ret = 0;

-	if (!dev)
-		return -ERESTARTSYS; /* FIXME: busy loop! -arnd*/
+	kref_get(&dev->ref);

-	mutex_lock(&mtd_table_mutex);
 	mutex_lock(&dev->lock);

 	if (dev->open)
 		goto unlock;

-	kref_get(&dev->ref);
 	__module_get(dev->tr->owner);

 	if (!dev->mtd)
@@ -233,8 +212,6 @@ static int blktrans_open(struct block_device *bdev, fmode_t mode)
 unlock:
 	dev->open++;
 	mutex_unlock(&dev->lock);
-	mutex_unlock(&mtd_table_mutex);
-	blktrans_dev_put(dev);
 	return ret;

 error_release:
@@ -242,27 +219,20 @@ error_release:
 		dev->tr->release(dev);
 error_put:
 	module_put(dev->tr->owner);
-	kref_put(&dev->ref, blktrans_dev_release);
 	mutex_unlock(&dev->lock);
-	mutex_unlock(&mtd_table_mutex);
 	blktrans_dev_put(dev);
 	return ret;
 }

 static void blktrans_release(struct gendisk *disk, fmode_t mode)
 {
-	struct mtd_blktrans_dev *dev = blktrans_dev_get(disk);
+	struct mtd_blktrans_dev *dev = disk->private_data;

-	if (!dev)
-		return;
-
-	mutex_lock(&mtd_table_mutex);
 	mutex_lock(&dev->lock);

 	if (--dev->open)
 		goto unlock;

-	kref_put(&dev->ref, blktrans_dev_release);
 	module_put(dev->tr->owner);

 	if (dev->mtd) {
@@ -272,18 +242,14 @@ static void blktrans_release(struct gendisk *disk, fmode_t mode)
 	}
 unlock:
 	mutex_unlock(&dev->lock);
-	mutex_unlock(&mtd_table_mutex);
 	blktrans_dev_put(dev);
 }

 static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
-	struct mtd_blktrans_dev *dev = blktrans_dev_get(bdev->bd_disk);
+	struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data;
 	int ret = -ENXIO;

-	if (!dev)
-		return ret;
-
 	mutex_lock(&dev->lock);

 	if (!dev->mtd)
@@ -292,7 +258,6 @@ static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	ret = dev->tr->getgeo ? dev->tr->getgeo(dev, geo) : -ENOTTY;
 unlock:
 	mutex_unlock(&dev->lock);
-	blktrans_dev_put(dev);
 	return ret;
 }

@@ -315,12 +280,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	struct gendisk *gd;
 	int ret;

-	if (mutex_trylock(&mtd_table_mutex)) {
-		mutex_unlock(&mtd_table_mutex);
-		BUG();
-	}
+	lockdep_assert_held(&mtd_table_mutex);

-	mutex_lock(&blktrans_ref_mutex);
 	list_for_each_entry(d, &tr->devs, list) {
 		if (new->devnum == -1) {
 			/* Use first free number */
@@ -332,7 +293,6 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 			}
 		} else if (d->devnum == new->devnum) {
 			/* Required number taken */
-			mutex_unlock(&blktrans_ref_mutex);
 			return -EBUSY;
 		} else if (d->devnum > new->devnum) {
 			/* Required number was free */
@@ -350,14 +310,11 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	 * minor numbers and that the disk naming code below can cope
 	 * with this number. */
 	if (new->devnum > (MINORMASK >> tr->part_bits) ||
-	    (tr->part_bits && new->devnum >= 27 * 26)) {
-		mutex_unlock(&blktrans_ref_mutex);
+	    (tr->part_bits && new->devnum >= 27 * 26))
 		return ret;
-	}

 	list_add_tail(&new->list, &tr->devs);
 added:
-	mutex_unlock(&blktrans_ref_mutex);

 	mutex_init(&new->lock);
 	kref_init(&new->ref);
@@ -449,10 +406,7 @@ int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old)
 {
 	unsigned long flags;

-	if (mutex_trylock(&mtd_table_mutex)) {
-		mutex_unlock(&mtd_table_mutex);
-		BUG();
-	}
+	lockdep_assert_held(&mtd_table_mutex);

 	if (old->disk_attributes)
 		sysfs_remove_group(&disk_to_dev(old->disk)->kobj,
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -322,6 +322,10 @@ static void mtdblock_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	if (!(mtd->flags & MTD_WRITEABLE))
 		dev->mbd.readonly = 1;

+	if (mtd_type_is_nand(mtd))
+		pr_warn("%s: MTD device '%s' is NAND, please consider using UBI block devices instead.\n",
+			tr->name, mtd->name);
+
 	if (add_mtd_blktrans_dev(&dev->mbd))
 		kfree(dev);
 }
--- a/drivers/mtd/mtdblock_ro.c
+++ b/drivers/mtd/mtdblock_ro.c
@@ -46,6 +46,10 @@ static void mtdblock_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	dev->tr = tr;
 	dev->readonly = 1;

+	if (mtd_type_is_nand(mtd))
+		pr_warn("%s: MTD device '%s' is NAND, please consider using UBI block devices instead.\n",
+			tr->name, mtd->name);
+
 	if (add_mtd_blktrans_dev(dev))
 		kfree(dev);
 }
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -641,6 +641,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	int i;
 	size_t size;
 	struct mtd_concat *concat;
+	struct mtd_info *subdev_master = NULL;
 	uint32_t max_erasesize, curr_erasesize;
 	int num_erase_region;
 	int max_writebufsize = 0;
@@ -679,18 +680,24 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	concat->mtd.subpage_sft = subdev[0]->subpage_sft;
 	concat->mtd.oobsize = subdev[0]->oobsize;
 	concat->mtd.oobavail = subdev[0]->oobavail;
-	if (subdev[0]->_writev)
+
+	subdev_master = mtd_get_master(subdev[0]);
+	if (subdev_master->_writev)
 		concat->mtd._writev = concat_writev;
-	if (subdev[0]->_read_oob)
+	if (subdev_master->_read_oob)
 		concat->mtd._read_oob = concat_read_oob;
-	if (subdev[0]->_write_oob)
+	if (subdev_master->_write_oob)
 		concat->mtd._write_oob = concat_write_oob;
-	if (subdev[0]->_block_isbad)
+	if (subdev_master->_block_isbad)
 		concat->mtd._block_isbad = concat_block_isbad;
-	if (subdev[0]->_block_markbad)
+	if (subdev_master->_block_markbad)
 		concat->mtd._block_markbad = concat_block_markbad;
-	if (subdev[0]->_panic_write)
+	if (subdev_master->_panic_write)
 		concat->mtd._panic_write = concat_panic_write;
+	if (subdev_master->_read)
+		concat->mtd._read = concat_read;
+	if (subdev_master->_write)
+		concat->mtd._write = concat_write;

 	concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;

@@ -721,14 +728,22 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}

+		subdev_master = mtd_get_master(subdev[i]);
 		concat->mtd.size += subdev[i]->size;
 		concat->mtd.ecc_stats.badblocks +=
 			subdev[i]->ecc_stats.badblocks;
 		if (concat->mtd.writesize   !=  subdev[i]->writesize ||
 		    concat->mtd.subpage_sft != subdev[i]->subpage_sft ||
 		    concat->mtd.oobsize    !=  subdev[i]->oobsize ||
-		    !concat->mtd._read_oob  != !subdev[i]->_read_oob ||
-		    !concat->mtd._write_oob != !subdev[i]->_write_oob) {
+		    !concat->mtd._read_oob  != !subdev_master->_read_oob ||
+		    !concat->mtd._write_oob != !subdev_master->_write_oob) {
+			/*
+			 * Check against subdev[i] for data members, because
+			 * subdev's attributes may be different from master
+			 * mtd device. Check against subdev's master mtd
+			 * device for callbacks, because the existence of
+			 * subdev's callbacks is decided by master mtd device.
+			 */
 			kfree(concat);
 			printk("Incompatible OOB or ECC data on \"%s\"\n",
 			       subdev[i]->name);
@@ -744,8 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	concat->mtd.name = name;

 	concat->mtd._erase = concat_erase;
-	concat->mtd._read = concat_read;
-	concat->mtd._write = concat_write;
 	concat->mtd._sync = concat_sync;
 	concat->mtd._lock = concat_lock;
 	concat->mtd._unlock = concat_unlock;
--- a/drivers/mtd/nand/raw/Kconfig
+++ b/drivers/mtd/nand/raw/Kconfig
@@ -480,9 +480,9 @@ config MTD_NAND_RICOH
 	select MTD_SM_COMMON
 	help
 	  Enable support for Ricoh R5C852 xD card reader
-	  You also need to enable ether
+	  You also need to enable either
 	  NAND SSFDC (SmartMedia) read only translation layer' or new
-	  expermental, readwrite
+	  experimental, readwrite
 	  'SmartMedia/xD new translation layer'

 config MTD_NAND_DISKONCHIP
--- a/drivers/mtd/nand/raw/cafe_nand.c
+++ b/drivers/mtd/nand/raw/cafe_nand.c
@@ -751,7 +751,7 @@ static int cafe_nand_probe(struct pci_dev *pdev,
 			  "CAFE NAND", mtd);
 	if (err) {
 		dev_warn(&pdev->dev, "Could not register IRQ %d\n", pdev->irq);
-		goto out_ior;
+		goto out_free_rs;
 	}

 	/* Disable master reset, enable NAND clock */
@@ -795,6 +795,8 @@ static int cafe_nand_probe(struct pci_dev *pdev,
 	/* Disable NAND IRQ in global IRQ mask register */
 	cafe_writel(cafe, ~1 & cafe_readl(cafe, GLOBAL_IRQ_MASK), GLOBAL_IRQ_MASK);
 	free_irq(pdev->irq, mtd);
+ out_free_rs:
+	free_rs(cafe->rs);
 out_ior:
 	pci_iounmap(pdev, cafe->mmio);
 out_free_mtd:
--- a/drivers/mtd/nand/raw/intel-nand-controller.c
+++ b/drivers/mtd/nand/raw/intel-nand-controller.c
@@ -631,19 +631,26 @@ static int ebu_nand_probe(struct platform_device *pdev)
 	ebu_host->clk_rate = clk_get_rate(ebu_host->clk);

 	ebu_host->dma_tx = dma_request_chan(dev, "tx");
-	if (IS_ERR(ebu_host->dma_tx))
-		return dev_err_probe(dev, PTR_ERR(ebu_host->dma_tx),
+	if (IS_ERR(ebu_host->dma_tx)) {
+		ret = dev_err_probe(dev, PTR_ERR(ebu_host->dma_tx),
 				    "failed to request DMA tx chan!.\n");
+		goto err_disable_unprepare_clk;
+	}

 	ebu_host->dma_rx = dma_request_chan(dev, "rx");
-	if (IS_ERR(ebu_host->dma_rx))
-		return dev_err_probe(dev, PTR_ERR(ebu_host->dma_rx),
+	if (IS_ERR(ebu_host->dma_rx)) {
+		ret = dev_err_probe(dev, PTR_ERR(ebu_host->dma_rx),
 				    "failed to request DMA rx chan!.\n");
+		ebu_host->dma_rx = NULL;
+		goto err_cleanup_dma;
+	}

 	resname = devm_kasprintf(dev, GFP_KERNEL, "addr_sel%d", cs);
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, resname);
-	if (!res)
-		return -EINVAL;
+	if (!res) {
+		ret = -EINVAL;
+		goto err_cleanup_dma;
+	}
 	ebu_host->cs[cs].addr_sel = res->start;
 	writel(ebu_host->cs[cs].addr_sel | EBU_ADDR_MASK(5) | EBU_ADDR_SEL_REGEN,
 	       ebu_host->ebu + EBU_ADDR_SEL(cs));
@@ -653,7 +660,8 @@ static int ebu_nand_probe(struct platform_device *pdev)
 	mtd = nand_to_mtd(&ebu_host->chip);
 	if (!mtd->name) {
 		dev_err(ebu_host->dev, "NAND label property is mandatory\n");
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err_cleanup_dma;
 	}

 	mtd->dev.parent = dev;
@@ -681,6 +689,7 @@ err_clean_nand:
 	nand_cleanup(&ebu_host->chip);
 err_cleanup_dma:
 	ebu_dma_cleanup(ebu_host);
+err_disable_unprepare_clk:
 	clk_disable_unprepare(ebu_host->clk);

 	return ret;
--- a/drivers/mtd/nand/raw/meson_nand.c
+++ b/drivers/mtd/nand/raw/meson_nand.c
@@ -580,7 +580,7 @@ static int meson_nfc_rw_cmd_prepare_and_execute(struct nand_chip *nand,
 	u32 *addrs = nfc->cmdfifo.rw.addrs;
 	u32 cs = nfc->param.chip_select;
 	u32 cmd0, cmd_num, row_start;
-	int ret = 0, i;
+	int i;

 	cmd_num = sizeof(struct nand_rw_cmd) / sizeof(int);

@@ -620,7 +620,7 @@ static int meson_nfc_rw_cmd_prepare_and_execute(struct nand_chip *nand,
 		meson_nfc_cmd_idle(nfc, nfc->timing.tadl);
 	}

-	return ret;
+	return 0;
 }

 static int meson_nfc_write_page_sub(struct nand_chip *nand,
--- a/drivers/mtd/nand/raw/nand_bbt.c
+++ b/drivers/mtd/nand/raw/nand_bbt.c
@@ -447,6 +447,35 @@ static int scan_block_fast(struct nand_chip *this, struct nand_bbt_descr *bd,
 	return 0;
 }

+/* Check if a potential BBT block is marked as bad */
+static int bbt_block_checkbad(struct nand_chip *this, struct nand_bbt_descr *td,
+			      loff_t offs, uint8_t *buf)
+{
+	struct nand_bbt_descr *bd = this->badblock_pattern;
+
+	/*
+	 * No need to check for a bad BBT block if the BBM area overlaps with
+	 * the bad block table marker area in OOB since writing a BBM here
+	 * invalidates the bad block table marker anyway.
+	 */
+	if (!(td->options & NAND_BBT_NO_OOB) &&
+	    td->offs >= bd->offs && td->offs < bd->offs + bd->len)
+		return 0;
+
+	/*
+	 * There is no point in checking for a bad block marker if writing
+	 * such marker is not supported
+	 */
+	if (this->bbt_options & NAND_BBT_NO_OOB_BBM ||
+	    this->options & NAND_NO_BBM_QUIRK)
+		return 0;
+
+	if (scan_block_fast(this, bd, offs, buf) > 0)
+		return 1;
+
+	return 0;
+}
+
 /**
 * create_bbt - [GENERIC] Create a bad block table by scanning the device
 * @this: NAND chip object
@@ -560,6 +589,10 @@ static int search_bbt(struct nand_chip *this, uint8_t *buf,
 			int actblock = startblock + dir * block;
 			loff_t offs = (loff_t)actblock << this->bbt_erase_shift;

+			/* Check if block is marked bad */
+			if (bbt_block_checkbad(this, td, offs, buf))
+				continue;
+
 			/* Read first page */
 			scan_read(this, buf, offs, mtd->writesize, td);
 			if (!check_pattern(buf, scanlen, mtd->writesize, td)) {
--- a/drivers/mtd/nand/raw/omap2.c
+++ b/drivers/mtd/nand/raw/omap2.c
@@ -911,7 +911,7 @@ static int omap_correct_data(struct nand_chip *chip, u_char *dat,
 }

 /**
- * omap_calcuate_ecc - Generate non-inverted ECC bytes.
+ * omap_calculate_ecc - Generate non-inverted ECC bytes.
 * @chip: NAND chip object
 * @dat: The pointer to data on which ecc is computed
 * @ecc_code: The ecc_code buffer
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -288,6 +288,8 @@ static int spinand_ondie_ecc_prepare_io_req(struct nand_device *nand,
 	struct spinand_device *spinand = nand_to_spinand(nand);
 	bool enable = (req->mode != MTD_OPS_RAW);

+	memset(spinand->oobbuf, 0xff, nanddev_per_page_oobsize(nand));
+
 	/* Only enable or disable the engine */
 	return spinand_ecc_enable(spinand, enable);
 }
@@ -307,7 +309,7 @@ static int spinand_ondie_ecc_finish_io_req(struct nand_device *nand,
 	if (req->type == NAND_PAGE_WRITE)
 		return 0;

-	/* Finish a page write: check the status, report errors/bitflips */
+	/* Finish a page read: check the status, report errors/bitflips */
 	ret = spinand_check_ecc_status(spinand, engine_conf->status);
 	if (ret == -EBADMSG)
 		mtd->ecc_stats.failed++;
--- a/drivers/mtd/nand/spi/macronix.c
+++ b/drivers/mtd/nand/spi/macronix.c
@@ -126,7 +126,7 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout,
 				     mx35lf1ge4ab_ecc_get_status)),
 	SPINAND_INFO("MX35LF4GE4AD",
@@ -136,7 +136,7 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout,
 				     mx35lf1ge4ab_ecc_get_status)),
 	SPINAND_INFO("MX35LF1G24AD",
@@ -146,16 +146,16 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout, NULL)),
 	SPINAND_INFO("MX35LF2G24AD",
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x24),
-		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
+		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 2, 1, 1),
 		     NAND_ECCREQ(8, 512),
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout, NULL)),
 	SPINAND_INFO("MX35LF4G24AD",
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x35),
@@ -164,7 +164,7 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout, NULL)),
 	SPINAND_INFO("MX31LF1GE4BC",
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x1e),
@@ -173,7 +173,7 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0 /*SPINAND_HAS_QE_BIT*/,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout,
 				     mx35lf1ge4ab_ecc_get_status)),
 	SPINAND_INFO("MX31UF1GE4BC",
@@ -183,7 +183,7 @@ static const struct spinand_info macronix_spinand_table[] = {
 		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
 					      &write_cache_variants,
 					      &update_cache_variants),
-		     0 /*SPINAND_HAS_QE_BIT*/,
+		     SPINAND_HAS_QE_BIT,
 		     SPINAND_ECCINFO(&mx35lfxge4ab_ooblayout,
 				     mx35lf1ge4ab_ecc_get_status)),

--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -239,7 +239,7 @@ err:

 static int rfd_ftl_readsect(struct mtd_blktrans_dev *dev, u_long sector, char *buf)
 {
-	struct partition *part = (struct partition*)dev;
+	struct partition *part = container_of(dev, struct partition, mbd);
 	u_long addr;
 	size_t retlen;
 	int rc;
@@ -600,7 +600,7 @@ static int find_free_sector(const struct partition *part, const struct block *bl

 static int do_writesect(struct mtd_blktrans_dev *dev, u_long sector, char *buf, ulong *old_addr)
 {
-	struct partition *part = (struct partition*)dev;
+	struct partition *part = container_of(dev, struct partition, mbd);
 	struct block *block;
 	u_long addr;
 	int i;
@@ -666,7 +666,7 @@ err:

 static int rfd_ftl_writesect(struct mtd_blktrans_dev *dev, u_long sector, char *buf)
 {
-	struct partition *part = (struct partition*)dev;
+	struct partition *part = container_of(dev, struct partition, mbd);
 	u_long old_addr;
 	int i;
 	int rc = 0;
@@ -705,9 +705,37 @@ err:
 	return rc;
 }

+static int rfd_ftl_discardsect(struct mtd_blktrans_dev *dev,
+			       unsigned long sector, unsigned int nr_sects)
+{
+	struct partition *part = container_of(dev, struct partition, mbd);
+	u_long addr;
+	int rc;
+
+	while (nr_sects) {
+		if (sector >= part->sector_count)
+			return -EIO;
+
+		addr = part->sector_map[sector];
+
+		if (addr != -1) {
+			rc = mark_sector_deleted(part, addr);
+			if (rc)
+				return rc;
+
+			part->sector_map[sector] = -1;
+		}
+
+		sector++;
+		nr_sects--;
+	}
+
+	return 0;
+}
+
 static int rfd_ftl_getgeo(struct mtd_blktrans_dev *dev, struct hd_geometry *geo)
 {
-	struct partition *part = (struct partition*)dev;
+	struct partition *part = container_of(dev, struct partition, mbd);

 	geo->heads = 1;
 	geo->sectors = SECTORS_PER_TRACK;
@@ -720,7 +748,8 @@ static void rfd_ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 {
 	struct partition *part;

-	if (mtd->type != MTD_NORFLASH || mtd->size > UINT_MAX)
+	if ((mtd->type != MTD_NORFLASH && mtd->type != MTD_RAM) ||
+	    mtd->size > UINT_MAX)
 		return;

 	part = kzalloc(sizeof(struct partition), GFP_KERNEL);
@@ -754,7 +783,7 @@ static void rfd_ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 		printk(KERN_INFO PREFIX "name: '%s' type: %d flags %x\n",
 				mtd->name, mtd->type, mtd->flags);

-		if (!add_mtd_blktrans_dev((void*)part))
+		if (!add_mtd_blktrans_dev(&part->mbd))
 			return;
 	}
 out:
@@ -763,7 +792,7 @@ out:

 static void rfd_ftl_remove_dev(struct mtd_blktrans_dev *dev)
 {
-	struct partition *part = (struct partition*)dev;
+	struct partition *part = container_of(dev, struct partition, mbd);
 	int i;

 	for (i=0; i<part->total_blocks; i++) {
@@ -771,10 +800,10 @@ static void rfd_ftl_remove_dev(struct mtd_blktrans_dev *dev)
 			part->mbd.mtd->name, i, part->blocks[i].erases);
 	}

-	del_mtd_blktrans_dev(dev);
 	vfree(part->sector_map);
 	kfree(part->header_cache);
 	kfree(part->blocks);
+	del_mtd_blktrans_dev(&part->mbd);
 }

 static struct mtd_blktrans_ops rfd_ftl_tr = {
@@ -785,6 +814,7 @@ static struct mtd_blktrans_ops rfd_ftl_tr = {

 	.readsect	= rfd_ftl_readsect,
 	.writesect	= rfd_ftl_writesect,
+	.discard	= rfd_ftl_discardsect,
 	.getgeo		= rfd_ftl_getgeo,
 	.add_mtd	= rfd_ftl_add_mtd,
 	.remove_dev	= rfd_ftl_remove_dev,
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -400,6 +400,7 @@ struct cp210x_special_chars {
 };

 /* CP210X_VENDOR_SPECIFIC values */
+#define CP210X_GET_FW_VER	0x000E
 #define CP210X_READ_2NCONFIG	0x000E
 #define CP210X_GET_FW_VER_2N	0x0010
 #define CP210X_READ_LATCH	0x00C2
@@ -638,7 +639,7 @@ static int cp210x_read_reg_block(struct usb_serial_port *port, u8 req,
 	result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 			req, REQTYPE_INTERFACE_TO_HOST, 0,
 			port_priv->bInterfaceNumber, dmabuf, bufsize,
-			USB_CTRL_SET_TIMEOUT);
+			USB_CTRL_GET_TIMEOUT);
 	if (result == bufsize) {
 		memcpy(buf, dmabuf, bufsize);
 		result = 0;
@@ -1145,33 +1146,6 @@ static void cp210x_disable_event_mode(struct usb_serial_port *port)
 	port_priv->event_mode = false;
 }

-static int cp210x_set_chars(struct usb_serial_port *port,
-		struct cp210x_special_chars *chars)
-{
-	struct cp210x_port_private *port_priv = usb_get_serial_port_data(port);
-	struct usb_serial *serial = port->serial;
-	void *dmabuf;
-	int result;
-
-	dmabuf = kmemdup(chars, sizeof(*chars), GFP_KERNEL);
-	if (!dmabuf)
-		return -ENOMEM;
-
-	result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0),
-				CP210X_SET_CHARS, REQTYPE_HOST_TO_INTERFACE, 0,
-				port_priv->bInterfaceNumber,
-				dmabuf, sizeof(*chars), USB_CTRL_SET_TIMEOUT);
-
-	kfree(dmabuf);
-
-	if (result < 0) {
-		dev_err(&port->dev, "failed to set special chars: %d\n", result);
-		return result;
-	}
-
-	return 0;
-}
-
 static bool cp210x_termios_change(const struct ktermios *a, const struct ktermios *b)
 {
 	bool iflag_change, cc_change;
@@ -1192,6 +1166,7 @@ static void cp210x_set_flow_control(struct tty_struct *tty,
 	struct cp210x_flow_ctl flow_ctl;
 	u32 flow_repl;
 	u32 ctl_hs;
+	bool crtscts;
 	int ret;

 	/*
@@ -1218,9 +1193,12 @@ static void cp210x_set_flow_control(struct tty_struct *tty,
 		chars.bXonChar = START_CHAR(tty);
 		chars.bXoffChar = STOP_CHAR(tty);

-		ret = cp210x_set_chars(port, &chars);
-		if (ret)
-			return;
+		ret = cp210x_write_reg_block(port, CP210X_SET_CHARS, &chars,
+				sizeof(chars));
+		if (ret) {
+			dev_err(&port->dev, "failed to set special chars: %d\n",
+					ret);
+		}
 	}

 	mutex_lock(&port_priv->mutex);
@@ -1249,14 +1227,14 @@ static void cp210x_set_flow_control(struct tty_struct *tty,
 			flow_repl |= CP210X_SERIAL_RTS_FLOW_CTL;
 		else
 			flow_repl |= CP210X_SERIAL_RTS_INACTIVE;
-		port_priv->crtscts = true;
+		crtscts = true;
 	} else {
 		ctl_hs &= ~CP210X_SERIAL_CTS_HANDSHAKE;
 		if (port_priv->rts)
 			flow_repl |= CP210X_SERIAL_RTS_ACTIVE;
 		else
 			flow_repl |= CP210X_SERIAL_RTS_INACTIVE;
-		port_priv->crtscts = false;
+		crtscts = false;
 	}

 	if (I_IXOFF(tty)) {
@@ -1279,8 +1257,12 @@ static void cp210x_set_flow_control(struct tty_struct *tty,
 	flow_ctl.ulControlHandshake = cpu_to_le32(ctl_hs);
 	flow_ctl.ulFlowReplace = cpu_to_le32(flow_repl);

-	cp210x_write_reg_block(port, CP210X_SET_FLOW, &flow_ctl,
+	ret = cp210x_write_reg_block(port, CP210X_SET_FLOW, &flow_ctl,
 			sizeof(flow_ctl));
+	if (ret)
+		goto out_unlock;
+
+	port_priv->crtscts = crtscts;
 out_unlock:
 	mutex_unlock(&port_priv->mutex);
 }
@@ -2111,12 +2093,26 @@ static int cp210x_get_fw_version(struct usb_serial *serial, u16 value)
 	return 0;
 }

-static void cp210x_determine_quirks(struct usb_serial *serial)
+static void cp210x_determine_type(struct usb_serial *serial)
 {
 	struct cp210x_serial_private *priv = usb_get_serial_data(serial);
 	int ret;

+	ret = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST,
+			CP210X_GET_PARTNUM, &priv->partnum,
+			sizeof(priv->partnum));
+	if (ret < 0) {
+		dev_warn(&serial->interface->dev,
+				"querying part number failed\n");
+		priv->partnum = CP210X_PARTNUM_UNKNOWN;
+		return;
+	}
+
 	switch (priv->partnum) {
+	case CP210X_PARTNUM_CP2105:
+	case CP210X_PARTNUM_CP2108:
+		cp210x_get_fw_version(serial, CP210X_GET_FW_VER);
+		break;
 	case CP210X_PARTNUM_CP2102N_QFN28:
 	case CP210X_PARTNUM_CP2102N_QFN24:
 	case CP210X_PARTNUM_CP2102N_QFN20:
@@ -2140,18 +2136,9 @@ static int cp210x_attach(struct usb_serial *serial)
 	if (!priv)
 		return -ENOMEM;

-	result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST,
-					  CP210X_GET_PARTNUM, &priv->partnum,
-					  sizeof(priv->partnum));
-	if (result < 0) {
-		dev_warn(&serial->interface->dev,
-			 "querying part number failed\n");
-		priv->partnum = CP210X_PARTNUM_UNKNOWN;
-	}
-
 	usb_set_serial_data(serial, priv);

-	cp210x_determine_quirks(serial);
+	cp210x_determine_type(serial);
 	cp210x_init_max_speed(serial);

 	result = cp210x_gpio_init(serial);
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -1199,9 +1199,9 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");

-module_param(stats, bool, S_IRUGO | S_IWUSR);
+module_param(stats, bool, 0644);
 MODULE_PARM_DESC(stats, "Enable statistics or not");
-module_param(interval, int, S_IRUGO | S_IWUSR);
+module_param(interval, int, 0644);
 MODULE_PARM_DESC(interval, "Overrides interrupt interval");
-module_param(unstable_bauds, bool, S_IRUGO | S_IWUSR);
+module_param(unstable_bauds, bool, 0644);
 MODULE_PARM_DESC(unstable_bauds, "Allow unstable baud rates");
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -2938,5 +2938,5 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");

-module_param(ndi_latency_timer, int, S_IRUGO | S_IWUSR);
+module_param(ndi_latency_timer, int, 0644);
 MODULE_PARM_DESC(ndi_latency_timer, "NDI device latency timer override");
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -1444,5 +1444,5 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");

-module_param(initial_mode, int, S_IRUGO);
+module_param(initial_mode, int, 0444);
 MODULE_PARM_DESC(initial_mode, "Initial mode");
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -389,39 +389,6 @@ static void update_edgeport_E2PROM(struct edgeport_serial *edge_serial)
 	release_firmware(fw);
 }

-#if 0
-/************************************************************************
- *
- *  Get string descriptor from device
- *
- ************************************************************************/
-static int get_string_desc(struct usb_device *dev, int Id,
-				struct usb_string_descriptor **pRetDesc)
-{
-	struct usb_string_descriptor StringDesc;
-	struct usb_string_descriptor *pStringDesc;
-
-	dev_dbg(&dev->dev, "%s - USB String ID = %d\n", __func__, Id);
-
-	if (!usb_get_descriptor(dev, USB_DT_STRING, Id, &StringDesc,
-						sizeof(StringDesc)))
-		return 0;
-
-	pStringDesc = kmalloc(StringDesc.bLength, GFP_KERNEL);
-	if (!pStringDesc)
-		return -1;
-
-	if (!usb_get_descriptor(dev, USB_DT_STRING, Id, pStringDesc,
-							StringDesc.bLength)) {
-		kfree(pStringDesc);
-		return -1;
-	}
-
-	*pRetDesc = pStringDesc;
-	return 0;
-}
-#endif
-
 static void dump_product_info(struct edgeport_serial *edge_serial,
 			      struct edgeport_product_info *product_info)
 {
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2746,9 +2746,9 @@ MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 MODULE_FIRMWARE("edgeport/down3.bin");

-module_param(ignore_cpu_rev, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_cpu_rev, bool, 0644);
 MODULE_PARM_DESC(ignore_cpu_rev,
 			"Ignore the cpu revision when connecting to a device");

-module_param(default_uart_mode, int, S_IRUGO | S_IWUSR);
+module_param(default_uart_mode, int, 0644);
 MODULE_PARM_DESC(default_uart_mode, "Default uart_mode, 0=RS232, ...");
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -599,10 +599,10 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");

-module_param(connect_retries, int, S_IRUGO|S_IWUSR);
+module_param(connect_retries, int, 0644);
 MODULE_PARM_DESC(connect_retries,
 		"Maximum number of connect retries (one second each)");

-module_param(initial_wait, int, S_IRUGO|S_IWUSR);
+module_param(initial_wait, int, 0644);
 MODULE_PARM_DESC(initial_wait,
 		"Time to wait before attempting a connection (in seconds)");
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -1188,20 +1188,20 @@ MODULE_AUTHOR("Alain Degreffe eczema@ecze.com");
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");

-module_param(xmas, bool, S_IRUGO | S_IWUSR);
+module_param(xmas, bool, 0644);
 MODULE_PARM_DESC(xmas, "Xmas colors enabled or not");

-module_param(boost, int, S_IRUGO | S_IWUSR);
+module_param(boost, int, 0644);
 MODULE_PARM_DESC(boost, "Card overclock boost (in percent 100-500)");

-module_param(clockmode, int, S_IRUGO | S_IWUSR);
+module_param(clockmode, int, 0644);
 MODULE_PARM_DESC(clockmode, "Card clock mode (1=3.579 MHz, 2=3.680 MHz, "
 		"3=6 Mhz)");

-module_param(cdmode, int, S_IRUGO | S_IWUSR);
+module_param(cdmode, int, 0644);
 MODULE_PARM_DESC(cdmode, "Card detect mode (0=none, 1=CD, 2=!CD, 3=DSR, "
 		 "4=!DSR, 5=CTS, 6=!CTS, 7=RING, 8=!RING)");

-module_param(vcc_default, int, S_IRUGO | S_IWUSR);
+module_param(vcc_default, int, 0644);
 MODULE_PARM_DESC(vcc_default, "Set default VCC (either 3 for 3.3V or 5 "
 		"for 5V). Default to 5.");
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -433,6 +433,7 @@ static int pl2303_detect_type(struct usb_serial *serial)
 		switch (bcdDevice) {
 		case 0x100:
 		case 0x305:
+		case 0x405:
 			/*
 			 * Assume it's an HXN-type if the device doesn't
 			 * support the old read request value.
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -1056,5 +1056,5 @@ MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL v2");

-module_param(nmea, bool, S_IRUGO | S_IWUSR);
+module_param(nmea, bool, 0644);
 MODULE_PARM_DESC(nmea, "NMEA streaming");
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -137,6 +137,7 @@ menu "DOS/FAT/EXFAT/NT Filesystems"
 source "fs/fat/Kconfig"
 source "fs/exfat/Kconfig"
 source "fs/ntfs/Kconfig"
+source "fs/ntfs3/Kconfig"

 endmenu
 endif # BLOCK
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_CIFS)		+= cifs/
 obj-$(CONFIG_SMB_SERVER)	+= ksmbd/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
 obj-$(CONFIG_NTFS_FS)		+= ntfs/
+obj-$(CONFIG_NTFS3_FS)		+= ntfs3/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -221,8 +221,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		}

 		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
-			PROT_READ | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
 			fd_offset);

 		if (error != N_TXTADDR(ex))
@@ -230,7 +229,7 @@ static int load_aout_binary(struct linux_binprm * bprm)

 		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
 				PROT_READ | PROT_WRITE | PROT_EXEC,
-				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+				MAP_FIXED | MAP_PRIVATE,
 				fd_offset + ex.a_text);
 		if (error != N_DATADDR(ex))
 			return error;
@@ -309,7 +308,7 @@ static int load_aout_library(struct file *file)
 	/* Now use mmap to map the library into memory. */
 	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
+			MAP_FIXED | MAP_PRIVATE,
 			N_TXTOFF(ex));
 	retval = error;
 	if (error != start_addr)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -622,7 +622,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	eppnt = interp_elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
-			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_type = MAP_PRIVATE;
 			int elf_prot = make_prot(eppnt->p_flags, arch_state,
 						 true, true);
 			unsigned long vaddr = 0;
@@ -1070,7 +1070,7 @@ out_free_interp:
 		elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
 				     !!interpreter, false);

-		elf_flags = MAP_PRIVATE | MAP_DENYWRITE;
+		elf_flags = MAP_PRIVATE;

 		vaddr = elf_ppnt->p_vaddr;
 		/*
@@ -1384,7 +1384,7 @@ static int load_elf_library(struct file *file)
 			(eppnt->p_filesz +
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
 			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE,
+			MAP_FIXED_NOREPLACE | MAP_PRIVATE,
 			(eppnt->p_offset -
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1041,7 +1041,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (phdr->p_flags & PF_W) prot |= PROT_WRITE;
 		if (phdr->p_flags & PF_X) prot |= PROT_EXEC;

-		flags = MAP_PRIVATE | MAP_DENYWRITE;
+		flags = MAP_PRIVATE;
 		maddr = 0;

 		switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1272,7 +1272,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 	 * not visibile until then. This also enables the update
 	 * to be lockless.
 	 */
-	set_mm_exe_file(bprm->mm, bprm->file);
+	retval = set_mm_exe_file(bprm->mm, bprm->file);
+	if (retval)
+		goto out;

 	/* If the binary is not readable then enforce mm->dumpable=0 */
 	would_dump(bprm, bprm->file);
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -105,6 +105,13 @@ config F2FS_FS_LZO
 	help
 	  Support LZO compress algorithm, if unsure, say Y.

+config F2FS_FS_LZORLE
+	bool "LZO-RLE compression support"
+	depends on F2FS_FS_LZO
+	default y
+	help
+	  Support LZO-RLE compress algorithm, if unsure, say Y.
+
 config F2FS_FS_LZ4
 	bool "LZ4 compression support"
 	depends on F2FS_FS_COMPRESSION
@@ -114,7 +121,6 @@ config F2FS_FS_LZ4

 config F2FS_FS_LZ4HC
 	bool "LZ4HC compression support"
-	depends on F2FS_FS_COMPRESSION
 	depends on F2FS_FS_LZ4
 	default y
 	help
@@ -128,10 +134,11 @@ config F2FS_FS_ZSTD
 	help
 	  Support ZSTD compress algorithm, if unsure, say Y.

-config F2FS_FS_LZORLE
-	bool "LZO-RLE compression support"
-	depends on F2FS_FS_COMPRESSION
-	depends on F2FS_FS_LZO
+config F2FS_IOSTAT
+	bool "F2FS IO statistics information"
+	depends on F2FS_FS
 	default y
 	help
-	  Support LZO-RLE compress algorithm, if unsure, say Y.
+	  Support getting IO statistics through sysfs and printing out periodic
+	  IO statistics tracepoint events. You have to turn on "iostat_enable"
+	  sysfs node to enable this feature.
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_FS_VERITY) += verity.o
 f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
+f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -18,6 +18,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>

 #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
@@ -465,16 +466,29 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
 						unsigned int devidx, int type)
 {
 	struct inode_management *im = &sbi->im[type];
-	struct ino_entry *e, *tmp;
+	struct ino_entry *e = NULL, *new = NULL;

-	tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
+	if (type == FLUSH_INO) {
+		rcu_read_lock();
+		e = radix_tree_lookup(&im->ino_root, ino);
+		rcu_read_unlock();
+	}
+
+retry:
+	if (!e)
+		new = f2fs_kmem_cache_alloc(ino_entry_slab,
+						GFP_NOFS, true, NULL);

 	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);

 	spin_lock(&im->ino_lock);
 	e = radix_tree_lookup(&im->ino_root, ino);
 	if (!e) {
-		e = tmp;
+		if (!new) {
+			spin_unlock(&im->ino_lock);
+			goto retry;
+		}
+		e = new;
 		if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
 			f2fs_bug_on(sbi, 1);

@@ -492,8 +506,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
 	spin_unlock(&im->ino_lock);
 	radix_tree_preload_end();

-	if (e != tmp)
-		kmem_cache_free(ino_entry_slab, tmp);
+	if (new && e != new)
+		kmem_cache_free(ino_entry_slab, new);
 }

 static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -1289,12 +1303,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned long flags;

-	spin_lock_irqsave(&sbi->cp_lock, flags);
+	if (cpc->reason & CP_UMOUNT) {
+		if (le32_to_cpu(ckpt->cp_pack_total_block_count) >
+			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) {
+			clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+			f2fs_notice(sbi, "Disable nat_bits due to no space");
+		} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
+						f2fs_nat_bitmap_enabled(sbi)) {
+			f2fs_enable_nat_bits(sbi);
+			set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+			f2fs_notice(sbi, "Rebuild and enable nat_bits");
+		}
+	}

-	if ((cpc->reason & CP_UMOUNT) &&
-			le32_to_cpu(ckpt->cp_pack_total_block_count) >
-			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
-		disable_nat_bits(sbi, false);
+	spin_lock_irqsave(&sbi->cp_lock, flags);

 	if (cpc->reason & CP_TRIMMED)
 		__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
@@ -1480,7 +1502,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	start_blk = __start_cp_next_addr(sbi);

 	/* write nat bits */
-	if (enabled_nat_bits(sbi, cpc)) {
+	if ((cpc->reason & CP_UMOUNT) &&
+			is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
 		__u64 cp_ver = cur_cp_version(ckpt);
 		block_t blk;

@@ -1639,8 +1662,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)

 	/* write cached NAT/SIT entries to NAT/SIT area */
 	err = f2fs_flush_nat_entries(sbi, cpc);
-	if (err)
+	if (err) {
+		f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
+		f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
 		goto stop;
+	}

 	f2fs_flush_sit_entries(sbi, cpc);

@@ -1648,10 +1674,13 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	f2fs_save_inmem_curseg(sbi);

 	err = do_checkpoint(sbi, cpc);
-	if (err)
+	if (err) {
+		f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
+		f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
 		f2fs_release_discard_addrs(sbi);
-	else
+	} else {
 		f2fs_clear_prefree_segments(sbi, cpc);
+	}

 	f2fs_restore_inmem_curseg(sbi);
 stop:
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -28,7 +28,8 @@ static void *page_array_alloc(struct inode *inode, int nr)
 	unsigned int size = sizeof(struct page *) * nr;

 	if (likely(size <= sbi->page_array_slab_size))
-		return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS);
+		return f2fs_kmem_cache_alloc(sbi->page_array_slab,
+					GFP_F2FS_ZERO, false, F2FS_I_SB(inode));
 	return f2fs_kzalloc(sbi, size, GFP_NOFS);
 }

@@ -898,6 +899,54 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
 	return false;
 }

+bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+	unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+	bool compressed = dn->data_blkaddr == COMPRESS_ADDR;
+	int cluster_end = 0;
+	int i;
+	char *reason = "";
+
+	if (!compressed)
+		return false;
+
+	/* [..., COMPR_ADDR, ...] */
+	if (dn->ofs_in_node % cluster_size) {
+		reason = "[*|C|*|*]";
+		goto out;
+	}
+
+	for (i = 1; i < cluster_size; i++) {
+		block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+							dn->ofs_in_node + i);
+
+		/* [COMPR_ADDR, ..., COMPR_ADDR] */
+		if (blkaddr == COMPRESS_ADDR) {
+			reason = "[C|*|C|*]";
+			goto out;
+		}
+		if (compressed) {
+			if (!__is_valid_data_blkaddr(blkaddr)) {
+				if (!cluster_end)
+					cluster_end = i;
+				continue;
+			}
+			/* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */
+			if (cluster_end) {
+				reason = "[C|N|N|V]";
+				goto out;
+			}
+		}
+	}
+	return false;
+out:
+	f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s",
+			dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason);
+	set_sbi_flag(sbi, SBI_NEED_FSCK);
+	return true;
+}
+
 static int __f2fs_cluster_blocks(struct inode *inode,
 				unsigned int cluster_idx, bool compr)
 {
@@ -915,6 +964,11 @@ static int __f2fs_cluster_blocks(struct inode *inode,
 		goto fail;
 	}

+	if (f2fs_sanity_check_cluster(&dn)) {
+		ret = -EFSCORRUPTED;
+		goto fail;
+	}
+
 	if (dn.data_blkaddr == COMPRESS_ADDR) {
 		int i;

@@ -1228,7 +1282,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,

 	fio.version = ni.version;

-	cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS);
+	cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi);
 	if (!cic)
 		goto out_put_dnode;

@@ -1340,12 +1394,6 @@ out_destroy_crypt:

 	for (--i; i >= 0; i--)
 		fscrypt_finalize_bounce_page(&cc->cpages[i]);
-	for (i = 0; i < cc->nr_cpages; i++) {
-		if (!cc->cpages[i])
-			continue;
-		f2fs_compress_free_page(cc->cpages[i]);
-		cc->cpages[i] = NULL;
-	}
 out_put_cic:
 	kmem_cache_free(cic_entry_slab, cic);
 out_put_dnode:
@@ -1356,6 +1404,12 @@ out_unlock_op:
 	else
 		f2fs_unlock_op(sbi);
 out_free:
+	for (i = 0; i < cc->nr_cpages; i++) {
+		if (!cc->cpages[i])
+			continue;
+		f2fs_compress_free_page(cc->cpages[i]);
+		cc->cpages[i] = NULL;
+	}
 	page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
 	cc->cpages = NULL;
 	return -EAGAIN;
@@ -1506,7 +1560,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 	pgoff_t start_idx = start_idx_of_cluster(cc);
 	int i;

-	dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS);
+	dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
+					false, F2FS_I_SB(cc->inode));
 	if (!dic)
 		return ERR_PTR(-ENOMEM);

@@ -1666,6 +1721,30 @@ void f2fs_put_page_dic(struct page *page)
 	f2fs_put_dic(dic);
 }

+/*
+ * check whether cluster blocks are contiguous, and add extent cache entry
+ * only if cluster blocks are logically and physically contiguous.
+ */
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
+{
+	bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR;
+	int i = compressed ? 1 : 0;
+	block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+						dn->ofs_in_node + i);
+
+	for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
+		block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+						dn->ofs_in_node + i);
+
+		if (!__is_valid_data_blkaddr(blkaddr))
+			break;
+		if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr)
+			return 0;
+	}
+
+	return compressed ? i - 1 : i;
+}
+
 const struct address_space_operations f2fs_compress_aops = {
 	.releasepage = f2fs_release_page,
 	.invalidatepage = f2fs_invalidate_page,
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -25,6 +25,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>
 #include <trace/events/android_fs.h>

@@ -117,6 +118,7 @@ struct bio_post_read_ctx {
 	struct f2fs_sb_info *sbi;
 	struct work_struct work;
 	unsigned int enabled_steps;
+	block_t fs_blkaddr;
 };

 static void f2fs_finish_read_bio(struct bio *bio)
@@ -229,7 +231,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
 	struct bio_vec *bv;
 	struct bvec_iter_all iter_all;
 	bool all_compressed = true;
-	block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector);
+	block_t blkaddr = ctx->fs_blkaddr;

 	bio_for_each_segment_all(bv, ctx->bio, iter_all) {
 		struct page *page = bv->bv_page;
@@ -270,7 +272,10 @@ static void f2fs_post_read_work(struct work_struct *work)
 static void f2fs_read_end_io(struct bio *bio)
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
-	struct bio_post_read_ctx *ctx = bio->bi_private;
+	struct bio_post_read_ctx *ctx;
+
+	iostat_update_and_unbind_ctx(bio, 0);
+	ctx = bio->bi_private;

 	if (time_to_inject(sbi, FAULT_READ_IO)) {
 		f2fs_show_injection_info(sbi, FAULT_READ_IO);
@@ -292,10 +297,13 @@ static void f2fs_read_end_io(struct bio *bio)

 static void f2fs_write_end_io(struct bio *bio)
 {
-	struct f2fs_sb_info *sbi = bio->bi_private;
+	struct f2fs_sb_info *sbi;
 	struct bio_vec *bvec;
 	struct bvec_iter_all iter_all;

+	iostat_update_and_unbind_ctx(bio, 1);
+	sbi = bio->bi_private;
+
 	if (time_to_inject(sbi, FAULT_WRITE_IO)) {
 		f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
 		bio->bi_status = BLK_STS_IOERR;
@@ -399,6 +407,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 		bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
 						fio->type, fio->temp);
 	}
+	iostat_alloc_and_bind_ctx(sbi, bio, NULL);
+
 	if (fio->io_wbc)
 		wbc_init_bio(fio->io_wbc, bio);

@@ -484,6 +494,8 @@ submit_io:
 		trace_f2fs_submit_read_bio(sbi->sb, type, bio);
 	else
 		trace_f2fs_submit_write_bio(sbi->sb, type, bio);
+
+	iostat_update_submit_ctx(bio, type);
 	submit_bio(bio);
 }

@@ -728,7 +740,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
 	struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
 	struct bio_entry *be;

-	be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+	be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL);
 	be->bio = bio;
 	bio_get(bio);

@@ -975,7 +987,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
-	struct bio_post_read_ctx *ctx;
+	struct bio_post_read_ctx *ctx = NULL;
 	unsigned int post_read_steps = 0;

 	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
@@ -1008,8 +1020,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		ctx->bio = bio;
 		ctx->sbi = sbi;
 		ctx->enabled_steps = post_read_steps;
+		ctx->fs_blkaddr = blkaddr;
 		bio->bi_private = ctx;
 	}
+	iostat_alloc_and_bind_ctx(sbi, bio, ctx);

 	return bio;
 }
@@ -1138,7 +1152,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)

 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 {
-	struct extent_info ei = {0, 0, 0};
+	struct extent_info ei = {0, };
 	struct inode *inode = dn->inode;

 	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
@@ -1155,7 +1169,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
-	struct extent_info ei = {0,0,0};
+	struct extent_info ei = {0, };
 	int err;

 	page = f2fs_grab_cache_page(mapping, index, for_write);
@@ -1453,7 +1467,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	int err = 0, ofs = 1;
 	unsigned int ofs_in_node, last_ofs_in_node;
 	blkcnt_t prealloc;
-	struct extent_info ei = {0,0,0};
+	struct extent_info ei = {0, };
 	block_t blkaddr;
 	unsigned int start_pgofs;

@@ -1495,7 +1509,21 @@ next_dnode:
 	if (err) {
 		if (flag == F2FS_GET_BLOCK_BMAP)
 			map->m_pblk = 0;
+
 		if (err == -ENOENT) {
+			/*
+			 * There is one exceptional case that read_node_page()
+			 * may return -ENOENT due to filesystem has been
+			 * shutdown or cp_error, so force to convert error
+			 * number to EIO for such case.
+			 */
+			if (map->m_may_create &&
+				(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
+				f2fs_cp_error(sbi))) {
+				err = -EIO;
+				goto unlock_out;
+			}
+
 			err = 0;
 			if (map->m_next_pgofs)
 				*map->m_next_pgofs =
@@ -1555,6 +1583,13 @@ next_block:
 			map->m_flags |= F2FS_MAP_NEW;
 			blkaddr = dn.data_blkaddr;
 		} else {
+			if (f2fs_compressed_file(inode) &&
+					f2fs_sanity_check_cluster(&dn) &&
+					(flag != F2FS_GET_BLOCK_FIEMAP ||
+					IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+				err = -EFSCORRUPTED;
+				goto sync_out;
+			}
 			if (flag == F2FS_GET_BLOCK_BMAP) {
 				map->m_pblk = 0;
 				goto sync_out;
@@ -1848,8 +1883,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = 0;
 	int ret = 0;
-	bool compr_cluster = false;
+	bool compr_cluster = false, compr_appended;
 	unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
+	unsigned int count_in_cluster = 0;
 	loff_t maxbytes;

 	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
@@ -1897,15 +1933,17 @@ next:
 	map.m_next_pgofs = &next_pgofs;
 	map.m_seg_type = NO_CHECK_TYPE;

-	if (compr_cluster)
-		map.m_len = cluster_size - 1;
+	if (compr_cluster) {
+		map.m_lblk += 1;
+		map.m_len = cluster_size - count_in_cluster;
+	}

 	ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
 	if (ret)
 		goto out;

 	/* HOLE */
-	if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+	if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) {
 		start_blk = next_pgofs;

 		if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode,
@@ -1915,6 +1953,14 @@ next:
 		flags |= FIEMAP_EXTENT_LAST;
 	}

+	compr_appended = false;
+	/* In a case of compressed cluster, append this to the last extent */
+	if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) ||
+			!(map.m_flags & F2FS_MAP_FLAGS))) {
+		compr_appended = true;
+		goto skip_fill;
+	}
+
 	if (size) {
 		flags |= FIEMAP_EXTENT_MERGED;
 		if (IS_ENCRYPTED(inode))
@@ -1931,38 +1977,36 @@ next:
 	if (start_blk > last_blk)
 		goto out;

-	if (compr_cluster) {
-		compr_cluster = false;
-
-
-		logical = blks_to_bytes(inode, start_blk - 1);
-		phys = blks_to_bytes(inode, map.m_pblk);
-		size = blks_to_bytes(inode, cluster_size);
-
-		flags |= FIEMAP_EXTENT_ENCODED;
-
-		start_blk += cluster_size - 1;
-
-		if (start_blk > last_blk)
-			goto out;
-
-		goto prep_next;
-	}
-
+skip_fill:
 	if (map.m_pblk == COMPRESS_ADDR) {
 		compr_cluster = true;
-		start_blk++;
-		goto prep_next;
-	}
-
+		count_in_cluster = 1;
+	} else if (compr_appended) {
+		unsigned int appended_blks = cluster_size -
+						count_in_cluster + 1;
+		size += blks_to_bytes(inode, appended_blks);
+		start_blk += appended_blks;
+		compr_cluster = false;
+	} else {
 		logical = blks_to_bytes(inode, start_blk);
-	phys = blks_to_bytes(inode, map.m_pblk);
+		phys = __is_valid_data_blkaddr(map.m_pblk) ?
+			blks_to_bytes(inode, map.m_pblk) : 0;
 		size = blks_to_bytes(inode, map.m_len);
 		flags = 0;
-	if (map.m_flags & F2FS_MAP_UNWRITTEN)
+
+		if (compr_cluster) {
+			flags = FIEMAP_EXTENT_ENCODED;
+			count_in_cluster += map.m_len;
+			if (count_in_cluster == cluster_size) {
+				compr_cluster = false;
+				size += blks_to_bytes(inode, 1);
+			}
+		} else if (map.m_flags & F2FS_MAP_UNWRITTEN) {
 			flags = FIEMAP_EXTENT_UNWRITTEN;
+		}

 		start_blk += bytes_to_blks(inode, size);
+	}

 prep_next:
 	cond_resched();
@@ -2120,6 +2164,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	sector_t last_block_in_file;
 	const unsigned blocksize = blks_to_bytes(inode, 1);
 	struct decompress_io_ctx *dic = NULL;
+	struct extent_info ei = {0, };
+	bool from_dnode = true;
 	int i;
 	int ret = 0;

@@ -2142,6 +2188,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 			continue;
 		}
 		unlock_page(page);
+		if (for_write)
+			put_page(page);
 		cc->rpages[i] = NULL;
 		cc->nr_rpages--;
 	}
@@ -2150,6 +2198,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	if (f2fs_cluster_is_empty(cc))
 		goto out;

+	if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+		from_dnode = false;
+
+	if (!from_dnode)
+		goto skip_reading_dnode;
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
 	if (ret)
@@ -2157,11 +2211,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,

 	f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);

+skip_reading_dnode:
 	for (i = 1; i < cc->cluster_size; i++) {
 		block_t blkaddr;

-		blkaddr = data_blkaddr(dn.inode, dn.node_page,
-						dn.ofs_in_node + i);
+		blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+					dn.ofs_in_node + i) :
+					ei.blk + i - 1;

 		if (!__is_valid_data_blkaddr(blkaddr))
 			break;
@@ -2171,6 +2227,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 			goto out_put_dnode;
 		}
 		cc->nr_cpages++;
+
+		if (!from_dnode && i >= ei.c_len)
+			break;
 	}

 	/* nothing to decompress */
@@ -2190,8 +2249,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 		block_t blkaddr;
 		struct bio_post_read_ctx *ctx;

-		blkaddr = data_blkaddr(dn.inode, dn.node_page,
-						dn.ofs_in_node + i + 1);
+		blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+					dn.ofs_in_node + i + 1) :
+					ei.blk + i;

 		f2fs_wait_on_block_writeback(inode, blkaddr);

@@ -2225,7 +2285,7 @@ submit_and_realloc:
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
 			goto submit_and_realloc;

-		ctx = bio->bi_private;
+		ctx = get_post_read_ctx(bio);
 		ctx->enabled_steps |= STEP_DECOMPRESS;
 		refcount_inc(&dic->refcnt);

@@ -2236,12 +2296,14 @@ submit_and_realloc:
 		*last_block_in_bio = blkaddr;
 	}

+	if (from_dnode)
 		f2fs_put_dnode(&dn);

 	*bio_ret = bio;
 	return 0;

 out_put_dnode:
+	if (from_dnode)
 		f2fs_put_dnode(&dn);
 out:
 	for (i = 0; i < cc->cluster_size; i++) {
@@ -2277,6 +2339,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
 		.nr_rpages = 0,
 		.nr_cpages = 0,
 	};
+	pgoff_t nc_cluster_idx = NULL_CLUSTER;
 #endif
 	unsigned nr_pages = rac ? readahead_count(rac) : 1;
 	unsigned max_nr_pages = nr_pages;
@@ -2309,12 +2372,23 @@ static int f2fs_mpage_readpages(struct inode *inode,
 				if (ret)
 					goto set_error_page;
 			}
+			if (cc.cluster_idx == NULL_CLUSTER) {
+				if (nc_cluster_idx ==
+					page->index >> cc.log_cluster_size) {
+					goto read_single_page;
+				}
+
 				ret = f2fs_is_compressed_cluster(inode, page->index);
 				if (ret < 0)
 					goto set_error_page;
-			else if (!ret)
+				else if (!ret) {
+					nc_cluster_idx =
+						page->index >> cc.log_cluster_size;
 					goto read_single_page;
+				}

+				nc_cluster_idx = NULL_CLUSTER;
+			}
 			ret = f2fs_init_compress_ctx(&cc);
 			if (ret)
 				goto set_error_page;
@@ -2503,6 +2577,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
 		return true;
 	if (f2fs_is_atomic_file(inode))
 		return true;
+	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
+		return true;

 	/* swap file is migrating in aligned write mode */
 	if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -2535,7 +2611,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	struct page *page = fio->page;
 	struct inode *inode = page->mapping->host;
 	struct dnode_of_data dn;
-	struct extent_info ei = {0,0,0};
+	struct extent_info ei = {0, };
 	struct node_info ni;
 	bool ipu_force = false;
 	int err = 0;
@@ -3181,9 +3257,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 			FS_CP_DATA_IO : FS_DATA_IO);
 }

-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct inode *inode, loff_t to)
 {
-	struct inode *inode = mapping->host;
 	loff_t i_size = i_size_read(inode);

 	if (IS_NOQUOTA(inode))
@@ -3192,12 +3267,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
 	/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
 	if (to > i_size && !f2fs_verity_in_progress(inode)) {
 		down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-		filemap_invalidate_lock(mapping);
+		filemap_invalidate_lock(inode->i_mapping);

 		truncate_pagecache(inode, i_size);
 		f2fs_truncate_blocks(inode, i_size, true);

-		filemap_invalidate_unlock(mapping);
+		filemap_invalidate_unlock(inode->i_mapping);
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 	}
 }
@@ -3211,7 +3286,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	struct dnode_of_data dn;
 	struct page *ipage;
 	bool locked = false;
-	struct extent_info ei = {0,0,0};
+	struct extent_info ei = {0, };
 	int err = 0;
 	int flag;

@@ -3343,6 +3418,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,

 		*fsdata = NULL;

+		if (len == PAGE_SIZE)
+			goto repeat;
+
 		ret = f2fs_prepare_compress_overwrite(inode, pagep,
 							index, fsdata);
 		if (ret < 0) {
@@ -3425,7 +3503,7 @@ repeat:

 fail:
 	f2fs_put_page(page, 1);
-	f2fs_write_failed(mapping, pos + len);
+	f2fs_write_failed(inode, pos + len);
 	if (drop_atomic)
 		f2fs_drop_inmem_pages_all(sbi, false);
 	return err;
@@ -3568,7 +3646,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (f2fs_force_buffered_io(inode, iocb, iter))
 		return 0;

-	do_opu = allow_outplace_dio(inode, iocb, iter);
+	do_opu = rw == WRITE && f2fs_lfs_mode(sbi);

 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);

@@ -3639,7 +3717,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
 						count - iov_iter_count(iter));
 		} else if (err < 0) {
-			f2fs_write_failed(mapping, offset + count);
+			f2fs_write_failed(inode, offset + count);
 		}
 	} else {
 		if (err > 0)
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -323,11 +323,27 @@ get_cache:
 #endif
 }

+static char *s_flag[] = {
+	[SBI_IS_DIRTY]		= " fs_dirty",
+	[SBI_IS_CLOSE]		= " closing",
+	[SBI_NEED_FSCK]		= " need_fsck",
+	[SBI_POR_DOING]		= " recovering",
+	[SBI_NEED_SB_WRITE]	= " sb_dirty",
+	[SBI_NEED_CP]		= " need_cp",
+	[SBI_IS_SHUTDOWN]	= " shutdown",
+	[SBI_IS_RECOVERED]	= " recovered",
+	[SBI_CP_DISABLED]	= " cp_disabled",
+	[SBI_CP_DISABLED_QUICK]	= " cp_disabled_quick",
+	[SBI_QUOTA_NEED_FLUSH]	= " quota_need_flush",
+	[SBI_QUOTA_SKIP_FLUSH]	= " quota_skip_flush",
+	[SBI_QUOTA_NEED_REPAIR]	= " quota_need_repair",
+	[SBI_IS_RESIZEFS]	= " resizefs",
+};
+
 static int stat_show(struct seq_file *s, void *v)
 {
 	struct f2fs_stat_info *si;
-	int i = 0;
-	int j;
+	int i = 0, j = 0;

 	mutex_lock(&f2fs_stat_mutex);
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
@@ -338,6 +354,12 @@ static int stat_show(struct seq_file *s, void *v)
 			f2fs_readonly(si->sbi->sb) ? "RO": "RW",
 			is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
 			"Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
+		if (si->sbi->s_flag) {
+			seq_puts(s, "[SBI:");
+			for_each_set_bit(j, &si->sbi->s_flag, 32)
+				seq_puts(s, s_flag[j]);
+			seq_puts(s, "]\n");
+		}
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -450,6 +472,15 @@ static int stat_show(struct seq_file *s, void *v)
 				si->data_segs, si->bg_data_segs);
 		seq_printf(s, "  - node segments : %d (%d)\n",
 				si->node_segs, si->bg_node_segs);
+		seq_printf(s, "  - Reclaimed segs : Normal (%d), Idle CB (%d), "
+				"Idle Greedy (%d), Idle AT (%d), "
+				"Urgent High (%d), Urgent Low (%d)\n",
+				si->sbi->gc_reclaimed_segs[GC_NORMAL],
+				si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
+				si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
+				si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
+				si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+				si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
 		seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
 				si->bg_data_blks + si->bg_node_blks);
 		seq_printf(s, "  - data blocks : %d (%d)\n", si->data_blks,
@@ -611,7 +642,7 @@ void __init f2fs_create_root_stats(void)
 #ifdef CONFIG_DEBUG_FS
 	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);

-	debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
+	debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL,
 			    &stat_fops);
 #endif
 }
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -83,8 +83,8 @@ int f2fs_init_casefolded_name(const struct inode *dir,
 	struct super_block *sb = dir->i_sb;

 	if (IS_CASEFOLDED(dir)) {
-		fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab,
-								GFP_NOFS);
+		fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
+					GFP_NOFS, false, F2FS_SB(sb));
 		if (!fname->cf_name.name)
 			return -ENOMEM;
 		fname->cf_name.len = utf8_casefold(sb->s_encoding,
@@ -1000,6 +1000,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
 	struct blk_plug plug;
 	bool readdir_ra = sbi->readdir_ra == 1;
+	bool found_valid_dirent = false;
 	int err = 0;

 	bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -1014,13 +1015,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,

 		de = &d->dentry[bit_pos];
 		if (de->name_len == 0) {
-			bit_pos++;
-			ctx->pos = start_pos + bit_pos;
+			if (found_valid_dirent || !bit_pos) {
 				printk_ratelimited(
 					"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
 					KERN_WARNING, sbi->sb->s_id,
 					le32_to_cpu(de->ino));
 				set_sbi_flag(sbi, SBI_NEED_FSCK);
+			}
+			bit_pos++;
+			ctx->pos = start_pos + bit_pos;
 			continue;
 		}

@@ -1063,6 +1066,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			f2fs_ra_node_page(sbi, le32_to_cpu(de->ino));

 		ctx->pos = start_pos + bit_pos;
+		found_valid_dirent = true;
 	}
 out:
 	if (readdir_ra)
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -239,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 {
 	struct extent_node *en;

-	en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+	en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
 	if (!en)
 		return NULL;

@@ -292,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	mutex_lock(&sbi->extent_tree_lock);
 	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
 	if (!et) {
-		et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+		et = f2fs_kmem_cache_alloc(extent_tree_slab,
+					GFP_NOFS, true, NULL);
 		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
 		memset(et, 0, sizeof(struct extent_tree));
 		et->ino = ino;
@@ -661,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 		f2fs_mark_inode_dirty_sync(inode, true);
 }

+#ifdef CONFIG_F2FS_FS_COMPRESSION
+void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr, unsigned int llen,
+				unsigned int c_len)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_node *en = NULL;
+	struct extent_node *prev_en = NULL, *next_en = NULL;
+	struct extent_info ei;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	bool leftmost = false;
+
+	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen);
+
+	/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
+	if (is_inode_flag_set(inode, FI_NO_EXTENT))
+		return;
+
+	write_lock(&et->lock);
+
+	en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
+				(struct rb_entry *)et->cached_en, fofs,
+				(struct rb_entry **)&prev_en,
+				(struct rb_entry **)&next_en,
+				&insert_p, &insert_parent, false,
+				&leftmost);
+	if (en)
+		goto unlock_out;
+
+	set_extent_info(&ei, fofs, blkaddr, llen);
+	ei.c_len = c_len;
+
+	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+		__insert_extent_tree(sbi, et, &ei,
+				insert_p, insert_parent, leftmost);
+unlock_out:
+	write_unlock(&et->lock);
+}
+#endif
+
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct extent_tree *et, *next;
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -43,6 +43,7 @@ enum {
 	FAULT_KVMALLOC,
 	FAULT_PAGE_ALLOC,
 	FAULT_PAGE_GET,
+	FAULT_ALLOC_BIO,	/* it's obsolete due to bio_alloc() will never fail */
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@@ -53,6 +54,7 @@ enum {
 	FAULT_CHECKPOINT,
 	FAULT_DISCARD,
 	FAULT_WRITE_IO,
+	FAULT_SLAB_ALLOC,
 	FAULT_MAX,
 };

@@ -139,6 +141,11 @@ struct f2fs_mount_info {
 	int fsync_mode;			/* fsync policy */
 	int fs_mode;			/* fs mode: LFS or ADAPTIVE */
 	int bggc_mode;			/* bggc mode: off, on or sync */
+	int discard_unit;		/*
+					 * discard command's offset/size should
+					 * be aligned to this unit: block,
+					 * segment or section
+					 */
 	struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */
 	block_t unusable_cap_perc;	/* percentage for cap */
 	block_t unusable_cap;		/* Amount of space allowed to be
@@ -542,7 +549,7 @@ enum {
 					 */
 };

-#define DEFAULT_RETRY_IO_COUNT	8	/* maximum retry read IO count */
+#define DEFAULT_RETRY_IO_COUNT	8	/* maximum retry read IO or flush count */

 /* congestion wait timeout value, default: 20ms */
 #define	DEFAULT_IO_TIMEOUT	(msecs_to_jiffies(20))
@@ -575,6 +582,9 @@ struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
 	unsigned int len;		/* length of the extent */
 	u32 blk;			/* start block address of the extent */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	unsigned int c_len;		/* physical extent length of compressed blocks */
+#endif
 };

 struct extent_node {
@@ -793,6 +803,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
 	ei->fofs = fofs;
 	ei->blk = blk;
 	ei->len = len;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	ei->c_len = 0;
+#endif
 }

 static inline bool __is_discard_mergeable(struct discard_info *back,
@@ -817,6 +830,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
 static inline bool __is_extent_mergeable(struct extent_info *back,
 						struct extent_info *front)
 {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (back->c_len && back->len != back->c_len)
+		return false;
+	if (front->c_len && front->len != front->c_len)
+		return false;
+#endif
 	return (back->fofs + back->len == front->fofs &&
 			back->blk + back->len == front->blk);
 }
@@ -1252,6 +1271,7 @@ enum {
 	GC_IDLE_AT,
 	GC_URGENT_HIGH,
 	GC_URGENT_LOW,
+	MAX_GC_MODE,
 };

 enum {
@@ -1297,6 +1317,12 @@ enum {
 				 */
 };

+enum {
+	DISCARD_UNIT_BLOCK,	/* basic discard unit is block */
+	DISCARD_UNIT_SEGMENT,	/* basic discard unit is segment */
+	DISCARD_UNIT_SECTION,	/* basic discard unit is section */
+};
+
 static inline int f2fs_test_bit(unsigned int nr, char *addr);
 static inline void f2fs_set_bit(unsigned int nr, char *addr);
 static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1686,14 +1712,6 @@ struct f2fs_sb_info {
 #endif
 	spinlock_t stat_lock;			/* lock for stat operations */

-	/* For app/fs IO statistics */
-	spinlock_t iostat_lock;
-	unsigned long long rw_iostat[NR_IO_TYPE];
-	unsigned long long prev_rw_iostat[NR_IO_TYPE];
-	bool iostat_enable;
-	unsigned long iostat_next_period;
-	unsigned int iostat_period_ms;
-
 	/* to attach REQ_META|REQ_FUA flags */
 	unsigned int data_io_flag;
 	unsigned int node_io_flag;
@@ -1732,6 +1750,12 @@ struct f2fs_sb_info {
 	struct kmem_cache *inline_xattr_slab;	/* inline xattr entry */
 	unsigned int inline_xattr_slab_size;	/* default inline xattr slab size */

+	/* For reclaimed segs statistics per each GC mode */
+	unsigned int gc_segment_mode;		/* GC state for reclaimed segments */
+	unsigned int gc_reclaimed_segs[MAX_GC_MODE];	/* Reclaimed segs for each mode */
+
+	unsigned long seq_file_ra_mul;		/* multiplier for ra_pages of seq. files in fadvise */
+
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	struct kmem_cache *page_array_slab;	/* page array entry */
 	unsigned int page_array_slab_size;	/* default page array slab size */
@@ -1747,6 +1771,20 @@ struct f2fs_sb_info {
 	unsigned int compress_watermark;	/* cache page watermark */
 	atomic_t compress_page_hit;		/* cache hit count */
 #endif
+
+#ifdef CONFIG_F2FS_IOSTAT
+	/* For app/fs IO statistics */
+	spinlock_t iostat_lock;
+	unsigned long long rw_iostat[NR_IO_TYPE];
+	unsigned long long prev_rw_iostat[NR_IO_TYPE];
+	bool iostat_enable;
+	unsigned long iostat_next_period;
+	unsigned int iostat_period_ms;
+
+	/* For io latency related statistics info in one iostat period */
+	spinlock_t iostat_lat_lock;
+	struct iostat_lat_info *iostat_io_lat;
+#endif
 };

 struct f2fs_private_dio {
@@ -2034,36 +2072,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 	spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }

-static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
-{
-	unsigned long flags;
-	unsigned char *nat_bits;
-
-	/*
-	 * In order to re-enable nat_bits we need to call fsck.f2fs by
-	 * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
-	 * so let's rely on regular fsck or unclean shutdown.
-	 */
-
-	if (lock)
-		spin_lock_irqsave(&sbi->cp_lock, flags);
-	__clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
-	nat_bits = NM_I(sbi)->nat_bits;
-	NM_I(sbi)->nat_bits = NULL;
-	if (lock)
-		spin_unlock_irqrestore(&sbi->cp_lock, flags);
-
-	kvfree(nat_bits);
-}
-
-static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
-					struct cp_control *cpc)
-{
-	bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
-
-	return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
-}
-
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
 	down_read(&sbi->cp_rwsem);
@@ -2587,7 +2595,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
 	return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
 }

-static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep,
 						gfp_t flags)
 {
 	void *entry;
@@ -2598,6 +2606,20 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
 	return entry;
 }

+static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+			gfp_t flags, bool nofail, struct f2fs_sb_info *sbi)
+{
+	if (nofail)
+		return f2fs_kmem_cache_alloc_nofail(cachep, flags);
+
+	if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) {
+		f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC);
+		return NULL;
+	}
+
+	return kmem_cache_alloc(cachep, flags);
+}
+
 static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type)
 {
 	if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
@@ -3210,47 +3232,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
 		sizeof((f2fs_inode)->field))			\
 		<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize)))	\

-#define DEFAULT_IOSTAT_PERIOD_MS	3000
-#define MIN_IOSTAT_PERIOD_MS		100
-/* maximum period of iostat tracing is 1 day */
-#define MAX_IOSTAT_PERIOD_MS		8640000
-
-static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
-{
-	int i;
-
-	spin_lock(&sbi->iostat_lock);
-	for (i = 0; i < NR_IO_TYPE; i++) {
-		sbi->rw_iostat[i] = 0;
-		sbi->prev_rw_iostat[i] = 0;
-	}
-	spin_unlock(&sbi->iostat_lock);
-}
-
-extern void f2fs_record_iostat(struct f2fs_sb_info *sbi);
-
-static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
-			enum iostat_type type, unsigned long long io_bytes)
-{
-	if (!sbi->iostat_enable)
-		return;
-	spin_lock(&sbi->iostat_lock);
-	sbi->rw_iostat[type] += io_bytes;
-
-	if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
-		sbi->rw_iostat[APP_BUFFERED_IO] =
-			sbi->rw_iostat[APP_WRITE_IO] -
-			sbi->rw_iostat[APP_DIRECT_IO];
-
-	if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
-		sbi->rw_iostat[APP_BUFFERED_READ_IO] =
-			sbi->rw_iostat[APP_READ_IO] -
-			sbi->rw_iostat[APP_DIRECT_READ_IO];
-	spin_unlock(&sbi->iostat_lock);
-
-	f2fs_record_iostat(sbi);
-}
-
 #define __is_large_section(sbi)		((sbi)->segs_per_sec > 1)

 #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
@@ -3417,6 +3398,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
 int f2fs_truncate_xattr_node(struct inode *inode);
 int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
 					unsigned int seq_id);
+bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi);
 int f2fs_remove_inode_page(struct inode *inode);
 struct page *f2fs_new_inode_page(struct inode *inode);
 struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
@@ -3441,6 +3423,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
 int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
 int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct f2fs_summary_block *sum);
+void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi);
 int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
 void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
@@ -3464,6 +3447,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
+int f2fs_start_discard_thread(struct f2fs_sb_info *sbi);
 void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi);
 void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi);
 bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi);
@@ -3986,6 +3970,9 @@ void f2fs_destroy_extent_cache(void);
 /*
 * sysfs.c
 */
+#define MIN_RA_MUL	2
+#define MAX_RA_MUL	256
+
 int __init f2fs_init_sysfs(void);
 void f2fs_exit_sysfs(void);
 int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
@@ -4040,18 +4027,23 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
 							block_t blkaddr);
 bool f2fs_cluster_is_empty(struct compress_ctx *cc);
 bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
 void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
 int f2fs_write_multi_pages(struct compress_ctx *cc,
 						int *submitted,
 						struct writeback_control *wbc,
 						enum iostat_type io_type);
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
+void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr, unsigned int llen,
+				unsigned int c_len);
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				unsigned nr_pages, sector_t *last_block_in_bio,
 				bool is_readahead, bool for_write);
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
 void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
 void f2fs_put_page_dic(struct page *page);
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
 void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -4106,6 +4098,8 @@ static inline void f2fs_put_page_dic(struct page *page)
 {
 	WARN_ON_ONCE(1);
 }
+static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; }
+static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; }
 static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
 static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
@@ -4121,6 +4115,9 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
 static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
 							nid_t ino) { }
 #define inc_compr_inode_stat(inode)		do { } while (0)
+static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr, unsigned int llen,
+				unsigned int c_len) { }
 #endif

 static inline void set_compress_context(struct inode *inode)
@@ -4136,7 +4133,8 @@ static inline void set_compress_context(struct inode *inode)
 				1 << COMPRESS_CHKSUM : 0;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
-	if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+	if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
+		F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
 			F2FS_OPTION(sbi).compress_level)
 		F2FS_I(inode)->i_compress_flag |=
 				F2FS_OPTION(sbi).compress_level <<
@@ -4304,16 +4302,6 @@ static inline int block_unaligned_IO(struct inode *inode,
 	return align & blocksize_mask;
 }

-static inline int allow_outplace_dio(struct inode *inode,
-				struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int rw = iov_iter_rw(iter);
-
-	return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
-				!block_unaligned_IO(inode, iocb, iter));
-}
-
 static inline bool f2fs_force_buffered_io(struct inode *inode,
 				struct kiocb *iocb, struct iov_iter *iter)
 {
@@ -4372,6 +4360,11 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
 	return false;
 }

+static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
+{
+	return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
+}
+
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
 #define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */

--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,7 @@
 #include <linux/nls.h>
 #include <linux/sched/signal.h>
 #include <linux/fileattr.h>
+#include <linux/fadvise.h>

 #include "f2fs.h"
 #include "node.h"
@@ -30,6 +31,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "gc.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>
 #include <uapi/linux/f2fs.h>

@@ -258,8 +260,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	};
 	unsigned int seq_id = 0;

-	if (unlikely(f2fs_readonly(inode->i_sb) ||
-				is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+	if (unlikely(f2fs_readonly(inode->i_sb)))
 		return 0;

 	trace_f2fs_sync_file_enter(inode);
@@ -273,7 +274,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	ret = file_write_and_wait_range(file, start, end);
 	clear_inode_flag(inode, FI_NEED_IPU);

-	if (ret) {
+	if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
 		trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 		return ret;
 	}
@@ -298,6 +299,18 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 				f2fs_exist_written_data(sbi, ino, UPDATE_INO))
 			goto flush_out;
 		goto out;
+	} else {
+		/*
+		 * for OPU case, during fsync(), node can be persisted before
+		 * data when lower device doesn't support write barrier, result
+		 * in data corruption after SPO.
+		 * So for strict fsync mode, force to use atomic write sematics
+		 * to keep write order in between data/node and last node to
+		 * avoid potential data corruption.
+		 */
+		if (F2FS_OPTION(sbi).fsync_mode ==
+				FSYNC_MODE_STRICT && !atomic)
+			atomic = true;
 	}
 go_write:
 	/*
@@ -737,6 +750,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
 		return err;

 #ifdef CONFIG_F2FS_FS_COMPRESSION
+	/*
+	 * For compressed file, after release compress blocks, don't allow write
+	 * direct, but we should allow write direct after truncate to zero.
+	 */
+	if (f2fs_compressed_file(inode) && !free_from
+			&& is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
+		clear_inode_flag(inode, FI_COMPRESS_RELEASED);
+
 	if (from != free_from) {
 		err = f2fs_truncate_partial_cluster(inode, from, lock);
 		if (err)
@@ -1082,7 +1103,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		}

 		if (pg_start < pg_end) {
-			struct address_space *mapping = inode->i_mapping;
 			loff_t blk_start, blk_end;
 			struct f2fs_sb_info *sbi = F2FS_I_SB(inode);

@@ -1092,16 +1112,15 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			blk_end = (loff_t)pg_end << PAGE_SHIFT;

 			down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-			filemap_invalidate_lock(mapping);
+			filemap_invalidate_lock(inode->i_mapping);

-			truncate_inode_pages_range(mapping, blk_start,
-					blk_end - 1);
+			truncate_pagecache_range(inode, blk_start, blk_end - 1);

 			f2fs_lock_op(sbi);
 			ret = f2fs_truncate_hole(inode, pg_start, pg_end);
 			f2fs_unlock_op(sbi);

-			filemap_invalidate_unlock(mapping);
+			filemap_invalidate_unlock(inode->i_mapping);
 			up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		}
 	}
@@ -3473,8 +3492,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
 		released_blocks += ret;
 	}

-	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 	filemap_invalidate_unlock(inode->i_mapping);
+	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 out:
 	inode_unlock(inode);

@@ -3626,8 +3645,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 		reserved_blocks += ret;
 	}

-	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 	filemap_invalidate_unlock(inode->i_mapping);
+	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);

 	if (ret >= 0) {
 		clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -4290,7 +4309,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			 * back to buffered IO.
 			 */
 			if (!f2fs_force_buffered_io(inode, iocb, from) &&
-					allow_outplace_dio(inode, iocb, from))
+					f2fs_lfs_mode(F2FS_I_SB(inode)))
 				goto write;
 		}
 		preallocated = true;
@@ -4330,6 +4349,34 @@ out:
 	return ret;
 }

+static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
+		int advice)
+{
+	struct inode *inode;
+	struct address_space *mapping;
+	struct backing_dev_info *bdi;
+
+	if (advice == POSIX_FADV_SEQUENTIAL) {
+		inode = file_inode(filp);
+		if (S_ISFIFO(inode->i_mode))
+			return -ESPIPE;
+
+		mapping = filp->f_mapping;
+		if (!mapping || len < 0)
+			return -EINVAL;
+
+		bdi = inode_to_bdi(mapping->host);
+		filp->f_ra.ra_pages = bdi->ra_pages *
+			F2FS_I_SB(inode)->seq_file_ra_mul;
+		spin_lock(&filp->f_lock);
+		filp->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&filp->f_lock);
+		return 0;
+	}
+
+	return generic_fadvise(filp, offset, len, advice);
+}
+
 #ifdef CONFIG_COMPAT
 struct compat_f2fs_gc_range {
 	u32 sync;
@@ -4458,4 +4505,5 @@ const struct file_operations f2fs_file_operations = {
 #endif
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
+	.fadvise	= f2fs_file_fadvise,
 };
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -19,6 +19,7 @@
 #include "node.h"
 #include "segment.h"
 #include "gc.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>

 static struct kmem_cache *victim_entry_slab;
@@ -371,7 +372,8 @@ static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi,
 	struct atgc_management *am = &sbi->am;
 	struct victim_entry *ve;

-	ve =  f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS);
+	ve =  f2fs_kmem_cache_alloc(victim_entry_slab,
+				GFP_NOFS, true, NULL);

 	ve->mtime = mtime;
 	ve->segno = segno;
@@ -849,7 +851,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
 		iput(inode);
 		return;
 	}
-	new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
+	new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab,
+					GFP_NOFS, true, NULL);
 	new_ie->inode = inode;

 	f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
@@ -1497,8 +1500,10 @@ next_step:
 			int err;

 			if (S_ISREG(inode->i_mode)) {
-				if (!down_write_trylock(&fi->i_gc_rwsem[READ]))
+				if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+					sbi->skipped_gc_rwsem++;
 					continue;
+				}
 				if (!down_write_trylock(
 						&fi->i_gc_rwsem[WRITE])) {
 					sbi->skipped_gc_rwsem++;
@@ -1646,6 +1651,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 							force_migrate);

 		stat_inc_seg_count(sbi, type, gc_type);
+		sbi->gc_reclaimed_segs[sbi->gc_mode]++;
 		migrated++;

 freed:
@@ -1747,7 +1753,7 @@ gc_more:
 		round++;
 	}

-	if (gc_type == FG_GC && seg_freed)
+	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;

 	if (sync)
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f2fs iostat support
+ *
+ * Copyright 2021 Google LLC
+ * Author: Daeho Jeong <daehojeong@google.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "iostat.h"
+#include <trace/events/f2fs.h>
+
+#define NUM_PREALLOC_IOSTAT_CTXS	128
+static struct kmem_cache *bio_iostat_ctx_cache;
+static mempool_t *bio_iostat_ctx_pool;
+
+int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	time64_t now = ktime_get_real_seconds();
+
+	if (!sbi->iostat_enable)
+		return 0;
+
+	seq_printf(seq, "time:		%-16llu\n", now);
+
+	/* print app write IOs */
+	seq_puts(seq, "[WRITE]\n");
+	seq_printf(seq, "app buffered:	%-16llu\n",
+				sbi->rw_iostat[APP_BUFFERED_IO]);
+	seq_printf(seq, "app direct:	%-16llu\n",
+				sbi->rw_iostat[APP_DIRECT_IO]);
+	seq_printf(seq, "app mapped:	%-16llu\n",
+				sbi->rw_iostat[APP_MAPPED_IO]);
+
+	/* print fs write IOs */
+	seq_printf(seq, "fs data:	%-16llu\n",
+				sbi->rw_iostat[FS_DATA_IO]);
+	seq_printf(seq, "fs node:	%-16llu\n",
+				sbi->rw_iostat[FS_NODE_IO]);
+	seq_printf(seq, "fs meta:	%-16llu\n",
+				sbi->rw_iostat[FS_META_IO]);
+	seq_printf(seq, "fs gc data:	%-16llu\n",
+				sbi->rw_iostat[FS_GC_DATA_IO]);
+	seq_printf(seq, "fs gc node:	%-16llu\n",
+				sbi->rw_iostat[FS_GC_NODE_IO]);
+	seq_printf(seq, "fs cp data:	%-16llu\n",
+				sbi->rw_iostat[FS_CP_DATA_IO]);
+	seq_printf(seq, "fs cp node:	%-16llu\n",
+				sbi->rw_iostat[FS_CP_NODE_IO]);
+	seq_printf(seq, "fs cp meta:	%-16llu\n",
+				sbi->rw_iostat[FS_CP_META_IO]);
+
+	/* print app read IOs */
+	seq_puts(seq, "[READ]\n");
+	seq_printf(seq, "app buffered:	%-16llu\n",
+				sbi->rw_iostat[APP_BUFFERED_READ_IO]);
+	seq_printf(seq, "app direct:	%-16llu\n",
+				sbi->rw_iostat[APP_DIRECT_READ_IO]);
+	seq_printf(seq, "app mapped:	%-16llu\n",
+				sbi->rw_iostat[APP_MAPPED_READ_IO]);
+
+	/* print fs read IOs */
+	seq_printf(seq, "fs data:	%-16llu\n",
+				sbi->rw_iostat[FS_DATA_READ_IO]);
+	seq_printf(seq, "fs gc data:	%-16llu\n",
+				sbi->rw_iostat[FS_GDATA_READ_IO]);
+	seq_printf(seq, "fs compr_data:	%-16llu\n",
+				sbi->rw_iostat[FS_CDATA_READ_IO]);
+	seq_printf(seq, "fs node:	%-16llu\n",
+				sbi->rw_iostat[FS_NODE_READ_IO]);
+	seq_printf(seq, "fs meta:	%-16llu\n",
+				sbi->rw_iostat[FS_META_READ_IO]);
+
+	/* print other IOs */
+	seq_puts(seq, "[OTHER]\n");
+	seq_printf(seq, "fs discard:	%-16llu\n",
+				sbi->rw_iostat[FS_DISCARD]);
+
+	return 0;
+}
+
+static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
+{
+	int io, idx = 0;
+	unsigned int cnt;
+	struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
+	struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+
+	spin_lock_irq(&sbi->iostat_lat_lock);
+	for (idx = 0; idx < MAX_IO_TYPE; idx++) {
+		for (io = 0; io < NR_PAGE_TYPE; io++) {
+			cnt = io_lat->bio_cnt[idx][io];
+			iostat_lat[idx][io].peak_lat =
+			   jiffies_to_msecs(io_lat->peak_lat[idx][io]);
+			iostat_lat[idx][io].cnt = cnt;
+			iostat_lat[idx][io].avg_lat = cnt ?
+			   jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0;
+			io_lat->sum_lat[idx][io] = 0;
+			io_lat->peak_lat[idx][io] = 0;
+			io_lat->bio_cnt[idx][io] = 0;
+		}
+	}
+	spin_unlock_irq(&sbi->iostat_lat_lock);
+
+	trace_f2fs_iostat_latency(sbi, iostat_lat);
+}
+
+static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
+{
+	unsigned long long iostat_diff[NR_IO_TYPE];
+	int i;
+
+	if (time_is_after_jiffies(sbi->iostat_next_period))
+		return;
+
+	/* Need double check under the lock */
+	spin_lock(&sbi->iostat_lock);
+	if (time_is_after_jiffies(sbi->iostat_next_period)) {
+		spin_unlock(&sbi->iostat_lock);
+		return;
+	}
+	sbi->iostat_next_period = jiffies +
+				msecs_to_jiffies(sbi->iostat_period_ms);
+
+	for (i = 0; i < NR_IO_TYPE; i++) {
+		iostat_diff[i] = sbi->rw_iostat[i] -
+				sbi->prev_rw_iostat[i];
+		sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
+	}
+	spin_unlock(&sbi->iostat_lock);
+
+	trace_f2fs_iostat(sbi, iostat_diff);
+
+	__record_iostat_latency(sbi);
+}
+
+void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
+{
+	struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+	int i;
+
+	spin_lock(&sbi->iostat_lock);
+	for (i = 0; i < NR_IO_TYPE; i++) {
+		sbi->rw_iostat[i] = 0;
+		sbi->prev_rw_iostat[i] = 0;
+	}
+	spin_unlock(&sbi->iostat_lock);
+
+	spin_lock_irq(&sbi->iostat_lat_lock);
+	memset(io_lat, 0, sizeof(struct iostat_lat_info));
+	spin_unlock_irq(&sbi->iostat_lat_lock);
+}
+
+void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+			enum iostat_type type, unsigned long long io_bytes)
+{
+	if (!sbi->iostat_enable)
+		return;
+
+	spin_lock(&sbi->iostat_lock);
+	sbi->rw_iostat[type] += io_bytes;
+
+	if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
+		sbi->rw_iostat[APP_BUFFERED_IO] =
+			sbi->rw_iostat[APP_WRITE_IO] -
+			sbi->rw_iostat[APP_DIRECT_IO];
+
+	if (type == APP_READ_IO || type == APP_DIRECT_READ_IO)
+		sbi->rw_iostat[APP_BUFFERED_READ_IO] =
+			sbi->rw_iostat[APP_READ_IO] -
+			sbi->rw_iostat[APP_DIRECT_READ_IO];
+	spin_unlock(&sbi->iostat_lock);
+
+	f2fs_record_iostat(sbi);
+}
+
+static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
+				int rw, bool is_sync)
+{
+	unsigned long ts_diff;
+	unsigned int iotype = iostat_ctx->type;
+	unsigned long flags;
+	struct f2fs_sb_info *sbi = iostat_ctx->sbi;
+	struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+	int idx;
+
+	if (!sbi->iostat_enable)
+		return;
+
+	ts_diff = jiffies - iostat_ctx->submit_ts;
+	if (iotype >= META_FLUSH)
+		iotype = META;
+
+	if (rw == 0) {
+		idx = READ_IO;
+	} else {
+		if (is_sync)
+			idx = WRITE_SYNC_IO;
+		else
+			idx = WRITE_ASYNC_IO;
+	}
+
+	spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
+	io_lat->sum_lat[idx][iotype] += ts_diff;
+	io_lat->bio_cnt[idx][iotype]++;
+	if (ts_diff > io_lat->peak_lat[idx][iotype])
+		io_lat->peak_lat[idx][iotype] = ts_diff;
+	spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
+}
+
+void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
+{
+	struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+	bool is_sync = bio->bi_opf & REQ_SYNC;
+
+	if (rw == 0)
+		bio->bi_private = iostat_ctx->post_read_ctx;
+	else
+		bio->bi_private = iostat_ctx->sbi;
+	__update_iostat_latency(iostat_ctx, rw, is_sync);
+	mempool_free(iostat_ctx, bio_iostat_ctx_pool);
+}
+
+void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+		struct bio *bio, struct bio_post_read_ctx *ctx)
+{
+	struct bio_iostat_ctx *iostat_ctx;
+	/* Due to the mempool, this never fails. */
+	iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS);
+	iostat_ctx->sbi = sbi;
+	iostat_ctx->submit_ts = 0;
+	iostat_ctx->type = 0;
+	iostat_ctx->post_read_ctx = ctx;
+	bio->bi_private = iostat_ctx;
+}
+
+int __init f2fs_init_iostat_processing(void)
+{
+	bio_iostat_ctx_cache =
+		kmem_cache_create("f2fs_bio_iostat_ctx",
+				  sizeof(struct bio_iostat_ctx), 0, 0, NULL);
+	if (!bio_iostat_ctx_cache)
+		goto fail;
+	bio_iostat_ctx_pool =
+		mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS,
+					 bio_iostat_ctx_cache);
+	if (!bio_iostat_ctx_pool)
+		goto fail_free_cache;
+	return 0;
+
+fail_free_cache:
+	kmem_cache_destroy(bio_iostat_ctx_cache);
+fail:
+	return -ENOMEM;
+}
+
+void f2fs_destroy_iostat_processing(void)
+{
+	mempool_destroy(bio_iostat_ctx_pool);
+	kmem_cache_destroy(bio_iostat_ctx_cache);
+}
+
+int f2fs_init_iostat(struct f2fs_sb_info *sbi)
+{
+	/* init iostat info */
+	spin_lock_init(&sbi->iostat_lock);
+	spin_lock_init(&sbi->iostat_lat_lock);
+	sbi->iostat_enable = false;
+	sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
+	sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info),
+					GFP_KERNEL);
+	if (!sbi->iostat_io_lat)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void f2fs_destroy_iostat(struct f2fs_sb_info *sbi)
+{
+	kfree(sbi->iostat_io_lat);
+}
--- a/fs/f2fs/iostat.h
+++ b/fs/f2fs/iostat.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ * Author: Daeho Jeong <daehojeong@google.com>
+ */
+#ifndef __F2FS_IOSTAT_H__
+#define __F2FS_IOSTAT_H__
+
+struct bio_post_read_ctx;
+
+#ifdef CONFIG_F2FS_IOSTAT
+
+#define DEFAULT_IOSTAT_PERIOD_MS	3000
+#define MIN_IOSTAT_PERIOD_MS		100
+/* maximum period of iostat tracing is 1 day */
+#define MAX_IOSTAT_PERIOD_MS		8640000
+
+enum {
+	READ_IO,
+	WRITE_SYNC_IO,
+	WRITE_ASYNC_IO,
+	MAX_IO_TYPE,
+};
+
+struct iostat_lat_info {
+	unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE];	/* sum of io latencies */
+	unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE];	/* peak io latency */
+	unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE];	/* bio count */
+};
+
+extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
+			void *offset);
+extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi);
+extern void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+			enum iostat_type type, unsigned long long io_bytes);
+
+struct bio_iostat_ctx {
+	struct f2fs_sb_info *sbi;
+	unsigned long submit_ts;
+	enum page_type type;
+	struct bio_post_read_ctx *post_read_ctx;
+};
+
+static inline void iostat_update_submit_ctx(struct bio *bio,
+			enum page_type type)
+{
+	struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+
+	iostat_ctx->submit_ts = jiffies;
+	iostat_ctx->type = type;
+}
+
+static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
+{
+	struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
+
+	return iostat_ctx->post_read_ctx;
+}
+
+extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw);
+extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+		struct bio *bio, struct bio_post_read_ctx *ctx);
+extern int f2fs_init_iostat_processing(void);
+extern void f2fs_destroy_iostat_processing(void);
+extern int f2fs_init_iostat(struct f2fs_sb_info *sbi);
+extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
+#else
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+		enum iostat_type type, unsigned long long io_bytes) {}
+static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
+static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
+		struct bio *bio, struct bio_post_read_ctx *ctx) {}
+static inline void iostat_update_submit_ctx(struct bio *bio,
+		enum page_type type) {}
+static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
+{
+	return bio->bi_private;
+}
+static inline int f2fs_init_iostat_processing(void) { return 0; }
+static inline void f2fs_destroy_iostat_processing(void) {}
+static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {}
+#endif
+#endif /* __F2FS_IOSTAT_H__ */
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -17,6 +17,7 @@
 #include "node.h"
 #include "segment.h"
 #include "xattr.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>

 #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
@@ -162,14 +163,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 	return dst_page;
 }

-static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
+						nid_t nid, bool no_fail)
 {
 	struct nat_entry *new;

-	if (no_fail)
-		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
-	else
-		new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
+	new = f2fs_kmem_cache_alloc(nat_entry_slab,
+					GFP_F2FS_ZERO, no_fail, sbi);
 	if (new) {
 		nat_set_nid(new, nid);
 		nat_reset_flag(new);
@@ -242,7 +242,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,

 	head = radix_tree_lookup(&nm_i->nat_set_root, set);
 	if (!head) {
-		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
+		head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
+						GFP_NOFS, true, NULL);

 		INIT_LIST_HEAD(&head->entry_list);
 		INIT_LIST_HEAD(&head->set_list);
@@ -329,7 +330,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
 	unsigned long flags;
 	unsigned int seq_id;

-	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS);
+	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
+					GFP_NOFS, true, NULL);

 	get_page(page);
 	fn->page = page;
@@ -428,7 +430,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *new, *e;

-	new = __alloc_nat_entry(nid, false);
+	new = __alloc_nat_entry(sbi, nid, false);
 	if (!new)
 		return;

@@ -451,7 +453,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
-	struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
+	struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);

 	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, ni->nid);
@@ -552,7 +554,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 	int i;

 	ni->nid = nid;
-
+retry:
 	/* Check nat cache */
 	down_read(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
@@ -564,10 +566,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 		return 0;
 	}

-	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
-
-	/* Check current segment summary */
+	/*
+	 * Check current segment summary by trying to grab journal_rwsem first.
+	 * This sem is on the critical path on the checkpoint requiring the above
+	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here
+	 * while not bothering checkpoint.
+	 */
+	if (!rwsem_is_locked(&sbi->cp_global_sem)) {
 		down_read(&curseg->journal_rwsem);
+	} else if (!down_read_trylock(&curseg->journal_rwsem)) {
+		up_read(&nm_i->nat_tree_lock);
+		goto retry;
+	}
+
 	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 	if (i >= 0) {
 		ne = nat_in_journal(journal, i);
@@ -832,6 +843,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 	dn->ofs_in_node = offset[level];
 	dn->node_page = npage[level];
 	dn->data_blkaddr = f2fs_data_blkaddr(dn);
+
+	if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
+					f2fs_sb_has_readonly(sbi)) {
+		unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
+		block_t blkaddr;
+
+		if (!c_len)
+			goto out;
+
+		blkaddr = f2fs_data_blkaddr(dn);
+		if (blkaddr == COMPRESS_ADDR)
+			blkaddr = data_blkaddr(dn->inode, dn->node_page,
+						dn->ofs_in_node + 1);
+
+		f2fs_update_extent_tree_range_compressed(dn->inode,
+					index, blkaddr,
+					F2FS_I(dn->inode)->i_cluster_size,
+					c_len);
+	}
+out:
 	return 0;

 release_pages:
@@ -1321,7 +1352,8 @@ static int read_node_page(struct page *page, int op_flags)
 	if (err)
 		return err;

-	if (unlikely(ni.blk_addr == NULL_ADDR) ||
+	/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
+	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
 			is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
 		ClearPageUptodate(page);
 		return -ENOENT;
@@ -2181,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
 	}
 }

+bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int i;
+	bool ret = true;
+
+	down_read(&nm_i->nat_tree_lock);
+	for (i = 0; i < nm_i->nat_blocks; i++) {
+		if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
+			ret = false;
+			break;
+		}
+	}
+	up_read(&nm_i->nat_tree_lock);
+
+	return ret;
+}
+
 static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
 							bool set, bool build)
 {
@@ -2222,7 +2272,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
 	if (unlikely(f2fs_check_nid_range(sbi, nid)))
 		return false;

-	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
 	i->nid = nid;
 	i->state = FREE_NID;

@@ -2812,7 +2862,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)

 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
-			ne = __alloc_nat_entry(nid, true);
+			ne = __alloc_nat_entry(sbi, nid, true);
 			__init_nat_entry(nm_i, ne, &raw_ne, true);
 		}

@@ -2852,7 +2902,23 @@ add_out:
 	list_add_tail(&nes->set_list, head);
 }

-static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
+							unsigned int valid)
+{
+	if (valid == 0) {
+		__set_bit_le(nat_ofs, nm_i->empty_nat_bits);
+		__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
+		return;
+	}
+
+	__clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
+	if (valid == NAT_ENTRY_PER_BLOCK)
+		__set_bit_le(nat_ofs, nm_i->full_nat_bits);
+	else
+		__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
+}
+
+static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 						struct page *page)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -2861,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 	int valid = 0;
 	int i = 0;

-	if (!enabled_nat_bits(sbi, NULL))
+	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
 		return;

 	if (nat_index == 0) {
@@ -2872,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 		if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
 			valid++;
 	}
-	if (valid == 0) {
-		__set_bit_le(nat_index, nm_i->empty_nat_bits);
-		__clear_bit_le(nat_index, nm_i->full_nat_bits);
-		return;
+
+	__update_nat_bits(nm_i, nat_index, valid);
 }

-	__clear_bit_le(nat_index, nm_i->empty_nat_bits);
-	if (valid == NAT_ENTRY_PER_BLOCK)
-		__set_bit_le(nat_index, nm_i->full_nat_bits);
-	else
-		__clear_bit_le(nat_index, nm_i->full_nat_bits);
+void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	unsigned int nat_ofs;
+
+	down_read(&nm_i->nat_tree_lock);
+
+	for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
+		unsigned int valid = 0, nid_ofs = 0;
+
+		/* handle nid zero due to it should never be used */
+		if (unlikely(nat_ofs == 0)) {
+			valid = 1;
+			nid_ofs = 1;
+		}
+
+		for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
+			if (!test_bit_le(nid_ofs,
+					nm_i->free_nid_bitmap[nat_ofs]))
+				valid++;
+		}
+
+		__update_nat_bits(nm_i, nat_ofs, valid);
+	}
+
+	up_read(&nm_i->nat_tree_lock);
 }

 static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -2901,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	 * #1, flush nat entries to journal in current hot data summary block.
 	 * #2, flush nat entries to nat page.
 	 */
-	if (enabled_nat_bits(sbi, cpc) ||
+	if ((cpc->reason & CP_UMOUNT) ||
 		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
 		to_journal = false;

@@ -2948,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	if (to_journal) {
 		up_write(&curseg->journal_rwsem);
 	} else {
-		__update_nat_bits(sbi, start_nid, page);
+		update_nat_bits(sbi, start_nid, page);
 		f2fs_put_page(page, 1);
 	}

@@ -2979,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	 * during unmount, let's flush nat_bits before checking
 	 * nat_cnt[DIRTY_NAT].
 	 */
-	if (enabled_nat_bits(sbi, cpc)) {
+	if (cpc->reason & CP_UMOUNT) {
 		down_write(&nm_i->nat_tree_lock);
 		remove_nats_in_journal(sbi);
 		up_write(&nm_i->nat_tree_lock);
@@ -2995,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	 * entries, remove all entries from journal and merge them
 	 * into nat entry set.
 	 */
-	if (enabled_nat_bits(sbi, cpc) ||
+	if (cpc->reason & CP_UMOUNT ||
 		!__has_cursum_space(journal,
 			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
@@ -3032,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
 	__u64 cp_ver = cur_cp_version(ckpt);
 	block_t nat_bits_addr;

-	if (!enabled_nat_bits(sbi, NULL))
-		return 0;
-
 	nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
 	nm_i->nat_bits = f2fs_kvzalloc(sbi,
 			nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
 	if (!nm_i->nat_bits)
 		return -ENOMEM;

+	nm_i->full_nat_bits = nm_i->nat_bits + 8;
+	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
+	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+		return 0;
+
 	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
 						nm_i->nat_bits_blocks;
 	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
@@ -3057,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)

 	cp_ver |= (cur_cp_crc(ckpt) << 32);
 	if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
-		disable_nat_bits(sbi, true);
+		clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+		f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
+			cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
 		return 0;
 	}

-	nm_i->full_nat_bits = nm_i->nat_bits + 8;
-	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
-
 	f2fs_notice(sbi, "Found nat_bits in checkpoint");
 	return 0;
 }
@@ -3074,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
 	unsigned int i = 0;
 	nid_t nid, last_nid;

-	if (!enabled_nat_bits(sbi, NULL))
+	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
 		return;

 	for (i = 0; i < nm_i->nat_blocks; i++) {
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -91,7 +91,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
 			goto err_out;
 	}

-	entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
+	entry = f2fs_kmem_cache_alloc(fsync_entry_slab,
+					GFP_F2FS_ZERO, true, NULL);
 	entry->inode = inode;
 	list_add_tail(&entry->list, head);

--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
 #include "segment.h"
 #include "node.h"
 #include "gc.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>

 #define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -188,7 +189,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)

 	set_page_private_atomic(page);

-	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
+	new = f2fs_kmem_cache_alloc(inmem_entry_slab,
+					GFP_NOFS, true, NULL);

 	/* add atomic page indices to the list */
 	new->page = page;
@@ -776,11 +778,22 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
 		return 0;

 	for (i = 1; i < sbi->s_ndevs; i++) {
+		int count = DEFAULT_RETRY_IO_COUNT;
+
 		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
 			continue;
+
+		do {
 			ret = __submit_flush_wait(sbi, FDEV(i).bdev);
 			if (ret)
+				congestion_wait(BLK_RW_ASYNC,
+						DEFAULT_IO_TIMEOUT);
+		} while (ret && --count);
+
+		if (ret) {
+			f2fs_stop_checkpoint(sbi, false);
 			break;
+		}

 		spin_lock(&sbi->dev_lock);
 		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
@@ -990,7 +1003,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,

 	pend_list = &dcc->pend_list[plist_idx(len)];

-	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
+	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL);
 	INIT_LIST_HEAD(&dc->list);
 	dc->bdev = bdev;
 	dc->lstart = lstart;
@@ -1893,7 +1906,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);

-		if (!f2fs_test_and_set_bit(offset, se->discard_map))
+		if (f2fs_block_unit_discard(sbi) &&
+				!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 	}

@@ -1918,7 +1932,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
 	int i;

-	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi))
+	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
+			!f2fs_block_unit_discard(sbi))
 		return false;

 	if (!force) {
@@ -1949,7 +1964,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,

 		if (!de) {
 			de = f2fs_kmem_cache_alloc(discard_entry_slab,
-								GFP_F2FS_ZERO);
+						GFP_F2FS_ZERO, true, NULL);
 			de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
 			list_add_tail(&de->list, head);
 		}
@@ -2003,14 +2018,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 	unsigned int start = 0, end = -1;
 	unsigned int secno, start_segno;
 	bool force = (cpc->reason & CP_DISCARD);
-	bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
+	bool section_alignment = F2FS_OPTION(sbi).discard_unit ==
+						DISCARD_UNIT_SECTION;
+
+	if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
+		section_alignment = true;

 	mutex_lock(&dirty_i->seglist_lock);

 	while (1) {
 		int i;

-		if (need_align && end != -1)
+		if (section_alignment && end != -1)
 			end--;
 		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
 		if (start >= MAIN_SEGS(sbi))
@@ -2018,7 +2037,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 		end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
 								start + 1);

-		if (need_align) {
+		if (section_alignment) {
 			start = rounddown(start, sbi->segs_per_sec);
 			end = roundup(end, sbi->segs_per_sec);
 		}
@@ -2056,6 +2075,9 @@ next:
 	}
 	mutex_unlock(&dirty_i->seglist_lock);

+	if (!f2fs_block_unit_discard(sbi))
+		goto wakeup;
+
 	/* send small discards */
 	list_for_each_entry_safe(entry, this, head, list) {
 		unsigned int cur_pos = 0, next_pos, len, total_len = 0;
@@ -2089,12 +2111,29 @@ skip:
 		dcc->nr_discards -= total_len;
 	}

+wakeup:
 	wake_up_discard_thread(sbi, false);
 }

+int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
+{
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	int err = 0;
+
+	if (!f2fs_realtime_discard_enable(sbi))
+		return 0;
+
+	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
+				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
+	if (IS_ERR(dcc->f2fs_issue_discard))
+		err = PTR_ERR(dcc->f2fs_issue_discard);
+
+	return err;
+}
+
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
-	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	struct discard_cmd_control *dcc;
 	int err = 0, i;

@@ -2108,6 +2147,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 		return -ENOMEM;

 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+		dcc->discard_granularity = sbi->blocks_per_seg;
+	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
+		dcc->discard_granularity = BLKS_PER_SEC(sbi);
+
 	INIT_LIST_HEAD(&dcc->entry_list);
 	for (i = 0; i < MAX_PLIST_NUM; i++)
 		INIT_LIST_HEAD(&dcc->pend_list[i]);
@@ -2127,13 +2171,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	init_waitqueue_head(&dcc->discard_wait_queue);
 	SM_I(sbi)->dcc_info = dcc;
 init_thread:
-	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
-				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
-	if (IS_ERR(dcc->f2fs_issue_discard)) {
-		err = PTR_ERR(dcc->f2fs_issue_discard);
+	err = f2fs_start_discard_thread(sbi);
+	if (err) {
 		kfree(dcc);
 		SM_I(sbi)->dcc_info = NULL;
-		return err;
 	}

 	return err;
@@ -2255,7 +2296,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 			del = 0;
 		}

-		if (!f2fs_test_and_set_bit(offset, se->discard_map))
+		if (f2fs_block_unit_discard(sbi) &&
+				!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;

 		/*
@@ -2297,7 +2339,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 			}
 		}

-		if (f2fs_test_and_clear_bit(offset, se->discard_map))
+		if (f2fs_block_unit_discard(sbi) &&
+			f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -3563,7 +3606,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
 		goto drop_bio;
 	}

-	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) {
+	if (f2fs_cp_error(sbi)) {
 		err = -EIO;
 		goto drop_bio;
 	}
@@ -4071,7 +4114,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 static struct sit_entry_set *grab_sit_entry_set(void)
 {
 	struct sit_entry_set *ses =
-			f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS);
+			f2fs_kmem_cache_alloc(sit_entry_set_slab,
+						GFP_NOFS, true, NULL);

 	ses->entry_cnt = 0;
 	INIT_LIST_HEAD(&ses->set_list);
@@ -4282,6 +4326,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	unsigned int sit_segs, start;
 	char *src_bitmap, *bitmap;
 	unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size;
+	unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0;

 	/* allocate memory for SIT information */
 	sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
@@ -4304,9 +4349,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;

 #ifdef CONFIG_F2FS_CHECK_FS
-	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4;
+	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map);
 #else
-	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3;
+	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map);
 #endif
 	sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
 	if (!sit_i->bitmap)
@@ -4326,9 +4371,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		bitmap += SIT_VBLOCK_MAP_SIZE;
 #endif

+		if (discard_map) {
 			sit_i->sentries[start].discard_map = bitmap;
 			bitmap += SIT_VBLOCK_MAP_SIZE;
 		}
+	}

 	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
 	if (!sit_i->tmp_map)
@@ -4489,6 +4536,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 			if (IS_NODESEG(se->type))
 				total_node_blocks += se->valid_blocks;

+			if (f2fs_block_unit_discard(sbi)) {
 				/* build discard map only one time */
 				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
 					memset(se->discard_map, 0xff,
@@ -4501,6 +4549,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 						sbi->blocks_per_seg -
 						se->valid_blocks;
 				}
+			}

 			if (__is_large_section(sbi))
 				get_sec_entry(sbi, start)->valid_blocks +=
@@ -4535,6 +4584,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 		if (IS_NODESEG(se->type))
 			total_node_blocks += se->valid_blocks;

+		if (f2fs_block_unit_discard(sbi)) {
 			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
 				memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
 			} else {
@@ -4543,6 +4593,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 				sbi->discard_blks += old_valid_blocks;
 				sbi->discard_blks -= se->valid_blocks;
 			}
+		}

 		if (__is_large_section(sbi)) {
 			get_sec_entry(sbi, start)->valid_blocks +=
@@ -5159,7 +5210,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
 		sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
-	sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec;
+	sm_info->min_seq_blocks = sbi->blocks_per_seg;
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
 	sm_info->min_ssr_sections = reserved_sections(sbi);

--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -142,7 +142,7 @@ enum {
 };

 /*
- * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * In the victim_sel_policy->alloc_mode, there are three block allocation modes.
 * LFS writes data sequentially with cleaning operations.
 * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
 * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into
@@ -155,7 +155,7 @@ enum {
 };

 /*
- * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes.
 * GC_CB is based on cost-benefit algorithm.
 * GC_GREEDY is based on greedy algorithm.
 * GC_AT is based on age-threshold algorithm.
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -33,6 +33,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "gc.h"
+#include "iostat.h"

 #define CREATE_TRACE_POINTS
 #include <trace/events/f2fs.h>
@@ -56,6 +57,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_CHECKPOINT]	= "checkpoint error",
 	[FAULT_DISCARD]		= "discard error",
 	[FAULT_WRITE_IO]	= "write IO error",
+	[FAULT_SLAB_ALLOC]	= "slab alloc",
 };

 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -155,6 +157,7 @@ enum {
 	Opt_atgc,
 	Opt_gc_merge,
 	Opt_nogc_merge,
+	Opt_discard_unit,
 	Opt_err,
 };

@@ -231,6 +234,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_atgc, "atgc"},
 	{Opt_gc_merge, "gc_merge"},
 	{Opt_nogc_merge, "nogc_merge"},
+	{Opt_discard_unit, "discard_unit=%s"},
 	{Opt_err, NULL},
 };

@@ -657,10 +661,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 				return -EINVAL;
 			break;
 		case Opt_discard:
+			if (!f2fs_hw_support_discard(sbi)) {
+				f2fs_warn(sbi, "device does not support discard");
+				break;
+			}
 			set_opt(sbi, DISCARD);
 			break;
 		case Opt_nodiscard:
-			if (f2fs_sb_has_blkzoned(sbi)) {
+			if (f2fs_hw_should_discard(sbi)) {
 				f2fs_warn(sbi, "discard is required for zoned block devices");
 				return -EINVAL;
 			}
@@ -1173,6 +1181,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 		case Opt_nogc_merge:
 			clear_opt(sbi, GC_MERGE);
 			break;
+		case Opt_discard_unit:
+			name = match_strdup(&args[0]);
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "block")) {
+				F2FS_OPTION(sbi).discard_unit =
+						DISCARD_UNIT_BLOCK;
+			} else if (!strcmp(name, "segment")) {
+				F2FS_OPTION(sbi).discard_unit =
+						DISCARD_UNIT_SEGMENT;
+			} else if (!strcmp(name, "section")) {
+				F2FS_OPTION(sbi).discard_unit =
+						DISCARD_UNIT_SECTION;
+			} else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
 		default:
 			f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
 				 p);
@@ -1211,6 +1238,14 @@ default_check:
 		return -EINVAL;
 	}
 #endif
+	if (f2fs_sb_has_blkzoned(sbi)) {
+		if (F2FS_OPTION(sbi).discard_unit !=
+						DISCARD_UNIT_SECTION) {
+			f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default");
+			F2FS_OPTION(sbi).discard_unit =
+					DISCARD_UNIT_SECTION;
+		}
+	}

 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	if (f2fs_test_compress_extension(sbi)) {
@@ -1271,7 +1306,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
 	struct f2fs_inode_info *fi;

-	fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
+	fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
+				GFP_F2FS_ZERO, false, F2FS_SB(sb));
 	if (!fi)
 		return NULL;

@@ -1541,6 +1577,7 @@ static void f2fs_put_super(struct super_block *sb)
 #endif
 	fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
 	destroy_percpu_info(sbi);
+	f2fs_destroy_iostat(sbi);
 	for (i = 0; i < NR_PAGE_TYPE; i++)
 		kvfree(sbi->write_io[i]);
 #ifdef CONFIG_UNICODE
@@ -1924,6 +1961,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)

 	if (test_opt(sbi, ATGC))
 		seq_puts(seq, ",atgc");
+
+	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK)
+		seq_printf(seq, ",discard_unit=%s", "block");
+	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+		seq_printf(seq, ",discard_unit=%s", "segment");
+	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
+		seq_printf(seq, ",discard_unit=%s", "section");
+
 	return 0;
 }

@@ -1959,11 +2004,15 @@ static void default_options(struct f2fs_sb_info *sbi)
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
+	if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
 		set_opt(sbi, DISCARD);
-	if (f2fs_sb_has_blkzoned(sbi))
+	if (f2fs_sb_has_blkzoned(sbi)) {
 		F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
-	else
+		F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION;
+	} else {
 		F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
+		F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK;
+	}

 #ifdef CONFIG_F2FS_FS_XATTR
 	set_opt(sbi, XATTR_USER);
@@ -2038,8 +2087,17 @@ restore_flag:

 static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
 {
+	int retry = DEFAULT_RETRY_IO_COUNT;
+
 	/* we should flush all the data to keep data consistency */
+	do {
 		sync_inodes_sb(sbi->sb);
+		cond_resched();
+		congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+	} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
+
+	if (unlikely(retry < 0))
+		f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");

 	down_write(&sbi->gc_lock);
 	f2fs_dirty_to_prefree(sbi);
@@ -2060,12 +2118,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_gc = false, need_stop_gc = false;
 	bool need_restart_ckpt = false, need_stop_ckpt = false;
 	bool need_restart_flush = false, need_stop_flush = false;
+	bool need_restart_discard = false, need_stop_discard = false;
 	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
-	bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT);
+	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
+	bool no_discard = !test_opt(sbi, DISCARD);
 	bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
-	bool checkpoint_changed;
+	bool block_unit_discard = f2fs_block_unit_discard(sbi);
+	struct discard_cmd_control *dcc;
 #ifdef CONFIG_QUOTA
 	int i, j;
 #endif
@@ -2110,8 +2171,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	err = parse_options(sb, data, true);
 	if (err)
 		goto restore_opts;
-	checkpoint_changed =
-			disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT);

 	/*
 	 * Previous and new state of filesystem is RO,
@@ -2168,6 +2227,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}

+	if (block_unit_discard != f2fs_block_unit_discard(sbi)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "switch discard_unit option is not allowed");
+		goto restore_opts;
+	}
+
 	if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
 		err = -EINVAL;
 		f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
@@ -2233,11 +2298,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		need_stop_flush = true;
 	}

-	if (checkpoint_changed) {
+	if (no_discard == !!test_opt(sbi, DISCARD)) {
+		if (test_opt(sbi, DISCARD)) {
+			err = f2fs_start_discard_thread(sbi);
+			if (err)
+				goto restore_flush;
+			need_stop_discard = true;
+		} else {
+			dcc = SM_I(sbi)->dcc_info;
+			f2fs_stop_discard_thread(sbi);
+			if (atomic_read(&dcc->discard_cmd_cnt))
+				f2fs_issue_discard_timeout(sbi);
+			need_restart_discard = true;
+		}
+	}
+
+	if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) {
 		if (test_opt(sbi, DISABLE_CHECKPOINT)) {
 			err = f2fs_disable_checkpoint(sbi);
 			if (err)
-				goto restore_flush;
+				goto restore_discard;
 		} else {
 			f2fs_enable_checkpoint(sbi);
 		}
@@ -2257,6 +2337,13 @@ skip:
 	adjust_unusable_cap_perc(sbi);
 	*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
 	return 0;
+restore_discard:
+	if (need_restart_discard) {
+		if (f2fs_start_discard_thread(sbi))
+			f2fs_warn(sbi, "discard has been stopped");
+	} else if (need_stop_discard) {
+		f2fs_stop_discard_thread(sbi);
+	}
 restore_flush:
 	if (need_restart_flush) {
 		if (f2fs_create_flush_cmd_control(sbi))
@@ -2517,6 +2604,33 @@ static int f2fs_enable_quotas(struct super_block *sb)
 	return 0;
 }

+static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sbi->sb);
+	struct address_space *mapping = dqopt->files[type]->i_mapping;
+	int ret = 0;
+
+	ret = dquot_writeback_dquots(sbi->sb, type);
+	if (ret)
+		goto out;
+
+	ret = filemap_fdatawrite(mapping);
+	if (ret)
+		goto out;
+
+	/* if we are using journalled quota */
+	if (is_journalled_quota(sbi))
+		goto out;
+
+	ret = filemap_fdatawait(mapping);
+
+	truncate_inode_pages(&dqopt->files[type]->i_data, 0);
+out:
+	if (ret)
+		set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+	return ret;
+}
+
 int f2fs_quota_sync(struct super_block *sb, int type)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -2524,6 +2638,20 @@ int f2fs_quota_sync(struct super_block *sb, int type)
 	int cnt;
 	int ret;

+	/*
+	 * Now when everything is written we can discard the pagecache so
+	 * that userspace sees the changes.
+	 */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+
+		if (type != -1 && cnt != type)
+			continue;
+
+		if (!sb_has_quota_active(sb, type))
+			return 0;
+
+		inode_lock(dqopt->files[cnt]);
+
 		/*
 		 * do_quotactl
 		 *  f2fs_quota_sync
@@ -2534,47 +2662,18 @@ int f2fs_quota_sync(struct super_block *sb, int type)
 		 *			      down_read(quota_sem)
 		 */
 		f2fs_lock_op(sbi);
-
 		down_read(&sbi->quota_sem);
-	ret = dquot_writeback_dquots(sb, type);
-	if (ret)
-		goto out;

-	/*
-	 * Now when everything is written we can discard the pagecache so
-	 * that userspace sees the changes.
-	 */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		struct address_space *mapping;
+		ret = f2fs_quota_sync_file(sbi, cnt);

-		if (type != -1 && cnt != type)
-			continue;
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-
-		mapping = dqopt->files[cnt]->i_mapping;
-
-		ret = filemap_fdatawrite(mapping);
-		if (ret)
-			goto out;
-
-		/* if we are using journalled quota */
-		if (is_journalled_quota(sbi))
-			continue;
-
-		ret = filemap_fdatawait(mapping);
-		if (ret)
-			set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
-
-		inode_lock(dqopt->files[cnt]);
-		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
-		inode_unlock(dqopt->files[cnt]);
-	}
-out:
-	if (ret)
-		set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
 		up_read(&sbi->quota_sem);
 		f2fs_unlock_op(sbi);
+
+		inode_unlock(dqopt->files[cnt]);
+
+		if (ret)
+			break;
+	}
 	return ret;
 }

@@ -3207,11 +3306,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return -EFSCORRUPTED;
 	}

-	if (le32_to_cpu(raw_super->cp_payload) >
-				(blocks_per_seg - F2FS_CP_PACKS)) {
-		f2fs_info(sbi, "Insane cp_payload (%u > %u)",
+	if (le32_to_cpu(raw_super->cp_payload) >=
+				(blocks_per_seg - F2FS_CP_PACKS -
+				NR_CURSEG_PERSIST_TYPE)) {
+		f2fs_info(sbi, "Insane cp_payload (%u >= %u)",
 			  le32_to_cpu(raw_super->cp_payload),
-			  blocks_per_seg - F2FS_CP_PACKS);
+			  blocks_per_seg - F2FS_CP_PACKS -
+			  NR_CURSEG_PERSIST_TYPE);
 		return -EFSCORRUPTED;
 	}

@@ -3247,6 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int cp_pack_start_sum, cp_payload;
 	block_t user_block_count, valid_user_blocks;
 	block_t avail_node_count, valid_node_count;
+	unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks;
 	int i, j;

 	total = le32_to_cpu(raw_super->segment_count);
@@ -3377,6 +3479,17 @@ skip_cross:
 		return 1;
 	}

+	nat_blocks = nat_segs << log_blocks_per_seg;
+	nat_bits_bytes = nat_blocks / BITS_PER_BYTE;
+	nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
+	if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) &&
+		(cp_payload + F2FS_CP_PACKS +
+		NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) {
+		f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)",
+			  cp_payload, nat_bits_blocks);
+		return -EFSCORRUPTED;
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_err(sbi, "A bug case: need to run fsck");
 		return 1;
@@ -3409,6 +3522,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
 	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
 	sbi->migration_granularity = sbi->segs_per_sec;
+	sbi->seq_file_ra_mul = MIN_RA_MUL;

 	sbi->dir_level = DEF_DIR_LEVEL;
 	sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3768,6 +3882,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 	/* adjust parameters according to the volume size */
 	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
 		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+		if (f2fs_block_unit_discard(sbi))
 			sm_i->dcc_info->discard_granularity = 1;
 		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
 	}
@@ -3889,11 +4004,6 @@ try_onemore:
 	set_sbi_flag(sbi, SBI_POR_DOING);
 	spin_lock_init(&sbi->stat_lock);

-	/* init iostat info */
-	spin_lock_init(&sbi->iostat_lock);
-	sbi->iostat_enable = false;
-	sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS;
-
 	for (i = 0; i < NR_PAGE_TYPE; i++) {
 		int n = (i == META) ? 1 : NR_TEMP_TYPE;
 		int j;
@@ -3924,10 +4034,14 @@ try_onemore:
 	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);

-	err = init_percpu_info(sbi);
+	err = f2fs_init_iostat(sbi);
 	if (err)
 		goto free_bio_info;

+	err = init_percpu_info(sbi);
+	if (err)
+		goto free_iostat;
+
 	if (F2FS_IO_ALIGNED(sbi)) {
 		sbi->write_io_dummy =
 			mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
@@ -4259,6 +4373,8 @@ free_io_dummy:
 	mempool_destroy(sbi->write_io_dummy);
 free_percpu:
 	destroy_percpu_info(sbi);
+free_iostat:
+	f2fs_destroy_iostat(sbi);
 free_bio_info:
 	for (i = 0; i < NR_PAGE_TYPE; i++)
 		kvfree(sbi->write_io[i]);
@@ -4401,9 +4517,12 @@ static int __init init_f2fs_fs(void)
 	err = f2fs_init_post_read_processing();
 	if (err)
 		goto free_root_stats;
-	err = f2fs_init_bio_entry_cache();
+	err = f2fs_init_iostat_processing();
 	if (err)
 		goto free_post_read;
+	err = f2fs_init_bio_entry_cache();
+	if (err)
+		goto free_iostat;
 	err = f2fs_init_bioset();
 	if (err)
 		goto free_bio_enrty_cache;
@@ -4425,6 +4544,8 @@ free_bioset:
 	f2fs_destroy_bioset();
 free_bio_enrty_cache:
 	f2fs_destroy_bio_entry_cache();
+free_iostat:
+	f2fs_destroy_iostat_processing();
 free_post_read:
 	f2fs_destroy_post_read_processing();
 free_root_stats:
@@ -4459,6 +4580,7 @@ static void __exit exit_f2fs_fs(void)
 	f2fs_destroy_compress_mempool();
 	f2fs_destroy_bioset();
 	f2fs_destroy_bio_entry_cache();
+	f2fs_destroy_iostat_processing();
 	f2fs_destroy_post_read_processing();
 	f2fs_destroy_root_stats();
 	unregister_filesystem(&f2fs_fs_type);
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -17,6 +17,7 @@
 #include "f2fs.h"
 #include "segment.h"
 #include "gc.h"
+#include "iostat.h"
 #include <trace/events/f2fs.h>

 static struct proc_dir_entry *f2fs_proc_root;
@@ -307,6 +308,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 		return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
 #endif

+	if (!strcmp(a->attr.name, "gc_segment_mode"))
+		return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+
+	if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
+		return sysfs_emit(buf, "%u\n",
+			sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
+	}
+
 	ui = (unsigned int *)(ptr + a->offset);

 	return sprintf(buf, "%u\n", *ui);
@@ -343,7 +352,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
 			set = false;
 		}

-		if (strlen(name) >= F2FS_EXTENSION_LEN)
+		if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
 			return -EINVAL;

 		down_write(&sbi->sb_lock);
@@ -420,6 +429,8 @@ out:
 	if (!strcmp(a->attr.name, "discard_granularity")) {
 		if (t == 0 || t > MAX_PLIST_NUM)
 			return -EINVAL;
+		if (!f2fs_block_unit_discard(sbi))
+			return -EINVAL;
 		if (t == *ui)
 			return count;
 		*ui = t;
@@ -467,6 +478,7 @@ out:
 		return count;
 	}

+#ifdef CONFIG_F2FS_IOSTAT
 	if (!strcmp(a->attr.name, "iostat_enable")) {
 		sbi->iostat_enable = !!t;
 		if (!sbi->iostat_enable)
@@ -482,6 +494,7 @@ out:
 		spin_unlock(&sbi->iostat_lock);
 		return count;
 	}
+#endif

 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	if (!strcmp(a->attr.name, "compr_written_block") ||
@@ -515,6 +528,29 @@ out:
 		return count;
 	}

+	if (!strcmp(a->attr.name, "gc_segment_mode")) {
+		if (t < MAX_GC_MODE)
+			sbi->gc_segment_mode = t;
+		else
+			return -EINVAL;
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
+		if (t != 0)
+			return -EINVAL;
+		sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0;
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "seq_file_ra_mul")) {
+		if (t >= MIN_RA_MUL && t <= MAX_RA_MUL)
+			sbi->seq_file_ra_mul = t;
+		else
+			return -EINVAL;
+		return count;
+	}
+
 	*ui = (unsigned int)t;

 	return count;
@@ -667,8 +703,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval,
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
 		umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
+#ifdef CONFIG_F2FS_IOSTAT
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
+#endif
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
@@ -740,6 +778,10 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou
 F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight);
 F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold);

+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
+
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_urgent_sleep_time),
@@ -770,8 +812,10 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(discard_idle_interval),
 	ATTR_LIST(gc_idle_interval),
 	ATTR_LIST(umount_discard_timeout),
+#ifdef CONFIG_F2FS_IOSTAT
 	ATTR_LIST(iostat_enable),
 	ATTR_LIST(iostat_period_ms),
+#endif
 	ATTR_LIST(readdir_ra),
 	ATTR_LIST(max_io_bytes),
 	ATTR_LIST(gc_pin_file_thresh),
@@ -812,6 +856,9 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(atgc_candidate_count),
 	ATTR_LIST(atgc_age_weight),
 	ATTR_LIST(atgc_age_threshold),
+	ATTR_LIST(seq_file_ra_mul),
+	ATTR_LIST(gc_segment_mode),
+	ATTR_LIST(gc_reclaimed_segments),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs);
@@ -1036,101 +1083,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
 	return 0;
 }

-void f2fs_record_iostat(struct f2fs_sb_info *sbi)
-{
-	unsigned long long iostat_diff[NR_IO_TYPE];
-	int i;
-
-	if (time_is_after_jiffies(sbi->iostat_next_period))
-		return;
-
-	/* Need double check under the lock */
-	spin_lock(&sbi->iostat_lock);
-	if (time_is_after_jiffies(sbi->iostat_next_period)) {
-		spin_unlock(&sbi->iostat_lock);
-		return;
-	}
-	sbi->iostat_next_period = jiffies +
-				msecs_to_jiffies(sbi->iostat_period_ms);
-
-	for (i = 0; i < NR_IO_TYPE; i++) {
-		iostat_diff[i] = sbi->rw_iostat[i] -
-				sbi->prev_rw_iostat[i];
-		sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
-	}
-	spin_unlock(&sbi->iostat_lock);
-
-	trace_f2fs_iostat(sbi, iostat_diff);
-}
-
-static int __maybe_unused iostat_info_seq_show(struct seq_file *seq,
-					       void *offset)
-{
-	struct super_block *sb = seq->private;
-	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	time64_t now = ktime_get_real_seconds();
-
-	if (!sbi->iostat_enable)
-		return 0;
-
-	seq_printf(seq, "time:		%-16llu\n", now);
-
-	/* print app write IOs */
-	seq_puts(seq, "[WRITE]\n");
-	seq_printf(seq, "app buffered:	%-16llu\n",
-				sbi->rw_iostat[APP_BUFFERED_IO]);
-	seq_printf(seq, "app direct:	%-16llu\n",
-				sbi->rw_iostat[APP_DIRECT_IO]);
-	seq_printf(seq, "app mapped:	%-16llu\n",
-				sbi->rw_iostat[APP_MAPPED_IO]);
-
-	/* print fs write IOs */
-	seq_printf(seq, "fs data:	%-16llu\n",
-				sbi->rw_iostat[FS_DATA_IO]);
-	seq_printf(seq, "fs node:	%-16llu\n",
-				sbi->rw_iostat[FS_NODE_IO]);
-	seq_printf(seq, "fs meta:	%-16llu\n",
-				sbi->rw_iostat[FS_META_IO]);
-	seq_printf(seq, "fs gc data:	%-16llu\n",
-				sbi->rw_iostat[FS_GC_DATA_IO]);
-	seq_printf(seq, "fs gc node:	%-16llu\n",
-				sbi->rw_iostat[FS_GC_NODE_IO]);
-	seq_printf(seq, "fs cp data:	%-16llu\n",
-				sbi->rw_iostat[FS_CP_DATA_IO]);
-	seq_printf(seq, "fs cp node:	%-16llu\n",
-				sbi->rw_iostat[FS_CP_NODE_IO]);
-	seq_printf(seq, "fs cp meta:	%-16llu\n",
-				sbi->rw_iostat[FS_CP_META_IO]);
-
-	/* print app read IOs */
-	seq_puts(seq, "[READ]\n");
-	seq_printf(seq, "app buffered:	%-16llu\n",
-				sbi->rw_iostat[APP_BUFFERED_READ_IO]);
-	seq_printf(seq, "app direct:	%-16llu\n",
-				sbi->rw_iostat[APP_DIRECT_READ_IO]);
-	seq_printf(seq, "app mapped:	%-16llu\n",
-				sbi->rw_iostat[APP_MAPPED_READ_IO]);
-
-	/* print fs read IOs */
-	seq_printf(seq, "fs data:	%-16llu\n",
-				sbi->rw_iostat[FS_DATA_READ_IO]);
-	seq_printf(seq, "fs gc data:	%-16llu\n",
-				sbi->rw_iostat[FS_GDATA_READ_IO]);
-	seq_printf(seq, "fs compr_data:	%-16llu\n",
-				sbi->rw_iostat[FS_CDATA_READ_IO]);
-	seq_printf(seq, "fs node:	%-16llu\n",
-				sbi->rw_iostat[FS_NODE_READ_IO]);
-	seq_printf(seq, "fs meta:	%-16llu\n",
-				sbi->rw_iostat[FS_META_READ_IO]);
-
-	/* print other IOs */
-	seq_puts(seq, "[OTHER]\n");
-	seq_printf(seq, "fs discard:	%-16llu\n",
-				sbi->rw_iostat[FS_DISCARD]);
-
-	return 0;
-}
-
 static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
 						void *offset)
 {
@@ -1213,13 +1165,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);

 	if (sbi->s_proc) {
-		proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc,
+		proc_create_single_data("segment_info", 0444, sbi->s_proc,
 				segment_info_seq_show, sb);
-		proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc,
+		proc_create_single_data("segment_bits", 0444, sbi->s_proc,
 				segment_bits_seq_show, sb);
-		proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc,
+#ifdef CONFIG_F2FS_IOSTAT
+		proc_create_single_data("iostat_info", 0444, sbi->s_proc,
 				iostat_info_seq_show, sb);
-		proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc,
+#endif
+		proc_create_single_data("victim_bits", 0444, sbi->s_proc,
 				victim_bits_seq_show, sb);
 	}
 	return 0;
@@ -1238,7 +1192,9 @@ put_sb_kobj:
 void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
 {
 	if (sbi->s_proc) {
+#ifdef CONFIG_F2FS_IOSTAT
 		remove_proc_entry("iostat_info", sbi->s_proc);
+#endif
 		remove_proc_entry("segment_info", sbi->s_proc);
 		remove_proc_entry("segment_bits", sbi->s_proc);
 		remove_proc_entry("victim_bits", sbi->s_proc);
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
 {
 	if (likely(size == sbi->inline_xattr_slab_size)) {
 		*is_inline = true;
-		return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+		return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab,
+					GFP_F2FS_ZERO, false, sbi);
 	}
 	*is_inline = false;
 	return f2fs_kzalloc(sbi, size, GFP_NOFS);
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -649,6 +649,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
 	switch (rqstp->rq_authop->flavour) {
 		case RPC_AUTH_NULL:
 		case RPC_AUTH_UNIX:
+			rqstp->rq_auth_stat = rpc_auth_ok;
 			if (rqstp->rq_proc == 0)
 				return SVC_OK;
 			if (is_callback(rqstp->rq_proc)) {
@@ -659,6 +660,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
 			}
 			return svc_set_client(rqstp);
 	}
+	rqstp->rq_auth_stat = rpc_autherr_badcred;
 	return SVC_DENIED;
 }

--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -429,6 +429,8 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
+	rqstp->rq_auth_stat = rpc_autherr_badcred;
+
 	switch (rqstp->rq_authop->flavour) {
 	case RPC_AUTH_NULL:
 		if (rqstp->rq_proc != CB_NULL)
@@ -439,6 +441,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 		 if (svc_is_backchannel(rqstp))
 			return SVC_DENIED;
 	}
+
+	rqstp->rq_auth_stat = rpc_auth_ok;
 	return SVC_OK;
 }

--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -63,11 +63,10 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
 	return htonl(NFS4_OK);
 }

-static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p)
-{
-	return xdr_argsize_check(rqstp, p);
-}
-
+/*
+ * svc_process_common() looks for an XDR encoder to know when
+ * not to drop a Reply.
+ */
 static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
 {
 	return xdr_ressize_check(rqstp, p);
@@ -864,17 +863,16 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 }

 static __be32 process_op(int nop, struct svc_rqst *rqstp,
-		struct xdr_stream *xdr_in, void *argp,
-		struct xdr_stream *xdr_out, void *resp,
 			 struct cb_process_state *cps)
 {
+	struct xdr_stream *xdr_out = &rqstp->rq_res_stream;
 	struct callback_op *op = &callback_ops[0];
 	unsigned int op_nr;
 	__be32 status;
 	long maxlen;
 	__be32 res;

-	status = decode_op_hdr(xdr_in, &op_nr);
+	status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr);
 	if (unlikely(status))
 		return status;

@@ -904,9 +902,11 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp,

 	maxlen = xdr_out->end - xdr_out->p;
 	if (maxlen > 0 && maxlen < PAGE_SIZE) {
-		status = op->decode_args(rqstp, xdr_in, argp);
+		status = op->decode_args(rqstp, &rqstp->rq_arg_stream,
+					 rqstp->rq_argp);
 		if (likely(status == 0))
-			status = op->process_op(argp, resp, cps);
+			status = op->process_op(rqstp->rq_argp, rqstp->rq_resp,
+						cps);
 	} else
 		status = htonl(NFS4ERR_RESOURCE);

@@ -915,7 +915,7 @@ encode_hdr:
 	if (unlikely(res))
 		return res;
 	if (op->encode_res != NULL && status == 0)
-		status = op->encode_res(rqstp, xdr_out, resp);
+		status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp);
 	return status;
 }

@@ -926,22 +926,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
 {
 	struct cb_compound_hdr_arg hdr_arg = { 0 };
 	struct cb_compound_hdr_res hdr_res = { NULL };
-	struct xdr_stream xdr_in, xdr_out;
-	__be32 *p, status;
 	struct cb_process_state cps = {
 		.drc_status = 0,
 		.clp = NULL,
 		.net = SVC_NET(rqstp),
 	};
 	unsigned int nops = 0;
+	__be32 status;

-	xdr_init_decode(&xdr_in, &rqstp->rq_arg,
-			rqstp->rq_arg.head[0].iov_base, NULL);
-
-	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
-	xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL);
-
-	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg);
 	if (status == htonl(NFS4ERR_RESOURCE))
 		return rpc_garbage_args;

@@ -961,15 +954,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
 	cps.minorversion = hdr_arg.minorversion;
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
-	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) {
+	if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) {
 		if (cps.clp)
 			nfs_put_client(cps.clp);
 		return rpc_system_err;
 	}
 	while (status == 0 && nops != hdr_arg.nops) {
-		status = process_op(nops, rqstp, &xdr_in,
-				    rqstp->rq_argp, &xdr_out, rqstp->rq_resp,
-				    &cps);
+		status = process_op(nops, rqstp, &cps);
 		nops++;
 	}

@@ -988,7 +979,20 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)

 out_invalidcred:
 	pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
-	return svc_return_autherr(rqstp, rpc_autherr_badcred);
+	rqstp->rq_auth_stat = rpc_autherr_badcred;
+	return rpc_success;
+}
+
+static int
+nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+{
+	const struct svc_procedure *procp = rqstp->rq_procinfo;
+
+	svcxdr_init_decode(rqstp);
+	svcxdr_init_encode(rqstp);
+
+	*statp = procp->pc_func(rqstp);
+	return 1;
 }

 /*
@@ -1057,7 +1061,6 @@ static struct callback_op callback_ops[] = {
 static const struct svc_procedure nfs4_callback_procedures1[] = {
 	[CB_NULL] = {
 		.pc_func = nfs4_callback_null,
-		.pc_decode = nfs4_decode_void,
 		.pc_encode = nfs4_encode_void,
 		.pc_xdrressize = 1,
 		.pc_name = "NULL",
@@ -1079,7 +1082,7 @@ const struct svc_version nfs4_callback_version1 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_count = nfs4_callback_count1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
-	.vs_dispatch = NULL,
+	.vs_dispatch = nfs_callback_dispatch,
 	.vs_hidden = true,
 	.vs_need_cong_ctrl = true,
 };
@@ -1091,7 +1094,7 @@ const struct svc_version nfs4_callback_version4 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_count = nfs4_callback_count4,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
-	.vs_dispatch = NULL,
+	.vs_dispatch = nfs_callback_dispatch,
 	.vs_hidden = true,
 	.vs_need_cong_ctrl = true,
 };
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -179,6 +179,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)

 	clp->cl_proto = cl_init->proto;
 	clp->cl_nconnect = cl_init->nconnect;
+	clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
 	clp->cl_net = get_net(cl_init->net);

 	clp->cl_principal = "*";
@@ -540,6 +541,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,

 	clnt->cl_principal = clp->cl_principal;
 	clp->cl_rpcclient = clnt;
+	clnt->cl_max_connect = clp->cl_max_connect;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -60,6 +60,7 @@ enum nfs_param {
 	Opt_mountvers,
 	Opt_namelen,
 	Opt_nconnect,
+	Opt_max_connect,
 	Opt_port,
 	Opt_posix,
 	Opt_proto,
@@ -158,6 +159,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
 	fsparam_u32   ("mountvers",	Opt_mountvers),
 	fsparam_u32   ("namlen",	Opt_namelen),
 	fsparam_u32   ("nconnect",	Opt_nconnect),
+	fsparam_u32   ("max_connect",	Opt_max_connect),
 	fsparam_string("nfsvers",	Opt_vers),
 	fsparam_u32   ("port",		Opt_port),
 	fsparam_flag_no("posix",	Opt_posix),
@@ -770,6 +772,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 			goto out_of_bounds;
 		ctx->nfs_server.nconnect = result.uint_32;
 		break;
+	case Opt_max_connect:
+		if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS)
+			goto out_of_bounds;
+		ctx->nfs_server.max_connect = result.uint_32;
+		break;
 	case Opt_lookupcache:
 		switch (result.uint_32) {
 		case Opt_lookupcache_all:
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -67,6 +67,7 @@ struct nfs_client_initdata {
 	int proto;
 	u32 minorversion;
 	unsigned int nconnect;
+	unsigned int max_connect;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
 	const struct cred *cred;
@@ -121,6 +122,7 @@ struct nfs_fs_context {
 		int			port;
 		unsigned short		protocol;
 		unsigned short		nconnect;
+		unsigned short		max_connect;
 		unsigned short		export_path_len;
 	} nfs_server;

--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -49,7 +49,6 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
 {
 	if (task->tk_status != -EJUKEBOX)
 		return 0;
-	if (task->tk_status == -EJUKEBOX)
 	nfs_inc_stats(inode, NFSIOS_DELAY);
 	task->tk_status = 0;
 	rpc_restart_call(task);
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -402,6 +402,33 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 	return nfs4_init_callback(clp);
 }

+static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
+{
+	struct sockaddr_storage clp_addr, old_addr;
+	struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr;
+	struct sockaddr *old_sap = (struct sockaddr *)&old_addr;
+	size_t clp_salen;
+	struct xprt_create xprt_args = {
+		.ident = old->cl_proto,
+		.net = old->cl_net,
+		.servername = old->cl_hostname,
+	};
+
+	if (clp->cl_proto != old->cl_proto)
+		return;
+	clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr));
+	rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr));
+
+	if (clp_addr.ss_family != old_addr.ss_family)
+		return;
+
+	xprt_args.dstaddr = clp_sap;
+	xprt_args.addrlen = clp_salen;
+
+	rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
+			  rpc_clnt_test_and_add_xprt, NULL);
+}
+
 /**
 * nfs4_init_client - Initialise an NFS4 client record
 *
@@ -436,6 +463,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 		 * won't try to use it.
 		 */
 		nfs_mark_client_ready(clp, -EPERM);
+		if (old->cl_mvops->session_trunk)
+			nfs4_add_trunk(clp, old);
 	}
 	clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
 	nfs_put_client(clp);
@@ -865,6 +894,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		const char *ip_addr,
 		int proto, const struct rpc_timeout *timeparms,
 		u32 minorversion, unsigned int nconnect,
+		unsigned int max_connect,
 		struct net *net)
 {
 	struct nfs_client_initdata cl_init = {
@@ -883,6 +913,8 @@ static int nfs4_set_client(struct nfs_server *server,

 	if (minorversion == 0)
 		__set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
+	else
+		cl_init.max_connect = max_connect;
 	if (proto == XPRT_TRANSPORT_TCP)
 		cl_init.nconnect = nconnect;

@@ -952,8 +984,10 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 		return ERR_PTR(-EINVAL);
 	cl_init.hostname = buf;

-	if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+	if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) {
 		cl_init.nconnect = mds_clp->cl_nconnect;
+		cl_init.max_connect = NFS_MAX_TRANSPORTS;
+	}

 	if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
 		__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
@@ -1122,6 +1156,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
 				&timeparms,
 				ctx->minorversion,
 				ctx->nfs_server.nconnect,
+				ctx->nfs_server.max_connect,
 				fc->net_ns);
 	if (error < 0)
 		return error;
@@ -1211,6 +1246,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
 				parent_client->cl_nconnect,
+				parent_client->cl_max_connect,
 				parent_client->cl_net);
 	if (!error)
 		goto init_server;
@@ -1226,6 +1262,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
 				parent_client->cl_nconnect,
+				parent_client->cl_max_connect,
 				parent_client->cl_net);
 	if (error < 0)
 		goto error;
@@ -1323,7 +1360,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
 				clp->cl_proto, clnt->cl_timeout,
 				clp->cl_minorversion,
-				clp->cl_nconnect, net);
+				clp->cl_nconnect, clp->cl_max_connect, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	if (error != 0) {
 		nfs_server_insert_lists(server);
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -158,13 +158,11 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 		sync = true;
 retry:
 	if (!nfs42_files_from_same_server(file_in, file_out)) {
-		/* for inter copy, if copy size if smaller than 12 RPC
-		 * payloads, fallback to traditional copy. There are
-		 * 14 RPCs during an NFSv4.x mount between source/dest
-		 * servers.
+		/*
+		 * for inter copy, if copy size is too small
+		 * then fallback to generic copy.
 		 */
-		if (sync ||
-			count <= 14 * NFS_SERVER(file_inode(file_in))->rsize)
+		if (sync)
 			return -EOPNOTSUPP;
 		cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
 				GFP_NOFS);
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -335,7 +335,7 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)

 static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
 {
-	if (pnfs_seqid_is_newer(newseq, lo->plh_barrier))
+	if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
 		lo->plh_barrier = newseq;
 }

@@ -347,12 +347,16 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
 		iomode = IOMODE_ANY;
 	lo->plh_return_iomode = iomode;
 	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-	if (seq != 0) {
-		WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+	/*
+	 * We must set lo->plh_return_seq to avoid livelocks with
+	 * pnfs_layout_need_return()
+	 */
+	if (seq == 0)
+		seq = be32_to_cpu(lo->plh_stateid.seqid);
+	if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
 		lo->plh_return_seq = seq;
 	pnfs_barrier_update(lo, seq);
 }
-}

 static void
 pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
@@ -592,10 +596,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 	inode = lo->plh_inode;

 	if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
-		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
-			spin_unlock(&inode->i_lock);
-			return;
-		}
 		pnfs_get_layout_hdr(lo);
 		pnfs_layout_remove_lseg(lo, lseg);
 		if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
@@ -1000,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 {
 	u32 seqid = be32_to_cpu(stateid->seqid);

-	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier;
+	return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
 }

 /* lget is set to 1 if called from inside send_layoutget call chain */
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -293,15 +293,19 @@ static int
 readpage_async_filler(void *data, struct page *page)
 {
 	struct nfs_readdesc *desc = data;
+	struct inode *inode = page_file_mapping(page)->host;
+	unsigned int rsize = NFS_SERVER(inode)->rsize;
 	struct nfs_page *new;
-	unsigned int len;
+	unsigned int len, aligned_len;
 	int error;

 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);

-	new = nfs_create_request(desc->ctx, page, 0, len);
+	aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
+
+	new = nfs_create_request(desc->ctx, page, 0, aligned_len);
 	if (IS_ERR(new))
 		goto out_error;

--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -480,6 +480,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	if (clp->cl_nconnect > 0)
 		seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
 	if (version == 4) {
+		if (clp->cl_max_connect > 1)
+			seq_printf(m, ",max_connect=%u", clp->cl_max_connect);
 		if (nfss->port != NFS_PORT)
 			seq_printf(m, ",port=%u", nfss->port);
 	} else
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NTFS3_FS
+	tristate "NTFS Read-Write file system support"
+	select NLS
+	help
+	  Windows OS native file system (NTFS) support up to NTFS version 3.1.
+
+	  Y or M enables the NTFS3 driver with full features enabled (read,
+	  write, journal replaying, sparse/compressed files support).
+	  File system type to use on mount is "ntfs3". Module name (M option)
+	  is also "ntfs3".
+
+	  Documentation: <file:Documentation/filesystems/ntfs3.rst>
+
+config NTFS3_64BIT_CLUSTER
+	bool "64 bits per NTFS clusters"
+	depends on NTFS3_FS && 64BIT
+	help
+	  Windows implementation of ntfs.sys uses 32 bits per clusters.
+	  If activated 64 bits per clusters you will be able to use 4k cluster
+	  for 16T+ volumes. Windows will not be able to mount such volumes.
+
+	  It is recommended to say N here.
+
+config NTFS3_LZX_XPRESS
+	bool "activate support of external compressions lzx/xpress"
+	depends on NTFS3_FS
+	help
+	  In Windows 10 one can use command "compact" to compress any files.
+	  4 possible variants of compression are: xpress4k, xpress8k, xpress16k and lzx.
+	  If activated you will be able to read such files correctly.
+
+	  It is recommended to say Y here.
+
+config NTFS3_FS_POSIX_ACL
+	bool "NTFS POSIX Access Control Lists"
+	depends on NTFS3_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support additional access rights
+	  for users and groups beyond the standard owner/group/world scheme,
+	  and this option selects support for ACLs specifically for ntfs
+	  filesystems.
+	  NOTE: this is linux only feature. Windows will ignore these ACLs.
+
+	  If you don't know what Access Control Lists are, say N.
--- a/fs/ntfs3/Makefile
+++ b/fs/ntfs3/Makefile
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the ntfs3 filesystem support.
+#
+
+# to check robot warnings
+ccflags-y += -Wint-to-pointer-cast \
+	$(call cc-option,-Wunused-but-set-variable,-Wunused-const-variable) \
+	$(call cc-option,-Wold-style-declaration,-Wout-of-line-declaration)
+
+obj-$(CONFIG_NTFS3_FS) += ntfs3.o
+
+ntfs3-y :=	attrib.o \
+		attrlist.o \
+		bitfunc.o \
+		bitmap.o \
+		dir.o \
+		fsntfs.o \
+		frecord.o \
+		file.o \
+		fslog.o \
+		inode.o \
+		index.o \
+		lznt.o \
+		namei.o \
+		record.o \
+		run.o \
+		super.o \
+		upcase.o \
+		xattr.o
+
+ntfs3-$(CONFIG_NTFS3_LZX_XPRESS) += $(addprefix lib/,\
+		decompress_common.o \
+		lzx_decompress.o \
+		xpress_decompress.o \
+		)
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * al_is_valid_le
+ *
+ * Return: True if @le is valid.
+ */
+static inline bool al_is_valid_le(const struct ntfs_inode *ni,
+				  struct ATTR_LIST_ENTRY *le)
+{
+	if (!le || !ni->attr_list.le || !ni->attr_list.size)
+		return false;
+
+	return PtrOffset(ni->attr_list.le, le) + le16_to_cpu(le->size) <=
+	       ni->attr_list.size;
+}
+
+void al_destroy(struct ntfs_inode *ni)
+{
+	run_close(&ni->attr_list.run);
+	kfree(ni->attr_list.le);
+	ni->attr_list.le = NULL;
+	ni->attr_list.size = 0;
+	ni->attr_list.dirty = false;
+}
+
+/*
+ * ntfs_load_attr_list
+ *
+ * This method makes sure that the ATTRIB list, if present,
+ * has been properly set up.
+ */
+int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
+{
+	int err;
+	size_t lsize;
+	void *le = NULL;
+
+	if (ni->attr_list.size)
+		return 0;
+
+	if (!attr->non_res) {
+		lsize = le32_to_cpu(attr->res.data_size);
+		le = kmalloc(al_aligned(lsize), GFP_NOFS);
+		if (!le) {
+			err = -ENOMEM;
+			goto out;
+		}
+		memcpy(le, resident_data(attr), lsize);
+	} else if (attr->nres.svcn) {
+		err = -EINVAL;
+		goto out;
+	} else {
+		u16 run_off = le16_to_cpu(attr->nres.run_off);
+
+		lsize = le64_to_cpu(attr->nres.data_size);
+
+		run_init(&ni->attr_list.run);
+
+		err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno,
+				    0, le64_to_cpu(attr->nres.evcn), 0,
+				    Add2Ptr(attr, run_off),
+				    le32_to_cpu(attr->size) - run_off);
+		if (err < 0)
+			goto out;
+
+		le = kmalloc(al_aligned(lsize), GFP_NOFS);
+		if (!le) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = ntfs_read_run_nb(ni->mi.sbi, &ni->attr_list.run, 0, le,
+				       lsize, NULL);
+		if (err)
+			goto out;
+	}
+
+	ni->attr_list.size = lsize;
+	ni->attr_list.le = le;
+
+	return 0;
+
+out:
+	ni->attr_list.le = le;
+	al_destroy(ni);
+
+	return err;
+}
+
+/*
+ * al_enumerate
+ *
+ * Return:
+ * * The next list le.
+ * * If @le is NULL then return the first le.
+ */
+struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
+				     struct ATTR_LIST_ENTRY *le)
+{
+	size_t off;
+	u16 sz;
+
+	if (!le) {
+		le = ni->attr_list.le;
+	} else {
+		sz = le16_to_cpu(le->size);
+		if (sz < sizeof(struct ATTR_LIST_ENTRY)) {
+			/* Impossible 'cause we should not return such le. */
+			return NULL;
+		}
+		le = Add2Ptr(le, sz);
+	}
+
+	/* Check boundary. */
+	off = PtrOffset(ni->attr_list.le, le);
+	if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) {
+		/* The regular end of list. */
+		return NULL;
+	}
+
+	sz = le16_to_cpu(le->size);
+
+	/* Check le for errors. */
+	if (sz < sizeof(struct ATTR_LIST_ENTRY) ||
+	    off + sz > ni->attr_list.size ||
+	    sz < le->name_off + le->name_len * sizeof(short)) {
+		return NULL;
+	}
+
+	return le;
+}
+
+/*
+ * al_find_le
+ *
+ * Find the first le in the list which matches type, name and VCN.
+ *
+ * Return: NULL if not found.
+ */
+struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni,
+				   struct ATTR_LIST_ENTRY *le,
+				   const struct ATTRIB *attr)
+{
+	CLST svcn = attr_svcn(attr);
+
+	return al_find_ex(ni, le, attr->type, attr_name(attr), attr->name_len,
+			  &svcn);
+}
+
+/*
+ * al_find_ex
+ *
+ * Find the first le in the list which matches type, name and VCN.
+ *
+ * Return: NULL if not found.
+ */
+struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni,
+				   struct ATTR_LIST_ENTRY *le,
+				   enum ATTR_TYPE type, const __le16 *name,
+				   u8 name_len, const CLST *vcn)
+{
+	struct ATTR_LIST_ENTRY *ret = NULL;
+	u32 type_in = le32_to_cpu(type);
+
+	while ((le = al_enumerate(ni, le))) {
+		u64 le_vcn;
+		int diff = le32_to_cpu(le->type) - type_in;
+
+		/* List entries are sorted by type, name and VCN. */
+		if (diff < 0)
+			continue;
+
+		if (diff > 0)
+			return ret;
+
+		if (le->name_len != name_len)
+			continue;
+
+		le_vcn = le64_to_cpu(le->vcn);
+		if (!le_vcn) {
+			/*
+			 * Compare entry names only for entry with vcn == 0.
+			 */
+			diff = ntfs_cmp_names(le_name(le), name_len, name,
+					      name_len, ni->mi.sbi->upcase,
+					      true);
+			if (diff < 0)
+				continue;
+
+			if (diff > 0)
+				return ret;
+		}
+
+		if (!vcn)
+			return le;
+
+		if (*vcn == le_vcn)
+			return le;
+
+		if (*vcn < le_vcn)
+			return ret;
+
+		ret = le;
+	}
+
+	return ret;
+}
+
+/*
+ * al_find_le_to_insert
+ *
+ * Find the first list entry which matches type, name and VCN.
+ */
+static struct ATTR_LIST_ENTRY *al_find_le_to_insert(struct ntfs_inode *ni,
+						    enum ATTR_TYPE type,
+						    const __le16 *name,
+						    u8 name_len, CLST vcn)
+{
+	struct ATTR_LIST_ENTRY *le = NULL, *prev;
+	u32 type_in = le32_to_cpu(type);
+
+	/* List entries are sorted by type, name and VCN. */
+	while ((le = al_enumerate(ni, prev = le))) {
+		int diff = le32_to_cpu(le->type) - type_in;
+
+		if (diff < 0)
+			continue;
+
+		if (diff > 0)
+			return le;
+
+		if (!le->vcn) {
+			/*
+			 * Compare entry names only for entry with vcn == 0.
+			 */
+			diff = ntfs_cmp_names(le_name(le), le->name_len, name,
+					      name_len, ni->mi.sbi->upcase,
+					      true);
+			if (diff < 0)
+				continue;
+
+			if (diff > 0)
+				return le;
+		}
+
+		if (le64_to_cpu(le->vcn) >= vcn)
+			return le;
+	}
+
+	return prev ? Add2Ptr(prev, le16_to_cpu(prev->size)) : ni->attr_list.le;
+}
+
+/*
+ * al_add_le
+ *
+ * Add an "attribute list entry" to the list.
+ */
+int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
+	      u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref,
+	      struct ATTR_LIST_ENTRY **new_le)
+{
+	int err;
+	struct ATTRIB *attr;
+	struct ATTR_LIST_ENTRY *le;
+	size_t off;
+	u16 sz;
+	size_t asize, new_asize, old_size;
+	u64 new_size;
+	typeof(ni->attr_list) *al = &ni->attr_list;
+
+	/*
+	 * Compute the size of the new 'le'
+	 */
+	sz = le_size(name_len);
+	old_size = al->size;
+	new_size = old_size + sz;
+	asize = al_aligned(old_size);
+	new_asize = al_aligned(new_size);
+
+	/* Scan forward to the point at which the new 'le' should be inserted. */
+	le = al_find_le_to_insert(ni, type, name, name_len, svcn);
+	off = PtrOffset(al->le, le);
+
+	if (new_size > asize) {
+		void *ptr = kmalloc(new_asize, GFP_NOFS);
+
+		if (!ptr)
+			return -ENOMEM;
+
+		memcpy(ptr, al->le, off);
+		memcpy(Add2Ptr(ptr, off + sz), le, old_size - off);
+		le = Add2Ptr(ptr, off);
+		kfree(al->le);
+		al->le = ptr;
+	} else {
+		memmove(Add2Ptr(le, sz), le, old_size - off);
+	}
+	*new_le = le;
+
+	al->size = new_size;
+
+	le->type = type;
+	le->size = cpu_to_le16(sz);
+	le->name_len = name_len;
+	le->name_off = offsetof(struct ATTR_LIST_ENTRY, name);
+	le->vcn = cpu_to_le64(svcn);
+	le->ref = *ref;
+	le->id = id;
+	memcpy(le->name, name, sizeof(short) * name_len);
+
+	err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, new_size,
+			    &new_size, true, &attr);
+	if (err) {
+		/* Undo memmove above. */
+		memmove(le, Add2Ptr(le, sz), old_size - off);
+		al->size = old_size;
+		return err;
+	}
+
+	al->dirty = true;
+
+	if (attr && attr->non_res) {
+		err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
+					al->size);
+		if (err)
+			return err;
+		al->dirty = false;
+	}
+
+	return 0;
+}
+
+/*
+ * al_remove_le - Remove @le from attribute list.
+ */
+bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le)
+{
+	u16 size;
+	size_t off;
+	typeof(ni->attr_list) *al = &ni->attr_list;
+
+	if (!al_is_valid_le(ni, le))
+		return false;
+
+	/* Save on stack the size of 'le' */
+	size = le16_to_cpu(le->size);
+	off = PtrOffset(al->le, le);
+
+	memmove(le, Add2Ptr(le, size), al->size - (off + size));
+
+	al->size -= size;
+	al->dirty = true;
+
+	return true;
+}
+
+/*
+ * al_delete_le - Delete first le from the list which matches its parameters.
+ */
+bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
+		  const __le16 *name, size_t name_len,
+		  const struct MFT_REF *ref)
+{
+	u16 size;
+	struct ATTR_LIST_ENTRY *le;
+	size_t off;
+	typeof(ni->attr_list) *al = &ni->attr_list;
+
+	/* Scan forward to the first le that matches the input. */
+	le = al_find_ex(ni, NULL, type, name, name_len, &vcn);
+	if (!le)
+		return false;
+
+	off = PtrOffset(al->le, le);
+
+next:
+	if (off >= al->size)
+		return false;
+	if (le->type != type)
+		return false;
+	if (le->name_len != name_len)
+		return false;
+	if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len,
+				       ni->mi.sbi->upcase, true))
+		return false;
+	if (le64_to_cpu(le->vcn) != vcn)
+		return false;
+
+	/*
+	 * The caller specified a segment reference, so we have to
+	 * scan through the matching entries until we find that segment
+	 * reference or we run of matching entries.
+	 */
+	if (ref && memcmp(ref, &le->ref, sizeof(*ref))) {
+		off += le16_to_cpu(le->size);
+		le = Add2Ptr(al->le, off);
+		goto next;
+	}
+
+	/* Save on stack the size of 'le'. */
+	size = le16_to_cpu(le->size);
+	/* Delete the le. */
+	memmove(le, Add2Ptr(le, size), al->size - (off + size));
+
+	al->size -= size;
+	al->dirty = true;
+
+	return true;
+}
+
+int al_update(struct ntfs_inode *ni)
+{
+	int err;
+	struct ATTRIB *attr;
+	typeof(ni->attr_list) *al = &ni->attr_list;
+
+	if (!al->dirty || !al->size)
+		return 0;
+
+	/*
+	 * Attribute list increased on demand in al_add_le.
+	 * Attribute list decreased here.
+	 */
+	err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, al->size, NULL,
+			    false, &attr);
+	if (err)
+		goto out;
+
+	if (!attr->non_res) {
+		memcpy(resident_data(attr), al->le, al->size);
+	} else {
+		err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
+					al->size);
+		if (err)
+			goto out;
+
+		attr->nres.valid_size = attr->nres.data_size;
+	}
+
+	ni->mi.dirty = true;
+	al->dirty = false;
+
+out:
+	return err;
+}
--- a/fs/ntfs3/bitfunc.c
+++ b/fs/ntfs3/bitfunc.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+#define BITS_IN_SIZE_T (sizeof(size_t) * 8)
+
+/*
+ * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8
+ * fill_mask[i] = 0xFF >> (8-i)
+ */
+static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F,
+				0x1F, 0x3F, 0x7F, 0xFF };
+
+/*
+ * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8
+ * zero_mask[i] = 0xFF << i
+ */
+static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0,
+				0xE0, 0xC0, 0x80, 0x00 };
+
+/*
+ * are_bits_clear
+ *
+ * Return: True if all bits [bit, bit+nbits) are zeros "0".
+ */
+bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
+{
+	size_t pos = bit & 7;
+	const u8 *map = (u8 *)lmap + (bit >> 3);
+
+	if (pos) {
+		if (8 - pos >= nbits)
+			return !nbits || !(*map & fill_mask[pos + nbits] &
+					   zero_mask[pos]);
+
+		if (*map++ & zero_mask[pos])
+			return false;
+		nbits -= 8 - pos;
+	}
+
+	pos = ((size_t)map) & (sizeof(size_t) - 1);
+	if (pos) {
+		pos = sizeof(size_t) - pos;
+		if (nbits >= pos * 8) {
+			for (nbits -= pos * 8; pos; pos--, map++) {
+				if (*map)
+					return false;
+			}
+		}
+	}
+
+	for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) {
+		if (*((size_t *)map))
+			return false;
+	}
+
+	for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) {
+		if (*map)
+			return false;
+	}
+
+	pos = nbits & 7;
+	if (pos && (*map & fill_mask[pos]))
+		return false;
+
+	return true;
+}
+
+/*
+ * are_bits_set
+ *
+ * Return: True if all bits [bit, bit+nbits) are ones "1".
+ */
+bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
+{
+	u8 mask;
+	size_t pos = bit & 7;
+	const u8 *map = (u8 *)lmap + (bit >> 3);
+
+	if (pos) {
+		if (8 - pos >= nbits) {
+			mask = fill_mask[pos + nbits] & zero_mask[pos];
+			return !nbits || (*map & mask) == mask;
+		}
+
+		mask = zero_mask[pos];
+		if ((*map++ & mask) != mask)
+			return false;
+		nbits -= 8 - pos;
+	}
+
+	pos = ((size_t)map) & (sizeof(size_t) - 1);
+	if (pos) {
+		pos = sizeof(size_t) - pos;
+		if (nbits >= pos * 8) {
+			for (nbits -= pos * 8; pos; pos--, map++) {
+				if (*map != 0xFF)
+					return false;
+			}
+		}
+	}
+
+	for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) {
+		if (*((size_t *)map) != MINUS_ONE_T)
+			return false;
+	}
+
+	for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) {
+		if (*map != 0xFF)
+			return false;
+	}
+
+	pos = nbits & 7;
+	if (pos) {
+		u8 mask = fill_mask[pos];
+
+		if ((*map & mask) != mask)
+			return false;
+	}
+
+	return true;
+}
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
--- a/fs/ntfs3/debug.h
+++ b/fs/ntfs3/debug.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * Useful functions for debugging.
+ *
+ */
+
+// clang-format off
+#ifndef _LINUX_NTFS3_DEBUG_H
+#define _LINUX_NTFS3_DEBUG_H
+
+#ifndef Add2Ptr
+#define Add2Ptr(P, I)		((void *)((u8 *)(P) + (I)))
+#define PtrOffset(B, O)		((size_t)((size_t)(O) - (size_t)(B)))
+#endif
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void ntfs_printk(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
+void ntfs_inode_printk(struct inode *inode, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void ntfs_printk(const struct super_block *sb, const char *fmt, ...)
+{
+}
+
+static inline __printf(2, 3)
+void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
+{
+}
+#endif
+
+/*
+ * Logging macros. Thanks Joe Perches <joe@perches.com> for implementation.
+ */
+
+#define ntfs_err(sb, fmt, ...)  ntfs_printk(sb, KERN_ERR fmt, ##__VA_ARGS__)
+#define ntfs_warn(sb, fmt, ...) ntfs_printk(sb, KERN_WARNING fmt, ##__VA_ARGS__)
+#define ntfs_info(sb, fmt, ...) ntfs_printk(sb, KERN_INFO fmt, ##__VA_ARGS__)
+#define ntfs_notice(sb, fmt, ...)                                              \
+	ntfs_printk(sb, KERN_NOTICE fmt, ##__VA_ARGS__)
+
+#define ntfs_inode_err(inode, fmt, ...)                                        \
+	ntfs_inode_printk(inode, KERN_ERR fmt, ##__VA_ARGS__)
+#define ntfs_inode_warn(inode, fmt, ...)                                       \
+	ntfs_inode_printk(inode, KERN_WARNING fmt, ##__VA_ARGS__)
+
+#endif /* _LINUX_NTFS3_DEBUG_H */
+// clang-format on
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ *  Directory handling functions for NTFS-based filesystems.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/iversion.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/* Convert little endian UTF-16 to NLS string. */
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni,
+		      u8 *buf, int buf_len)
+{
+	int ret, uni_len, warn;
+	const __le16 *ip;
+	u8 *op;
+	struct nls_table *nls = sbi->options.nls;
+
+	static_assert(sizeof(wchar_t) == sizeof(__le16));
+
+	if (!nls) {
+		/* UTF-16 -> UTF-8 */
+		ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len,
+				      UTF16_LITTLE_ENDIAN, buf, buf_len);
+		buf[ret] = '\0';
+		return ret;
+	}
+
+	ip = uni->name;
+	op = buf;
+	uni_len = uni->len;
+	warn = 0;
+
+	while (uni_len--) {
+		u16 ec;
+		int charlen;
+		char dump[5];
+
+		if (buf_len < NLS_MAX_CHARSET_SIZE) {
+			ntfs_warn(sbi->sb,
+				  "filename was truncated while converting.");
+			break;
+		}
+
+		ec = le16_to_cpu(*ip++);
+		charlen = nls->uni2char(ec, op, buf_len);
+
+		if (charlen > 0) {
+			op += charlen;
+			buf_len -= charlen;
+			continue;
+		}
+
+		*op++ = '_';
+		buf_len -= 1;
+		if (warn)
+			continue;
+
+		warn = 1;
+		hex_byte_pack(&dump[0], ec >> 8);
+		hex_byte_pack(&dump[2], ec);
+		dump[4] = 0;
+
+		ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump,
+			 nls->charset);
+	}
+
+	*op = '\0';
+	return op - buf;
+}
+
+// clang-format off
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+// clang-format on
+
+/*
+ * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c
+ *
+ * Function is sparse warnings free.
+ */
+static inline void put_utf16(wchar_t *s, unsigned int c,
+			     enum utf16_endian endian)
+{
+	static_assert(sizeof(wchar_t) == sizeof(__le16));
+	static_assert(sizeof(wchar_t) == sizeof(__be16));
+
+	switch (endian) {
+	default:
+		*s = (wchar_t)c;
+		break;
+	case UTF16_LITTLE_ENDIAN:
+		*(__le16 *)s = __cpu_to_le16(c);
+		break;
+	case UTF16_BIG_ENDIAN:
+		*(__be16 *)s = __cpu_to_be16(c);
+		break;
+	}
+}
+
+/*
+ * _utf8s_to_utf16s
+ *
+ * Modified version of 'utf8s_to_utf16s' allows to
+ * detect -ENAMETOOLONG without writing out of expected maximum.
+ */
+static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+			    wchar_t *pwcs, int maxout)
+{
+	u16 *op;
+	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (inlen > 0 && *s) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, inlen, &u);
+			if (size < 0)
+				return -EINVAL;
+			s += size;
+			inlen -= size;
+
+			if (u >= PLANE_SIZE) {
+				if (maxout < 2)
+					return -ENAMETOOLONG;
+
+				u -= PLANE_SIZE;
+				put_utf16(op++,
+					  SURROGATE_PAIR |
+						  ((u >> 10) & SURROGATE_BITS),
+					  endian);
+				put_utf16(op++,
+					  SURROGATE_PAIR | SURROGATE_LOW |
+						  (u & SURROGATE_BITS),
+					  endian);
+				maxout -= 2;
+			} else {
+				if (maxout < 1)
+					return -ENAMETOOLONG;
+
+				put_utf16(op++, u, endian);
+				maxout--;
+			}
+		} else {
+			if (maxout < 1)
+				return -ENAMETOOLONG;
+
+			put_utf16(op++, *s++, endian);
+			inlen--;
+			maxout--;
+		}
+	}
+	return op - pwcs;
+}
+
+/*
+ * ntfs_nls_to_utf16 - Convert input string to UTF-16.
+ * @name:	Input name.
+ * @name_len:	Input name length.
+ * @uni:	Destination memory.
+ * @max_ulen:	Destination memory.
+ * @endian:	Endian of target UTF-16 string.
+ *
+ * This function is called:
+ * - to create NTFS name
+ * - to create symlink
+ *
+ * Return: UTF-16 string length or error (if negative).
+ */
+int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
+		      struct cpu_str *uni, u32 max_ulen,
+		      enum utf16_endian endian)
+{
+	int ret, slen;
+	const u8 *end;
+	struct nls_table *nls = sbi->options.nls;
+	u16 *uname = uni->name;
+
+	static_assert(sizeof(wchar_t) == sizeof(u16));
+
+	if (!nls) {
+		/* utf8 -> utf16 */
+		ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen);
+		uni->len = ret;
+		return ret;
+	}
+
+	for (ret = 0, end = name + name_len; name < end; ret++, name += slen) {
+		if (ret >= max_ulen)
+			return -ENAMETOOLONG;
+
+		slen = nls->char2uni(name, end - name, uname + ret);
+		if (!slen)
+			return -EINVAL;
+		if (slen < 0)
+			return slen;
+	}
+
+#ifdef __BIG_ENDIAN
+	if (endian == UTF16_LITTLE_ENDIAN) {
+		int i = ret;
+
+		while (i--) {
+			__cpu_to_le16s(uname);
+			uname++;
+		}
+	}
+#else
+	if (endian == UTF16_BIG_ENDIAN) {
+		int i = ret;
+
+		while (i--) {
+			__cpu_to_be16s(uname);
+			uname++;
+		}
+	}
+#endif
+
+	uni->len = ret;
+	return ret;
+}
+
+/*
+ * dir_search_u - Helper function.
+ */
+struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
+			   struct ntfs_fnd *fnd)
+{
+	int err = 0;
+	struct super_block *sb = dir->i_sb;
+	struct ntfs_sb_info *sbi = sb->s_fs_info;
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct NTFS_DE *e;
+	int diff;
+	struct inode *inode = NULL;
+	struct ntfs_fnd *fnd_a = NULL;
+
+	if (!fnd) {
+		fnd_a = fnd_get();
+		if (!fnd_a) {
+			err = -ENOMEM;
+			goto out;
+		}
+		fnd = fnd_a;
+	}
+
+	err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd);
+
+	if (err)
+		goto out;
+
+	if (diff) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	inode = ntfs_iget5(sb, &e->ref, uni);
+	if (!IS_ERR(inode) && is_bad_inode(inode)) {
+		iput(inode);
+		err = -EINVAL;
+	}
+out:
+	fnd_put(fnd_a);
+
+	return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode;
+}
+
+static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
+			       const struct NTFS_DE *e, u8 *name,
+			       struct dir_context *ctx)
+{
+	const struct ATTR_FILE_NAME *fname;
+	unsigned long ino;
+	int name_len;
+	u32 dt_type;
+
+	fname = Add2Ptr(e, sizeof(struct NTFS_DE));
+
+	if (fname->type == FILE_NAME_DOS)
+		return 0;
+
+	if (!mi_is_ref(&ni->mi, &fname->home))
+		return 0;
+
+	ino = ino_get(&e->ref);
+
+	if (ino == MFT_REC_ROOT)
+		return 0;
+
+	/* Skip meta files. Unless option to show metafiles is set. */
+	if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino))
+		return 0;
+
+	if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
+		return 0;
+
+	name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len,
+				     name, PATH_MAX);
+	if (name_len <= 0) {
+		ntfs_warn(sbi->sb, "failed to convert name for inode %lx.",
+			  ino);
+		return 0;
+	}
+
+	dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+	return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
+}
+
+/*
+ * ntfs_read_hdr - Helper function for ntfs_readdir().
+ */
+static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
+			 const struct INDEX_HDR *hdr, u64 vbo, u64 pos,
+			 u8 *name, struct dir_context *ctx)
+{
+	int err;
+	const struct NTFS_DE *e;
+	u32 e_size;
+	u32 end = le32_to_cpu(hdr->used);
+	u32 off = le32_to_cpu(hdr->de_off);
+
+	for (;; off += e_size) {
+		if (off + sizeof(struct NTFS_DE) > end)
+			return -1;
+
+		e = Add2Ptr(hdr, off);
+		e_size = le16_to_cpu(e->size);
+		if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
+			return -1;
+
+		if (de_is_last(e))
+			return 0;
+
+		/* Skip already enumerated. */
+		if (vbo + off < pos)
+			continue;
+
+		if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME)
+			return -1;
+
+		ctx->pos = vbo + off;
+
+		/* Submit the name to the filldir callback. */
+		err = ntfs_filldir(sbi, ni, e, name, ctx);
+		if (err)
+			return err;
+	}
+}
+
+/*
+ * ntfs_readdir - file_operations::iterate_shared
+ *
+ * Use non sorted enumeration.
+ * We have an example of broken volume where sorted enumeration
+ * counts each name twice.
+ */
+static int ntfs_readdir(struct file *file, struct dir_context *ctx)
+{
+	const struct INDEX_ROOT *root;
+	u64 vbo;
+	size_t bit;
+	loff_t eod;
+	int err = 0;
+	struct inode *dir = file_inode(file);
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct super_block *sb = dir->i_sb;
+	struct ntfs_sb_info *sbi = sb->s_fs_info;
+	loff_t i_size = i_size_read(dir);
+	u32 pos = ctx->pos;
+	u8 *name = NULL;
+	struct indx_node *node = NULL;
+	u8 index_bits = ni->dir.index_bits;
+
+	/* Name is a buffer of PATH_MAX length. */
+	static_assert(NTFS_NAME_LEN * 4 < PATH_MAX);
+
+	eod = i_size + sbi->record_size;
+
+	if (pos >= eod)
+		return 0;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	/* Allocate PATH_MAX bytes. */
+	name = __getname();
+	if (!name)
+		return -ENOMEM;
+
+	if (!ni->mi_loaded && ni->attr_list.size) {
+		/*
+		 * Directory inode is locked for read.
+		 * Load all subrecords to avoid 'write' access to 'ni' during
+		 * directory reading.
+		 */
+		ni_lock(ni);
+		if (!ni->mi_loaded && ni->attr_list.size) {
+			err = ni_load_all_mi(ni);
+			if (!err)
+				ni->mi_loaded = true;
+		}
+		ni_unlock(ni);
+		if (err)
+			goto out;
+	}
+
+	root = indx_get_root(&ni->dir, ni, NULL, NULL);
+	if (!root) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (pos >= sbi->record_size) {
+		bit = (pos - sbi->record_size) >> index_bits;
+	} else {
+		err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx);
+		if (err)
+			goto out;
+		bit = 0;
+	}
+
+	if (!i_size) {
+		ctx->pos = eod;
+		goto out;
+	}
+
+	for (;;) {
+		vbo = (u64)bit << index_bits;
+		if (vbo >= i_size) {
+			ctx->pos = eod;
+			goto out;
+		}
+
+		err = indx_used_bit(&ni->dir, ni, &bit);
+		if (err)
+			goto out;
+
+		if (bit == MINUS_ONE_T) {
+			ctx->pos = eod;
+			goto out;
+		}
+
+		vbo = (u64)bit << index_bits;
+		if (vbo >= i_size) {
+			ntfs_inode_err(dir, "Looks like your dir is corrupt");
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
+				&node);
+		if (err)
+			goto out;
+
+		err = ntfs_read_hdr(sbi, ni, &node->index->ihdr,
+				    vbo + sbi->record_size, pos, name, ctx);
+		if (err)
+			goto out;
+
+		bit += 1;
+	}
+
+out:
+
+	__putname(name);
+	put_indx_node(node);
+
+	if (err == -ENOENT) {
+		err = 0;
+		ctx->pos = pos;
+	}
+
+	return err;
+}
+
+static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
+			  size_t *files)
+{
+	int err = 0;
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct NTFS_DE *e = NULL;
+	struct INDEX_ROOT *root;
+	struct INDEX_HDR *hdr;
+	const struct ATTR_FILE_NAME *fname;
+	u32 e_size, off, end;
+	u64 vbo = 0;
+	size_t drs = 0, fles = 0, bit = 0;
+	loff_t i_size = ni->vfs_inode.i_size;
+	struct indx_node *node = NULL;
+	u8 index_bits = ni->dir.index_bits;
+
+	if (is_empty)
+		*is_empty = true;
+
+	root = indx_get_root(&ni->dir, ni, NULL, NULL);
+	if (!root)
+		return -EINVAL;
+
+	hdr = &root->ihdr;
+
+	for (;;) {
+		end = le32_to_cpu(hdr->used);
+		off = le32_to_cpu(hdr->de_off);
+
+		for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) {
+			e = Add2Ptr(hdr, off);
+			e_size = le16_to_cpu(e->size);
+			if (e_size < sizeof(struct NTFS_DE) ||
+			    off + e_size > end)
+				break;
+
+			if (de_is_last(e))
+				break;
+
+			fname = de_get_fname(e);
+			if (!fname)
+				continue;
+
+			if (fname->type == FILE_NAME_DOS)
+				continue;
+
+			if (is_empty) {
+				*is_empty = false;
+				if (!dirs && !files)
+					goto out;
+			}
+
+			if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY)
+				drs += 1;
+			else
+				fles += 1;
+		}
+
+		if (vbo >= i_size)
+			goto out;
+
+		err = indx_used_bit(&ni->dir, ni, &bit);
+		if (err)
+			goto out;
+
+		if (bit == MINUS_ONE_T)
+			goto out;
+
+		vbo = (u64)bit << index_bits;
+		if (vbo >= i_size)
+			goto out;
+
+		err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
+				&node);
+		if (err)
+			goto out;
+
+		hdr = &node->index->ihdr;
+		bit += 1;
+		vbo = (u64)bit << ni->dir.idx2vbn_bits;
+	}
+
+out:
+	put_indx_node(node);
+	if (dirs)
+		*dirs = drs;
+	if (files)
+		*files = fles;
+
+	return err;
+}
+
+bool dir_is_empty(struct inode *dir)
+{
+	bool is_empty = false;
+
+	ntfs_dir_count(dir, &is_empty, NULL, NULL);
+
+	return is_empty;
+}
+
+// clang-format off
+const struct file_operations ntfs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= ntfs_readdir,
+	.fsync		= generic_file_fsync,
+	.open		= ntfs_file_open,
+};
+// clang-format on
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
--- a/fs/ntfs3/lib/decompress_common.c
+++ b/fs/ntfs3/lib/decompress_common.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * decompress_common.c - Code shared by the XPRESS and LZX decompressors
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+
+/*
+ * make_huffman_decode_table() -
+ *
+ * Build a decoding table for a canonical prefix code, or "Huffman code".
+ *
+ * This is an internal function, not part of the library API!
+ *
+ * This takes as input the length of the codeword for each symbol in the
+ * alphabet and produces as output a table that can be used for fast
+ * decoding of prefix-encoded symbols using read_huffsym().
+ *
+ * Strictly speaking, a canonical prefix code might not be a Huffman
+ * code.  But this algorithm will work either way; and in fact, since
+ * Huffman codes are defined in terms of symbol frequencies, there is no
+ * way for the decompressor to know whether the code is a true Huffman
+ * code or not until all symbols have been decoded.
+ *
+ * Because the prefix code is assumed to be "canonical", it can be
+ * reconstructed directly from the codeword lengths.  A prefix code is
+ * canonical if and only if a longer codeword never lexicographically
+ * precedes a shorter codeword, and the lexicographic ordering of
+ * codewords of the same length is the same as the lexicographic ordering
+ * of the corresponding symbols.  Consequently, we can sort the symbols
+ * primarily by codeword length and secondarily by symbol value, then
+ * reconstruct the prefix code by generating codewords lexicographically
+ * in that order.
+ *
+ * This function does not, however, generate the prefix code explicitly.
+ * Instead, it directly builds a table for decoding symbols using the
+ * code.  The basic idea is this: given the next 'max_codeword_len' bits
+ * in the input, we can look up the decoded symbol by indexing a table
+ * containing 2**max_codeword_len entries.  A codeword with length
+ * 'max_codeword_len' will have exactly one entry in this table, whereas
+ * a codeword shorter than 'max_codeword_len' will have multiple entries
+ * in this table.  Precisely, a codeword of length n will be represented
+ * by 2**(max_codeword_len - n) entries in this table.  The 0-based index
+ * of each such entry will contain the corresponding codeword as a prefix
+ * when zero-padded on the left to 'max_codeword_len' binary digits.
+ *
+ * That's the basic idea, but we implement two optimizations regarding
+ * the format of the decode table itself:
+ *
+ * - For many compression formats, the maximum codeword length is too
+ *   long for it to be efficient to build the full decoding table
+ *   whenever a new prefix code is used.  Instead, we can build the table
+ *   using only 2**table_bits entries, where 'table_bits' is some number
+ *   less than or equal to 'max_codeword_len'.  Then, only codewords of
+ *   length 'table_bits' and shorter can be directly looked up.  For
+ *   longer codewords, the direct lookup instead produces the root of a
+ *   binary tree.  Using this tree, the decoder can do traditional
+ *   bit-by-bit decoding of the remainder of the codeword.  Child nodes
+ *   are allocated in extra entries at the end of the table; leaf nodes
+ *   contain symbols.  Note that the long-codeword case is, in general,
+ *   not performance critical, since in Huffman codes the most frequently
+ *   used symbols are assigned the shortest codeword lengths.
+ *
+ * - When we decode a symbol using a direct lookup of the table, we still
+ *   need to know its length so that the bitstream can be advanced by the
+ *   appropriate number of bits.  The simple solution is to simply retain
+ *   the 'lens' array and use the decoded symbol as an index into it.
+ *   However, this requires two separate array accesses in the fast path.
+ *   The optimization is to store the length directly in the decode
+ *   table.  We use the bottom 11 bits for the symbol and the top 5 bits
+ *   for the length.  In addition, to combine this optimization with the
+ *   previous one, we introduce a special case where the top 2 bits of
+ *   the length are both set if the entry is actually the root of a
+ *   binary tree.
+ *
+ * @decode_table:
+ *	The array in which to create the decoding table.  This must have
+ *	a length of at least ((2**table_bits) + 2 * num_syms) entries.
+ *
+ * @num_syms:
+ *	The number of symbols in the alphabet; also, the length of the
+ *	'lens' array.  Must be less than or equal to 2048.
+ *
+ * @table_bits:
+ *	The order of the decode table size, as explained above.  Must be
+ *	less than or equal to 13.
+ *
+ * @lens:
+ *	An array of length @num_syms, indexable by symbol, that gives the
+ *	length of the codeword, in bits, for that symbol.  The length can
+ *	be 0, which means that the symbol does not have a codeword
+ *	assigned.
+ *
+ * @max_codeword_len:
+ *	The longest codeword length allowed in the compression format.
+ *	All entries in 'lens' must be less than or equal to this value.
+ *	This must be less than or equal to 23.
+ *
+ * @working_space
+ *	A temporary array of length '2 * (max_codeword_len + 1) +
+ *	num_syms'.
+ *
+ * Returns 0 on success, or -1 if the lengths do not form a valid prefix
+ * code.
+ */
+int make_huffman_decode_table(u16 decode_table[], const u32 num_syms,
+			      const u32 table_bits, const u8 lens[],
+			      const u32 max_codeword_len,
+			      u16 working_space[])
+{
+	const u32 table_num_entries = 1 << table_bits;
+	u16 * const len_counts = &working_space[0];
+	u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
+	u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
+	int left;
+	void *decode_table_ptr;
+	u32 sym_idx;
+	u32 codeword_len;
+	u32 stores_per_loop;
+	u32 decode_table_pos;
+	u32 len;
+	u32 sym;
+
+	/* Count how many symbols have each possible codeword length.
+	 * Note that a length of 0 indicates the corresponding symbol is not
+	 * used in the code and therefore does not have a codeword.
+	 */
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (sym = 0; sym < num_syms; sym++)
+		len_counts[lens[sym]]++;
+
+	/* We can assume all lengths are <= max_codeword_len, but we
+	 * cannot assume they form a valid prefix code.  A codeword of
+	 * length n should require a proportion of the codespace equaling
+	 * (1/2)^n.  The code is valid if and only if the codespace is
+	 * exactly filled by the lengths, by this measure.
+	 */
+	left = 1;
+	for (len = 1; len <= max_codeword_len; len++) {
+		left <<= 1;
+		left -= len_counts[len];
+		if (left < 0) {
+			/* The lengths overflow the codespace; that is, the code
+			 * is over-subscribed.
+			 */
+			return -1;
+		}
+	}
+
+	if (left) {
+		/* The lengths do not fill the codespace; that is, they form an
+		 * incomplete set.
+		 */
+		if (left == (1 << max_codeword_len)) {
+			/* The code is completely empty.  This is arguably
+			 * invalid, but in fact it is valid in LZX and XPRESS,
+			 * so we must allow it.  By definition, no symbols can
+			 * be decoded with an empty code.  Consequently, we
+			 * technically don't even need to fill in the decode
+			 * table.  However, to avoid accessing uninitialized
+			 * memory if the algorithm nevertheless attempts to
+			 * decode symbols using such a code, we zero out the
+			 * decode table.
+			 */
+			memset(decode_table, 0,
+			       table_num_entries * sizeof(decode_table[0]));
+			return 0;
+		}
+		return -1;
+	}
+
+	/* Sort the symbols primarily by length and secondarily by symbol order.
+	 */
+
+	/* Initialize 'offsets' so that offsets[len] for 1 <= len <=
+	 * max_codeword_len is the number of codewords shorter than 'len' bits.
+	 */
+	offsets[1] = 0;
+	for (len = 1; len < max_codeword_len; len++)
+		offsets[len + 1] = offsets[len] + len_counts[len];
+
+	/* Use the 'offsets' array to sort the symbols.  Note that we do not
+	 * include symbols that are not used in the code.  Consequently, fewer
+	 * than 'num_syms' entries in 'sorted_syms' may be filled.
+	 */
+	for (sym = 0; sym < num_syms; sym++)
+		if (lens[sym])
+			sorted_syms[offsets[lens[sym]]++] = sym;
+
+	/* Fill entries for codewords with length <= table_bits
+	 * --- that is, those short enough for a direct mapping.
+	 *
+	 * The table will start with entries for the shortest codeword(s), which
+	 * have the most entries.  From there, the number of entries per
+	 * codeword will decrease.
+	 */
+	decode_table_ptr = decode_table;
+	sym_idx = 0;
+	codeword_len = 1;
+	stores_per_loop = (1 << (table_bits - codeword_len));
+	for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
+		u32 end_sym_idx = sym_idx + len_counts[codeword_len];
+
+		for (; sym_idx < end_sym_idx; sym_idx++) {
+			u16 entry;
+			u16 *p;
+			u32 n;
+
+			entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx];
+			p = (u16 *)decode_table_ptr;
+			n = stores_per_loop;
+
+			do {
+				*p++ = entry;
+			} while (--n);
+
+			decode_table_ptr = p;
+		}
+	}
+
+	/* If we've filled in the entire table, we are done.  Otherwise,
+	 * there are codewords longer than table_bits for which we must
+	 * generate binary trees.
+	 */
+	decode_table_pos = (u16 *)decode_table_ptr - decode_table;
+	if (decode_table_pos != table_num_entries) {
+		u32 j;
+		u32 next_free_tree_slot;
+		u32 cur_codeword;
+
+		/* First, zero out the remaining entries.  This is
+		 * necessary so that these entries appear as
+		 * "unallocated" in the next part.  Each of these entries
+		 * will eventually be filled with the representation of
+		 * the root node of a binary tree.
+		 */
+		j = decode_table_pos;
+		do {
+			decode_table[j] = 0;
+		} while (++j != table_num_entries);
+
+		/* We allocate child nodes starting at the end of the
+		 * direct lookup table.  Note that there should be
+		 * 2*num_syms extra entries for this purpose, although
+		 * fewer than this may actually be needed.
+		 */
+		next_free_tree_slot = table_num_entries;
+
+		/* Iterate through each codeword with length greater than
+		 * 'table_bits', primarily in order of codeword length
+		 * and secondarily in order of symbol.
+		 */
+		for (cur_codeword = decode_table_pos << 1;
+		     codeword_len <= max_codeword_len;
+		     codeword_len++, cur_codeword <<= 1) {
+			u32 end_sym_idx = sym_idx + len_counts[codeword_len];
+
+			for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) {
+				/* 'sorted_sym' is the symbol represented by the
+				 * codeword.
+				 */
+				u32 sorted_sym = sorted_syms[sym_idx];
+				u32 extra_bits = codeword_len - table_bits;
+				u32 node_idx = cur_codeword >> extra_bits;
+
+				/* Go through each bit of the current codeword
+				 * beyond the prefix of length @table_bits and
+				 * walk the appropriate binary tree, allocating
+				 * any slots that have not yet been allocated.
+				 *
+				 * Note that the 'pointer' entry to the binary
+				 * tree, which is stored in the direct lookup
+				 * portion of the table, is represented
+				 * identically to other internal (non-leaf)
+				 * nodes of the binary tree; it can be thought
+				 * of as simply the root of the tree.  The
+				 * representation of these internal nodes is
+				 * simply the index of the left child combined
+				 * with the special bits 0xC000 to distinguish
+				 * the entry from direct mapping and leaf node
+				 * entries.
+				 */
+				do {
+					/* At least one bit remains in the
+					 * codeword, but the current node is an
+					 * unallocated leaf.  Change it to an
+					 * internal node.
+					 */
+					if (decode_table[node_idx] == 0) {
+						decode_table[node_idx] =
+							next_free_tree_slot | 0xC000;
+						decode_table[next_free_tree_slot++] = 0;
+						decode_table[next_free_tree_slot++] = 0;
+					}
+
+					/* Go to the left child if the next bit
+					 * in the codeword is 0; otherwise go to
+					 * the right child.
+					 */
+					node_idx = decode_table[node_idx] & 0x3FFF;
+					--extra_bits;
+					node_idx += (cur_codeword >> extra_bits) & 1;
+				} while (extra_bits != 0);
+
+				/* We've traversed the tree using the entire
+				 * codeword, and we're now at the entry where
+				 * the actual symbol will be stored.  This is
+				 * distinguished from internal nodes by not
+				 * having its high two bits set.
+				 */
+				decode_table[node_idx] = sorted_sym;
+			}
+		}
+	}
+	return 0;
+}
--- a/fs/ntfs3/lib/decompress_common.h
+++ b/fs/ntfs3/lib/decompress_common.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * decompress_common.h - Code shared by the XPRESS and LZX decompressors
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+
+/* "Force inline" macro (not required, but helpful for performance)  */
+#define forceinline __always_inline
+
+/* Enable whole-word match copying on selected architectures  */
+#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED)
+#  define FAST_UNALIGNED_ACCESS
+#endif
+
+/* Size of a machine word  */
+#define WORDBYTES (sizeof(size_t))
+
+static forceinline void
+copy_unaligned_word(const void *src, void *dst)
+{
+	put_unaligned(get_unaligned((const size_t *)src), (size_t *)dst);
+}
+
+
+/* Generate a "word" with platform-dependent size whose bytes all contain the
+ * value 'b'.
+ */
+static forceinline size_t repeat_byte(u8 b)
+{
+	size_t v;
+
+	v = b;
+	v |= v << 8;
+	v |= v << 16;
+	v |= v << ((WORDBYTES == 8) ? 32 : 0);
+	return v;
+}
+
+/* Structure that encapsulates a block of in-memory data being interpreted as a
+ * stream of bits, optionally with interwoven literal bytes.  Bits are assumed
+ * to be stored in little endian 16-bit coding units, with the bits ordered high
+ * to low.
+ */
+struct input_bitstream {
+
+	/* Bits that have been read from the input buffer.  The bits are
+	 * left-justified; the next bit is always bit 31.
+	 */
+	u32 bitbuf;
+
+	/* Number of bits currently held in @bitbuf.  */
+	u32 bitsleft;
+
+	/* Pointer to the next byte to be retrieved from the input buffer.  */
+	const u8 *next;
+
+	/* Pointer to just past the end of the input buffer.  */
+	const u8 *end;
+};
+
+/* Initialize a bitstream to read from the specified input buffer.  */
+static forceinline void init_input_bitstream(struct input_bitstream *is,
+					     const void *buffer, u32 size)
+{
+	is->bitbuf = 0;
+	is->bitsleft = 0;
+	is->next = buffer;
+	is->end = is->next + size;
+}
+
+/* Ensure the bit buffer variable for the bitstream contains at least @num_bits
+ * bits.  Following this, bitstream_peek_bits() and/or bitstream_remove_bits()
+ * may be called on the bitstream to peek or remove up to @num_bits bits.  Note
+ * that @num_bits must be <= 16.
+ */
+static forceinline void bitstream_ensure_bits(struct input_bitstream *is,
+					      u32 num_bits)
+{
+	if (is->bitsleft < num_bits) {
+		if (is->end - is->next >= 2) {
+			is->bitbuf |= (u32)get_unaligned_le16(is->next)
+					<< (16 - is->bitsleft);
+			is->next += 2;
+		}
+		is->bitsleft += 16;
+	}
+}
+
+/* Return the next @num_bits bits from the bitstream, without removing them.
+ * There must be at least @num_bits remaining in the buffer variable, from a
+ * previous call to bitstream_ensure_bits().
+ */
+static forceinline u32
+bitstream_peek_bits(const struct input_bitstream *is, const u32 num_bits)
+{
+	return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1);
+}
+
+/* Remove @num_bits from the bitstream.  There must be at least @num_bits
+ * remaining in the buffer variable, from a previous call to
+ * bitstream_ensure_bits().
+ */
+static forceinline void
+bitstream_remove_bits(struct input_bitstream *is, u32 num_bits)
+{
+	is->bitbuf <<= num_bits;
+	is->bitsleft -= num_bits;
+}
+
+/* Remove and return @num_bits bits from the bitstream.  There must be at least
+ * @num_bits remaining in the buffer variable, from a previous call to
+ * bitstream_ensure_bits().
+ */
+static forceinline u32
+bitstream_pop_bits(struct input_bitstream *is, u32 num_bits)
+{
+	u32 bits = bitstream_peek_bits(is, num_bits);
+
+	bitstream_remove_bits(is, num_bits);
+	return bits;
+}
+
+/* Read and return the next @num_bits bits from the bitstream.  */
+static forceinline u32
+bitstream_read_bits(struct input_bitstream *is, u32 num_bits)
+{
+	bitstream_ensure_bits(is, num_bits);
+	return bitstream_pop_bits(is, num_bits);
+}
+
+/* Read and return the next literal byte embedded in the bitstream.  */
+static forceinline u8
+bitstream_read_byte(struct input_bitstream *is)
+{
+	if (unlikely(is->end == is->next))
+		return 0;
+	return *is->next++;
+}
+
+/* Read and return the next 16-bit integer embedded in the bitstream.  */
+static forceinline u16
+bitstream_read_u16(struct input_bitstream *is)
+{
+	u16 v;
+
+	if (unlikely(is->end - is->next < 2))
+		return 0;
+	v = get_unaligned_le16(is->next);
+	is->next += 2;
+	return v;
+}
+
+/* Read and return the next 32-bit integer embedded in the bitstream.  */
+static forceinline u32
+bitstream_read_u32(struct input_bitstream *is)
+{
+	u32 v;
+
+	if (unlikely(is->end - is->next < 4))
+		return 0;
+	v = get_unaligned_le32(is->next);
+	is->next += 4;
+	return v;
+}
+
+/* Read into @dst_buffer an array of literal bytes embedded in the bitstream.
+ * Return either a pointer to the byte past the last written, or NULL if the
+ * read overflows the input buffer.
+ */
+static forceinline void *bitstream_read_bytes(struct input_bitstream *is,
+					      void *dst_buffer, size_t count)
+{
+	if ((size_t)(is->end - is->next) < count)
+		return NULL;
+	memcpy(dst_buffer, is->next, count);
+	is->next += count;
+	return (u8 *)dst_buffer + count;
+}
+
+/* Align the input bitstream on a coding-unit boundary.  */
+static forceinline void bitstream_align(struct input_bitstream *is)
+{
+	is->bitsleft = 0;
+	is->bitbuf = 0;
+}
+
+extern int make_huffman_decode_table(u16 decode_table[], const u32 num_syms,
+				     const u32 num_bits, const u8 lens[],
+				     const u32 max_codeword_len,
+				     u16 working_space[]);
+
+
+/* Reads and returns the next Huffman-encoded symbol from a bitstream.  If the
+ * input data is exhausted, the Huffman symbol is decoded as if the missing bits
+ * are all zeroes.
+ */
+static forceinline u32 read_huffsym(struct input_bitstream *istream,
+					 const u16 decode_table[],
+					 u32 table_bits,
+					 u32 max_codeword_len)
+{
+	u32 entry;
+	u32 key_bits;
+
+	bitstream_ensure_bits(istream, max_codeword_len);
+
+	/* Index the decode table by the next table_bits bits of the input.  */
+	key_bits = bitstream_peek_bits(istream, table_bits);
+	entry = decode_table[key_bits];
+	if (entry < 0xC000) {
+		/* Fast case: The decode table directly provided the
+		 * symbol and codeword length.  The low 11 bits are the
+		 * symbol, and the high 5 bits are the codeword length.
+		 */
+		bitstream_remove_bits(istream, entry >> 11);
+		return entry & 0x7FF;
+	}
+	/* Slow case: The codeword for the symbol is longer than
+	 * table_bits, so the symbol does not have an entry
+	 * directly in the first (1 << table_bits) entries of the
+	 * decode table.  Traverse the appropriate binary tree
+	 * bit-by-bit to decode the symbol.
+	 */
+	bitstream_remove_bits(istream, table_bits);
+	do {
+		key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1);
+	} while ((entry = decode_table[key_bits]) >= 0xC000);
+	return entry;
+}
+
+/*
+ * Copy an LZ77 match at (dst - offset) to dst.
+ *
+ * The length and offset must be already validated --- that is, (dst - offset)
+ * can't underrun the output buffer, and (dst + length) can't overrun the output
+ * buffer.  Also, the length cannot be 0.
+ *
+ * @bufend points to the byte past the end of the output buffer.  This function
+ * won't write any data beyond this position.
+ *
+ * Returns dst + length.
+ */
+static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend,
+			       u32 min_length)
+{
+	const u8 *src = dst - offset;
+
+	/*
+	 * Try to copy one machine word at a time.  On i386 and x86_64 this is
+	 * faster than copying one byte at a time, unless the data is
+	 * near-random and all the matches have very short lengths.  Note that
+	 * since this requires unaligned memory accesses, it won't necessarily
+	 * be faster on every architecture.
+	 *
+	 * Also note that we might copy more than the length of the match.  For
+	 * example, if a word is 8 bytes and the match is of length 5, then
+	 * we'll simply copy 8 bytes.  This is okay as long as we don't write
+	 * beyond the end of the output buffer, hence the check for (bufend -
+	 * end >= WORDBYTES - 1).
+	 */
+#ifdef FAST_UNALIGNED_ACCESS
+	u8 * const end = dst + length;
+
+	if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) {
+
+		if (offset >= WORDBYTES) {
+			/* The source and destination words don't overlap.  */
+
+			/* To improve branch prediction, one iteration of this
+			 * loop is unrolled.  Most matches are short and will
+			 * fail the first check.  But if that check passes, then
+			 * it becomes increasing likely that the match is long
+			 * and we'll need to continue copying.
+			 */
+
+			copy_unaligned_word(src, dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+
+			if (dst < end) {
+				do {
+					copy_unaligned_word(src, dst);
+					src += WORDBYTES;
+					dst += WORDBYTES;
+				} while (dst < end);
+			}
+			return end;
+		} else if (offset == 1) {
+
+			/* Offset 1 matches are equivalent to run-length
+			 * encoding of the previous byte.  This case is common
+			 * if the data contains many repeated bytes.
+			 */
+			size_t v = repeat_byte(*(dst - 1));
+
+			do {
+				put_unaligned(v, (size_t *)dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			} while (dst < end);
+			return end;
+		}
+		/*
+		 * We don't bother with special cases for other 'offset <
+		 * WORDBYTES', which are usually rarer than 'offset == 1'.  Extra
+		 * checks will just slow things down.  Actually, it's possible
+		 * to handle all the 'offset < WORDBYTES' cases using the same
+		 * code, but it still becomes more complicated doesn't seem any
+		 * faster overall; it definitely slows down the more common
+		 * 'offset == 1' case.
+		 */
+	}
+#endif /* FAST_UNALIGNED_ACCESS */
+
+	/* Fall back to a bytewise copy.  */
+
+	if (min_length >= 2) {
+		*dst++ = *src++;
+		length--;
+	}
+	if (min_length >= 3) {
+		*dst++ = *src++;
+		length--;
+	}
+	do {
+		*dst++ = *src++;
+	} while (--length);
+
+	return dst;
+}
--- a/fs/ntfs3/lib/lib.h
+++ b/fs/ntfs3/lib/lib.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Adapted for linux kernel by Alexander Mamaev:
+ * - remove implementations of get_unaligned_
+ * - assume GCC is always defined
+ * - ISO C90
+ * - linux kernel code style
+ */
+
+
+/* globals from xpress_decompress.c */
+struct xpress_decompressor *xpress_allocate_decompressor(void);
+void xpress_free_decompressor(struct xpress_decompressor *d);
+int xpress_decompress(struct xpress_decompressor *__restrict d,
+		      const void *__restrict compressed_data,
+		      size_t compressed_size,
+		      void *__restrict uncompressed_data,
+		      size_t uncompressed_size);
+
+/* globals from lzx_decompress.c */
+struct lzx_decompressor *lzx_allocate_decompressor(void);
+void lzx_free_decompressor(struct lzx_decompressor *d);
+int lzx_decompress(struct lzx_decompressor *__restrict d,
+		   const void *__restrict compressed_data,
+		   size_t compressed_size, void *__restrict uncompressed_data,
+		   size_t uncompressed_size);
--- a/fs/ntfs3/lib/lzx_decompress.c
+++ b/fs/ntfs3/lib/lzx_decompress.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * lzx_decompress.c - A decompressor for the LZX compression format, which can
+ * be used in "System Compressed" files.  This is based on the code from wimlib.
+ * This code only supports a window size (dictionary size) of 32768 bytes, since
+ * this is the only size used in System Compression.
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+#include "lib.h"
+
+/* Number of literal byte values  */
+#define LZX_NUM_CHARS			256
+
+/* The smallest and largest allowed match lengths  */
+#define LZX_MIN_MATCH_LEN		2
+#define LZX_MAX_MATCH_LEN		257
+
+/* Number of distinct match lengths that can be represented  */
+#define LZX_NUM_LENS			(LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
+
+/* Number of match lengths for which no length symbol is required  */
+#define LZX_NUM_PRIMARY_LENS		7
+#define LZX_NUM_LEN_HEADERS		(LZX_NUM_PRIMARY_LENS + 1)
+
+/* Valid values of the 3-bit block type field  */
+#define LZX_BLOCKTYPE_VERBATIM		1
+#define LZX_BLOCKTYPE_ALIGNED		2
+#define LZX_BLOCKTYPE_UNCOMPRESSED	3
+
+/* Number of offset slots for a window size of 32768  */
+#define LZX_NUM_OFFSET_SLOTS		30
+
+/* Number of symbols in the main code for a window size of 32768  */
+#define LZX_MAINCODE_NUM_SYMBOLS	\
+	(LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS))
+
+/* Number of symbols in the length code  */
+#define LZX_LENCODE_NUM_SYMBOLS		(LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS)
+
+/* Number of symbols in the precode  */
+#define LZX_PRECODE_NUM_SYMBOLS		20
+
+/* Number of bits in which each precode codeword length is represented  */
+#define LZX_PRECODE_ELEMENT_SIZE	4
+
+/* Number of low-order bits of each match offset that are entropy-encoded in
+ * aligned offset blocks
+ */
+#define LZX_NUM_ALIGNED_OFFSET_BITS	3
+
+/* Number of symbols in the aligned offset code  */
+#define LZX_ALIGNEDCODE_NUM_SYMBOLS	(1 << LZX_NUM_ALIGNED_OFFSET_BITS)
+
+/* Mask for the match offset bits that are entropy-encoded in aligned offset
+ * blocks
+ */
+#define LZX_ALIGNED_OFFSET_BITMASK	((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1)
+
+/* Number of bits in which each aligned offset codeword length is represented  */
+#define LZX_ALIGNEDCODE_ELEMENT_SIZE	3
+
+/* Maximum lengths (in bits) of the codewords in each Huffman code  */
+#define LZX_MAX_MAIN_CODEWORD_LEN	16
+#define LZX_MAX_LEN_CODEWORD_LEN	16
+#define LZX_MAX_PRE_CODEWORD_LEN	((1 << LZX_PRECODE_ELEMENT_SIZE) - 1)
+#define LZX_MAX_ALIGNED_CODEWORD_LEN	((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1)
+
+/* The default "filesize" value used in pre/post-processing.  In the LZX format
+ * used in cabinet files this value must be given to the decompressor, whereas
+ * in the LZX format used in WIM files and system-compressed files this value is
+ * fixed at 12000000.
+ */
+#define LZX_DEFAULT_FILESIZE		12000000
+
+/* Assumed block size when the encoded block size begins with a 0 bit.  */
+#define LZX_DEFAULT_BLOCK_SIZE		32768
+
+/* Number of offsets in the recent (or "repeat") offsets queue.  */
+#define LZX_NUM_RECENT_OFFSETS		3
+
+/* These values are chosen for fast decompression.  */
+#define LZX_MAINCODE_TABLEBITS		11
+#define LZX_LENCODE_TABLEBITS		10
+#define LZX_PRECODE_TABLEBITS		6
+#define LZX_ALIGNEDCODE_TABLEBITS	7
+
+#define LZX_READ_LENS_MAX_OVERRUN	50
+
+/* Mapping: offset slot => first match offset that uses that offset slot.
+ */
+static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = {
+	0,	1,	2,	3,	4,	/* 0  --- 4  */
+	6,	8,	12,	16,	24,	/* 5  --- 9  */
+	32,	48,	64,	96,	128,	/* 10 --- 14 */
+	192,	256,	384,	512,	768,	/* 15 --- 19 */
+	1024,	1536,	2048,	3072,	4096,   /* 20 --- 24 */
+	6144,	8192,	12288,	16384,	24576,	/* 25 --- 29 */
+	32768,					/* extra     */
+};
+
+/* Mapping: offset slot => how many extra bits must be read and added to the
+ * corresponding offset slot base to decode the match offset.
+ */
+static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = {
+	0,	0,	0,	0,	1,
+	1,	2,	2,	3,	3,
+	4,	4,	5,	5,	6,
+	6,	7,	7,	8,	8,
+	9,	9,	10,	10,	11,
+	11,	12,	12,	13,	13,
+};
+
+/* Reusable heap-allocated memory for LZX decompression  */
+struct lzx_decompressor {
+
+	/* Huffman decoding tables, and arrays that map symbols to codeword
+	 * lengths
+	 */
+
+	u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) +
+					(LZX_MAINCODE_NUM_SYMBOLS * 2)];
+	u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];
+
+
+	u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) +
+					(LZX_LENCODE_NUM_SYMBOLS * 2)];
+	u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];
+
+
+	u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) +
+					(LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)];
+	u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+
+	u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) +
+				 (LZX_PRECODE_NUM_SYMBOLS * 2)];
+	u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+
+	/* Temporary space for make_huffman_decode_table()  */
+	u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) +
+			  LZX_MAINCODE_NUM_SYMBOLS];
+};
+
+static void undo_e8_translation(void *target, s32 input_pos)
+{
+	s32 abs_offset, rel_offset;
+
+	abs_offset = get_unaligned_le32(target);
+	if (abs_offset >= 0) {
+		if (abs_offset < LZX_DEFAULT_FILESIZE) {
+			/* "good translation" */
+			rel_offset = abs_offset - input_pos;
+			put_unaligned_le32(rel_offset, target);
+		}
+	} else {
+		if (abs_offset >= -input_pos) {
+			/* "compensating translation" */
+			rel_offset = abs_offset + LZX_DEFAULT_FILESIZE;
+			put_unaligned_le32(rel_offset, target);
+		}
+	}
+}
+
+/*
+ * Undo the 'E8' preprocessing used in LZX.  Before compression, the
+ * uncompressed data was preprocessed by changing the targets of suspected x86
+ * CALL instructions from relative offsets to absolute offsets.  After
+ * match/literal decoding, the decompressor must undo the translation.
+ */
+static void lzx_postprocess(u8 *data, u32 size)
+{
+	/*
+	 * A worthwhile optimization is to push the end-of-buffer check into the
+	 * relatively rare E8 case.  This is possible if we replace the last six
+	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
+	 * before reaching end-of-buffer.  In addition, this scheme guarantees
+	 * that no translation can begin following an E8 byte in the last 10
+	 * bytes because a 4-byte offset containing E8 as its high byte is a
+	 * large negative number that is not valid for translation.  That is
+	 * exactly what we need.
+	 */
+	u8 *tail;
+	u8 saved_bytes[6];
+	u8 *p;
+
+	if (size <= 10)
+		return;
+
+	tail = &data[size - 6];
+	memcpy(saved_bytes, tail, 6);
+	memset(tail, 0xE8, 6);
+	p = data;
+	for (;;) {
+		while (*p != 0xE8)
+			p++;
+		if (p >= tail)
+			break;
+		undo_e8_translation(p + 1, p - data);
+		p += 5;
+	}
+	memcpy(tail, saved_bytes, 6);
+}
+
+/* Read a Huffman-encoded symbol using the precode.  */
+static forceinline u32 read_presym(const struct lzx_decompressor *d,
+					struct input_bitstream *is)
+{
+	return read_huffsym(is, d->precode_decode_table,
+			    LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the main code.  */
+static forceinline u32 read_mainsym(const struct lzx_decompressor *d,
+					 struct input_bitstream *is)
+{
+	return read_huffsym(is, d->maincode_decode_table,
+			    LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the length code.  */
+static forceinline u32 read_lensym(const struct lzx_decompressor *d,
+					struct input_bitstream *is)
+{
+	return read_huffsym(is, d->lencode_decode_table,
+			    LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the aligned offset code.  */
+static forceinline u32 read_alignedsym(const struct lzx_decompressor *d,
+					    struct input_bitstream *is)
+{
+	return read_huffsym(is, d->alignedcode_decode_table,
+			    LZX_ALIGNEDCODE_TABLEBITS,
+			    LZX_MAX_ALIGNED_CODEWORD_LEN);
+}
+
+/*
+ * Read the precode from the compressed input bitstream, then use it to decode
+ * @num_lens codeword length values.
+ *
+ * @is:		The input bitstream.
+ *
+ * @lens:	An array that contains the length values from the previous time
+ *		the codeword lengths for this Huffman code were read, or all 0's
+ *		if this is the first time.  This array must have at least
+ *		(@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries.
+ *
+ * @num_lens:	Number of length values to decode.
+ *
+ * Returns 0 on success, or -1 if the data was invalid.
+ */
+static int lzx_read_codeword_lens(struct lzx_decompressor *d,
+				  struct input_bitstream *is,
+				  u8 *lens, u32 num_lens)
+{
+	u8 *len_ptr = lens;
+	u8 *lens_end = lens + num_lens;
+	int i;
+
+	/* Read the lengths of the precode codewords.  These are given
+	 * explicitly.
+	 */
+	for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
+		d->precode_lens[i] =
+			bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE);
+	}
+
+	/* Make the decoding table for the precode.  */
+	if (make_huffman_decode_table(d->precode_decode_table,
+				      LZX_PRECODE_NUM_SYMBOLS,
+				      LZX_PRECODE_TABLEBITS,
+				      d->precode_lens,
+				      LZX_MAX_PRE_CODEWORD_LEN,
+				      d->working_space))
+		return -1;
+
+	/* Decode the codeword lengths.  */
+	do {
+		u32 presym;
+		u8 len;
+
+		/* Read the next precode symbol.  */
+		presym = read_presym(d, is);
+		if (presym < 17) {
+			/* Difference from old length  */
+			len = *len_ptr - presym;
+			if ((s8)len < 0)
+				len += 17;
+			*len_ptr++ = len;
+		} else {
+			/* Special RLE values  */
+
+			u32 run_len;
+
+			if (presym == 17) {
+				/* Run of 0's  */
+				run_len = 4 + bitstream_read_bits(is, 4);
+				len = 0;
+			} else if (presym == 18) {
+				/* Longer run of 0's  */
+				run_len = 20 + bitstream_read_bits(is, 5);
+				len = 0;
+			} else {
+				/* Run of identical lengths  */
+				run_len = 4 + bitstream_read_bits(is, 1);
+				presym = read_presym(d, is);
+				if (presym > 17)
+					return -1;
+				len = *len_ptr - presym;
+				if ((s8)len < 0)
+					len += 17;
+			}
+
+			do {
+				*len_ptr++ = len;
+			} while (--run_len);
+			/* Worst case overrun is when presym == 18,
+			 * run_len == 20 + 31, and only 1 length was remaining.
+			 * So LZX_READ_LENS_MAX_OVERRUN == 50.
+			 *
+			 * Overrun while reading the first half of maincode_lens
+			 * can corrupt the previous values in the second half.
+			 * This doesn't really matter because the resulting
+			 * lengths will still be in range, and data that
+			 * generates overruns is invalid anyway.
+			 */
+		}
+	} while (len_ptr < lens_end);
+
+	return 0;
+}
+
+/*
+ * Read the header of an LZX block and save the block type and (uncompressed)
+ * size in *block_type_ret and *block_size_ret, respectively.
+ *
+ * If the block is compressed, also update the Huffman decode @tables with the
+ * new Huffman codes.  If the block is uncompressed, also update the match
+ * offset @queue with the new match offsets.
+ *
+ * Return 0 on success, or -1 if the data was invalid.
+ */
+static int lzx_read_block_header(struct lzx_decompressor *d,
+				 struct input_bitstream *is,
+				 int *block_type_ret,
+				 u32 *block_size_ret,
+				 u32 recent_offsets[])
+{
+	int block_type;
+	u32 block_size;
+	int i;
+
+	bitstream_ensure_bits(is, 4);
+
+	/* The first three bits tell us what kind of block it is, and should be
+	 * one of the LZX_BLOCKTYPE_* values.
+	 */
+	block_type = bitstream_pop_bits(is, 3);
+
+	/* Read the block size.  */
+	if (bitstream_pop_bits(is, 1)) {
+		block_size = LZX_DEFAULT_BLOCK_SIZE;
+	} else {
+		block_size = 0;
+		block_size |= bitstream_read_bits(is, 8);
+		block_size <<= 8;
+		block_size |= bitstream_read_bits(is, 8);
+	}
+
+	switch (block_type) {
+
+	case LZX_BLOCKTYPE_ALIGNED:
+
+		/* Read the aligned offset code and prepare its decode table.
+		 */
+
+		for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+			d->alignedcode_lens[i] =
+				bitstream_read_bits(is,
+						    LZX_ALIGNEDCODE_ELEMENT_SIZE);
+		}
+
+		if (make_huffman_decode_table(d->alignedcode_decode_table,
+					      LZX_ALIGNEDCODE_NUM_SYMBOLS,
+					      LZX_ALIGNEDCODE_TABLEBITS,
+					      d->alignedcode_lens,
+					      LZX_MAX_ALIGNED_CODEWORD_LEN,
+					      d->working_space))
+			return -1;
+
+		/* Fall though, since the rest of the header for aligned offset
+		 * blocks is the same as that for verbatim blocks.
+		 */
+		fallthrough;
+
+	case LZX_BLOCKTYPE_VERBATIM:
+
+		/* Read the main code and prepare its decode table.
+		 *
+		 * Note that the codeword lengths in the main code are encoded
+		 * in two parts: one part for literal symbols, and one part for
+		 * match symbols.
+		 */
+
+		if (lzx_read_codeword_lens(d, is, d->maincode_lens,
+					   LZX_NUM_CHARS))
+			return -1;
+
+		if (lzx_read_codeword_lens(d, is,
+					   d->maincode_lens + LZX_NUM_CHARS,
+					   LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS))
+			return -1;
+
+		if (make_huffman_decode_table(d->maincode_decode_table,
+					      LZX_MAINCODE_NUM_SYMBOLS,
+					      LZX_MAINCODE_TABLEBITS,
+					      d->maincode_lens,
+					      LZX_MAX_MAIN_CODEWORD_LEN,
+					      d->working_space))
+			return -1;
+
+		/* Read the length code and prepare its decode table.  */
+
+		if (lzx_read_codeword_lens(d, is, d->lencode_lens,
+					   LZX_LENCODE_NUM_SYMBOLS))
+			return -1;
+
+		if (make_huffman_decode_table(d->lencode_decode_table,
+					      LZX_LENCODE_NUM_SYMBOLS,
+					      LZX_LENCODE_TABLEBITS,
+					      d->lencode_lens,
+					      LZX_MAX_LEN_CODEWORD_LEN,
+					      d->working_space))
+			return -1;
+
+		break;
+
+	case LZX_BLOCKTYPE_UNCOMPRESSED:
+
+		/* Before reading the three recent offsets from the uncompressed
+		 * block header, the stream must be aligned on a 16-bit
+		 * boundary.  But if the stream is *already* aligned, then the
+		 * next 16 bits must be discarded.
+		 */
+		bitstream_ensure_bits(is, 1);
+		bitstream_align(is);
+
+		recent_offsets[0] = bitstream_read_u32(is);
+		recent_offsets[1] = bitstream_read_u32(is);
+		recent_offsets[2] = bitstream_read_u32(is);
+
+		/* Offsets of 0 are invalid.  */
+		if (recent_offsets[0] == 0 || recent_offsets[1] == 0 ||
+		    recent_offsets[2] == 0)
+			return -1;
+		break;
+
+	default:
+		/* Unrecognized block type.  */
+		return -1;
+	}
+
+	*block_type_ret = block_type;
+	*block_size_ret = block_size;
+	return 0;
+}
+
+/* Decompress a block of LZX-compressed data.  */
+static int lzx_decompress_block(const struct lzx_decompressor *d,
+				struct input_bitstream *is,
+				int block_type, u32 block_size,
+				u8 * const out_begin, u8 *out_next,
+				u32 recent_offsets[])
+{
+	u8 * const block_end = out_next + block_size;
+	u32 ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
+
+	do {
+		u32 mainsym;
+		u32 match_len;
+		u32 match_offset;
+		u32 offset_slot;
+		u32 num_extra_bits;
+
+		mainsym = read_mainsym(d, is);
+		if (mainsym < LZX_NUM_CHARS) {
+			/* Literal  */
+			*out_next++ = mainsym;
+			continue;
+		}
+
+		/* Match  */
+
+		/* Decode the length header and offset slot.  */
+		mainsym -= LZX_NUM_CHARS;
+		match_len = mainsym % LZX_NUM_LEN_HEADERS;
+		offset_slot = mainsym / LZX_NUM_LEN_HEADERS;
+
+		/* If needed, read a length symbol to decode the full length. */
+		if (match_len == LZX_NUM_PRIMARY_LENS)
+			match_len += read_lensym(d, is);
+		match_len += LZX_MIN_MATCH_LEN;
+
+		if (offset_slot < LZX_NUM_RECENT_OFFSETS) {
+			/* Repeat offset  */
+
+			/* Note: This isn't a real LRU queue, since using the R2
+			 * offset doesn't bump the R1 offset down to R2.  This
+			 * quirk allows all 3 recent offsets to be handled by
+			 * the same code.  (For R0, the swap is a no-op.)
+			 */
+			match_offset = recent_offsets[offset_slot];
+			recent_offsets[offset_slot] = recent_offsets[0];
+			recent_offsets[0] = match_offset;
+		} else {
+			/* Explicit offset  */
+
+			/* Look up the number of extra bits that need to be read
+			 * to decode offsets with this offset slot.
+			 */
+			num_extra_bits = lzx_extra_offset_bits[offset_slot];
+
+			/* Start with the offset slot base value.  */
+			match_offset = lzx_offset_slot_base[offset_slot];
+
+			/* In aligned offset blocks, the low-order 3 bits of
+			 * each offset are encoded using the aligned offset
+			 * code.  Otherwise, all the extra bits are literal.
+			 */
+
+			if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) {
+				match_offset +=
+					bitstream_read_bits(is, num_extra_bits -
+								LZX_NUM_ALIGNED_OFFSET_BITS)
+							<< LZX_NUM_ALIGNED_OFFSET_BITS;
+				match_offset += read_alignedsym(d, is);
+			} else {
+				match_offset += bitstream_read_bits(is, num_extra_bits);
+			}
+
+			/* Adjust the offset.  */
+			match_offset -= (LZX_NUM_RECENT_OFFSETS - 1);
+
+			/* Update the recent offsets.  */
+			recent_offsets[2] = recent_offsets[1];
+			recent_offsets[1] = recent_offsets[0];
+			recent_offsets[0] = match_offset;
+		}
+
+		/* Validate the match, then copy it to the current position.  */
+
+		if (match_len > (size_t)(block_end - out_next))
+			return -1;
+
+		if (match_offset > (size_t)(out_next - out_begin))
+			return -1;
+
+		out_next = lz_copy(out_next, match_len, match_offset,
+				   block_end, LZX_MIN_MATCH_LEN);
+
+	} while (out_next != block_end);
+
+	return 0;
+}
+
+/*
+ * lzx_allocate_decompressor - Allocate an LZX decompressor
+ *
+ * Return the pointer to the decompressor on success, or return NULL and set
+ * errno on failure.
+ */
+struct lzx_decompressor *lzx_allocate_decompressor(void)
+{
+	return kmalloc(sizeof(struct lzx_decompressor), GFP_NOFS);
+}
+
+/*
+ * lzx_decompress - Decompress a buffer of LZX-compressed data
+ *
+ * @decompressor:      A decompressor allocated with lzx_allocate_decompressor()
+ * @compressed_data:	The buffer of data to decompress
+ * @compressed_size:	Number of bytes of compressed data
+ * @uncompressed_data:	The buffer in which to store the decompressed data
+ * @uncompressed_size:	The number of bytes the data decompresses into
+ *
+ * Return 0 on success, or return -1 and set errno on failure.
+ */
+int lzx_decompress(struct lzx_decompressor *decompressor,
+		   const void *compressed_data, size_t compressed_size,
+		   void *uncompressed_data, size_t uncompressed_size)
+{
+	struct lzx_decompressor *d = decompressor;
+	u8 * const out_begin = uncompressed_data;
+	u8 *out_next = out_begin;
+	u8 * const out_end = out_begin + uncompressed_size;
+	struct input_bitstream is;
+	u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
+	int e8_status = 0;
+
+	init_input_bitstream(&is, compressed_data, compressed_size);
+
+	/* Codeword lengths begin as all 0's for delta encoding purposes.  */
+	memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS);
+	memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS);
+
+	/* Decompress blocks until we have all the uncompressed data.  */
+
+	while (out_next != out_end) {
+		int block_type;
+		u32 block_size;
+
+		if (lzx_read_block_header(d, &is, &block_type, &block_size,
+					  recent_offsets))
+			goto invalid;
+
+		if (block_size < 1 || block_size > (size_t)(out_end - out_next))
+			goto invalid;
+
+		if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) {
+
+			/* Compressed block  */
+
+			if (lzx_decompress_block(d,
+						 &is,
+						 block_type,
+						 block_size,
+						 out_begin,
+						 out_next,
+						 recent_offsets))
+				goto invalid;
+
+			e8_status |= d->maincode_lens[0xe8];
+			out_next += block_size;
+		} else {
+			/* Uncompressed block  */
+
+			out_next = bitstream_read_bytes(&is, out_next,
+							block_size);
+			if (!out_next)
+				goto invalid;
+
+			if (block_size & 1)
+				bitstream_read_byte(&is);
+
+			e8_status = 1;
+		}
+	}
+
+	/* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */
+	if (e8_status)
+		lzx_postprocess(uncompressed_data, uncompressed_size);
+
+	return 0;
+
+invalid:
+	return -1;
+}
+
+/*
+ * lzx_free_decompressor - Free an LZX decompressor
+ *
+ * @decompressor:       A decompressor that was allocated with
+ *			lzx_allocate_decompressor(), or NULL.
+ */
+void lzx_free_decompressor(struct lzx_decompressor *decompressor)
+{
+	kfree(decompressor);
+}
--- a/fs/ntfs3/lib/xpress_decompress.c
+++ b/fs/ntfs3/lib/xpress_decompress.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * xpress_decompress.c - A decompressor for the XPRESS compression format
+ * (Huffman variant), which can be used in "System Compressed" files.  This is
+ * based on the code from wimlib.
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+#include "lib.h"
+
+#define XPRESS_NUM_SYMBOLS	512
+#define XPRESS_MAX_CODEWORD_LEN	15
+#define XPRESS_MIN_MATCH_LEN	3
+
+/* This value is chosen for fast decompression.  */
+#define XPRESS_TABLEBITS 12
+
+/* Reusable heap-allocated memory for XPRESS decompression  */
+struct xpress_decompressor {
+
+	/* The Huffman decoding table  */
+	u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS];
+
+	/* An array that maps symbols to codeword lengths  */
+	u8 lens[XPRESS_NUM_SYMBOLS];
+
+	/* Temporary space for make_huffman_decode_table()  */
+	u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) +
+			  XPRESS_NUM_SYMBOLS];
+};
+
+/*
+ * xpress_allocate_decompressor - Allocate an XPRESS decompressor
+ *
+ * Return the pointer to the decompressor on success, or return NULL and set
+ * errno on failure.
+ */
+struct xpress_decompressor *xpress_allocate_decompressor(void)
+{
+	return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS);
+}
+
+/*
+ * xpress_decompress - Decompress a buffer of XPRESS-compressed data
+ *
+ * @decompressor:       A decompressor that was allocated with
+ *			xpress_allocate_decompressor()
+ * @compressed_data:	The buffer of data to decompress
+ * @compressed_size:	Number of bytes of compressed data
+ * @uncompressed_data:	The buffer in which to store the decompressed data
+ * @uncompressed_size:	The number of bytes the data decompresses into
+ *
+ * Return 0 on success, or return -1 and set errno on failure.
+ */
+int xpress_decompress(struct xpress_decompressor *decompressor,
+		      const void *compressed_data, size_t compressed_size,
+		      void *uncompressed_data, size_t uncompressed_size)
+{
+	struct xpress_decompressor *d = decompressor;
+	const u8 * const in_begin = compressed_data;
+	u8 * const out_begin = uncompressed_data;
+	u8 *out_next = out_begin;
+	u8 * const out_end = out_begin + uncompressed_size;
+	struct input_bitstream is;
+	u32 i;
+
+	/* Read the Huffman codeword lengths.  */
+	if (compressed_size < XPRESS_NUM_SYMBOLS / 2)
+		goto invalid;
+	for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
+		d->lens[i*2 + 0] = in_begin[i] & 0xF;
+		d->lens[i*2 + 1] = in_begin[i] >> 4;
+	}
+
+	/* Build a decoding table for the Huffman code.  */
+	if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS,
+				      XPRESS_TABLEBITS, d->lens,
+				      XPRESS_MAX_CODEWORD_LEN,
+				      d->working_space))
+		goto invalid;
+
+	/* Decode the matches and literals.  */
+
+	init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2,
+			     compressed_size - XPRESS_NUM_SYMBOLS / 2);
+
+	while (out_next != out_end) {
+		u32 sym;
+		u32 log2_offset;
+		u32 length;
+		u32 offset;
+
+		sym = read_huffsym(&is, d->decode_table,
+				   XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN);
+		if (sym < 256) {
+			/* Literal  */
+			*out_next++ = sym;
+		} else {
+			/* Match  */
+			length = sym & 0xf;
+			log2_offset = (sym >> 4) & 0xf;
+
+			bitstream_ensure_bits(&is, 16);
+
+			offset = ((u32)1 << log2_offset) |
+				 bitstream_pop_bits(&is, log2_offset);
+
+			if (length == 0xf) {
+				length += bitstream_read_byte(&is);
+				if (length == 0xf + 0xff)
+					length = bitstream_read_u16(&is);
+			}
+			length += XPRESS_MIN_MATCH_LEN;
+
+			if (offset > (size_t)(out_next - out_begin))
+				goto invalid;
+
+			if (length > (size_t)(out_end - out_next))
+				goto invalid;
+
+			out_next = lz_copy(out_next, length, offset, out_end,
+					   XPRESS_MIN_MATCH_LEN);
+		}
+	}
+	return 0;
+
+invalid:
+	return -1;
+}
+
+/*
+ * xpress_free_decompressor - Free an XPRESS decompressor
+ *
+ * @decompressor:       A decompressor that was allocated with
+ *			xpress_allocate_decompressor(), or NULL.
+ */
+void xpress_free_decompressor(struct xpress_decompressor *decompressor)
+{
+	kfree(decompressor);
+}
--- a/fs/ntfs3/lznt.c
+++ b/fs/ntfs3/lznt.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+// clang-format off
+/* Src buffer is zero. */
+#define LZNT_ERROR_ALL_ZEROS	1
+#define LZNT_CHUNK_SIZE		0x1000
+// clang-format on
+
+struct lznt_hash {
+	const u8 *p1;
+	const u8 *p2;
+};
+
+struct lznt {
+	const u8 *unc;
+	const u8 *unc_end;
+	const u8 *best_match;
+	size_t max_len;
+	bool std;
+
+	struct lznt_hash hash[LZNT_CHUNK_SIZE];
+};
+
+static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev,
+				   size_t max_len)
+{
+	size_t len = 0;
+
+	while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len)
+		;
+	return len;
+}
+
+static size_t longest_match_std(const u8 *src, struct lznt *ctx)
+{
+	size_t hash_index;
+	size_t len1 = 0, len2 = 0;
+	const u8 **hash;
+
+	hash_index =
+		((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) &
+		(LZNT_CHUNK_SIZE - 1);
+
+	hash = &(ctx->hash[hash_index].p1);
+
+	if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] &&
+	    hash[0][1] == src[1] && hash[0][2] == src[2]) {
+		len1 = 3;
+		if (ctx->max_len > 3)
+			len1 += get_match_len(src + 3, ctx->unc_end,
+					      hash[0] + 3, ctx->max_len - 3);
+	}
+
+	if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] &&
+	    hash[1][1] == src[1] && hash[1][2] == src[2]) {
+		len2 = 3;
+		if (ctx->max_len > 3)
+			len2 += get_match_len(src + 3, ctx->unc_end,
+					      hash[1] + 3, ctx->max_len - 3);
+	}
+
+	/* Compare two matches and select the best one. */
+	if (len1 < len2) {
+		ctx->best_match = hash[1];
+		len1 = len2;
+	} else {
+		ctx->best_match = hash[0];
+	}
+
+	hash[1] = hash[0];
+	hash[0] = src;
+	return len1;
+}
+
+static size_t longest_match_best(const u8 *src, struct lznt *ctx)
+{
+	size_t max_len;
+	const u8 *ptr;
+
+	if (ctx->unc >= src || !ctx->max_len)
+		return 0;
+
+	max_len = 0;
+	for (ptr = ctx->unc; ptr < src; ++ptr) {
+		size_t len =
+			get_match_len(src, ctx->unc_end, ptr, ctx->max_len);
+		if (len >= max_len) {
+			max_len = len;
+			ctx->best_match = ptr;
+		}
+	}
+
+	return max_len >= 3 ? max_len : 0;
+}
+
+static const size_t s_max_len[] = {
+	0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12,
+};
+
+static const size_t s_max_off[] = {
+	0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+};
+
+static inline u16 make_pair(size_t offset, size_t len, size_t index)
+{
+	return ((offset - 1) << (12 - index)) |
+	       ((len - 3) & (((1 << (12 - index)) - 1)));
+}
+
+static inline size_t parse_pair(u16 pair, size_t *offset, size_t index)
+{
+	*offset = 1 + (pair >> (12 - index));
+	return 3 + (pair & ((1 << (12 - index)) - 1));
+}
+
+/*
+ * compress_chunk
+ *
+ * Return:
+ * * 0	- Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data.
+ * * 1	- Input buffer is full zero.
+ * * -2 - The compressed buffer is too small to hold the compressed data.
+ */
+static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *),
+				 const u8 *unc, const u8 *unc_end, u8 *cmpr,
+				 u8 *cmpr_end, size_t *cmpr_chunk_size,
+				 struct lznt *ctx)
+{
+	size_t cnt = 0;
+	size_t idx = 0;
+	const u8 *up = unc;
+	u8 *cp = cmpr + 3;
+	u8 *cp2 = cmpr + 2;
+	u8 not_zero = 0;
+	/* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */
+	u8 ohdr = 0;
+	u8 *last;
+	u16 t16;
+
+	if (unc + LZNT_CHUNK_SIZE < unc_end)
+		unc_end = unc + LZNT_CHUNK_SIZE;
+
+	last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end);
+
+	ctx->unc = unc;
+	ctx->unc_end = unc_end;
+	ctx->max_len = s_max_len[0];
+
+	while (up < unc_end) {
+		size_t max_len;
+
+		while (unc + s_max_off[idx] < up)
+			ctx->max_len = s_max_len[++idx];
+
+		/* Find match. */
+		max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0;
+
+		if (!max_len) {
+			if (cp >= last)
+				goto NotCompressed;
+			not_zero |= *cp++ = *up++;
+		} else if (cp + 1 >= last) {
+			goto NotCompressed;
+		} else {
+			t16 = make_pair(up - ctx->best_match, max_len, idx);
+			*cp++ = t16;
+			*cp++ = t16 >> 8;
+
+			ohdr |= 1 << cnt;
+			up += max_len;
+		}
+
+		cnt = (cnt + 1) & 7;
+		if (!cnt) {
+			*cp2 = ohdr;
+			ohdr = 0;
+			cp2 = cp;
+			cp += 1;
+		}
+	}
+
+	if (cp2 < last)
+		*cp2 = ohdr;
+	else
+		cp -= 1;
+
+	*cmpr_chunk_size = cp - cmpr;
+
+	t16 = (*cmpr_chunk_size - 3) | 0xB000;
+	cmpr[0] = t16;
+	cmpr[1] = t16 >> 8;
+
+	return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS;
+
+NotCompressed:
+
+	if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last)
+		return -2;
+
+	/*
+	 * Copy non cmpr data.
+	 * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000)
+	 */
+	cmpr[0] = 0xff;
+	cmpr[1] = 0x3f;
+
+	memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE);
+	*cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short);
+
+	return 0;
+}
+
+static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr,
+				       const u8 *cmpr_end)
+{
+	u8 *up = unc;
+	u8 ch = *cmpr++;
+	size_t bit = 0;
+	size_t index = 0;
+	u16 pair;
+	size_t offset, length;
+
+	/* Do decompression until pointers are inside range. */
+	while (up < unc_end && cmpr < cmpr_end) {
+		/* Correct index */
+		while (unc + s_max_off[index] < up)
+			index += 1;
+
+		/* Check the current flag for zero. */
+		if (!(ch & (1 << bit))) {
+			/* Just copy byte. */
+			*up++ = *cmpr++;
+			goto next;
+		}
+
+		/* Check for boundary. */
+		if (cmpr + 1 >= cmpr_end)
+			return -EINVAL;
+
+		/* Read a short from little endian stream. */
+		pair = cmpr[1];
+		pair <<= 8;
+		pair |= cmpr[0];
+
+		cmpr += 2;
+
+		/* Translate packed information into offset and length. */
+		length = parse_pair(pair, &offset, index);
+
+		/* Check offset for boundary. */
+		if (unc + offset > up)
+			return -EINVAL;
+
+		/* Truncate the length if necessary. */
+		if (up + length >= unc_end)
+			length = unc_end - up;
+
+		/* Now we copy bytes. This is the heart of LZ algorithm. */
+		for (; length > 0; length--, up++)
+			*up = *(up - offset);
+
+next:
+		/* Advance flag bit value. */
+		bit = (bit + 1) & 7;
+
+		if (!bit) {
+			if (cmpr >= cmpr_end)
+				break;
+
+			ch = *cmpr++;
+		}
+	}
+
+	/* Return the size of uncompressed data. */
+	return up - unc;
+}
+
+/*
+ * get_lznt_ctx
+ * @level: 0 - Standard compression.
+ * 	   !0 - Best compression, requires a lot of cpu.
+ */
+struct lznt *get_lznt_ctx(int level)
+{
+	struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash)
+				       : sizeof(struct lznt),
+				 GFP_NOFS);
+
+	if (r)
+		r->std = !level;
+	return r;
+}
+
+/*
+ * compress_lznt - Compresses @unc into @cmpr
+ *
+ * Return:
+ * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data.
+ * * 0 - Input buffer is full zero.
+ */
+size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr,
+		     size_t cmpr_size, struct lznt *ctx)
+{
+	int err;
+	size_t (*match)(const u8 *src, struct lznt *ctx);
+	u8 *p = cmpr;
+	u8 *end = p + cmpr_size;
+	const u8 *unc_chunk = unc;
+	const u8 *unc_end = unc_chunk + unc_size;
+	bool is_zero = true;
+
+	if (ctx->std) {
+		match = &longest_match_std;
+		memset(ctx->hash, 0, sizeof(ctx->hash));
+	} else {
+		match = &longest_match_best;
+	}
+
+	/* Compression cycle. */
+	for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) {
+		cmpr_size = 0;
+		err = compress_chunk(match, unc_chunk, unc_end, p, end,
+				     &cmpr_size, ctx);
+		if (err < 0)
+			return unc_size;
+
+		if (is_zero && err != LZNT_ERROR_ALL_ZEROS)
+			is_zero = false;
+
+		p += cmpr_size;
+	}
+
+	if (p <= end - 2)
+		p[0] = p[1] = 0;
+
+	return is_zero ? 0 : PtrOffset(cmpr, p);
+}
+
+/*
+ * decompress_lznt - Decompress @cmpr into @unc.
+ */
+ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc,
+			size_t unc_size)
+{
+	const u8 *cmpr_chunk = cmpr;
+	const u8 *cmpr_end = cmpr_chunk + cmpr_size;
+	u8 *unc_chunk = unc;
+	u8 *unc_end = unc_chunk + unc_size;
+	u16 chunk_hdr;
+
+	if (cmpr_size < sizeof(short))
+		return -EINVAL;
+
+	/* Read chunk header. */
+	chunk_hdr = cmpr_chunk[1];
+	chunk_hdr <<= 8;
+	chunk_hdr |= cmpr_chunk[0];
+
+	/* Loop through decompressing chunks. */
+	for (;;) {
+		size_t chunk_size_saved;
+		size_t unc_use;
+		size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1));
+
+		/* Check that the chunk actually fits the supplied buffer. */
+		if (cmpr_chunk + cmpr_use > cmpr_end)
+			return -EINVAL;
+
+		/* First make sure the chunk contains compressed data. */
+		if (chunk_hdr & 0x8000) {
+			/* Decompress a chunk and return if we get an error. */
+			ssize_t err =
+				decompress_chunk(unc_chunk, unc_end,
+						 cmpr_chunk + sizeof(chunk_hdr),
+						 cmpr_chunk + cmpr_use);
+			if (err < 0)
+				return err;
+			unc_use = err;
+		} else {
+			/* This chunk does not contain compressed data. */
+			unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end
+					  ? unc_end - unc_chunk
+					  : LZNT_CHUNK_SIZE;
+
+			if (cmpr_chunk + sizeof(chunk_hdr) + unc_use >
+			    cmpr_end) {
+				return -EINVAL;
+			}
+
+			memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr),
+			       unc_use);
+		}
+
+		/* Advance pointers. */
+		cmpr_chunk += cmpr_use;
+		unc_chunk += unc_use;
+
+		/* Check for the end of unc buffer. */
+		if (unc_chunk >= unc_end)
+			break;
+
+		/* Proceed the next chunk. */
+		if (cmpr_chunk > cmpr_end - 2)
+			break;
+
+		chunk_size_saved = LZNT_CHUNK_SIZE;
+
+		/* Read chunk header. */
+		chunk_hdr = cmpr_chunk[1];
+		chunk_hdr <<= 8;
+		chunk_hdr |= cmpr_chunk[0];
+
+		if (!chunk_hdr)
+			break;
+
+		/* Check the size of unc buffer. */
+		if (unc_use < chunk_size_saved) {
+			size_t t1 = chunk_size_saved - unc_use;
+			u8 *t2 = unc_chunk + t1;
+
+			/* 'Zero' memory. */
+			if (t2 >= unc_end)
+				break;
+
+			memset(unc_chunk, 0, t1);
+			unc_chunk = t2;
+		}
+	}
+
+	/* Check compression boundary. */
+	if (cmpr_chunk > cmpr_end)
+		return -EINVAL;
+
+	/*
+	 * The unc size is just a difference between current
+	 * pointer and original one.
+	 */
+	return PtrOffset(unc, unc_chunk);
+}
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/iversion.h>
+#include <linux/namei.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * fill_name_de - Format NTFS_DE in @buf.
+ */
+int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name,
+		 const struct cpu_str *uni)
+{
+	int err;
+	struct NTFS_DE *e = buf;
+	u16 data_size;
+	struct ATTR_FILE_NAME *fname = (struct ATTR_FILE_NAME *)(e + 1);
+
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+	e->ref.high = fname->home.high = 0;
+#endif
+	if (uni) {
+#ifdef __BIG_ENDIAN
+		int ulen = uni->len;
+		__le16 *uname = fname->name;
+		const u16 *name_cpu = uni->name;
+
+		while (ulen--)
+			*uname++ = cpu_to_le16(*name_cpu++);
+#else
+		memcpy(fname->name, uni->name, uni->len * sizeof(u16));
+#endif
+		fname->name_len = uni->len;
+
+	} else {
+		/* Convert input string to unicode. */
+		err = ntfs_nls_to_utf16(sbi, name->name, name->len,
+					(struct cpu_str *)&fname->name_len,
+					NTFS_NAME_LEN, UTF16_LITTLE_ENDIAN);
+		if (err < 0)
+			return err;
+	}
+
+	fname->type = FILE_NAME_POSIX;
+	data_size = fname_full_size(fname);
+
+	e->size = cpu_to_le16(ALIGN(data_size, 8) + sizeof(struct NTFS_DE));
+	e->key_size = cpu_to_le16(data_size);
+	e->flags = 0;
+	e->res = 0;
+
+	return 0;
+}
+
+/*
+ * ntfs_lookup - inode_operations::lookup
+ */
+static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
+				  u32 flags)
+{
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct cpu_str *uni = __getname();
+	struct inode *inode;
+	int err;
+
+	if (!uni)
+		inode = ERR_PTR(-ENOMEM);
+	else {
+		err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
+					dentry->d_name.len, uni, NTFS_NAME_LEN,
+					UTF16_HOST_ENDIAN);
+		if (err < 0)
+			inode = ERR_PTR(err);
+		else {
+			ni_lock(ni);
+			inode = dir_search_u(dir, uni, NULL);
+			ni_unlock(ni);
+		}
+		__putname(uni);
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * ntfs_create - inode_operations::create
+ */
+static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, umode_t mode, bool excl)
+{
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct inode *inode;
+
+	ni_lock_dir(ni);
+
+	inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode,
+				  0, NULL, 0, NULL);
+
+	ni_unlock(ni);
+
+	return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_mknod
+ *
+ * inode_operations::mknod
+ */
+static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+	struct ntfs_inode *ni = ntfs_i(dir);
+	struct inode *inode;
+
+	ni_lock_dir(ni);
+
+	inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev,
+				  NULL, 0, NULL);
+
+	ni_unlock(ni);
+
+	return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_link - inode_operations::link
+ */
+static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
+{
+	int err;
+	struct inode *inode = d_inode(ode);
+	struct ntfs_inode *ni = ntfs_i(inode);
+
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	if (inode->i_nlink >= NTFS_LINK_MAX)
+		return -EMLINK;
+
+	ni_lock_dir(ntfs_i(dir));
+	if (inode != dir)
+		ni_lock(ni);
+
+	inc_nlink(inode);
+	ihold(inode);
+
+	err = ntfs_link_inode(inode, de);
+
+	if (!err) {
+		dir->i_ctime = dir->i_mtime = inode->i_ctime =
+			current_time(dir);
+		mark_inode_dirty(inode);
+		mark_inode_dirty(dir);
+		d_instantiate(de, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
+
+	if (inode != dir)
+		ni_unlock(ni);
+	ni_unlock(ntfs_i(dir));
+
+	return err;
+}
+
+/*
+ * ntfs_unlink - inode_operations::unlink
+ */
+static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ntfs_inode *ni = ntfs_i(dir);
+	int err;
+
+	ni_lock_dir(ni);
+
+	err = ntfs_unlink_inode(dir, dentry);
+
+	ni_unlock(ni);
+
+	return err;
+}
+
+/*
+ * ntfs_symlink - inode_operations::symlink
+ */
+static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+			struct dentry *dentry, const char *symname)
+{
+	u32 size = strlen(symname);
+	struct inode *inode;
+	struct ntfs_inode *ni = ntfs_i(dir);
+
+	ni_lock_dir(ni);
+
+	inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777,
+				  0, symname, size, NULL);
+
+	ni_unlock(ni);
+
+	return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_mkdir- inode_operations::mkdir
+ */
+static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+		      struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct ntfs_inode *ni = ntfs_i(dir);
+
+	ni_lock_dir(ni);
+
+	inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode,
+				  0, NULL, 0, NULL);
+
+	ni_unlock(ni);
+
+	return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_rmdir - inode_operations::rm_dir
+ */
+static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct ntfs_inode *ni = ntfs_i(dir);
+	int err;
+
+	ni_lock_dir(ni);
+
+	err = ntfs_unlink_inode(dir, dentry);
+
+	ni_unlock(ni);
+
+	return err;
+}
+
+/*
+ * ntfs_rename - inode_operations::rename
+ */
+static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
+		       struct dentry *dentry, struct inode *new_dir,
+		       struct dentry *new_dentry, u32 flags)
+{
+	int err;
+	struct super_block *sb = dir->i_sb;
+	struct ntfs_sb_info *sbi = sb->s_fs_info;
+	struct ntfs_inode *dir_ni = ntfs_i(dir);
+	struct ntfs_inode *new_dir_ni = ntfs_i(new_dir);
+	struct inode *inode = d_inode(dentry);
+	struct ntfs_inode *ni = ntfs_i(inode);
+	struct inode *new_inode = d_inode(new_dentry);
+	struct NTFS_DE *de, *new_de;
+	bool is_same, is_bad;
+	/*
+	 * de		- memory of PATH_MAX bytes:
+	 * [0-1024)	- original name (dentry->d_name)
+	 * [1024-2048)	- paired to original name, usually DOS variant of dentry->d_name
+	 * [2048-3072)	- new name (new_dentry->d_name)
+	 */
+	static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + SIZEOF_RESIDENT < 1024);
+	static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + sizeof(struct NTFS_DE) <
+		      1024);
+	static_assert(PATH_MAX >= 4 * 1024);
+
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	is_same = dentry->d_name.len == new_dentry->d_name.len &&
+		  !memcmp(dentry->d_name.name, new_dentry->d_name.name,
+			  dentry->d_name.len);
+
+	if (is_same && dir == new_dir) {
+		/* Nothing to do. */
+		return 0;
+	}
+
+	if (ntfs_is_meta_file(sbi, inode->i_ino)) {
+		/* Should we print an error? */
+		return -EINVAL;
+	}
+
+	if (new_inode) {
+		/* Target name exists. Unlink it. */
+		dget(new_dentry);
+		ni_lock_dir(new_dir_ni);
+		err = ntfs_unlink_inode(new_dir, new_dentry);
+		ni_unlock(new_dir_ni);
+		dput(new_dentry);
+		if (err)
+			return err;
+	}
+
+	/* Allocate PATH_MAX bytes. */
+	de = __getname();
+	if (!de)
+		return -ENOMEM;
+
+	/* Translate dentry->d_name into unicode form. */
+	err = fill_name_de(sbi, de, &dentry->d_name, NULL);
+	if (err < 0)
+		goto out;
+
+	if (is_same) {
+		/* Reuse 'de'. */
+		new_de = de;
+	} else {
+		/* Translate new_dentry->d_name into unicode form. */
+		new_de = Add2Ptr(de, 2048);
+		err = fill_name_de(sbi, new_de, &new_dentry->d_name, NULL);
+		if (err < 0)
+			goto out;
+	}
+
+	ni_lock_dir(dir_ni);
+	ni_lock(ni);
+
+	is_bad = false;
+	err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
+	if (is_bad) {
+		/* Restore after failed rename failed too. */
+		make_bad_inode(inode);
+		ntfs_inode_err(inode, "failed to undo rename");
+		ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+	} else if (!err) {
+		inode->i_ctime = dir->i_ctime = dir->i_mtime =
+			current_time(dir);
+		mark_inode_dirty(inode);
+		mark_inode_dirty(dir);
+		if (dir != new_dir) {
+			new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime;
+			mark_inode_dirty(new_dir);
+		}
+
+		if (IS_DIRSYNC(dir))
+			ntfs_sync_inode(dir);
+
+		if (IS_DIRSYNC(new_dir))
+			ntfs_sync_inode(inode);
+	}
+
+	ni_unlock(ni);
+	ni_unlock(dir_ni);
+out:
+	__putname(de);
+	return err;
+}
+
+struct dentry *ntfs3_get_parent(struct dentry *child)
+{
+	struct inode *inode = d_inode(child);
+	struct ntfs_inode *ni = ntfs_i(inode);
+
+	struct ATTR_LIST_ENTRY *le = NULL;
+	struct ATTRIB *attr = NULL;
+	struct ATTR_FILE_NAME *fname;
+
+	while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL,
+				    NULL))) {
+		fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+		if (!fname)
+			continue;
+
+		return d_obtain_alias(
+			ntfs_iget5(inode->i_sb, &fname->home, NULL));
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+// clang-format off
+const struct inode_operations ntfs_dir_inode_operations = {
+	.lookup		= ntfs_lookup,
+	.create		= ntfs_create,
+	.link		= ntfs_link,
+	.unlink		= ntfs_unlink,
+	.symlink	= ntfs_symlink,
+	.mkdir		= ntfs_mkdir,
+	.rmdir		= ntfs_rmdir,
+	.mknod		= ntfs_mknod,
+	.rename		= ntfs_rename,
+	.permission	= ntfs_permission,
+	.get_acl	= ntfs_get_acl,
+	.set_acl	= ntfs_set_acl,
+	.setattr	= ntfs3_setattr,
+	.getattr	= ntfs_getattr,
+	.listxattr	= ntfs_listxattr,
+	.fiemap		= ntfs_fiemap,
+};
+
+const struct inode_operations ntfs_special_inode_operations = {
+	.setattr	= ntfs3_setattr,
+	.getattr	= ntfs_getattr,
+	.listxattr	= ntfs_listxattr,
+	.get_acl	= ntfs_get_acl,
+	.set_acl	= ntfs_set_acl,
+};
+// clang-format on
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -0,0 +1,605 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type,
+			       const __le16 *name, u8 name_len,
+			       const u16 *upcase)
+{
+	/* First, compare the type codes. */
+	int diff = le32_to_cpu(left->type) - le32_to_cpu(type);
+
+	if (diff)
+		return diff;
+
+	/* They have the same type code, so we have to compare the names. */
+	return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len,
+			      upcase, true);
+}
+
+/*
+ * mi_new_attt_id
+ *
+ * Return: Unused attribute id that is less than mrec->next_attr_id.
+ */
+static __le16 mi_new_attt_id(struct mft_inode *mi)
+{
+	u16 free_id, max_id, t16;
+	struct MFT_REC *rec = mi->mrec;
+	struct ATTRIB *attr;
+	__le16 id;
+
+	id = rec->next_attr_id;
+	free_id = le16_to_cpu(id);
+	if (free_id < 0x7FFF) {
+		rec->next_attr_id = cpu_to_le16(free_id + 1);
+		return id;
+	}
+
+	/* One record can store up to 1024/24 ~= 42 attributes. */
+	free_id = 0;
+	max_id = 0;
+
+	attr = NULL;
+
+	for (;;) {
+		attr = mi_enum_attr(mi, attr);
+		if (!attr) {
+			rec->next_attr_id = cpu_to_le16(max_id + 1);
+			mi->dirty = true;
+			return cpu_to_le16(free_id);
+		}
+
+		t16 = le16_to_cpu(attr->id);
+		if (t16 == free_id) {
+			free_id += 1;
+			attr = NULL;
+		} else if (max_id < t16)
+			max_id = t16;
+	}
+}
+
+int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi)
+{
+	int err;
+	struct mft_inode *m = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
+
+	if (!m)
+		return -ENOMEM;
+
+	err = mi_init(m, sbi, rno);
+	if (err) {
+		kfree(m);
+		return err;
+	}
+
+	err = mi_read(m, false);
+	if (err) {
+		mi_put(m);
+		return err;
+	}
+
+	*mi = m;
+	return 0;
+}
+
+void mi_put(struct mft_inode *mi)
+{
+	mi_clear(mi);
+	kfree(mi);
+}
+
+int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno)
+{
+	mi->sbi = sbi;
+	mi->rno = rno;
+	mi->mrec = kmalloc(sbi->record_size, GFP_NOFS);
+	if (!mi->mrec)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * mi_read - Read MFT data.
+ */
+int mi_read(struct mft_inode *mi, bool is_mft)
+{
+	int err;
+	struct MFT_REC *rec = mi->mrec;
+	struct ntfs_sb_info *sbi = mi->sbi;
+	u32 bpr = sbi->record_size;
+	u64 vbo = (u64)mi->rno << sbi->record_bits;
+	struct ntfs_inode *mft_ni = sbi->mft.ni;
+	struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL;
+	struct rw_semaphore *rw_lock = NULL;
+
+	if (is_mounted(sbi)) {
+		if (!is_mft) {
+			rw_lock = &mft_ni->file.run_lock;
+			down_read(rw_lock);
+		}
+	}
+
+	err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
+	if (rw_lock)
+		up_read(rw_lock);
+	if (!err)
+		goto ok;
+
+	if (err == -E_NTFS_FIXUP) {
+		mi->dirty = true;
+		goto ok;
+	}
+
+	if (err != -ENOENT)
+		goto out;
+
+	if (rw_lock) {
+		ni_lock(mft_ni);
+		down_write(rw_lock);
+	}
+	err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, &mft_ni->file.run,
+				 vbo >> sbi->cluster_bits);
+	if (rw_lock) {
+		up_write(rw_lock);
+		ni_unlock(mft_ni);
+	}
+	if (err)
+		goto out;
+
+	if (rw_lock)
+		down_read(rw_lock);
+	err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
+	if (rw_lock)
+		up_read(rw_lock);
+
+	if (err == -E_NTFS_FIXUP) {
+		mi->dirty = true;
+		goto ok;
+	}
+	if (err)
+		goto out;
+
+ok:
+	/* Check field 'total' only here. */
+	if (le32_to_cpu(rec->total) != bpr) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	return err;
+}
+
+struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+{
+	const struct MFT_REC *rec = mi->mrec;
+	u32 used = le32_to_cpu(rec->used);
+	u32 t32, off, asize;
+	u16 t16;
+
+	if (!attr) {
+		u32 total = le32_to_cpu(rec->total);
+
+		off = le16_to_cpu(rec->attr_off);
+
+		if (used > total)
+			return NULL;
+
+		if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
+		    !IS_ALIGNED(off, 4)) {
+			return NULL;
+		}
+
+		/* Skip non-resident records. */
+		if (!is_rec_inuse(rec))
+			return NULL;
+
+		attr = Add2Ptr(rec, off);
+	} else {
+		/* Check if input attr inside record. */
+		off = PtrOffset(rec, attr);
+		if (off >= used)
+			return NULL;
+
+		asize = le32_to_cpu(attr->size);
+		if (asize < SIZEOF_RESIDENT) {
+			/* Impossible 'cause we should not return such attribute. */
+			return NULL;
+		}
+
+		attr = Add2Ptr(attr, asize);
+		off += asize;
+	}
+
+	asize = le32_to_cpu(attr->size);
+
+	/* Can we use the first field (attr->type). */
+	if (off + 8 > used) {
+		static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
+		return NULL;
+	}
+
+	if (attr->type == ATTR_END) {
+		/* End of enumeration. */
+		return NULL;
+	}
+
+	/* 0x100 is last known attribute for now. */
+	t32 = le32_to_cpu(attr->type);
+	if ((t32 & 0xf) || (t32 > 0x100))
+		return NULL;
+
+	/* Check boundary. */
+	if (off + asize > used)
+		return NULL;
+
+	/* Check size of attribute. */
+	if (!attr->non_res) {
+		if (asize < SIZEOF_RESIDENT)
+			return NULL;
+
+		t16 = le16_to_cpu(attr->res.data_off);
+
+		if (t16 > asize)
+			return NULL;
+
+		t32 = le32_to_cpu(attr->res.data_size);
+		if (t16 + t32 > asize)
+			return NULL;
+
+		return attr;
+	}
+
+	/* Check some nonresident fields. */
+	if (attr->name_len &&
+	    le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len >
+		    le16_to_cpu(attr->nres.run_off)) {
+		return NULL;
+	}
+
+	if (attr->nres.svcn || !is_attr_ext(attr)) {
+		if (asize + 8 < SIZEOF_NONRESIDENT)
+			return NULL;
+
+		if (attr->nres.c_unit)
+			return NULL;
+	} else if (asize + 8 < SIZEOF_NONRESIDENT_EX)
+		return NULL;
+
+	return attr;
+}
+
+/*
+ * mi_find_attr - Find the attribute by type and name and id.
+ */
+struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
+			    enum ATTR_TYPE type, const __le16 *name,
+			    size_t name_len, const __le16 *id)
+{
+	u32 type_in = le32_to_cpu(type);
+	u32 atype;
+
+next_attr:
+	attr = mi_enum_attr(mi, attr);
+	if (!attr)
+		return NULL;
+
+	atype = le32_to_cpu(attr->type);
+	if (atype > type_in)
+		return NULL;
+
+	if (atype < type_in)
+		goto next_attr;
+
+	if (attr->name_len != name_len)
+		goto next_attr;
+
+	if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short)))
+		goto next_attr;
+
+	if (id && *id != attr->id)
+		goto next_attr;
+
+	return attr;
+}
+
+int mi_write(struct mft_inode *mi, int wait)
+{
+	struct MFT_REC *rec;
+	int err;
+	struct ntfs_sb_info *sbi;
+
+	if (!mi->dirty)
+		return 0;
+
+	sbi = mi->sbi;
+	rec = mi->mrec;
+
+	err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait);
+	if (err)
+		return err;
+
+	if (mi->rno < sbi->mft.recs_mirr)
+		sbi->flags |= NTFS_FLAGS_MFTMIRR;
+
+	mi->dirty = false;
+
+	return 0;
+}
+
+int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
+		  __le16 flags, bool is_mft)
+{
+	int err;
+	u16 seq = 1;
+	struct MFT_REC *rec;
+	u64 vbo = (u64)rno << sbi->record_bits;
+
+	err = mi_init(mi, sbi, rno);
+	if (err)
+		return err;
+
+	rec = mi->mrec;
+
+	if (rno == MFT_REC_MFT) {
+		;
+	} else if (rno < MFT_REC_FREE) {
+		seq = rno;
+	} else if (rno >= sbi->mft.used) {
+		;
+	} else if (mi_read(mi, is_mft)) {
+		;
+	} else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) {
+		/* Record is reused. Update its sequence number. */
+		seq = le16_to_cpu(rec->seq) + 1;
+		if (!seq)
+			seq = 1;
+	}
+
+	memcpy(rec, sbi->new_rec, sbi->record_size);
+
+	rec->seq = cpu_to_le16(seq);
+	rec->flags = RECORD_FLAG_IN_USE | flags;
+
+	mi->dirty = true;
+
+	if (!mi->nb.nbufs) {
+		struct ntfs_inode *ni = sbi->mft.ni;
+		bool lock = false;
+
+		if (is_mounted(sbi) && !is_mft) {
+			down_read(&ni->file.run_lock);
+			lock = true;
+		}
+
+		err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size,
+				  &mi->nb);
+		if (lock)
+			up_read(&ni->file.run_lock);
+	}
+
+	return err;
+}
+
+/*
+ * mi_mark_free - Mark record as unused and marks it as free in bitmap.
+ */
+void mi_mark_free(struct mft_inode *mi)
+{
+	CLST rno = mi->rno;
+	struct ntfs_sb_info *sbi = mi->sbi;
+
+	if (rno >= MFT_REC_RESERVED && rno < MFT_REC_FREE) {
+		ntfs_clear_mft_tail(sbi, rno, rno + 1);
+		mi->dirty = false;
+		return;
+	}
+
+	if (mi->mrec) {
+		clear_rec_inuse(mi->mrec);
+		mi->dirty = true;
+		mi_write(mi, 0);
+	}
+	ntfs_mark_rec_free(sbi, rno);
+}
+
+/*
+ * mi_insert_attr - Reserve space for new attribute.
+ *
+ * Return: Not full constructed attribute or NULL if not possible to create.
+ */
+struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
+			      const __le16 *name, u8 name_len, u32 asize,
+			      u16 name_off)
+{
+	size_t tail;
+	struct ATTRIB *attr;
+	__le16 id;
+	struct MFT_REC *rec = mi->mrec;
+	struct ntfs_sb_info *sbi = mi->sbi;
+	u32 used = le32_to_cpu(rec->used);
+	const u16 *upcase = sbi->upcase;
+	int diff;
+
+	/* Can we insert mi attribute? */
+	if (used + asize > mi->sbi->record_size)
+		return NULL;
+
+	/*
+	 * Scan through the list of attributes to find the point
+	 * at which we should insert it.
+	 */
+	attr = NULL;
+	while ((attr = mi_enum_attr(mi, attr))) {
+		diff = compare_attr(attr, type, name, name_len, upcase);
+		if (diff > 0)
+			break;
+		if (diff < 0)
+			continue;
+
+		if (!is_attr_indexed(attr))
+			return NULL;
+		break;
+	}
+
+	if (!attr) {
+		tail = 8; /* Not used, just to suppress warning. */
+		attr = Add2Ptr(rec, used - 8);
+	} else {
+		tail = used - PtrOffset(rec, attr);
+	}
+
+	id = mi_new_attt_id(mi);
+
+	memmove(Add2Ptr(attr, asize), attr, tail);
+	memset(attr, 0, asize);
+
+	attr->type = type;
+	attr->size = cpu_to_le32(asize);
+	attr->name_len = name_len;
+	attr->name_off = cpu_to_le16(name_off);
+	attr->id = id;
+
+	memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short));
+	rec->used = cpu_to_le32(used + asize);
+
+	mi->dirty = true;
+
+	return attr;
+}
+
+/*
+ * mi_remove_attr - Remove the attribute from record.
+ *
+ * NOTE: The source attr will point to next attribute.
+ */
+bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+		    struct ATTRIB *attr)
+{
+	struct MFT_REC *rec = mi->mrec;
+	u32 aoff = PtrOffset(rec, attr);
+	u32 used = le32_to_cpu(rec->used);
+	u32 asize = le32_to_cpu(attr->size);
+
+	if (aoff + asize > used)
+		return false;
+
+	if (ni && is_attr_indexed(attr)) {
+		le16_add_cpu(&ni->mi.mrec->hard_links, -1);
+		ni->mi.dirty = true;
+	}
+
+	used -= asize;
+	memmove(attr, Add2Ptr(attr, asize), used - aoff);
+	rec->used = cpu_to_le32(used);
+	mi->dirty = true;
+
+	return true;
+}
+
+/* bytes = "new attribute size" - "old attribute size" */
+bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes)
+{
+	struct MFT_REC *rec = mi->mrec;
+	u32 aoff = PtrOffset(rec, attr);
+	u32 total, used = le32_to_cpu(rec->used);
+	u32 nsize, asize = le32_to_cpu(attr->size);
+	u32 rsize = le32_to_cpu(attr->res.data_size);
+	int tail = (int)(used - aoff - asize);
+	int dsize;
+	char *next;
+
+	if (tail < 0 || aoff >= used)
+		return false;
+
+	if (!bytes)
+		return true;
+
+	total = le32_to_cpu(rec->total);
+	next = Add2Ptr(attr, asize);
+
+	if (bytes > 0) {
+		dsize = ALIGN(bytes, 8);
+		if (used + dsize > total)
+			return false;
+		nsize = asize + dsize;
+		/* Move tail */
+		memmove(next + dsize, next, tail);
+		memset(next, 0, dsize);
+		used += dsize;
+		rsize += dsize;
+	} else {
+		dsize = ALIGN(-bytes, 8);
+		if (dsize > asize)
+			return false;
+		nsize = asize - dsize;
+		memmove(next - dsize, next, tail);
+		used -= dsize;
+		rsize -= dsize;
+	}
+
+	rec->used = cpu_to_le32(used);
+	attr->size = cpu_to_le32(nsize);
+	if (!attr->non_res)
+		attr->res.data_size = cpu_to_le32(rsize);
+	mi->dirty = true;
+
+	return true;
+}
+
+int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
+		 struct runs_tree *run, CLST len)
+{
+	int err = 0;
+	struct ntfs_sb_info *sbi = mi->sbi;
+	u32 new_run_size;
+	CLST plen;
+	struct MFT_REC *rec = mi->mrec;
+	CLST svcn = le64_to_cpu(attr->nres.svcn);
+	u32 used = le32_to_cpu(rec->used);
+	u32 aoff = PtrOffset(rec, attr);
+	u32 asize = le32_to_cpu(attr->size);
+	char *next = Add2Ptr(attr, asize);
+	u16 run_off = le16_to_cpu(attr->nres.run_off);
+	u32 run_size = asize - run_off;
+	u32 tail = used - aoff - asize;
+	u32 dsize = sbi->record_size - used;
+
+	/* Make a maximum gap in current record. */
+	memmove(next + dsize, next, tail);
+
+	/* Pack as much as possible. */
+	err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize,
+		       &plen);
+	if (err < 0) {
+		memmove(next, next + dsize, tail);
+		return err;
+	}
+
+	new_run_size = ALIGN(err, 8);
+
+	memmove(next + new_run_size - run_size, next + dsize, tail);
+
+	attr->size = cpu_to_le32(asize + new_run_size - run_size);
+	attr->nres.evcn = cpu_to_le64(svcn + plen - 1);
+	rec->used = cpu_to_le32(used + new_run_size - run_size);
+	mi->dirty = true;
+
+	return 0;
+}
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
--- a/Show More
+++ b/Show More