Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm and dax updates from Dan Williams:
 "Save for a few late fixes, all of these commits have shipped in -next
  releases since before the merge window opened, and 0day has given a
  build success notification.

  The ext4 touches came from Jan, and the xfs touches have Darrick's
  reviewed-by. An xfstest for the MAP_SYNC feature has been through
  a few round of reviews and is on track to be merged.

   - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
     'userspace flush' of persistent memory updates via filesystem-dax
     mappings. It arranges for any filesystem metadata updates that may
     be required to satisfy a write fault to also be flushed ("on disk")
     before the kernel returns to userspace from the fault handler.
     Effectively every write-fault that dirties metadata completes an
     fsync() before returning from the fault handler. The new
     MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag
     is validated as supported by the filesystem's ->mmap() file
     operation.

   - Add support for the standard ACPI 6.2 label access methods that
     replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods.
     This enables interoperability with environments that only implement
     the standardized methods.

   - Add support for the ACPI 6.2 NVDIMM media error injection methods.

   - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for
     latch last shutdown status, firmware update, SMART error injection,
     and SMART alarm threshold control.

   - Cleanup physical address information disclosures to be root-only.

   - Fix revalidation of the DIMM "locked label area" status to support
     dynamic unlock of the label area.

   - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
     (system-physical-address) command and error injection commands.

  Acknowledgements that came after the commits were pushed to -next:

   - 957ac8c421 ("dax: fix PMD faults on zero-length files"):
       Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

   - a39e596baa ("xfs: support for synchronous DAX faults") and
     7b565c9f96 ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()")
        Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>"

* tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits)
  acpi, nfit: add 'Enable Latch System Shutdown Status' command support
  dax: fix general protection fault in dax_alloc_inode
  dax: fix PMD faults on zero-length files
  dax: stop requiring a live device for dax_flush()
  brd: remove dax support
  dax: quiet bdev_dax_supported()
  fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core
  tools/testing/nvdimm: unit test clear-error commands
  acpi, nfit: validate commands against the device type
  tools/testing/nvdimm: stricter bounds checking for error injection commands
  xfs: support for synchronous DAX faults
  xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
  ext4: Support for synchronous DAX faults
  ext4: Simplify error handling in ext4_dax_huge_fault()
  dax: Implement dax_finish_sync_fault()
  dax, iomap: Add support for synchronous faults
  mm: Define MAP_SYNC and VM_SYNC flags
  dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
  dax: Allow dax_iomap_fault() to return pfn
  dax: Fix comment describing dax_iomap_fault()
  ...
This commit is contained in:
Linus Torvalds
2017-11-17 09:51:57 -08:00
48 changed files with 1407 additions and 562 deletions

323
fs/dax.c
View File

@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
void *entry, sector_t sector,
unsigned long flags)
unsigned long flags, bool dirty)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
void *new_entry;
pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE)
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
entry = new_entry;
}
if (vmf->flags & FAULT_FLAG_WRITE)
if (dirty)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
@@ -825,38 +825,42 @@ out:
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
unsigned long vaddr = vmf->address;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
pfn_t pfn;
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
void *kaddr;
int id, rc;
long length;
rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
&kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
goto out;
/* For larger pages we need devmap */
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
out:
dax_read_unlock(id);
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
trace_dax_insert_mapping(mapping->host, vmf, ret);
if (vmf->flags & FAULT_FLAG_WRITE)
return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
else
return vm_insert_mixed(vma, vaddr, pfn);
return rc;
}
/*
@@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
RADIX_DAX_ZERO_PAGE);
RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
@@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
@@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
return VM_FAULT_SIGBUS;
}
static int dax_iomap_pte_fault(struct vm_fault *vmf,
/*
* MAP_SYNC on a dax mapping guarantees dirty metadata is
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(unsigned long flags,
struct vm_area_struct *vma, struct iomap *iomap)
{
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
&& (iomap->flags & IOMAP_F_DIRTY);
}
static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector;
struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
int vmf_ret = 0;
void *entry;
pfn_t pfn;
trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
@@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto out;
}
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
if (vmf->cow_page) {
sector_t sector = dax_iomap_sector(&iomap, pos);
switch (iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
@@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto finish_iomap;
}
sync = dax_fault_is_synchronous(flags, vma, &iomap);
switch (iomap.type) {
case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
sector, PAGE_SIZE, entry, vmf->vma, vmf);
error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
if (error < 0)
goto error_finish_iomap;
entry = dax_insert_mapping_entry(mapping, vmf, entry,
dax_iomap_sector(&iomap, pos),
0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto error_finish_iomap;
}
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PTE into page tables only after that happens.
* Skip insertion for now and return the pfn so that caller can
* insert it after fsync is done.
*/
if (sync) {
if (WARN_ON_ONCE(!pfnp)) {
error = -EIO;
goto error_finish_iomap;
}
*pfnp = pfn;
vmf_ret = VM_FAULT_NEEDDSYNC | major;
goto finish_iomap;
}
trace_dax_insert_mapping(inode, vmf, entry);
if (write)
error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
else
error = vm_insert_mixed(vma, vaddr, pfn);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
if (!write) {
vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
@@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
#ifdef CONFIG_FS_DAX_PMD
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
loff_t pos, void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host;
const size_t size = PMD_SIZE;
void *ret = NULL, *kaddr;
long length = 0;
pgoff_t pgoff;
pfn_t pfn = {};
int id;
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback;
id = dax_read_lock();
length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (length < 0)
goto unlock_fallback;
length = PFN_PHYS(length);
if (length < size)
goto unlock_fallback;
if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
goto unlock_fallback;
if (!pfn_t_devmap(pfn))
goto unlock_fallback;
dax_read_unlock(id);
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, vmf->flags & FAULT_FLAG_WRITE);
unlock_fallback:
dax_read_unlock(id);
fallback:
trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
return VM_FAULT_FALLBACK;
}
/*
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
* more often than one might expect in the below functions.
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry)
@@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
@@ -1310,13 +1314,14 @@ fallback:
return VM_FAULT_FALLBACK;
}
static int dax_iomap_pmd_fault(struct vm_fault *vmf,
static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
int result = VM_FAULT_FALLBACK;
@@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
void *entry;
loff_t pos;
int error;
pfn_t pfn;
/*
* Check whether offset isn't beyond end of file now. Caller is
@@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* this is a reliable test.
*/
pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
if (pgoff > max_pgoff) {
if (pgoff >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
@@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
if (error < 0)
goto finish_iomap;
entry = dax_insert_mapping_entry(mapping, vmf, entry,
dax_iomap_sector(&iomap, pos),
RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PMD into page tables only after that happens.
* Skip insertion for now and return the pfn so that caller can
* insert it after fsync is done.
*/
if (sync) {
if (WARN_ON_ONCE(!pfnp))
goto finish_iomap;
*pfnp = pfn;
result = VM_FAULT_NEEDDSYNC;
goto finish_iomap;
}
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
write);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
@@ -1442,7 +1476,7 @@ out:
return result;
}
#else
static int dax_iomap_pmd_fault(struct vm_fault *vmf,
static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault
* @ops: iomap ops passed from the file system
* @pe_size: Size of the page to fault in
* @pfnp: PFN to insert for synchronous faults if fsync is required
* @ops: Iomap ops passed from the file system
*
* When a page fault occurs, filesystems may call this helper in
* their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* successfully.
*/
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops)
pfn_t *pfnp, const struct iomap_ops *ops)
{
switch (pe_size) {
case PE_SIZE_PTE:
return dax_iomap_pte_fault(vmf, ops);
return dax_iomap_pte_fault(vmf, pfnp, ops);
case PE_SIZE_PMD:
return dax_iomap_pmd_fault(vmf, ops);
return dax_iomap_pmd_fault(vmf, pfnp, ops);
default:
return VM_FAULT_FALLBACK;
}
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
/**
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
* @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
*
* This function inserts writeable PTE or PMD entry into page tables for mmaped
* DAX file. It takes care of marking corresponding radix tree entry as dirty
* as well.
*/
static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
enum page_entry_size pe_size,
pfn_t pfn)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
int vmf_ret, error;
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
/* Did we race with someone splitting entry or so? */
if (!entry ||
(pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
(pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
switch (pe_size) {
case PE_SIZE_PTE:
error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
vmf_ret = dax_fault_return(error);
break;
#ifdef CONFIG_FS_DAX_PMD
case PE_SIZE_PMD:
vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
break;
#endif
default:
vmf_ret = VM_FAULT_FALLBACK;
}
put_locked_mapping_entry(mapping, index);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
return vmf_ret;
}
/**
* dax_finish_sync_fault - finish synchronous page fault
* @vmf: The description of the fault
* @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
*
* This function ensures that the file range touched by the page fault is
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
size_t len = 0;
if (pe_size == PE_SIZE_PTE)
len = PAGE_SIZE;
else if (pe_size == PE_SIZE_PMD)
len = PMD_SIZE;
else
WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);