From c78bc072ac80823a5934574e791325dd6817ec10 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 1 Dec 2018 10:58:38 -0500 Subject: [PATCH 001/574] remoteproc: convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Yangtao Li Link: https://lore.kernel.org/r/20181201155838.8619-1-tiny.windzz@gmail.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_debugfs.c | 28 ++++--------------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c index d734cadb16e3..732770e92b99 100644 --- a/drivers/remoteproc/remoteproc_debugfs.c +++ b/drivers/remoteproc/remoteproc_debugfs.c @@ -269,17 +269,7 @@ static int rproc_rsc_table_show(struct seq_file *seq, void *p) return 0; } -static int rproc_rsc_table_open(struct inode *inode, struct file *file) -{ - return single_open(file, rproc_rsc_table_show, inode->i_private); -} - -static const struct file_operations rproc_rsc_table_ops = { - .open = rproc_rsc_table_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rproc_rsc_table); /* Expose carveout content via debugfs */ static int rproc_carveouts_show(struct seq_file *seq, void *p) @@ -299,17 +289,7 @@ static int rproc_carveouts_show(struct seq_file *seq, void *p) return 0; } -static int rproc_carveouts_open(struct inode *inode, struct file *file) -{ - return single_open(file, rproc_carveouts_show, inode->i_private); -} - -static const struct file_operations rproc_carveouts_ops = { - .open = rproc_carveouts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rproc_carveouts); void rproc_remove_trace_file(struct dentry *tfile) { @@ -354,9 +334,9 @@ void rproc_create_debug_dir(struct rproc *rproc) debugfs_create_file("crash", 0200, rproc->dbg_dir, rproc, &rproc_crash_ops); debugfs_create_file("resource_table", 0400, rproc->dbg_dir, - rproc, &rproc_rsc_table_ops); + rproc, &rproc_rsc_table_fops); debugfs_create_file("carveout_memories", 0400, rproc->dbg_dir, - rproc, &rproc_carveouts_ops); + rproc, &rproc_carveouts_fops); } void __init rproc_init_debugfs(void) From 075894d45656fe9aeced4f34ef692b52791d78dc Mon Sep 17 00:00:00 2001 From: Wang Wenhu Date: Fri, 13 Mar 2020 09:50:49 -0700 Subject: [PATCH 002/574] rpmsg: fix a comment typo for rpmsg_device_match() Should be 'a' rather than 'an'. Signed-off-by: WANG Wenhu Link: https://lore.kernel.org/r/20200313165049.62907-1-wenhu.wang@vivo.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/rpmsg_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c index e330ec4dfc33..a6361cad608b 100644 --- a/drivers/rpmsg/rpmsg_core.c +++ b/drivers/rpmsg/rpmsg_core.c @@ -284,7 +284,7 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, EXPORT_SYMBOL(rpmsg_trysend_offchannel); /* - * match an rpmsg channel with a channel info struct. + * match a rpmsg channel with a channel info struct. * this is used to make sure we're not creating rpmsg devices for channels * that already exist. */ From 6442df49400b466431979e7634849a464a5f1861 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 15 Apr 2020 14:48:52 -0600 Subject: [PATCH 003/574] remoteproc: Fix IDR initialisation in rproc_alloc() If ida_simple_get() returns an error when called in rproc_alloc(), put_device() is called to clean things up. By this time the rproc device type has been assigned, with rproc_type_release() as the release function. The first thing rproc_type_release() does is call: idr_destroy(&rproc->notifyids); But at the time the ida_simple_get() call is made, the notifyids field in the remoteproc structure has not been initialized. I'm not actually sure this case causes an observable problem, but it's incorrect. Fix this by initializing the notifyids field before calling ida_simple_get() in rproc_alloc(). Fixes: b5ab5e24e960 ("remoteproc: maintain a generic child device for each rproc") Signed-off-by: Alex Elder Reviewed-by: Mathieu Poirier Reviewed-by: Suman Anna Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200415204858.2448-2-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index e12a54e67588..80056513ae71 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2053,6 +2053,7 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->dev.type = &rproc_type; rproc->dev.class = &rproc_class; rproc->dev.driver_data = rproc; + idr_init(&rproc->notifyids); /* Assign a unique device index and name */ rproc->index = ida_simple_get(&rproc_dev_index, 0, 0, GFP_KERNEL); @@ -2078,8 +2079,6 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, mutex_init(&rproc->lock); - idr_init(&rproc->notifyids); - INIT_LIST_HEAD(&rproc->carveouts); INIT_LIST_HEAD(&rproc->mappings); INIT_LIST_HEAD(&rproc->traces); From 0c2ae2b1afdfffa5e485614569d2ff12dee97fc5 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Wed, 15 Apr 2020 14:48:53 -0600 Subject: [PATCH 004/574] remoteproc: Split firmware name allocation from rproc_alloc() Make the firmware name allocation a function on its own in an effort to cleanup function rproc_alloc(). Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200415204858.2448-3-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 66 ++++++++++++++++------------ 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 80056513ae71..4dee63f319ba 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1979,6 +1979,33 @@ static const struct device_type rproc_type = { .release = rproc_type_release, }; +static int rproc_alloc_firmware(struct rproc *rproc, + const char *name, const char *firmware) +{ + char *p, *template = "rproc-%s-fw"; + int name_len; + + if (!firmware) { + /* + * If the caller didn't pass in a firmware name then + * construct a default name. + */ + name_len = strlen(name) + strlen(template) - 2 + 1; + p = kmalloc(name_len, GFP_KERNEL); + if (!p) + return -ENOMEM; + snprintf(p, name_len, template, name); + } else { + p = kstrdup(firmware, GFP_KERNEL); + if (!p) + return -ENOMEM; + } + + rproc->firmware = p; + + return 0; +} + /** * rproc_alloc() - allocate a remote processor handle * @dev: the underlying device @@ -2007,42 +2034,21 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, const char *firmware, int len) { struct rproc *rproc; - char *p, *template = "rproc-%s-fw"; - int name_len; if (!dev || !name || !ops) return NULL; - if (!firmware) { - /* - * If the caller didn't pass in a firmware name then - * construct a default name. - */ - name_len = strlen(name) + strlen(template) - 2 + 1; - p = kmalloc(name_len, GFP_KERNEL); - if (!p) - return NULL; - snprintf(p, name_len, template, name); - } else { - p = kstrdup(firmware, GFP_KERNEL); - if (!p) - return NULL; - } - rproc = kzalloc(sizeof(struct rproc) + len, GFP_KERNEL); - if (!rproc) { - kfree(p); + if (!rproc) return NULL; - } + + if (rproc_alloc_firmware(rproc, name, firmware)) + goto free_rproc; rproc->ops = kmemdup(ops, sizeof(*ops), GFP_KERNEL); - if (!rproc->ops) { - kfree(p); - kfree(rproc); - return NULL; - } + if (!rproc->ops) + goto free_firmware; - rproc->firmware = p; rproc->name = name; rproc->priv = &rproc[1]; rproc->auto_boot = true; @@ -2091,6 +2097,12 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->state = RPROC_OFFLINE; return rproc; + +free_firmware: + kfree(rproc->firmware); +free_rproc: + kfree(rproc); + return NULL; } EXPORT_SYMBOL(rproc_alloc); From 4df4f8be8b3e9ce807ba47c030893d711abe6ee3 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Wed, 15 Apr 2020 14:48:54 -0600 Subject: [PATCH 005/574] remoteproc: Simplify default name allocation In an effort to cleanup firmware name allocation, replace the cumbersome mechanic used to allocate a default firmware name with function kasprintf(). Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Suggested-by: Bjorn Andersson Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200415204858.2448-4-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 4dee63f319ba..9899467fa1cf 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1982,24 +1982,19 @@ static const struct device_type rproc_type = { static int rproc_alloc_firmware(struct rproc *rproc, const char *name, const char *firmware) { - char *p, *template = "rproc-%s-fw"; - int name_len; + char *p; - if (!firmware) { + if (!firmware) /* * If the caller didn't pass in a firmware name then * construct a default name. */ - name_len = strlen(name) + strlen(template) - 2 + 1; - p = kmalloc(name_len, GFP_KERNEL); - if (!p) - return -ENOMEM; - snprintf(p, name_len, template, name); - } else { + p = kasprintf(GFP_KERNEL, "rproc-%s-fw", name); + else p = kstrdup(firmware, GFP_KERNEL); - if (!p) - return -ENOMEM; - } + + if (!p) + return -ENOMEM; rproc->firmware = p; From 418fd78771220f5e522d02676758ed824c349fd5 Mon Sep 17 00:00:00 2001 From: Clement Leger Date: Fri, 10 Apr 2020 12:24:32 +0200 Subject: [PATCH 006/574] remoteproc: add rproc_coredump_set_elf_info This function allows drivers to correctly setup the coredump output elf information. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Clement Leger Link: https://lore.kernel.org/r/20200410102433.2672-2-cleger@kalray.eu Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 32 ++++++++++++++++++++-- drivers/remoteproc/remoteproc_elf_loader.c | 3 -- include/linux/remoteproc.h | 2 ++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9899467fa1cf..d9e6949e4ac1 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1565,6 +1565,28 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc, } EXPORT_SYMBOL(rproc_coredump_add_custom_segment); +/** + * rproc_coredump_set_elf_info() - set coredump elf information + * @rproc: handle of a remote processor + * @class: elf class for coredump elf file + * @machine: elf machine for coredump elf file + * + * Set elf information which will be used for coredump elf file. + * + * Return: 0 on success, negative errno on error. + */ +int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine) +{ + if (class != ELFCLASS64 && class != ELFCLASS32) + return -EINVAL; + + rproc->elf_class = class; + rproc->elf_machine = machine; + + return 0; +} +EXPORT_SYMBOL(rproc_coredump_set_elf_info); + /** * rproc_coredump() - perform coredump * @rproc: rproc handle @@ -1587,6 +1609,11 @@ static void rproc_coredump(struct rproc *rproc) if (list_empty(&rproc->dump_segments)) return; + if (class == ELFCLASSNONE) { + dev_err(&rproc->dev, "Elf class is not set\n"); + return; + } + data_size = elf_size_of_hdr(class); list_for_each_entry(segment, &rproc->dump_segments, node) { data_size += elf_size_of_phdr(class) + segment->size; @@ -1605,7 +1632,7 @@ static void rproc_coredump(struct rproc *rproc) elf_hdr_init_ident(ehdr, class); elf_hdr_set_e_type(class, ehdr, ET_CORE); - elf_hdr_set_e_machine(class, ehdr, EM_NONE); + elf_hdr_set_e_machine(class, ehdr, rproc->elf_machine); elf_hdr_set_e_version(class, ehdr, EV_CURRENT); elf_hdr_set_e_entry(class, ehdr, rproc->bootaddr); elf_hdr_set_e_phoff(class, ehdr, elf_size_of_hdr(class)); @@ -2047,7 +2074,8 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->name = name; rproc->priv = &rproc[1]; rproc->auto_boot = true; - rproc->elf_class = ELFCLASS32; + rproc->elf_class = ELFCLASSNONE; + rproc->elf_machine = EM_NONE; device_initialize(&rproc->dev); rproc->dev.parent = dev; diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index 16e2c496fd45..4869fb7d8fe4 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -248,9 +248,6 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) memset(ptr + filesz, 0, memsz - filesz); } - if (ret == 0) - rproc->elf_class = class; - return ret; } EXPORT_SYMBOL(rproc_elf_load_segments); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9c07d7958c53..0547676479d3 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -518,6 +518,7 @@ struct rproc { struct list_head dump_segments; int nb_vdev; u8 elf_class; + u16 elf_machine; }; /** @@ -622,6 +623,7 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc, struct rproc_dump_segment *segment, void *dest), void *priv); +int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine); static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev) { From 3898fc99d199346348b3efe1f6657b9eb7fa56cd Mon Sep 17 00:00:00 2001 From: Clement Leger Date: Fri, 10 Apr 2020 12:24:33 +0200 Subject: [PATCH 007/574] remoteproc: use rproc_coredump_set_elf_info in drivers Modify drivers which are using remoteproc coredump functionality to use rproc_coredump_set_elf_info in order to create correct elf coredump format. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Clement Leger Link: https://lore.kernel.org/r/20200410102433.2672-3-cleger@kalray.eu Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_adsp.c | 1 + drivers/remoteproc/qcom_q6v5_mss.c | 3 +++ drivers/remoteproc/qcom_q6v5_pas.c | 1 + drivers/remoteproc/qcom_wcnss.c | 1 + drivers/remoteproc/stm32_rproc.c | 1 + 5 files changed, 7 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c index 24a3db961d5e..c60dabc6939e 100644 --- a/drivers/remoteproc/qcom_q6v5_adsp.c +++ b/drivers/remoteproc/qcom_q6v5_adsp.c @@ -431,6 +431,7 @@ static int adsp_probe(struct platform_device *pdev) dev_err(&pdev->dev, "unable to allocate remoteproc\n"); return -ENOMEM; } + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); adsp = (struct qcom_adsp *)rproc->priv; adsp->dev = &pdev->dev; diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index ce49c3236ff7..a335d2eebe97 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1357,6 +1357,8 @@ static int qcom_q6v5_register_dump_segments(struct rproc *rproc, return ret; } + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); + ehdr = (struct elf32_hdr *)fw->data; phdrs = (struct elf32_phdr *)(ehdr + 1); qproc->dump_complete_mask = 0; @@ -1667,6 +1669,7 @@ static int q6v5_probe(struct platform_device *pdev) } rproc->auto_boot = false; + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); qproc = (struct q6v5 *)rproc->priv; qproc->dev = &pdev->dev; diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 7a63efb85405..8ecc157f1ed1 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -398,6 +398,7 @@ static int adsp_probe(struct platform_device *pdev) } rproc->auto_boot = desc->auto_boot; + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); adsp = (struct qcom_adsp *)rproc->priv; adsp->dev = &pdev->dev; diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index 0c7afd038f0d..5d65e1a9329a 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -480,6 +480,7 @@ static int wcnss_probe(struct platform_device *pdev) dev_err(&pdev->dev, "unable to allocate remoteproc\n"); return -ENOMEM; } + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); wcnss = (struct qcom_wcnss *)rproc->priv; wcnss->dev = &pdev->dev; diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c index 6a66dbf2df40..0f9d02ca4f5a 100644 --- a/drivers/remoteproc/stm32_rproc.c +++ b/drivers/remoteproc/stm32_rproc.c @@ -625,6 +625,7 @@ static int stm32_rproc_probe(struct platform_device *pdev) if (!rproc) return -ENOMEM; + rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE); rproc->has_iommu = false; ddata = rproc->priv; ddata->workqueue = create_workqueue(dev_name(dev)); From 66a4347e9a3e6adf9d3d5ceb9856cbed3d805beb Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Wed, 8 Apr 2020 16:36:38 -0700 Subject: [PATCH 008/574] remoteproc: sysmon: Add ability to send type of notification Current implementation of the sysmon driver does not support adding notifications for other remoteproc events - prepare, start, unprepare. Clients on the remoteproc side might be interested in knowing when a remoteproc boots up. This change adds the ability to send the notification type along with the name. For example, audio DSP is interested in knowing when modem has crashed so that it can perform cleanup and wait for modem to boot up before it starts processing data again. Acked-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1586389003-26675-2-git-send-email-sidgup@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_sysmon.c | 54 ++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/drivers/remoteproc/qcom_sysmon.c b/drivers/remoteproc/qcom_sysmon.c index faf3822d8791..13660506b9b0 100644 --- a/drivers/remoteproc/qcom_sysmon.c +++ b/drivers/remoteproc/qcom_sysmon.c @@ -46,6 +46,25 @@ struct qcom_sysmon { struct sockaddr_qrtr ssctl; }; +enum { + SSCTL_SSR_EVENT_BEFORE_POWERUP, + SSCTL_SSR_EVENT_AFTER_POWERUP, + SSCTL_SSR_EVENT_BEFORE_SHUTDOWN, + SSCTL_SSR_EVENT_AFTER_SHUTDOWN, +}; + +static const char * const sysmon_state_string[] = { + [SSCTL_SSR_EVENT_BEFORE_POWERUP] = "before_powerup", + [SSCTL_SSR_EVENT_AFTER_POWERUP] = "after_powerup", + [SSCTL_SSR_EVENT_BEFORE_SHUTDOWN] = "before_shutdown", + [SSCTL_SSR_EVENT_AFTER_SHUTDOWN] = "after_shutdown", +}; + +struct sysmon_event { + const char *subsys_name; + u32 ssr_event; +}; + static DEFINE_MUTEX(sysmon_lock); static LIST_HEAD(sysmon_list); @@ -54,13 +73,15 @@ static LIST_HEAD(sysmon_list); * @sysmon: sysmon context * @name: other remote's name */ -static void sysmon_send_event(struct qcom_sysmon *sysmon, const char *name) +static void sysmon_send_event(struct qcom_sysmon *sysmon, + const struct sysmon_event *event) { char req[50]; int len; int ret; - len = snprintf(req, sizeof(req), "ssr:%s:before_shutdown", name); + len = snprintf(req, sizeof(req), "ssr:%s:%s", event->subsys_name, + sysmon_state_string[event->ssr_event]); if (len >= sizeof(req)) return; @@ -148,13 +169,6 @@ static int sysmon_callback(struct rpmsg_device *rpdev, void *data, int count, #define SSCTL_SUBSYS_NAME_LENGTH 15 -enum { - SSCTL_SSR_EVENT_BEFORE_POWERUP, - SSCTL_SSR_EVENT_AFTER_POWERUP, - SSCTL_SSR_EVENT_BEFORE_SHUTDOWN, - SSCTL_SSR_EVENT_AFTER_SHUTDOWN, -}; - enum { SSCTL_SSR_EVENT_FORCED, SSCTL_SSR_EVENT_GRACEFUL, @@ -331,7 +345,8 @@ static void ssctl_request_shutdown(struct qcom_sysmon *sysmon) * @sysmon: sysmon context * @name: other remote's name */ -static void ssctl_send_event(struct qcom_sysmon *sysmon, const char *name) +static void ssctl_send_event(struct qcom_sysmon *sysmon, + const struct sysmon_event *event) { struct ssctl_subsys_event_resp resp; struct ssctl_subsys_event_req req; @@ -346,9 +361,9 @@ static void ssctl_send_event(struct qcom_sysmon *sysmon, const char *name) } memset(&req, 0, sizeof(req)); - strlcpy(req.subsys_name, name, sizeof(req.subsys_name)); + strlcpy(req.subsys_name, event->subsys_name, sizeof(req.subsys_name)); req.subsys_name_len = strlen(req.subsys_name); - req.event = SSCTL_SSR_EVENT_BEFORE_SHUTDOWN; + req.event = event->ssr_event; req.evt_driven_valid = true; req.evt_driven = SSCTL_SSR_EVENT_FORCED; @@ -432,8 +447,12 @@ static int sysmon_start(struct rproc_subdev *subdev) static void sysmon_stop(struct rproc_subdev *subdev, bool crashed) { struct qcom_sysmon *sysmon = container_of(subdev, struct qcom_sysmon, subdev); + struct sysmon_event event = { + .subsys_name = sysmon->name, + .ssr_event = SSCTL_SSR_EVENT_BEFORE_SHUTDOWN + }; - blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)sysmon->name); + blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)&event); /* Don't request graceful shutdown if we've crashed */ if (crashed) @@ -456,19 +475,20 @@ static int sysmon_notify(struct notifier_block *nb, unsigned long event, { struct qcom_sysmon *sysmon = container_of(nb, struct qcom_sysmon, nb); struct rproc *rproc = sysmon->rproc; - const char *ssr_name = data; + struct sysmon_event *sysmon_event = data; /* Skip non-running rprocs and the originating instance */ - if (rproc->state != RPROC_RUNNING || !strcmp(data, sysmon->name)) { + if (rproc->state != RPROC_RUNNING || + !strcmp(sysmon_event->subsys_name, sysmon->name)) { dev_dbg(sysmon->dev, "not notifying %s\n", sysmon->name); return NOTIFY_DONE; } /* Only SSCTL version 2 supports SSR events */ if (sysmon->ssctl_version == 2) - ssctl_send_event(sysmon, ssr_name); + ssctl_send_event(sysmon, sysmon_event); else if (sysmon->ept) - sysmon_send_event(sysmon, ssr_name); + sysmon_send_event(sysmon, sysmon_event); return NOTIFY_DONE; } From 1877f54f75ad16549861ae92a708bd04c23f2a72 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Wed, 8 Apr 2020 16:36:39 -0700 Subject: [PATCH 009/574] remoteproc: sysmon: Add notifications for events Add notification for other stages of remoteproc boot and shutdown. This includes adding callback functions for the prepare and unprepare events, and fleshing out the callback function for start. Acked-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1586389003-26675-3-git-send-email-sidgup@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_sysmon.c | 37 ++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/drivers/remoteproc/qcom_sysmon.c b/drivers/remoteproc/qcom_sysmon.c index 13660506b9b0..851664e1aabe 100644 --- a/drivers/remoteproc/qcom_sysmon.c +++ b/drivers/remoteproc/qcom_sysmon.c @@ -439,8 +439,31 @@ static const struct qmi_ops ssctl_ops = { .del_server = ssctl_del_server, }; +static int sysmon_prepare(struct rproc_subdev *subdev) +{ + struct qcom_sysmon *sysmon = container_of(subdev, struct qcom_sysmon, + subdev); + struct sysmon_event event = { + .subsys_name = sysmon->name, + .ssr_event = SSCTL_SSR_EVENT_BEFORE_POWERUP + }; + + blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)&event); + + return 0; +} + static int sysmon_start(struct rproc_subdev *subdev) { + struct qcom_sysmon *sysmon = container_of(subdev, struct qcom_sysmon, + subdev); + struct sysmon_event event = { + .subsys_name = sysmon->name, + .ssr_event = SSCTL_SSR_EVENT_AFTER_POWERUP + }; + + blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)&event); + return 0; } @@ -464,6 +487,18 @@ static void sysmon_stop(struct rproc_subdev *subdev, bool crashed) sysmon_request_shutdown(sysmon); } +static void sysmon_unprepare(struct rproc_subdev *subdev) +{ + struct qcom_sysmon *sysmon = container_of(subdev, struct qcom_sysmon, + subdev); + struct sysmon_event event = { + .subsys_name = sysmon->name, + .ssr_event = SSCTL_SSR_EVENT_AFTER_SHUTDOWN + }; + + blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)&event); +} + /** * sysmon_notify() - notify sysmon target of another's SSR * @nb: notifier_block associated with sysmon instance @@ -563,8 +598,10 @@ struct qcom_sysmon *qcom_add_sysmon_subdev(struct rproc *rproc, qmi_add_lookup(&sysmon->qmi, 43, 0, 0); + sysmon->subdev.prepare = sysmon_prepare; sysmon->subdev.start = sysmon_start; sysmon->subdev.stop = sysmon_stop; + sysmon->subdev.unprepare = sysmon_unprepare; rproc_add_subdev(rproc, &sysmon->subdev); From 1f36ab3f6e3b791eadad94f792c874706e153b66 Mon Sep 17 00:00:00 2001 From: Siddharth Gupta Date: Wed, 8 Apr 2020 16:36:40 -0700 Subject: [PATCH 010/574] remoteproc: sysmon: Inform current rproc about all active rprocs Clients/services running on a remoteproc that booted up might need to be aware of the state of already running remoteprocs. When a remoteproc boots up (fresh or after recovery) it is not aware of the remoteprocs that booted before it, i.e., the system state is incomplete. So to keep track of it we send sysmon on behalf of all 'ONLINE' remoteprocs. Acked-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1586389003-26675-4-git-send-email-sidgup@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_sysmon.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/remoteproc/qcom_sysmon.c b/drivers/remoteproc/qcom_sysmon.c index 851664e1aabe..8d8996d714f0 100644 --- a/drivers/remoteproc/qcom_sysmon.c +++ b/drivers/remoteproc/qcom_sysmon.c @@ -453,10 +453,20 @@ static int sysmon_prepare(struct rproc_subdev *subdev) return 0; } +/** + * sysmon_start() - start callback for the sysmon remoteproc subdevice + * @subdev: instance of the sysmon subdevice + * + * Inform all the listners of sysmon notifications that the rproc associated + * to @subdev has booted up. The rproc that booted up also needs to know + * which rprocs are already up and running, so send start notifications + * on behalf of all the online rprocs. + */ static int sysmon_start(struct rproc_subdev *subdev) { struct qcom_sysmon *sysmon = container_of(subdev, struct qcom_sysmon, subdev); + struct qcom_sysmon *target; struct sysmon_event event = { .subsys_name = sysmon->name, .ssr_event = SSCTL_SSR_EVENT_AFTER_POWERUP @@ -464,6 +474,21 @@ static int sysmon_start(struct rproc_subdev *subdev) blocking_notifier_call_chain(&sysmon_notifiers, 0, (void *)&event); + mutex_lock(&sysmon_lock); + list_for_each_entry(target, &sysmon_list, node) { + if (target == sysmon || + target->rproc->state != RPROC_RUNNING) + continue; + + event.subsys_name = target->name; + + if (sysmon->ssctl_version == 2) + ssctl_send_event(sysmon, &event); + else if (sysmon->ept) + sysmon_send_event(sysmon, &event); + } + mutex_unlock(&sysmon_lock); + return 0; } From 93920f61c2ad7edb01e63323832585796af75fc9 Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:32:42 +0200 Subject: [PATCH 011/574] x86/cpu: Add 'table' argument to cpu_matches() To make cpu_matches() reusable for other matching tables, have it take a pointer to a x86_cpu_id table as an argument. [ bp: Flip arguments order. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf --- arch/x86/kernel/cpu/common.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..1131ae032bf2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,9 +1075,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -1097,31 +1097,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1139,7 +1142,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1148,7 +1151,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); From 7e5b3c267d256822407a22fdce6afdf9cd13f9fb Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:54:04 +0200 Subject: [PATCH 012/574] x86/speculation: Add Special Register Buffer Data Sampling (SRBDS) mitigation SRBDS is an MDS-like speculative side channel that can leak bits from the random number generator (RNG) across cores and threads. New microcode serializes the processor access during the execution of RDRAND and RDSEED. This ensures that the shared buffer is overwritten before it is released for reuse. While it is present on all affected CPU models, the microcode mitigation is not needed on models that enumerate ARCH_CAPABILITIES[MDS_NO] in the cases where TSX is not supported or has been disabled with TSX_CTRL. The mitigation is activated by default on affected processors and it increases latency for RDRAND and RDSEED instructions. Among other effects this will reduce throughput from /dev/urandom. * Enable administrator to configure the mitigation off when desired using either mitigations=off or srbds=off. * Export vulnerability status via sysfs * Rename file-scoped macros to apply for non-whitelist table initializations. [ bp: Massage, - s/VULNBL_INTEL_STEPPING/VULNBL_INTEL_STEPPINGS/g, - do not read arch cap MSR a second time in tsx_fused_off() - just pass it in, - flip check in cpu_set_bug_bits() to save an indentation level, - reflow comments. jpoimboe: s/Mitigated/Mitigation/ in user-visible strings tglx: Dropped the fused off magic for now ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Pawan Gupta Reviewed-by: Josh Poimboeuf Tested-by: Neelima Krishnan --- .../ABI/testing/sysfs-devices-system-cpu | 1 + .../admin-guide/kernel-parameters.txt | 20 ++++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/msr-index.h | 4 + arch/x86/kernel/cpu/bugs.c | 106 ++++++++++++++++++ arch/x86/kernel/cpu/common.c | 31 +++++ arch/x86/kernel/cpu/cpu.h | 1 + drivers/base/cpu.c | 8 ++ 8 files changed, 173 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2e0e3b45d02a..b39531a3c5bc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -492,6 +492,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..f720463bd918 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4757,6 +4757,26 @@ the kernel will oops in either "warn" or "fatal" mode. + srbds= [X86,INTEL] + Control the Special Register Buffer Data Sampling + (SRBDS) mitigation. + + Certain CPUs are vulnerable to an MDS-like + exploit which can leak bits from the random + number generator. + + By default, this issue is mitigated by + microcode. However, the microcode fix can cause + the RDRAND and RDSEED instructions to become + much slower. Among other effects, this will + result in reduced throughput from /dev/urandom. + + The microcode mitigation can be disabled with + the following option: + + off: Disable mitigation and remove + performance impact to RDRAND and RDSEED + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index db189945e9b0..02dabc9e77b0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -362,6 +362,7 @@ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -407,5 +408,6 @@ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..3efde600a674 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -128,6 +128,10 @@ #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ #define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ +/* SRBDS support */ +#define MSR_IA32_MCU_OPT_CTRL 0x00000123 +#define RNGDS_MITG_DIS BIT(0) + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..56978cb06149 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -397,6 +399,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) } early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -1528,6 +1621,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1131ae032bf2..8293ee514975 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { const struct x86_cpu_id *m = x86_match_cpu(table); @@ -1142,6 +1163,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1594,6 +1624,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..fb538fccd24c 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 9a1c00fbbaef..d2136ab9b14a 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -562,6 +562,12 @@ ssize_t __weak cpu_show_itlb_multihit(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_srbds(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -570,6 +576,7 @@ static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); +static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -580,6 +587,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_mds.attr, &dev_attr_tsx_async_abort.attr, &dev_attr_itlb_multihit.attr, + &dev_attr_srbds.attr, NULL }; From 7222a1b5b87417f22265c92deea76a6aecd0fb0f Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 18:21:51 +0200 Subject: [PATCH 013/574] x86/speculation: Add SRBDS vulnerability and mitigation documentation Add documentation for the SRBDS vulnerability and its mitigation. [ bp: Massage. jpoimboe: sysfs table strings. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf --- Documentation/admin-guide/hw-vuln/index.rst | 1 + .../special-register-buffer-data-sampling.rst | 148 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 0795e3c2643f..ca4dbdd9016d 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -14,3 +14,4 @@ are configurable at compile, boot or run time. mds tsx_async_abort multihit.rst + special-register-buffer-data-sampling.rst diff --git a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst new file mode 100644 index 000000000000..6a473da80b62 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst @@ -0,0 +1,148 @@ +.. SPDX-License-Identifier: GPL-2.0 + +SRBDS - Special Register Buffer Data Sampling +============================================= + +SRBDS is a hardware vulnerability that allows MDS :doc:`mds` techniques to +infer values returned from special register accesses. Special register +accesses are accesses to off core registers. According to Intel's evaluation, +the special register reads that have a security expectation of privacy are +RDRAND, RDSEED and SGX EGETKEY. + +When RDRAND, RDSEED and EGETKEY instructions are used, the data is moved +to the core through the special register mechanism that is susceptible +to MDS attacks. + +Affected processors +-------------------- +Core models (desktop, mobile, Xeon-E3) that implement RDRAND and/or RDSEED may +be affected. + +A processor is affected by SRBDS if its Family_Model and stepping is +in the following list, with the exception of the listed processors +exporting MDS_NO while Intel TSX is available yet not enabled. The +latter class of processors are only affected when Intel TSX is enabled +by software using TSX_CTRL_MSR otherwise they are not affected. + + ============= ============ ======== + common name Family_Model Stepping + ============= ============ ======== + Haswell 06_3CH All + Haswell_L 06_45H All + Haswell_G 06_46H All + + Broadwell_G 06_47H All + Broadwell 06_3DH All + + Skylake_L 06_4EH All + Skylake 06_5EH All + + Kabylake_L 06_8EH <=0xC + + Kabylake 06_9EH <=0xD + ============= ============ ======== + +Related CVEs +------------ + +The following CVE entry is related to this SRBDS issue: + + ============== ===== ===================================== + CVE-2020-0543 SRBDS Special Register Buffer Data Sampling + ============== ===== ===================================== + +Attack scenarios +---------------- +An unprivileged user can extract values returned from RDRAND and RDSEED +executed on another core or sibling thread using MDS techniques. + + +Mitigation mechanism +------------------- +Intel will release microcode updates that modify the RDRAND, RDSEED, and +EGETKEY instructions to overwrite secret special register data in the shared +staging buffer before the secret data can be accessed by another logical +processor. + +During execution of the RDRAND, RDSEED, or EGETKEY instructions, off-core +accesses from other logical processors will be delayed until the special +register read is complete and the secret data in the shared staging buffer is +overwritten. + +This has three effects on performance: + +#. RDRAND, RDSEED, or EGETKEY instructions have higher latency. + +#. Executing RDRAND at the same time on multiple logical processors will be + serialized, resulting in an overall reduction in the maximum RDRAND + bandwidth. + +#. Executing RDRAND, RDSEED or EGETKEY will delay memory accesses from other + logical processors that miss their core caches, with an impact similar to + legacy locked cache-line-split accesses. + +The microcode updates provide an opt-out mechanism (RNGDS_MITG_DIS) to disable +the mitigation for RDRAND and RDSEED instructions executed outside of Intel +Software Guard Extensions (Intel SGX) enclaves. On logical processors that +disable the mitigation using this opt-out mechanism, RDRAND and RDSEED do not +take longer to execute and do not impact performance of sibling logical +processors memory accesses. The opt-out mechanism does not affect Intel SGX +enclaves (including execution of RDRAND or RDSEED inside an enclave, as well +as EGETKEY execution). + +IA32_MCU_OPT_CTRL MSR Definition +-------------------------------- +Along with the mitigation for this issue, Intel added a new thread-scope +IA32_MCU_OPT_CTRL MSR, (address 0x123). The presence of this MSR and +RNGDS_MITG_DIS (bit 0) is enumerated by CPUID.(EAX=07H,ECX=0).EDX[SRBDS_CTRL = +9]==1. This MSR is introduced through the microcode update. + +Setting IA32_MCU_OPT_CTRL[0] (RNGDS_MITG_DIS) to 1 for a logical processor +disables the mitigation for RDRAND and RDSEED executed outside of an Intel SGX +enclave on that logical processor. Opting out of the mitigation for a +particular logical processor does not affect the RDRAND and RDSEED mitigations +for other logical processors. + +Note that inside of an Intel SGX enclave, the mitigation is applied regardless +of the value of RNGDS_MITG_DS. + +Mitigation control on the kernel command line +--------------------------------------------- +The kernel command line allows control over the SRBDS mitigation at boot time +with the option "srbds=". The option for this is: + + ============= ============================================================= + off This option disables SRBDS mitigation for RDRAND and RDSEED on + affected platforms. + ============= ============================================================= + +SRBDS System Information +----------------------- +The Linux kernel provides vulnerability status information through sysfs. For +SRBDS this can be accessed by the following sysfs file: +/sys/devices/system/cpu/vulnerabilities/srbds + +The possible values contained in this file are: + + ============================== ============================================= + Not affected Processor not vulnerable + Vulnerable Processor vulnerable and mitigation disabled + Vulnerable: No microcode Processor vulnerable and microcode is missing + mitigation + Mitigation: Microcode Processor is vulnerable and mitigation is in + effect. + Mitigation: TSX disabled Processor is only vulnerable when TSX is + enabled while this system was booted with TSX + disabled. + Unknown: Dependent on + hypervisor status Running on virtual guest processor that is + affected but with no way to know if host + processor is mitigated or vulnerable. + ============================== ============================================= + +SRBDS Default mitigation +------------------------ +This new microcode serializes processor access during execution of RDRAND, +RDSEED ensures that the shared buffer is overwritten before it is released for +reuse. Use the "srbds=off" kernel command line to disable the mitigation for +RDRAND and RDSEED. From 1487deda19c82d30d1867277e89bc2d515b9d2d4 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 20 Apr 2020 17:15:58 -0600 Subject: [PATCH 014/574] remoteproc: Use kstrdup_const() rather than kstrdup() For cases where @firmware is declared "const char *", use function kstrdup_const() to avoid needlessly creating another copy on the heap. Suggested-by: Bjorn Andersson Signed-off-by: Mathieu Poirier Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200420231601.16781-2-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 8 ++++---- include/linux/remoteproc.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index d9e6949e4ac1..db8a15fc1e4a 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1996,7 +1996,7 @@ static void rproc_type_release(struct device *dev) if (rproc->index >= 0) ida_simple_remove(&rproc_dev_index, rproc->index); - kfree(rproc->firmware); + kfree_const(rproc->firmware); kfree(rproc->ops); kfree(rproc); } @@ -2009,7 +2009,7 @@ static const struct device_type rproc_type = { static int rproc_alloc_firmware(struct rproc *rproc, const char *name, const char *firmware) { - char *p; + const char *p; if (!firmware) /* @@ -2018,7 +2018,7 @@ static int rproc_alloc_firmware(struct rproc *rproc, */ p = kasprintf(GFP_KERNEL, "rproc-%s-fw", name); else - p = kstrdup(firmware, GFP_KERNEL); + p = kstrdup_const(firmware, GFP_KERNEL); if (!p) return -ENOMEM; @@ -2122,7 +2122,7 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, return rproc; free_firmware: - kfree(rproc->firmware); + kfree_const(rproc->firmware); free_rproc: kfree(rproc); return NULL; diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0547676479d3..800b4f09dc98 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -489,7 +489,7 @@ struct rproc { struct list_head node; struct iommu_domain *domain; const char *name; - char *firmware; + const char *firmware; void *priv; struct rproc_ops *ops; struct device dev; From 9d5f82c8ba2471e34150a0e750ef54089e2a3740 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 20 Apr 2020 17:15:59 -0600 Subject: [PATCH 015/574] remoteproc: Restructure firmware name allocation Improve the readability of function rproc_alloc_firmware() by using a non-negated condition and moving the comment out of the conditional block Suggested-by: Alex Elder Signed-off-by: Mathieu Poirier Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200420231601.16781-3-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index db8a15fc1e4a..45529d40342f 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2011,14 +2011,14 @@ static int rproc_alloc_firmware(struct rproc *rproc, { const char *p; - if (!firmware) - /* - * If the caller didn't pass in a firmware name then - * construct a default name. - */ - p = kasprintf(GFP_KERNEL, "rproc-%s-fw", name); - else + /* + * Allocate a firmware name if the caller gave us one to work + * with. Otherwise construct a new one using a default pattern. + */ + if (firmware) p = kstrdup_const(firmware, GFP_KERNEL); + else + p = kasprintf(GFP_KERNEL, "rproc-%s-fw", name); if (!p) return -ENOMEM; From bf860aa176d0104cfbaf863acbadf5548f1172c2 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 20 Apr 2020 17:16:00 -0600 Subject: [PATCH 016/574] remoteproc: Split rproc_ops allocation from rproc_alloc() Make the rproc_ops allocation a function on its own in an effort to clean up function rproc_alloc(). Signed-off-by: Mathieu Poirier Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200420231601.16781-4-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 33 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 45529d40342f..15318507aedb 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2028,6 +2028,26 @@ static int rproc_alloc_firmware(struct rproc *rproc, return 0; } +static int rproc_alloc_ops(struct rproc *rproc, const struct rproc_ops *ops) +{ + rproc->ops = kmemdup(ops, sizeof(*ops), GFP_KERNEL); + if (!rproc->ops) + return -ENOMEM; + + if (rproc->ops->load) + return 0; + + /* Default to ELF loader if no load function is specified */ + rproc->ops->load = rproc_elf_load_segments; + rproc->ops->parse_fw = rproc_elf_load_rsc_table; + rproc->ops->find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table; + if (!rproc->ops->sanity_check) + rproc->ops->sanity_check = rproc_elf32_sanity_check; + rproc->ops->get_boot_addr = rproc_elf_get_boot_addr; + + return 0; +} + /** * rproc_alloc() - allocate a remote processor handle * @dev: the underlying device @@ -2067,8 +2087,7 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, if (rproc_alloc_firmware(rproc, name, firmware)) goto free_rproc; - rproc->ops = kmemdup(ops, sizeof(*ops), GFP_KERNEL); - if (!rproc->ops) + if (rproc_alloc_ops(rproc, ops)) goto free_firmware; rproc->name = name; @@ -2096,16 +2115,6 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, atomic_set(&rproc->power, 0); - /* Default to ELF loader if no load function is specified */ - if (!rproc->ops->load) { - rproc->ops->load = rproc_elf_load_segments; - rproc->ops->parse_fw = rproc_elf_load_rsc_table; - rproc->ops->find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table; - if (!rproc->ops->sanity_check) - rproc->ops->sanity_check = rproc_elf32_sanity_check; - rproc->ops->get_boot_addr = rproc_elf_get_boot_addr; - } - mutex_init(&rproc->lock); INIT_LIST_HEAD(&rproc->carveouts); From 226f5db4212438cdfe1a94652d74c6c01788a837 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Mon, 20 Apr 2020 17:16:01 -0600 Subject: [PATCH 017/574] remoteproc: Get rid of tedious error path Get rid of tedious error management by moving firmware and operation allocation after calling device_initialize(). That way we take advantage of the automatic call to rproc_type_release() to cleanup after ourselves when put_device() is called. Signed-off-by: Mathieu Poirier Reviewed-by: Alex Elder Reviewed-by: Bjorn Andersson Acked-by: Suman Anna Link: https://lore.kernel.org/r/20200420231601.16781-5-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 15318507aedb..6fca4e2c0dd7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2084,12 +2084,6 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, if (!rproc) return NULL; - if (rproc_alloc_firmware(rproc, name, firmware)) - goto free_rproc; - - if (rproc_alloc_ops(rproc, ops)) - goto free_firmware; - rproc->name = name; rproc->priv = &rproc[1]; rproc->auto_boot = true; @@ -2103,12 +2097,17 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->dev.driver_data = rproc; idr_init(&rproc->notifyids); + if (rproc_alloc_firmware(rproc, name, firmware)) + goto put_device; + + if (rproc_alloc_ops(rproc, ops)) + goto put_device; + /* Assign a unique device index and name */ rproc->index = ida_simple_get(&rproc_dev_index, 0, 0, GFP_KERNEL); if (rproc->index < 0) { dev_err(dev, "ida_simple_get failed: %d\n", rproc->index); - put_device(&rproc->dev); - return NULL; + goto put_device; } dev_set_name(&rproc->dev, "remoteproc%d", rproc->index); @@ -2130,10 +2129,8 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, return rproc; -free_firmware: - kfree_const(rproc->firmware); -free_rproc: - kfree(rproc); +put_device: + put_device(&rproc->dev); return NULL; } EXPORT_SYMBOL(rproc_alloc); From db65527836156fe9c6abe6b9b4aa02fac49a67e3 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 16 Apr 2020 19:20:36 -0500 Subject: [PATCH 018/574] remoteproc: Use a local copy for the name field The current name field used in the remoteproc structure is simply a pointer to a name field supplied during the rproc_alloc() call. The pointer passed in by remoteproc drivers during registration is typically a dev_name pointer, but it is possible that the pointer will no longer remain valid if the devices themselves were created at runtime like in the case of of_platform_populate(), and were deleted upon any failures within the respective remoteproc driver probe function. So, allocate and maintain a local copy for this name field to keep it agnostic of the logic used in the remoteproc drivers. Reviewed-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Signed-off-by: Suman Anna Link: https://lore.kernel.org/r/20200417002036.24359-3-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 6fca4e2c0dd7..a352362d8e48 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1997,6 +1997,7 @@ static void rproc_type_release(struct device *dev) ida_simple_remove(&rproc_dev_index, rproc->index); kfree_const(rproc->firmware); + kfree_const(rproc->name); kfree(rproc->ops); kfree(rproc); } @@ -2084,7 +2085,6 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, if (!rproc) return NULL; - rproc->name = name; rproc->priv = &rproc[1]; rproc->auto_boot = true; rproc->elf_class = ELFCLASSNONE; @@ -2097,6 +2097,10 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->dev.driver_data = rproc; idr_init(&rproc->notifyids); + rproc->name = kstrdup_const(name, GFP_KERNEL); + if (!rproc->name) + goto put_device; + if (rproc_alloc_firmware(rproc, name, firmware)) goto put_device; From 305ac5a766b1d0dd8a4052c8c92e5464888eaa10 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 17 Apr 2020 19:00:37 +0200 Subject: [PATCH 019/574] remoteproc: Add device-managed variants of rproc_alloc/rproc_add Add API functions devm_rproc_alloc() and devm_rproc_add(), which behave like rproc_alloc() and rproc_add() respectively, but register their respective cleanup function to be called on driver detach. Reviewed-by: Bjorn Andersson Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20200417170040.174319-2-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 67 ++++++++++++++++++++++++++++ include/linux/remoteproc.h | 5 +++ 2 files changed, 72 insertions(+) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index a352362d8e48..448262470fc7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1976,6 +1976,33 @@ int rproc_add(struct rproc *rproc) } EXPORT_SYMBOL(rproc_add); +static void devm_rproc_remove(void *rproc) +{ + rproc_del(rproc); +} + +/** + * devm_rproc_add() - resource managed rproc_add() + * @dev: the underlying device + * @rproc: the remote processor handle to register + * + * This function performs like rproc_add() but the registered rproc device will + * automatically be removed on driver detach. + * + * Returns: 0 on success, negative errno on failure + */ +int devm_rproc_add(struct device *dev, struct rproc *rproc) +{ + int err; + + err = rproc_add(rproc); + if (err) + return err; + + return devm_add_action_or_reset(dev, devm_rproc_remove, rproc); +} +EXPORT_SYMBOL(devm_rproc_add); + /** * rproc_type_release() - release a remote processor instance * @dev: the rproc's device @@ -2215,6 +2242,46 @@ int rproc_del(struct rproc *rproc) } EXPORT_SYMBOL(rproc_del); +static void devm_rproc_free(struct device *dev, void *res) +{ + rproc_free(*(struct rproc **)res); +} + +/** + * devm_rproc_alloc() - resource managed rproc_alloc() + * @dev: the underlying device + * @name: name of this remote processor + * @ops: platform-specific handlers (mainly start/stop) + * @firmware: name of firmware file to load, can be NULL + * @len: length of private data needed by the rproc driver (in bytes) + * + * This function performs like rproc_alloc() but the acquired rproc device will + * automatically be released on driver detach. + * + * Returns: new rproc instance, or NULL on failure + */ +struct rproc *devm_rproc_alloc(struct device *dev, const char *name, + const struct rproc_ops *ops, + const char *firmware, int len) +{ + struct rproc **ptr, *rproc; + + ptr = devres_alloc(devm_rproc_free, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + rproc = rproc_alloc(dev, name, ops, firmware, len); + if (rproc) { + *ptr = rproc; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return rproc; +} +EXPORT_SYMBOL(devm_rproc_alloc); + /** * rproc_add_subdev() - add a subdevice to a remoteproc * @rproc: rproc handle to add the subdevice to diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 800b4f09dc98..ac4082f12e8b 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -600,6 +600,11 @@ int rproc_add(struct rproc *rproc); int rproc_del(struct rproc *rproc); void rproc_free(struct rproc *rproc); +struct rproc *devm_rproc_alloc(struct device *dev, const char *name, + const struct rproc_ops *ops, + const char *firmware, int len); +int devm_rproc_add(struct device *dev, struct rproc *rproc); + void rproc_add_carveout(struct rproc *rproc, struct rproc_mem_entry *mem); struct rproc_mem_entry * From e29ff72b779426c7fe462ead93c7ad77fe562935 Mon Sep 17 00:00:00 2001 From: Clement Leger Date: Wed, 22 Apr 2020 11:30:17 +0200 Subject: [PATCH 020/574] remoteproc: remove rproc_elf32_sanity_check Since checks are present in the remoteproc elf loader before calling da_to_va, loading a elf64 will work on 32bits flavors of kernel. Indeed, if a segment size is larger than what size_t can hold, the loader will return an error so the functionality is equivalent to what exists today. Acked-by: Suman Anna Signed-off-by: Clement Leger Link: https://lore.kernel.org/r/20200422093017.10985-1-cleger@kalray.eu Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 3 +-- drivers/remoteproc/remoteproc_elf_loader.c | 21 --------------------- drivers/remoteproc/remoteproc_internal.h | 1 - drivers/remoteproc/st_remoteproc.c | 2 +- drivers/remoteproc/st_slim_rproc.c | 2 +- drivers/remoteproc/stm32_rproc.c | 2 +- 6 files changed, 4 insertions(+), 27 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 448262470fc7..206363723071 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2069,8 +2069,7 @@ static int rproc_alloc_ops(struct rproc *rproc, const struct rproc_ops *ops) rproc->ops->load = rproc_elf_load_segments; rproc->ops->parse_fw = rproc_elf_load_rsc_table; rproc->ops->find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table; - if (!rproc->ops->sanity_check) - rproc->ops->sanity_check = rproc_elf32_sanity_check; + rproc->ops->sanity_check = rproc_elf_sanity_check; rproc->ops->get_boot_addr = rproc_elf_get_boot_addr; return 0; diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index 4869fb7d8fe4..df68d87752e4 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -112,27 +112,6 @@ int rproc_elf_sanity_check(struct rproc *rproc, const struct firmware *fw) } EXPORT_SYMBOL(rproc_elf_sanity_check); -/** - * rproc_elf_sanity_check() - Sanity Check ELF32 firmware image - * @rproc: the remote processor handle - * @fw: the ELF32 firmware image - * - * Make sure this fw image is sane. - */ -int rproc_elf32_sanity_check(struct rproc *rproc, const struct firmware *fw) -{ - int ret = rproc_elf_sanity_check(rproc, fw); - - if (ret) - return ret; - - if (fw_elf_get_class(fw) == ELFCLASS32) - return 0; - - return -EINVAL; -} -EXPORT_SYMBOL(rproc_elf32_sanity_check); - /** * rproc_elf_get_boot_addr() - Get rproc's boot address. * @rproc: the remote processor handle diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index b389dc79da81..31994715fd43 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -54,7 +54,6 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len); phys_addr_t rproc_va_to_pa(void *cpu_addr); int rproc_trigger_recovery(struct rproc *rproc); -int rproc_elf32_sanity_check(struct rproc *rproc, const struct firmware *fw); int rproc_elf_sanity_check(struct rproc *rproc, const struct firmware *fw); u64 rproc_elf_get_boot_addr(struct rproc *rproc, const struct firmware *fw); int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw); diff --git a/drivers/remoteproc/st_remoteproc.c b/drivers/remoteproc/st_remoteproc.c index a6cbfa452764..a3268d95a50e 100644 --- a/drivers/remoteproc/st_remoteproc.c +++ b/drivers/remoteproc/st_remoteproc.c @@ -233,7 +233,7 @@ static const struct rproc_ops st_rproc_ops = { .parse_fw = st_rproc_parse_fw, .load = rproc_elf_load_segments, .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table, - .sanity_check = rproc_elf32_sanity_check, + .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; diff --git a/drivers/remoteproc/st_slim_rproc.c b/drivers/remoteproc/st_slim_rproc.c index 3cca8b65a8db..09bcb4d8b9e0 100644 --- a/drivers/remoteproc/st_slim_rproc.c +++ b/drivers/remoteproc/st_slim_rproc.c @@ -203,7 +203,7 @@ static const struct rproc_ops slim_rproc_ops = { .da_to_va = slim_rproc_da_to_va, .get_boot_addr = rproc_elf_get_boot_addr, .load = rproc_elf_load_segments, - .sanity_check = rproc_elf32_sanity_check, + .sanity_check = rproc_elf_sanity_check, }; /** diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c index 0f9d02ca4f5a..f45b8d597da0 100644 --- a/drivers/remoteproc/stm32_rproc.c +++ b/drivers/remoteproc/stm32_rproc.c @@ -505,7 +505,7 @@ static struct rproc_ops st_rproc_ops = { .load = rproc_elf_load_segments, .parse_fw = stm32_rproc_parse_fw, .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table, - .sanity_check = rproc_elf32_sanity_check, + .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; From 33467ac3c8dc80575e1b158e78d0c30ab33c1b08 Mon Sep 17 00:00:00 2001 From: Loic Pallardy Date: Thu, 16 Apr 2020 19:20:35 -0500 Subject: [PATCH 021/574] remoteproc: Add prepare and unprepare ops On some SoC architecture, it is needed to enable HW like clock, bus, regulator, memory region... before loading co-processor firmware. This patch introduces prepare and unprepare ops to execute platform specific function before firmware loading and after stop execution. Signed-off-by: Loic Pallardy Signed-off-by: Suman Anna Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200417002036.24359-2-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 15 ++++++++++++++- drivers/remoteproc/remoteproc_internal.h | 16 ++++++++++++++++ include/linux/remoteproc.h | 4 ++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 206363723071..4bd0f45a00c0 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1394,12 +1394,19 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) return ret; } + /* Prepare rproc for firmware loading if needed */ + ret = rproc_prepare_device(rproc); + if (ret) { + dev_err(dev, "can't prepare rproc %s: %d\n", rproc->name, ret); + goto disable_iommu; + } + rproc->bootaddr = rproc_get_boot_addr(rproc, fw); /* Load resource table, core dump segment list etc from the firmware */ ret = rproc_parse_fw(rproc, fw); if (ret) - goto disable_iommu; + goto unprepare_rproc; /* reset max_notifyid */ rproc->max_notifyid = -1; @@ -1433,6 +1440,9 @@ clean_up_resources: kfree(rproc->cached_table); rproc->cached_table = NULL; rproc->table_ptr = NULL; +unprepare_rproc: + /* release HW resources if needed */ + rproc_unprepare_device(rproc); disable_iommu: rproc_disable_iommu(rproc); return ret; @@ -1865,6 +1875,9 @@ void rproc_shutdown(struct rproc *rproc) /* clean up all acquired resources */ rproc_resource_cleanup(rproc); + /* release HW resources if needed */ + rproc_unprepare_device(rproc); + rproc_disable_iommu(rproc); /* Free the copy of the resource table */ diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index 31994715fd43..4ba7cb59d3e8 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -63,6 +63,22 @@ struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc, struct rproc_mem_entry * rproc_find_carveout_by_name(struct rproc *rproc, const char *name, ...); +static inline int rproc_prepare_device(struct rproc *rproc) +{ + if (rproc->ops->prepare) + return rproc->ops->prepare(rproc); + + return 0; +} + +static inline int rproc_unprepare_device(struct rproc *rproc) +{ + if (rproc->ops->unprepare) + return rproc->ops->unprepare(rproc); + + return 0; +} + static inline int rproc_fw_sanity_check(struct rproc *rproc, const struct firmware *fw) { diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index ac4082f12e8b..0468be4d02f4 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -355,6 +355,8 @@ enum rsc_handling_status { /** * struct rproc_ops - platform-specific device handlers + * @prepare: prepare device for code loading + * @unprepare: unprepare device after stop * @start: power on the device and boot it * @stop: power off the device * @kick: kick a virtqueue (virtqueue id given as a parameter) @@ -373,6 +375,8 @@ enum rsc_handling_status { * panic at least the returned number of milliseconds */ struct rproc_ops { + int (*prepare)(struct rproc *rproc); + int (*unprepare)(struct rproc *rproc); int (*start)(struct rproc *rproc); int (*stop)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); From 2fb75ceaf71a4a198fc67b984aa95527c993fa1a Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 11 Apr 2020 18:07:50 +0200 Subject: [PATCH 022/574] remoteproc: Add missing '\n' in log messages Message logged by 'dev_xxx()' or 'pr_xxx()' should end with a '\n'. Fixes: 791c13b709dd ("remoteproc: Fix NULL pointer dereference in rproc_virtio_notify") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20200411160750.32573-1-christophe.jaillet@wanadoo.fr Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_virtio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index e61d738d9b47..15ec5fc65257 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -337,8 +337,7 @@ int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id) if (rproc->ops->kick == NULL) { ret = -EINVAL; - dev_err(dev, ".kick method not defined for %s", - rproc->name); + dev_err(dev, ".kick method not defined for %s\n", rproc->name); goto out; } From 3798cc4d106e91382bfe016caa2edada27c2bb3f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 27 Apr 2020 20:46:13 +0200 Subject: [PATCH 023/574] x86/speculation: Add Ivy Bridge to affected list Make the docs match the code. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner --- .../hw-vuln/special-register-buffer-data-sampling.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst index 6a473da80b62..47b1b3afac99 100644 --- a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst +++ b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst @@ -27,6 +27,8 @@ by software using TSX_CTRL_MSR otherwise they are not affected. ============= ============ ======== common name Family_Model Stepping ============= ============ ======== + IvyBridge 06_3AH All + Haswell 06_3CH All Haswell_L 06_45H All Haswell_G 06_46H All @@ -37,9 +39,8 @@ by software using TSX_CTRL_MSR otherwise they are not affected. Skylake_L 06_4EH All Skylake 06_5EH All - Kabylake_L 06_8EH <=0xC - - Kabylake 06_9EH <=0xD + Kabylake_L 06_8EH <= 0xC + Kabylake 06_9EH <= 0xD ============= ============ ======== Related CVEs From e6ab7490ffaed83d6581f512e66c7c8cc6f58c2d Mon Sep 17 00:00:00 2001 From: Alexander Schmidt Date: Fri, 28 Feb 2020 10:31:13 -0500 Subject: [PATCH 024/574] s390/pci: Expose new port attribute for PCIe functions Add SysFS attribute that provides the port number for PCI functions representing a single port of a multi-port device. Signed-off-by: Alexander Schmidt Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 1 + arch/s390/include/asm/pci_clp.h | 3 ++- arch/s390/pci/pci_clp.c | 1 + arch/s390/pci/pci_sysfs.c | 2 ++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 7485ee561fec..11560bfefe48 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -108,6 +108,7 @@ struct zpci_dev { u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u16 domain; + u8 port; struct mutex lock; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index bd2cb4ea7d93..d8122f534f54 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -102,7 +102,8 @@ struct clp_rsp_query_pci { u16 pchid; __le32 bar[PCI_STD_NUM_BARS]; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ - u32 : 16; + u16 : 12; + u16 port : 4; u8 fmb_len; u8 pft; /* pci function type */ u64 sdma; /* start dma as */ diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index ea794ae755ae..f7bca8cfa92c 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -155,6 +155,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->pfgid = response->pfgid; zdev->pft = response->pft; zdev->vfn = response->vfn; + zdev->port = response->port; zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 215f17437a4f..8ea8d04ed86d 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -33,6 +33,7 @@ zpci_attr(pchid, "0x%04x\n", pchid); zpci_attr(pfgid, "0x%02x\n", pfgid); zpci_attr(vfn, "0x%04x\n", vfn); zpci_attr(pft, "0x%02x\n", pft); +zpci_attr(port, "%d\n", port); zpci_attr(uid, "0x%x\n", uid); zpci_attr(segment0, "0x%02x\n", pfip[0]); zpci_attr(segment1, "0x%02x\n", pfip[1]); @@ -142,6 +143,7 @@ static struct attribute *zpci_dev_attrs[] = { &dev_attr_pchid.attr, &dev_attr_pfgid.attr, &dev_attr_pft.attr, + &dev_attr_port.attr, &dev_attr_vfn.attr, &dev_attr_uid.attr, &dev_attr_recover.attr, From d08d6f5d75242ceb410efbdf650efecc40d68c2d Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Fri, 21 Feb 2020 17:20:46 +0100 Subject: [PATCH 025/574] s390/pci: adaptation of iommu to multifunction In the future the bus sysdata may not directly point to the zpci_dev. In preparation of upcoming patches let us abstract the access to the zpci_dev from the device inside the pci device. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 5 +++++ drivers/iommu/s390-iommu.c | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 11560bfefe48..0e519197ea2d 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -231,6 +231,11 @@ static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) return pdev->sysdata; } +static inline struct zpci_dev *to_zpci_dev(struct device *dev) +{ + return to_zpci(to_pci_dev(dev)); +} + struct zpci_dev *get_zdev_by_fid(u32); /* DMA */ diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 1137f3ddcb85..c60d5c776fc6 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -87,7 +87,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct s390_domain *s390_domain = to_s390_domain(domain); - struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; + struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain_device *domain_device; unsigned long flags; int rc; @@ -139,7 +139,7 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { struct s390_domain *s390_domain = to_s390_domain(domain); - struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; + struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain_device *domain_device, *tmp; unsigned long flags; int found = 0; @@ -169,7 +169,7 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, static int s390_iommu_add_device(struct device *dev) { struct iommu_group *group = iommu_group_get_for_dev(dev); - struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; + struct zpci_dev *zdev = to_zpci_dev(dev); if (IS_ERR(group)) return PTR_ERR(group); @@ -182,7 +182,7 @@ static int s390_iommu_add_device(struct device *dev) static void s390_iommu_remove_device(struct device *dev) { - struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; + struct zpci_dev *zdev = to_zpci_dev(dev); struct iommu_domain *domain; /* From 6cf17f9a67c124aa4739b79709008d942635b975 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Fri, 7 Feb 2020 13:35:08 +0100 Subject: [PATCH 026/574] s390/pci: define kernel parameters for PCI multifunction Using PCI multifunctions in S390 is a new feature we may want to ignore to continue provide the same topology as in the past to userland even if the configuration supports exposing the topology of a multi-Function device. A new boolean parameters allows to overwrite the kernel pci configuration: - pci=norid when on, disallow the use a new firmware field, RID, which provides the PCI :. part of the PCI address. To be used in the following patches and satisfy the checkpatch.pl the variable is exposed in pci.h Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 0e519197ea2d..ec28bbc2d3d1 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -168,6 +168,7 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) extern const struct attribute_group *zpci_attr_groups[]; extern unsigned int s390_pci_force_floating __initdata; +extern unsigned int s390_pci_no_rid; /* ----------------------------------------------------------------------------- Prototypes diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 94ca121933de..3386a46f6bde 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -48,6 +48,8 @@ static unsigned int zpci_num_domains_allocated; min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ ZPCI_IOMAP_MAX_ENTRIES) +unsigned int s390_pci_no_rid; + static DEFINE_SPINLOCK(zpci_iomap_lock); static unsigned long *zpci_iomap_bitmap; struct zpci_iomap_entry *zpci_iomap_start; @@ -844,6 +846,10 @@ char * __init pcibios_setup(char *str) s390_pci_force_floating = 1; return NULL; } + if (!strcmp(str, "norid")) { + s390_pci_no_rid = 1; + return NULL; + } return str; } From c9a1752b84f1a8f73187c116ff0514b2ab24d878 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Fri, 21 Feb 2020 10:06:38 +0100 Subject: [PATCH 027/574] s390/pci: define RID and RID available Firmware provides the bus/devfn part of the PCI addresses of a zPCI function inside the new field RID of the CLP query PCI function with a bit to know if this field is available to use. Let's add these fields to the clp_rsp_query_pci structure, add corresponding fields to zdev and initialize them. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 3 +++ arch/s390/include/asm/pci_clp.h | 9 +++++++-- arch/s390/pci/pci_clp.c | 3 +++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index ec28bbc2d3d1..4dbf8e17f0e8 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -109,6 +109,9 @@ struct zpci_dev { u8 pft; /* pci function type */ u16 domain; u8 port; + u8 rid_available : 1; + u8 reserved : 7; + unsigned int devfn; /* DEVFN part of the RID*/ struct mutex lock; u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index d8122f534f54..896ee41e23e3 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -93,7 +93,9 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 6; + u16 : 3; + u16 rid_avail : 1; + u16 : 2; u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ @@ -108,7 +110,10 @@ struct clp_rsp_query_pci { u8 pft; /* pci function type */ u64 sdma; /* start dma as */ u64 edma; /* end dma as */ - u32 reserved[11]; +#define ZPCI_RID_MASK_DEVFN 0x00ff + u16 rid; /* BUS/DEVFN PCI address */ + u16 reserved0; + u32 reserved[10]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ u32 reserved2[16]; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index f7bca8cfa92c..9b318824a134 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -158,6 +158,9 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->port = response->port; zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; + zdev->rid_available = response->rid_avail; + if (!s390_pci_no_rid && zdev->rid_available) + zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip)); if (response->util_str_avail) { From 05bc1be6db4b2683bbf5b9394a75d0fb3acfcede Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 23 Mar 2020 10:45:43 +0100 Subject: [PATCH 028/574] s390/pci: create zPCI bus The zPCI bus is in charge to handle common zPCI resources for zPCI devices. Creating the zPCI bus, the PCI bus, the zPCI devices and the PCI devices and hotplug slots done in a specific order: - PCI hotplug slot creation needs a PCI bus - PCI bus needs a PCI domain which is reported by the pci_domain_nr() when setting up the host bridge - PCI domain is set from the zPCI with devfn 0 this is necessary to have a reproducible enumeration Therefore we can not create devices or hotplug slots for any PCI device associated with a zPCI device before having discovered the function zero of the bus. The discovery and initialization of devices can be done at several points in the code: - On Events, serialized in a thread context - On initialization, in the kernel init thread context - When powering on the hotplug slot, in a user thread context The removal of devices and their parent bus may also be done on events or for devices when powering down the slot. To guarantee the existence of the bus and devices until they are no more needed we use kref in zPCI bus and introduce a reference count in the zPCI devices. In this patch the zPCI bus still only accept a device with a devfn 0. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 26 ++++- arch/s390/pci/Makefile | 3 +- arch/s390/pci/pci.c | 171 ++++++++++++----------------- arch/s390/pci/pci_bus.c | 147 +++++++++++++++++++++++++ arch/s390/pci/pci_bus.h | 30 +++++ arch/s390/pci/pci_event.c | 17 ++- arch/s390/pci/pci_sysfs.c | 2 +- drivers/pci/hotplug/s390_pci_hpc.c | 11 +- 8 files changed, 288 insertions(+), 119 deletions(-) create mode 100644 arch/s390/pci/pci_bus.c create mode 100644 arch/s390/pci/pci_bus.h diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 4dbf8e17f0e8..686a44218456 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -28,6 +28,12 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS #define ZPCI_DOMAIN_BITMAP_SIZE (1 << 16) +#ifdef PCI +#if (ZPCI_NR_DEVICES > ZPCI_DOMAIN_BITMAP_SIZE) +# error ZPCI_NR_DEVICES can not be bigger than ZPCI_DOMAIN_BITMAP_SIZE +#endif +#endif /* PCI */ + /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 #define ZPCI_FC_ERROR 0x40 @@ -94,10 +100,23 @@ struct zpci_bar_struct { struct s390_domain; +#define ZPCI_FUNCTIONS_PER_BUS 256 +struct zpci_bus { + struct kref kref; + struct pci_bus *bus; + struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS]; + struct list_head resources; + struct list_head bus_next; + int pchid; + int domain_nr; + enum pci_bus_speed max_bus_speed; +}; + /* Private data per function */ struct zpci_dev { - struct pci_bus *bus; + struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct kref kref; struct hotplug_slot hotplug_slot; enum zpci_state state; @@ -107,7 +126,6 @@ struct zpci_dev { u16 pchid; /* physical channel ID */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ - u16 domain; u8 port; u8 rid_available : 1; u8 reserved : 7; @@ -232,7 +250,9 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {} /* Helpers */ static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) { - return pdev->sysdata; + struct zpci_bus *zbus = pdev->sysdata; + + return zbus->function[ZPCI_DEVFN]; } static inline struct zpci_dev *to_zpci_dev(struct device *dev) diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 748626a33028..b4e3c84772a1 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -4,4 +4,5 @@ # obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ - pci_event.o pci_debug.o pci_insn.o pci_mmio.o + pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ + pci_bus.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 3386a46f6bde..41423dad881c 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -36,13 +36,14 @@ #include #include +#include "pci_bus.h" + /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); static DECLARE_BITMAP(zpci_domain, ZPCI_DOMAIN_BITMAP_SIZE); static DEFINE_SPINLOCK(zpci_domain_lock); -static unsigned int zpci_num_domains_allocated; #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_STD_NUM_BARS / 2), \ @@ -90,17 +91,12 @@ void zpci_remove_reserved_devices(void) spin_unlock(&zpci_list_lock); list_for_each_entry_safe(zdev, tmp, &remove, entry) - zpci_remove_device(zdev); -} - -static struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus) -{ - return (bus && bus->sysdata) ? (struct zpci_dev *) bus->sysdata : NULL; + zpci_zdev_put(zdev); } int pci_domain_nr(struct pci_bus *bus) { - return ((struct zpci_dev *) bus->sysdata)->domain; + return ((struct zpci_bus *) bus->sysdata)->domain_nr; } EXPORT_SYMBOL_GPL(pci_domain_nr); @@ -508,15 +504,15 @@ static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, return r; } -static int zpci_setup_bus_resources(struct zpci_dev *zdev, - struct list_head *resources) +int zpci_setup_bus_resources(struct zpci_dev *zdev, + struct list_head *resources) { unsigned long addr, size, flags; struct resource *res; int i, entry; snprintf(zdev->res_name, sizeof(zdev->res_name), - "PCI Bus %04x:%02x", zdev->domain, ZPCI_BUS_NR); + "PCI Bus %04x:%02x", zdev->uid, ZPCI_BUS_NR); for (i = 0; i < PCI_STD_NUM_BARS; i++) { if (!zdev->bars[i].size) @@ -610,98 +606,53 @@ void pcibios_disable_device(struct pci_dev *pdev) zpci_debug_exit_device(zdev); } -static int zpci_alloc_domain(struct zpci_dev *zdev) +static int __zpci_register_domain(int domain) { spin_lock(&zpci_domain_lock); - if (zpci_num_domains_allocated > (ZPCI_NR_DEVICES - 1)) { + if (test_bit(domain, zpci_domain)) { spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because the configured limit of %d is reached\n", - zdev->fid, ZPCI_NR_DEVICES); - return -ENOSPC; + pr_err("Domain %04x is already assigned\n", domain); + return -EEXIST; } + set_bit(domain, zpci_domain); + spin_unlock(&zpci_domain_lock); + return domain; +} - if (zpci_unique_uid) { - zdev->domain = (u16) zdev->uid; - if (zdev->domain == 0) { - pr_warn("UID checking is active but no UID is set for PCI function %08x, so automatic domain allocation is used instead\n", - zdev->fid); - update_uid_checking(false); - goto auto_allocate; - } +static int __zpci_alloc_domain(void) +{ + int domain; - if (test_bit(zdev->domain, zpci_domain)) { - spin_unlock(&zpci_domain_lock); - pr_err("Adding PCI function %08x failed because domain %04x is already assigned\n", - zdev->fid, zdev->domain); - return -EEXIST; - } - set_bit(zdev->domain, zpci_domain); - zpci_num_domains_allocated++; - spin_unlock(&zpci_domain_lock); - return 0; - } -auto_allocate: + spin_lock(&zpci_domain_lock); /* * We can always auto allocate domains below ZPCI_NR_DEVICES. * There is either a free domain or we have reached the maximum in * which case we would have bailed earlier. */ - zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); - set_bit(zdev->domain, zpci_domain); - zpci_num_domains_allocated++; + domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); + set_bit(domain, zpci_domain); spin_unlock(&zpci_domain_lock); - return 0; + return domain; } -static void zpci_free_domain(struct zpci_dev *zdev) +int zpci_alloc_domain(int domain) +{ + if (zpci_unique_uid) { + if (domain) + return __zpci_register_domain(domain); + pr_warn("UID checking was active but no UID is provided: switching to automatic domain allocation\n"); + update_uid_checking(false); + } + return __zpci_alloc_domain(); +} + +void zpci_free_domain(int domain) { spin_lock(&zpci_domain_lock); - clear_bit(zdev->domain, zpci_domain); - zpci_num_domains_allocated--; + clear_bit(domain, zpci_domain); spin_unlock(&zpci_domain_lock); } -void pcibios_remove_bus(struct pci_bus *bus) -{ - struct zpci_dev *zdev = get_zdev_by_bus(bus); - - zpci_exit_slot(zdev); - zpci_cleanup_bus_resources(zdev); - zpci_destroy_iommu(zdev); - zpci_free_domain(zdev); - - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); - - zpci_dbg(3, "rem fid:%x\n", zdev->fid); - kfree(zdev); -} - -static int zpci_scan_bus(struct zpci_dev *zdev) -{ - LIST_HEAD(resources); - int ret; - - ret = zpci_setup_bus_resources(zdev, &resources); - if (ret) - goto error; - - zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops, - zdev, &resources); - if (!zdev->bus) { - ret = -EIO; - goto error; - } - zdev->bus->max_bus_speed = zdev->max_bus_speed; - pci_bus_add_devices(zdev->bus); - return 0; - -error: - zpci_cleanup_bus_resources(zdev); - pci_free_resource_list(&resources); - return ret; -} int zpci_enable_device(struct zpci_dev *zdev) { @@ -736,13 +687,15 @@ int zpci_create_device(struct zpci_dev *zdev) { int rc; - rc = zpci_alloc_domain(zdev); - if (rc) - goto out; + kref_init(&zdev->kref); + + spin_lock(&zpci_list_lock); + list_add_tail(&zdev->entry, &zpci_list); + spin_unlock(&zpci_list_lock); rc = zpci_init_iommu(zdev); if (rc) - goto out_free; + goto out; mutex_init(&zdev->lock); if (zdev->state == ZPCI_FN_STATE_CONFIGURED) { @@ -750,16 +703,12 @@ int zpci_create_device(struct zpci_dev *zdev) if (rc) goto out_destroy_iommu; } - rc = zpci_scan_bus(zdev); + + rc = zpci_bus_device_register(zdev, &pci_root_ops); if (rc) goto out_disable; - spin_lock(&zpci_list_lock); - list_add_tail(&zdev->entry, &zpci_list); - spin_unlock(&zpci_list_lock); - zpci_init_slot(zdev); - return 0; out_disable: @@ -767,19 +716,39 @@ out_disable: zpci_disable_device(zdev); out_destroy_iommu: zpci_destroy_iommu(zdev); -out_free: - zpci_free_domain(zdev); out: + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); return rc; } -void zpci_remove_device(struct zpci_dev *zdev) +void zpci_release_device(struct kref *kref) { - if (!zdev->bus) - return; + struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); - pci_stop_root_bus(zdev->bus); - pci_remove_root_bus(zdev->bus); + switch (zdev->state) { + case ZPCI_FN_STATE_ONLINE: + case ZPCI_FN_STATE_CONFIGURED: + zpci_disable_device(zdev); + fallthrough; + case ZPCI_FN_STATE_STANDBY: + if (zdev->zbus) { + zpci_exit_slot(zdev); + zpci_cleanup_bus_resources(zdev); + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); + } + fallthrough; + default: + break; + } + + spin_lock(&zpci_list_lock); + list_del(&zdev->entry); + spin_unlock(&zpci_list_lock); + zpci_dbg(3, "rem fid:%x\n", zdev->fid); + kfree(zdev); } int zpci_report_error(struct pci_dev *pdev, diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c new file mode 100644 index 000000000000..e1565b8537de --- /dev/null +++ b/arch/s390/pci/pci_bus.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel + * + */ + +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "pci_bus.h" + +static LIST_HEAD(zbus_list); +static DEFINE_SPINLOCK(zbus_list_lock); +static int zpci_nb_devices; + +/* zpci_bus_scan + * @zbus: the zbus holding the zdevices + * @ops: the pci operations + * + * The domain number must be set before pci_scan_root_bus is called. + * This function can be called once the domain is known, hence + * when the function_0 is dicovered. + */ +static int zpci_bus_scan(struct zpci_bus *zbus, int domain, struct pci_ops *ops) +{ + struct pci_bus *bus; + int rc; + + rc = zpci_alloc_domain(domain); + if (rc < 0) + return rc; + zbus->domain_nr = rc; + + bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources); + if (!bus) { + zpci_free_domain(zbus->domain_nr); + return -EFAULT; + } + + zbus->bus = bus; + pci_bus_add_devices(bus); + return 0; +} + +static void zpci_bus_release(struct kref *kref) +{ + struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); + + pci_lock_rescan_remove(); + pci_stop_root_bus(zbus->bus); + + zpci_free_domain(zbus->domain_nr); + pci_free_resource_list(&zbus->resources); + + pci_remove_root_bus(zbus->bus); + pci_unlock_rescan_remove(); + + spin_lock(&zbus_list_lock); + list_del(&zbus->bus_next); + spin_unlock(&zbus_list_lock); + kfree(zbus); +} + +static void zpci_bus_put(struct zpci_bus *zbus) +{ + kref_put(&zbus->kref, zpci_bus_release); +} + +static struct zpci_bus *zpci_bus_alloc(int pchid) +{ + struct zpci_bus *zbus; + + zbus = kzalloc(sizeof(*zbus), GFP_KERNEL); + if (!zbus) + return NULL; + + zbus->pchid = pchid; + INIT_LIST_HEAD(&zbus->bus_next); + spin_lock(&zbus_list_lock); + list_add_tail(&zbus->bus_next, &zbus_list); + spin_unlock(&zbus_list_lock); + + kref_init(&zbus->kref); + INIT_LIST_HEAD(&zbus->resources); + + return zbus; +} + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) +{ + struct zpci_bus *zbus; + int rc; + + if (zpci_nb_devices == ZPCI_NR_DEVICES) { + pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", + zdev->fid, ZPCI_NR_DEVICES); + return -ENOSPC; + } + zpci_nb_devices++; + + if (zdev->devfn != ZPCI_DEVFN) + return -EINVAL; + + zbus = zpci_bus_alloc(zdev->pchid); + if (!zbus) + return -ENOMEM; + + zdev->zbus = zbus; + zbus->function[ZPCI_DEVFN] = zdev; + + zpci_setup_bus_resources(zdev, &zbus->resources); + zbus->max_bus_speed = zdev->max_bus_speed; + + rc = zpci_bus_scan(zbus, (u16)zdev->uid, ops); + if (!rc) + return 0; + + pr_err("Adding PCI function %08x failed\n", zdev->fid); + zdev->zbus = NULL; + zpci_bus_put(zbus); + return rc; +} + +void zpci_bus_device_unregister(struct zpci_dev *zdev) +{ + struct zpci_bus *zbus = zdev->zbus; + + zpci_nb_devices--; + zbus->function[ZPCI_DEVFN] = NULL; + zpci_bus_put(zbus); +} diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h new file mode 100644 index 000000000000..c6aff42cc2cf --- /dev/null +++ b/arch/s390/pci/pci_bus.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): + * Pierre Morel + * + */ + +int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops); +void zpci_bus_device_unregister(struct zpci_dev *zdev); +int zpci_bus_init(void); + +void zpci_release_device(struct kref *kref); +static inline void zpci_zdev_put(struct zpci_dev *zdev) +{ + kref_put(&zdev->kref, zpci_release_device); +} + +int zpci_alloc_domain(int domain); +void zpci_free_domain(int domain); +int zpci_setup_bus_resources(struct zpci_dev *zdev, + struct list_head *resources); + +static inline struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus) +{ + struct zpci_bus *zbus = bus->sysdata; + + return zbus->function[ZPCI_DEVFN]; +} diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 8d6ee4af4230..d39e9299e133 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -14,6 +14,8 @@ #include #include +#include "pci_bus.h" + /* Content Code Description for PCI Function Error */ struct zpci_ccdf_err { u32 reserved1; @@ -53,7 +55,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) zpci_err_hex(ccdf, sizeof(*ccdf)); if (zdev) - pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN); + pdev = pci_get_slot(zdev->zbus->bus, ZPCI_DEVFN); pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); @@ -78,11 +80,9 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) enum zpci_state state; int ret; - if (zdev) - pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN); + if (zdev && zdev->zbus && zdev->zbus->bus) + pdev = pci_get_slot(zdev->zbus->bus, ZPCI_DEVFN); - pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n", - pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -102,7 +102,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) if (ret) break; pci_lock_rescan_remove(); - pci_rescan_bus(zdev->bus); + pci_rescan_bus(zdev->zbus->bus); pci_unlock_rescan_remove(); break; case 0x0302: /* Reserved -> Standby */ @@ -140,7 +140,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev->state = ZPCI_FN_STATE_STANDBY; if (!clp_get_state(ccdf->fid, &state) && state == ZPCI_FN_STATE_RESERVED) { - zpci_remove_device(zdev); + zpci_zdev_put(zdev); } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ @@ -149,12 +149,11 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) case 0x0308: /* Standby -> Reserved */ if (!zdev) break; - zpci_remove_device(zdev); + zpci_zdev_put(zdev); break; default: break; } - pci_dev_put(pdev); } void zpci_event_availability(void *data) diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index 8ea8d04ed86d..5c028bee91b9 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -89,7 +89,7 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, ret = zpci_enable_device(zdev); if (ret) goto out; - pci_rescan_bus(zdev->bus); + pci_rescan_bus(zdev->zbus->bus); } out: pci_unlock_rescan_remove(); diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index 39295d88f670..a9c9f05fe54b 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -52,6 +52,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) { struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, hotplug_slot); + struct zpci_bus *zbus = zdev->zbus; int rc; if (zdev->state != ZPCI_FN_STATE_STANDBY) @@ -65,9 +66,9 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) if (rc) goto out_deconfigure; - pci_scan_slot(zdev->bus, ZPCI_DEVFN); + pci_scan_slot(zbus->bus, ZPCI_DEVFN); pci_lock_rescan_remove(); - pci_bus_add_devices(zdev->bus); + pci_bus_add_devices(zbus->bus); pci_unlock_rescan_remove(); return rc; @@ -82,12 +83,13 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, hotplug_slot); struct pci_dev *pdev; + struct zpci_bus *zbus = zdev->zbus; int rc; if (!zpci_fn_configured(zdev->state)) return -EIO; - pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN); + pdev = pci_get_slot(zbus->bus, ZPCI_DEVFN); if (pdev) { pci_stop_and_remove_bus_device_locked(pdev); pci_dev_put(pdev); @@ -133,11 +135,12 @@ static const struct hotplug_slot_ops s390_hotplug_slot_ops = { int zpci_init_slot(struct zpci_dev *zdev) { char name[SLOT_NAME_SIZE]; + struct zpci_bus *zbus = zdev->zbus; zdev->hotplug_slot.ops = &s390_hotplug_slot_ops; snprintf(name, SLOT_NAME_SIZE, "%08x", zdev->fid); - return pci_hp_register(&zdev->hotplug_slot, zdev->bus, + return pci_hp_register(&zdev->hotplug_slot, zbus->bus, ZPCI_DEVFN, name); } From f606b3ef47c9f874af605323099663a10f691b24 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Wed, 25 Mar 2020 17:55:55 +0100 Subject: [PATCH 029/574] s390/pci: adapt events for zbus Simplify the event handling. Set the zpci state explicitly. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_event.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d39e9299e133..c296214f0a19 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -89,25 +89,19 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) switch (ccdf->pec) { case 0x0301: /* Reserved|Standby -> Configured */ if (!zdev) { - ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0); - if (ret) - break; - zdev = get_zdev_by_fid(ccdf->fid); + ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 1); + break; } - if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY) - break; - zdev->state = ZPCI_FN_STATE_CONFIGURED; zdev->fh = ccdf->fh; - ret = zpci_enable_device(zdev); - if (ret) - break; - pci_lock_rescan_remove(); - pci_rescan_bus(zdev->zbus->bus); - pci_unlock_rescan_remove(); + zdev->state = ZPCI_FN_STATE_CONFIGURED; + zpci_create_device(zdev); break; case 0x0302: /* Reserved -> Standby */ - if (!zdev) + if (!zdev) { clp_add_pci_device(ccdf->fid, ccdf->fh, 0); + break; + } + zdev->fh = ccdf->fh; break; case 0x0303: /* Deconfiguration requested */ if (!zdev) @@ -135,8 +129,6 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) pci_stop_and_remove_bus_device_locked(pdev); } - zdev->fh = ccdf->fh; - zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; if (!clp_get_state(ccdf->fid, &state) && state == ZPCI_FN_STATE_RESERVED) { From 65e450a9f9adabf3de1305a4c616f1313df402a3 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 23 Mar 2020 12:29:37 +0100 Subject: [PATCH 030/574] s390/pci: Adding bus resource The current PCI implementation do not provide a bus resource. This leads to a notice being print at boot. Let's do it more nicely and provide the bus resource. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci_bus.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 686a44218456..7d99ab35833c 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -107,6 +107,7 @@ struct zpci_bus { struct zpci_dev *function[ZPCI_FUNCTIONS_PER_BUS]; struct list_head resources; struct list_head bus_next; + struct resource bus_resource; int pchid; int domain_nr; enum pci_bus_speed max_bus_speed; diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index e1565b8537de..b4fefc69c461 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -99,6 +99,11 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) kref_init(&zbus->kref); INIT_LIST_HEAD(&zbus->resources); + zbus->bus_resource.start = 0; + zbus->bus_resource.end = ZPCI_BUS_NR; + zbus->bus_resource.flags = IORESOURCE_BUS; + pci_add_resource(&zbus->resources, &zbus->bus_resource); + return zbus; } From 44510d6fa0c00aa90b80075caa6b313b25927475 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Wed, 22 Apr 2020 15:15:23 +0200 Subject: [PATCH 031/574] s390/pci: Handling multifunctions We allow multiple functions on a single bus. We suppress the ZPCI_DEVFN definition and replace its occurences with zpci->devfn. We verify the number of device during the registration. There can never be more domains in use than existing devices, so we do not need to verify the count of domain after having verified the count of devices. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 8 +- arch/s390/pci/pci.c | 39 ++++---- arch/s390/pci/pci_bus.c | 148 +++++++++++++++++++++++++---- arch/s390/pci/pci_bus.h | 5 +- arch/s390/pci/pci_event.c | 4 +- drivers/pci/hotplug/s390_pci_hpc.c | 6 +- 6 files changed, 159 insertions(+), 51 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 7d99ab35833c..c1558cf071b8 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -22,7 +22,6 @@ int pci_domain_nr(struct pci_bus *); int pci_proc_domain(struct pci_bus *); #define ZPCI_BUS_NR 0 /* default bus number */ -#define ZPCI_DEVFN 0 /* default device number */ #define ZPCI_NR_DMA_SPACES 1 #define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS @@ -110,6 +109,7 @@ struct zpci_bus { struct resource bus_resource; int pchid; int domain_nr; + bool multifunction; enum pci_bus_speed max_bus_speed; }; @@ -117,6 +117,7 @@ struct zpci_bus { struct zpci_dev { struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct list_head bus_next; struct kref kref; struct hotplug_slot hotplug_slot; @@ -129,7 +130,8 @@ struct zpci_dev { u8 pft; /* pci function type */ u8 port; u8 rid_available : 1; - u8 reserved : 7; + u8 has_hp_slot : 1; + u8 reserved : 6; unsigned int devfn; /* DEVFN part of the RID*/ struct mutex lock; @@ -253,7 +255,7 @@ static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) { struct zpci_bus *zbus = pdev->sysdata; - return zbus->function[ZPCI_DEVFN]; + return zbus->function[pdev->devfn]; } static inline struct zpci_dev *to_zpci_dev(struct device *dev) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 41423dad881c..3f6670613c57 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -371,29 +371,17 @@ EXPORT_SYMBOL(pci_iounmap); static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct zpci_dev *zdev = get_zdev_by_bus(bus); - int ret; + struct zpci_dev *zdev = get_zdev_by_bus(bus, devfn); - if (!zdev || devfn != ZPCI_DEVFN) - ret = -ENODEV; - else - ret = zpci_cfg_load(zdev, where, val, size); - - return ret; + return (zdev) ? zpci_cfg_load(zdev, where, val, size) : -ENODEV; } static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct zpci_dev *zdev = get_zdev_by_bus(bus); - int ret; + struct zpci_dev *zdev = get_zdev_by_bus(bus, devfn); - if (!zdev || devfn != ZPCI_DEVFN) - ret = -ENODEV; - else - ret = zpci_cfg_store(zdev, where, val, size); - - return ret; + return (zdev) ? zpci_cfg_store(zdev, where, val, size) : -ENODEV; } static struct pci_ops pci_root_ops = { @@ -708,12 +696,12 @@ int zpci_create_device(struct zpci_dev *zdev) if (rc) goto out_disable; - zpci_init_slot(zdev); return 0; out_disable: if (zdev->state == ZPCI_FN_STATE_ONLINE) zpci_disable_device(zdev); + out_destroy_iommu: zpci_destroy_iommu(zdev); out: @@ -727,18 +715,25 @@ void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); + if (zdev->zbus->bus) { + struct pci_dev *pdev; + + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + if (pdev) + pci_stop_and_remove_bus_device_locked(pdev); + } + switch (zdev->state) { case ZPCI_FN_STATE_ONLINE: case ZPCI_FN_STATE_CONFIGURED: zpci_disable_device(zdev); fallthrough; case ZPCI_FN_STATE_STANDBY: - if (zdev->zbus) { + if (zdev->has_hp_slot) zpci_exit_slot(zdev); - zpci_cleanup_bus_resources(zdev); - zpci_bus_device_unregister(zdev); - zpci_destroy_iommu(zdev); - } + zpci_cleanup_bus_resources(zdev); + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); fallthrough; default: break; diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index b4fefc69c461..542c6b8f56df 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -62,14 +62,16 @@ static void zpci_bus_release(struct kref *kref) { struct zpci_bus *zbus = container_of(kref, struct zpci_bus, kref); - pci_lock_rescan_remove(); - pci_stop_root_bus(zbus->bus); + if (zbus->bus) { + pci_lock_rescan_remove(); + pci_stop_root_bus(zbus->bus); - zpci_free_domain(zbus->domain_nr); - pci_free_resource_list(&zbus->resources); + zpci_free_domain(zbus->domain_nr); + pci_free_resource_list(&zbus->resources); - pci_remove_root_bus(zbus->bus); - pci_unlock_rescan_remove(); + pci_remove_root_bus(zbus->bus); + pci_unlock_rescan_remove(); + } spin_lock(&zbus_list_lock); list_del(&zbus->bus_next); @@ -82,6 +84,23 @@ static void zpci_bus_put(struct zpci_bus *zbus) kref_put(&zbus->kref, zpci_bus_release); } +static struct zpci_bus *zpci_bus_get(int pchid) +{ + struct zpci_bus *zbus; + + spin_lock(&zbus_list_lock); + list_for_each_entry(zbus, &zbus_list, bus_next) { + if (pchid == zbus->pchid) { + kref_get(&zbus->kref); + goto out_unlock; + } + } + zbus = NULL; +out_unlock: + spin_unlock(&zbus_list_lock); + return zbus; +} + static struct zpci_bus *zpci_bus_alloc(int pchid) { struct zpci_bus *zbus; @@ -107,10 +126,61 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) return zbus; } +static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_bus *bus; + struct resource_entry *window, *n; + struct resource *res; + struct pci_dev *pdev; + int rc; + + bus = zbus->bus; + if (!bus) + return -EINVAL; + + pdev = pci_get_slot(bus, zdev->devfn); + if (pdev) { + /* Device is already known. */ + pci_dev_put(pdev); + return 0; + } + + rc = zpci_init_slot(zdev); + if (rc) + return rc; + zdev->has_hp_slot = 1; + + resource_list_for_each_entry_safe(window, n, &zbus->resources) { + res = window->res; + pci_bus_add_resource(bus, res, 0); + } + + pdev = pci_scan_single_device(bus, zdev->devfn); + if (pdev) { + pdev->multifunction = 1; + pci_bus_add_device(pdev); + } + + return 0; +} + +static void zpci_bus_add_devices(struct zpci_bus *zbus) +{ + int i; + + for (i = 1; i < ZPCI_FUNCTIONS_PER_BUS; i++) + if (zbus->function[i]) + zpci_bus_add_device(zbus, zbus->function[i]); + + pci_lock_rescan_remove(); + pci_bus_add_devices(zbus->bus); + pci_unlock_rescan_remove(); +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { - struct zpci_bus *zbus; - int rc; + struct zpci_bus *zbus = NULL; + int rc = -EBADF; if (zpci_nb_devices == ZPCI_NR_DEVICES) { pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n", @@ -119,25 +189,65 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) } zpci_nb_devices++; - if (zdev->devfn != ZPCI_DEVFN) + if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS) return -EINVAL; - zbus = zpci_bus_alloc(zdev->pchid); - if (!zbus) - return -ENOMEM; + if (!s390_pci_no_rid && zdev->rid_available) + zbus = zpci_bus_get(zdev->pchid); + + if (!zbus) { + zbus = zpci_bus_alloc(zdev->pchid); + if (!zbus) + return -ENOMEM; + } zdev->zbus = zbus; - zbus->function[ZPCI_DEVFN] = zdev; + if (zbus->function[zdev->devfn]) { + pr_err("devfn %04x is already assigned\n", zdev->devfn); + goto error; /* rc already set */ + } + zbus->function[zdev->devfn] = zdev; zpci_setup_bus_resources(zdev, &zbus->resources); - zbus->max_bus_speed = zdev->max_bus_speed; - rc = zpci_bus_scan(zbus, (u16)zdev->uid, ops); - if (!rc) - return 0; + if (zbus->bus) { + if (!zbus->multifunction) { + WARN_ONCE(1, "zbus is not multifunction\n"); + goto error_bus; + } + if (!zdev->rid_available) { + WARN_ONCE(1, "rid_available not set for multifunction\n"); + goto error_bus; + } + rc = zpci_bus_add_device(zbus, zdev); + if (rc) + goto error_bus; + } else if (zdev->devfn == 0) { + if (zbus->multifunction && !zdev->rid_available) { + WARN_ONCE(1, "rid_available not set on function 0 for multifunction\n"); + goto error_bus; + } + rc = zpci_bus_scan(zbus, (u16)zdev->uid, ops); + if (rc) + goto error_bus; + zpci_bus_add_devices(zbus); + rc = zpci_init_slot(zdev); + if (rc) + goto error_bus; + zdev->has_hp_slot = 1; + zbus->multifunction = zdev->rid_available; + zbus->max_bus_speed = zdev->max_bus_speed; + } else { + zbus->multifunction = 1; + } + return 0; + +error_bus: + zpci_nb_devices--; + zbus->function[zdev->devfn] = NULL; +error: pr_err("Adding PCI function %08x failed\n", zdev->fid); - zdev->zbus = NULL; zpci_bus_put(zbus); return rc; } @@ -147,6 +257,6 @@ void zpci_bus_device_unregister(struct zpci_dev *zdev) struct zpci_bus *zbus = zdev->zbus; zpci_nb_devices--; - zbus->function[ZPCI_DEVFN] = NULL; + zbus->function[zdev->devfn] = NULL; zpci_bus_put(zbus); } diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h index c6aff42cc2cf..89be3c354b7b 100644 --- a/arch/s390/pci/pci_bus.h +++ b/arch/s390/pci/pci_bus.h @@ -22,9 +22,10 @@ void zpci_free_domain(int domain); int zpci_setup_bus_resources(struct zpci_dev *zdev, struct list_head *resources); -static inline struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus) +static inline struct zpci_dev *get_zdev_by_bus(struct pci_bus *bus, + unsigned int devfn) { struct zpci_bus *zbus = bus->sysdata; - return zbus->function[ZPCI_DEVFN]; + return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn]; } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index c296214f0a19..08e1d619398e 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -55,7 +55,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) zpci_err_hex(ccdf, sizeof(*ccdf)); if (zdev) - pdev = pci_get_slot(zdev->zbus->bus, ZPCI_DEVFN); + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); @@ -81,7 +81,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) int ret; if (zdev && zdev->zbus && zdev->zbus->bus) - pdev = pci_get_slot(zdev->zbus->bus, ZPCI_DEVFN); + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index a9c9f05fe54b..1579ba895edf 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -66,7 +66,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) if (rc) goto out_deconfigure; - pci_scan_slot(zbus->bus, ZPCI_DEVFN); + pci_scan_slot(zbus->bus, zdev->devfn); pci_lock_rescan_remove(); pci_bus_add_devices(zbus->bus); pci_unlock_rescan_remove(); @@ -89,7 +89,7 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) if (!zpci_fn_configured(zdev->state)) return -EIO; - pdev = pci_get_slot(zbus->bus, ZPCI_DEVFN); + pdev = pci_get_slot(zbus->bus, zdev->devfn); if (pdev) { pci_stop_and_remove_bus_device_locked(pdev); pci_dev_put(pdev); @@ -141,7 +141,7 @@ int zpci_init_slot(struct zpci_dev *zdev) snprintf(name, SLOT_NAME_SIZE, "%08x", zdev->fid); return pci_hp_register(&zdev->hotplug_slot, zbus->bus, - ZPCI_DEVFN, name); + zdev->devfn, name); } void zpci_exit_slot(struct zpci_dev *zdev) From 53dd462ac4dc3fc61ee90ad03d96202e17589156 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 26 Mar 2020 12:07:03 -0400 Subject: [PATCH 032/574] s390/pci: Do not disable PF when VFs exist The Physical function should not be disabled until no virtual functions depends on it. Let's force the user to first use echo 0 > sriov_numfs before allowing to disable the PF with echo 0 > power. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- drivers/pci/hotplug/s390_pci_hpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index 1579ba895edf..b59f84918fe0 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -91,6 +91,9 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) pdev = pci_get_slot(zbus->bus, zdev->devfn); if (pdev) { + if (pci_num_vf(pdev)) + return -EBUSY; + pci_stop_and_remove_bus_device_locked(pdev); pci_dev_put(pdev); } From de267a7c71ba6be7857da0185871759067513d9c Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Wed, 1 Apr 2020 11:12:24 +0200 Subject: [PATCH 033/574] s390/pci: Documentation for zPCI There are changes in the usage of PCI for the user: - new kernel parameter - modification of the way functions are enumerated Let's document these. Signed-off-by: Pierre Morel Signed-off-by: Vasily Gorbik --- .../admin-guide/kernel-parameters.txt | 2 + Documentation/s390/index.rst | 1 + Documentation/s390/pci.rst | 126 ++++++++++++++++++ MAINTAINERS | 1 + 4 files changed, 130 insertions(+) create mode 100644 Documentation/s390/pci.rst diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..e6d4fb2e32d0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3669,6 +3669,8 @@ may put more devices in an IOMMU group. force_floating [S390] Force usage of floating interrupts. nomio [S390] Do not use MIO instructions. + norid [S390] ignore the RID field and force use of + one PCI domain per PCI function pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst index f7af2061e406..cf71df5776b4 100644 --- a/Documentation/s390/index.rst +++ b/Documentation/s390/index.rst @@ -15,6 +15,7 @@ s390 Architecture vfio-ccw zfcpdump common_io + pci text_files diff --git a/Documentation/s390/pci.rst b/Documentation/s390/pci.rst new file mode 100644 index 000000000000..75e043d4da85 --- /dev/null +++ b/Documentation/s390/pci.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +S/390 PCI +========= + +Authors: + - Pierre Morel + +Copyright, IBM Corp. 2020 + + +command line parameters and debugfs entries +=========================================== + +Command line parameters +----------------------- + +* nomio + + Do not use MIO instructions. + +* norid + + Ignore the RID field and force use of one PCI domain per PCI function. + +debugfs entries +--------------- + +* /sys/kernel/debug/s390dbf/pci_*/ (S/390 debug feature) + + Some views generated by the debug feature to hold various debug outputs. + + - /sys/kernel/debug/s390dbf/pci_msg/sprintf + Messages from the processing of PCI events like machine check handling + and setting of global functionality like UID checking. + + The level of logging can be changed to be more or less verbose by piping to + /sys/kernel/debug/s390dbf/pci_*/level a number between 0 and 6; see the + documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst) + for details. + +Sysfs entries +============= + +Specific entries, or entries specificities for zPCI functions. + +* /sys/bus/pci/slots/XXXXXXXX + + The slot entries are setup using the FID (Function Identifier) of the + PCI function. + + - /sys/bus/pci/slots/XXXXXXXX/power + + A physical function currently supporting virtual function can not be + powered-off until all virtual-function have been removed with + echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf + +* /sys/bus/pci/devices/XXXX:XX:XX.X/ + + - function_id + zPCI function identifier unique for the complete Z System. + It define uniquely a function in the system. + + - function_handle + Low level identifier used for a configured PCI function. + It may be useful for debuging. + + - pchid + Model dependent location of the I/O adapter. + + - pfgid + PCI Function Group ID, functions sharing identical functionality + are using a common identifier. + A PCI group defines interrupts, IOMMU, IOTLB and DMA specifics. + + - vfn + The Virtual Function Number, from 1 to N for virtual functions. + 0 for physical functions. + + - pft + PCI function type specifies the type of the PCI function. + + - port + The port correspond to the physical port the function is attached to. + It also gives an indication on the physical function a virtual function + is attached to. + + - uid + The UID, Unique Identifier is defined when configuring a LPAR and is + unique inside an LPAR. + + - pfip/segmentX + The segments are used to determine the isolation of a function. + They corresponds to the physical path to the function. + The more the segment are different the more the functions are isolated. + +Enumeration and hotplug +======================= + +The PCI address is made of 4 parts: domain, bus, device and function, +like in DDDD:BB:dd.f + +* When not using multi-functions (norid is set or firmware does not support + multi-functions) + + - There is only one function per domain. + + - the domain is set from the zPCI function's UID as defined during the + LPAR creation. + + - Addresses look like DDDD:00:00.0 + +* When using multi-functions (norid parameter is not set), there are some + change in the way zPCI functions are addressed: + + - There is still only one bus per domain. + + - There can be up to 256 functions per bus. + + - The domain part of the address of all functions of all functions for + a multi-Function device is set from the zPCI function's UID as defined + in the LPAR creation for the function zero. + + - New functions will only be ready to be used after the function zero + (the function with devfn 0) has been enumerated. diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..1df86d6cd6c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14662,6 +14662,7 @@ S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ F: arch/s390/pci/ F: drivers/pci/hotplug/s390_pci_hpc.c +F: Documentation/s390/pci.rst S390 VFIO AP DRIVER M: Tony Krowiak From 7b942b4be971d49cb185ce4690d7fbf94636e88a Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 9 Apr 2020 10:55:16 +0200 Subject: [PATCH 034/574] s390/qdio: consistently restore the IRQ handler For rolling back after an error, qdio_establish() calls qdio_shutdown(). If the error occurs early enough, then the qdio_irq's state still is QDIO_IRQ_STATE_INACTIVE and qdio_shutdown() does nothing. But at _any_ point where qdio_establish() bails out in this way, qdio_setup_irq() will have already replaced the IRQ handler. This then won't be restored after an early error, and the device can end up being returned to the device driver with qdio's IRQ handler still installed. Slightly reorder qdio_setup_irq() so we can be 100% sure that the IRQ handler was replaced. Then fix the bug in qdio_establish() by calling a helper that rolls back only the IRQ handler modification. Also use the new helper in qdio_shutdown() to keep things in sync, and slightly clean up the locking while doing so. This makes minor semantical changes, but holding setup_mutex gives us sufficient leeway to eg. pull qdio_shutdown_thinint() outside of the ccwdev lock's scope. Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.") Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 1 + drivers/s390/cio/qdio_main.c | 18 +++++------------- drivers/s390/cio/qdio_setup.c | 22 +++++++++++++++++----- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index b8453b594679..3cf223bc1d5f 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -389,6 +389,7 @@ int qdio_setup_get_ssqd(struct qdio_irq *irq_ptr, struct subchannel_id *schid, struct qdio_ssqd_desc *data); int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data); +void qdio_shutdown_irq(struct qdio_irq *irq); void qdio_print_subchannel_info(struct qdio_irq *irq_ptr); void qdio_release_memory(struct qdio_irq *irq_ptr); int qdio_setup_init(void); diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index bcc3ab14e72d..da5a11138020 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1154,35 +1154,27 @@ int qdio_shutdown(struct ccw_device *cdev, int how) /* cleanup subchannel */ spin_lock_irq(get_ccwdev_lock(cdev)); - + qdio_set_state(irq_ptr, QDIO_IRQ_STATE_CLEANUP); if (how & QDIO_FLAG_CLEANUP_USING_CLEAR) rc = ccw_device_clear(cdev, QDIO_DOING_CLEANUP); else /* default behaviour is halt */ rc = ccw_device_halt(cdev, QDIO_DOING_CLEANUP); + spin_unlock_irq(get_ccwdev_lock(cdev)); if (rc) { DBF_ERROR("%4x SHUTD ERR", irq_ptr->schid.sch_no); DBF_ERROR("rc:%4d", rc); goto no_cleanup; } - qdio_set_state(irq_ptr, QDIO_IRQ_STATE_CLEANUP); - spin_unlock_irq(get_ccwdev_lock(cdev)); wait_event_interruptible_timeout(cdev->private->wait_q, irq_ptr->state == QDIO_IRQ_STATE_INACTIVE || irq_ptr->state == QDIO_IRQ_STATE_ERR, 10 * HZ); - spin_lock_irq(get_ccwdev_lock(cdev)); no_cleanup: qdio_shutdown_thinint(irq_ptr); - - /* restore interrupt handler */ - if ((void *)cdev->handler == (void *)qdio_int_handler) { - cdev->handler = irq_ptr->orig_handler; - cdev->private->intparm = 0; - } - spin_unlock_irq(get_ccwdev_lock(cdev)); + qdio_shutdown_irq(irq_ptr); qdio_set_state(irq_ptr, QDIO_IRQ_STATE_INACTIVE); mutex_unlock(&irq_ptr->setup_mutex); @@ -1352,8 +1344,8 @@ int qdio_establish(struct ccw_device *cdev, rc = qdio_establish_thinint(irq_ptr); if (rc) { + qdio_shutdown_irq(irq_ptr); mutex_unlock(&irq_ptr->setup_mutex); - qdio_shutdown(cdev, QDIO_FLAG_CLEANUP_USING_CLEAR); return rc; } @@ -1371,8 +1363,8 @@ int qdio_establish(struct ccw_device *cdev, if (rc) { DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no); DBF_ERROR("rc:%4x", rc); + qdio_shutdown_irq(irq_ptr); mutex_unlock(&irq_ptr->setup_mutex); - qdio_shutdown(cdev, QDIO_FLAG_CLEANUP_USING_CLEAR); return rc; } diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index 3083edd61f0c..d12f094db056 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -491,6 +491,12 @@ int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data) /* qdr, qib, sls, slsbs, slibs, sbales are filled now */ + /* set our IRQ handler */ + spin_lock_irq(get_ccwdev_lock(cdev)); + irq_ptr->orig_handler = cdev->handler; + cdev->handler = qdio_int_handler; + spin_unlock_irq(get_ccwdev_lock(cdev)); + /* get qdio commands */ ciw = ccw_device_get_ciw(cdev, CIW_TYPE_EQUEUE); if (!ciw) { @@ -506,14 +512,20 @@ int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data) } irq_ptr->aqueue = *ciw; - /* set new interrupt handler */ - spin_lock_irq(get_ccwdev_lock(cdev)); - irq_ptr->orig_handler = cdev->handler; - cdev->handler = qdio_int_handler; - spin_unlock_irq(get_ccwdev_lock(cdev)); return 0; } +void qdio_shutdown_irq(struct qdio_irq *irq) +{ + struct ccw_device *cdev = irq->cdev; + + /* restore IRQ handler */ + spin_lock_irq(get_ccwdev_lock(cdev)); + cdev->handler = irq->orig_handler; + cdev->private->intparm = 0; + spin_unlock_irq(get_ccwdev_lock(cdev)); +} + void qdio_print_subchannel_info(struct qdio_irq *irq_ptr) { char s[80]; From 68a381746f20e5435206173e22d0a011ef78790e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 9 Apr 2020 09:55:05 +0200 Subject: [PATCH 035/574] s390/qdio: tear down thinint indicator after early error qdio_establish() calls qdio_establish_thinint(), but later has an error exit path that doesn't roll this call back. Fix it. Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.") Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index da5a11138020..80cc811bd2e0 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1363,6 +1363,7 @@ int qdio_establish(struct ccw_device *cdev, if (rc) { DBF_ERROR("%4x est IO ERR", irq_ptr->schid.sch_no); DBF_ERROR("rc:%4x", rc); + qdio_shutdown_thinint(irq_ptr); qdio_shutdown_irq(irq_ptr); mutex_unlock(&irq_ptr->setup_mutex); return rc; From 75e82bec6b2622c6f455b7a543fb5476a5d0eed7 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 9 Apr 2020 09:59:39 +0200 Subject: [PATCH 036/574] s390/qdio: put thinint indicator after early error qdio_establish() calls qdio_setup_thinint() via qdio_setup_irq(). If the subsequent qdio_establish_thinint() fails, we miss to put the DSCI again. Thus the DSCI isn't available for re-use. Given enough of such errors, we could end up with having only the shared DSCI available. Merge qdio_setup_thinint() into qdio_establish_thinint(), and deal with such an error internally. Fixes: 779e6e1c724d ("[S390] qdio: new qdio driver.") Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 1 - drivers/s390/cio/qdio_setup.c | 1 - drivers/s390/cio/qdio_thinint.c | 14 ++++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 3cf223bc1d5f..a2afd7bc100b 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -364,7 +364,6 @@ static inline int multicast_outbound(struct qdio_q *q) extern u64 last_ai_time; /* prototypes for thin interrupt */ -void qdio_setup_thinint(struct qdio_irq *irq_ptr); int qdio_establish_thinint(struct qdio_irq *irq_ptr); void qdio_shutdown_thinint(struct qdio_irq *irq_ptr); void tiqdio_add_device(struct qdio_irq *irq_ptr); diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index d12f094db056..8edfa0982221 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -480,7 +480,6 @@ int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data) } setup_qib(irq_ptr, init_data); - qdio_setup_thinint(irq_ptr); set_impl_params(irq_ptr, init_data->qib_param_field_format, init_data->qib_param_field, init_data->input_slib_elements, diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index ae50373617cd..0faa0ad21732 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -227,17 +227,19 @@ int __init tiqdio_register_thinints(void) int qdio_establish_thinint(struct qdio_irq *irq_ptr) { + int rc; + if (!is_thinint_irq(irq_ptr)) return 0; - return set_subchannel_ind(irq_ptr, 0); -} -void qdio_setup_thinint(struct qdio_irq *irq_ptr) -{ - if (!is_thinint_irq(irq_ptr)) - return; irq_ptr->dsci = get_indicator(); DBF_HEX(&irq_ptr->dsci, sizeof(void *)); + + rc = set_subchannel_ind(irq_ptr, 0); + if (rc) + put_indicator(irq_ptr->dsci); + + return rc; } void qdio_shutdown_thinint(struct qdio_irq *irq_ptr) From 3050f022df6a08cb3c968b9be01c163092c568d3 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 2 Apr 2020 11:37:50 +0200 Subject: [PATCH 037/574] s390/qdio: consolidate thinint init/exit Wrap the init/exit steps for thinint into a single helper that follows the established naming scheme. Signed-off-by: Julian Wiedmann Reviewed-by: Steffen Maier Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 6 ++-- drivers/s390/cio/qdio_main.c | 10 ++----- drivers/s390/cio/qdio_thinint.c | 49 ++++++++++++++------------------- 3 files changed, 24 insertions(+), 41 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index a2afd7bc100b..291b63ccdaf7 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -369,10 +369,8 @@ void qdio_shutdown_thinint(struct qdio_irq *irq_ptr); void tiqdio_add_device(struct qdio_irq *irq_ptr); void tiqdio_remove_device(struct qdio_irq *irq_ptr); void tiqdio_inbound_processing(unsigned long q); -int tiqdio_allocate_memory(void); -void tiqdio_free_memory(void); -int tiqdio_register_thinints(void); -void tiqdio_unregister_thinints(void); +int qdio_thinint_init(void); +void qdio_thinint_exit(void); int test_nonshared_ind(struct qdio_irq *); /* prototypes for setup */ diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 80cc811bd2e0..2abb1d2a0629 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1861,16 +1861,11 @@ static int __init init_QDIO(void) rc = qdio_setup_init(); if (rc) goto out_debug; - rc = tiqdio_allocate_memory(); + rc = qdio_thinint_init(); if (rc) goto out_cache; - rc = tiqdio_register_thinints(); - if (rc) - goto out_ti; return 0; -out_ti: - tiqdio_free_memory(); out_cache: qdio_setup_exit(); out_debug: @@ -1880,8 +1875,7 @@ out_debug: static void __exit exit_QDIO(void) { - tiqdio_unregister_thinints(); - tiqdio_free_memory(); + qdio_thinint_exit(); qdio_setup_exit(); qdio_debug_exit(); } diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 0faa0ad21732..7a440e4328cd 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -197,34 +197,6 @@ out: return rc; } -/* allocate non-shared indicators and shared indicator */ -int __init tiqdio_allocate_memory(void) -{ - q_indicators = kcalloc(TIQDIO_NR_INDICATORS, - sizeof(struct indicator_t), - GFP_KERNEL); - if (!q_indicators) - return -ENOMEM; - return 0; -} - -void tiqdio_free_memory(void) -{ - kfree(q_indicators); -} - -int __init tiqdio_register_thinints(void) -{ - int rc; - - rc = register_adapter_interrupt(&tiqdio_airq); - if (rc) { - DBF_EVENT("RTI:%x", rc); - return rc; - } - return 0; -} - int qdio_establish_thinint(struct qdio_irq *irq_ptr) { int rc; @@ -252,8 +224,27 @@ void qdio_shutdown_thinint(struct qdio_irq *irq_ptr) put_indicator(irq_ptr->dsci); } -void __exit tiqdio_unregister_thinints(void) +int __init qdio_thinint_init(void) +{ + int rc; + + q_indicators = kcalloc(TIQDIO_NR_INDICATORS, sizeof(struct indicator_t), + GFP_KERNEL); + if (!q_indicators) + return -ENOMEM; + + rc = register_adapter_interrupt(&tiqdio_airq); + if (rc) { + DBF_EVENT("RTI:%x", rc); + kfree(q_indicators); + return rc; + } + return 0; +} + +void __exit qdio_thinint_exit(void) { WARN_ON(!list_empty(&tiq_list)); unregister_adapter_interrupt(&tiqdio_airq); + kfree(q_indicators); } From edbf3b2a87db6357fba54520c1baf605e08557b3 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 2 Apr 2020 23:22:18 +0200 Subject: [PATCH 038/574] s390/qdio: do more fine-grained allocation roll-back Instead of having a catch-all qdio_release_memory() helper, free the individual allocations from the respective error path. Signed-off-by: Julian Wiedmann Reviewed-by: Steffen Maier Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 2 +- drivers/s390/cio/qdio_main.c | 33 ++++++++++++++++++++++----------- drivers/s390/cio/qdio_setup.c | 5 +---- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 291b63ccdaf7..effab60c1b98 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -388,7 +388,7 @@ int qdio_setup_get_ssqd(struct qdio_irq *irq_ptr, int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data); void qdio_shutdown_irq(struct qdio_irq *irq); void qdio_print_subchannel_info(struct qdio_irq *irq_ptr); -void qdio_release_memory(struct qdio_irq *irq_ptr); +void qdio_free_queues(struct qdio_irq *irq_ptr); int qdio_setup_init(void); void qdio_setup_exit(void); int qdio_enable_async_operation(struct qdio_output_q *q); diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 2abb1d2a0629..579caba8ea93 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1205,7 +1205,10 @@ int qdio_free(struct ccw_device *cdev) cdev->private->qdio_data = NULL; mutex_unlock(&irq_ptr->setup_mutex); - qdio_release_memory(irq_ptr); + qdio_free_queues(irq_ptr); + free_page((unsigned long) irq_ptr->qdr); + free_page(irq_ptr->chsc_page); + free_page((unsigned long) irq_ptr); return 0; } EXPORT_SYMBOL_GPL(qdio_free); @@ -1221,6 +1224,7 @@ int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, { struct subchannel_id schid; struct qdio_irq *irq_ptr; + int rc = -ENOMEM; ccw_device_get_schid(cdev, &schid); DBF_EVENT("qallocate:%4x", schid.sch_no); @@ -1232,12 +1236,12 @@ int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, /* irq_ptr must be in GFP_DMA since it contains ccw1.cda */ irq_ptr = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!irq_ptr) - goto out_err; + return -ENOMEM; irq_ptr->cdev = cdev; mutex_init(&irq_ptr->setup_mutex); if (qdio_allocate_dbf(irq_ptr)) - goto out_rel; + goto err_dbf; DBF_DEV_EVENT(DBF_ERR, irq_ptr, "alloc niq:%1u noq:%1u", no_input_qs, no_output_qs); @@ -1250,24 +1254,31 @@ int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, */ irq_ptr->chsc_page = get_zeroed_page(GFP_KERNEL); if (!irq_ptr->chsc_page) - goto out_rel; + goto err_chsc; /* qdr is used in ccw1.cda which is u32 */ irq_ptr->qdr = (struct qdr *) get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!irq_ptr->qdr) - goto out_rel; + goto err_qdr; - if (qdio_allocate_qs(irq_ptr, no_input_qs, no_output_qs)) - goto out_rel; + rc = qdio_allocate_qs(irq_ptr, no_input_qs, no_output_qs); + if (rc) + goto err_queues; INIT_LIST_HEAD(&irq_ptr->entry); cdev->private->qdio_data = irq_ptr; qdio_set_state(irq_ptr, QDIO_IRQ_STATE_INACTIVE); return 0; -out_rel: - qdio_release_memory(irq_ptr); -out_err: - return -ENOMEM; + +err_queues: + qdio_free_queues(irq_ptr); + free_page((unsigned long) irq_ptr->qdr); +err_qdr: + free_page(irq_ptr->chsc_page); +err_chsc: +err_dbf: + free_page((unsigned long) irq_ptr); + return rc; } EXPORT_SYMBOL_GPL(qdio_allocate); diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index 8edfa0982221..51dc9a41555a 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -347,7 +347,7 @@ void qdio_setup_ssqd_info(struct qdio_irq *irq_ptr) DBF_EVENT("3:%4x qib:%4x", irq_ptr->ssqd_desc.qdioac3, irq_ptr->qib.ac); } -void qdio_release_memory(struct qdio_irq *irq_ptr) +void qdio_free_queues(struct qdio_irq *irq_ptr) { struct qdio_q *q; int i; @@ -383,9 +383,6 @@ void qdio_release_memory(struct qdio_irq *irq_ptr) kmem_cache_free(qdio_q_cache, q); } } - free_page((unsigned long) irq_ptr->qdr); - free_page(irq_ptr->chsc_page); - free_page((unsigned long) irq_ptr); } static void __qdio_allocate_fill_qdr(struct qdio_irq *irq_ptr, From 2a7cf35c4056facd35c952e8000519034376eef7 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 2 Apr 2020 23:30:41 +0200 Subject: [PATCH 039/574] s390/qdio: roll-back after queue allocation error When qdio_allocate_qs() fails, have it deal with its previous allocations. This way qdio_allocate() doesn't need to clean up afterwards. Signed-off-by: Julian Wiedmann Reviewed-by: Steffen Maier Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 1 - drivers/s390/cio/qdio_setup.c | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 579caba8ea93..09bb69028d67 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1271,7 +1271,6 @@ int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs, return 0; err_queues: - qdio_free_queues(irq_ptr); free_page((unsigned long) irq_ptr->qdr); err_qdr: free_page(irq_ptr->chsc_page); diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index 51dc9a41555a..ebe61cbed443 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -135,6 +135,18 @@ output: } } +static void __qdio_free_queues(struct qdio_q **queues, unsigned int count) +{ + struct qdio_q *q; + unsigned int i; + + for (i = 0; i < count; i++) { + q = queues[i]; + free_page((unsigned long) q->slib); + kmem_cache_free(qdio_q_cache, q); + } +} + static int __qdio_allocate_qs(struct qdio_q **irq_ptr_qs, int nr_queues) { struct qdio_q *q; @@ -142,12 +154,15 @@ static int __qdio_allocate_qs(struct qdio_q **irq_ptr_qs, int nr_queues) for (i = 0; i < nr_queues; i++) { q = kmem_cache_zalloc(qdio_q_cache, GFP_KERNEL); - if (!q) + if (!q) { + __qdio_free_queues(irq_ptr_qs, i); return -ENOMEM; + } q->slib = (struct slib *) __get_free_page(GFP_KERNEL); if (!q->slib) { kmem_cache_free(qdio_q_cache, q); + __qdio_free_queues(irq_ptr_qs, i); return -ENOMEM; } irq_ptr_qs[i] = q; @@ -162,7 +177,11 @@ int qdio_allocate_qs(struct qdio_irq *irq_ptr, int nr_input_qs, int nr_output_qs rc = __qdio_allocate_qs(irq_ptr->input_qs, nr_input_qs); if (rc) return rc; + rc = __qdio_allocate_qs(irq_ptr->output_qs, nr_output_qs); + if (rc) + __qdio_free_queues(irq_ptr->input_qs, nr_input_qs); + return rc; } From d188cac39753c8419d2b168436a632711896ea4e Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 2 Apr 2020 23:48:00 +0200 Subject: [PATCH 040/574] s390/qdio: keep track of allocated queue count Knowing how many queues we initially allocated allows us to 1) sanity-check a subsequent qdio_establish() request, and 2) walk the queue arrays without further checks. Apply this while cleanly splitting qdio_free_queues() into two separate helpers. Signed-off-by: Julian Wiedmann Reviewed-by: Steffen Maier Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 3 ++ drivers/s390/cio/qdio_main.c | 5 ++++ drivers/s390/cio/qdio_setup.c | 55 +++++++++++++++++------------------ 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index effab60c1b98..66191e864b0b 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -292,6 +292,8 @@ struct qdio_irq { struct qdio_q *input_qs[QDIO_MAX_QUEUES_PER_IRQ]; struct qdio_q *output_qs[QDIO_MAX_QUEUES_PER_IRQ]; + unsigned int max_input_qs; + unsigned int max_output_qs; void (*irq_poll)(struct ccw_device *cdev, unsigned long data); unsigned long poll_state; @@ -389,6 +391,7 @@ int qdio_setup_irq(struct qdio_irq *irq_ptr, struct qdio_initialize *init_data); void qdio_shutdown_irq(struct qdio_irq *irq); void qdio_print_subchannel_info(struct qdio_irq *irq_ptr); void qdio_free_queues(struct qdio_irq *irq_ptr); +void qdio_free_async_data(struct qdio_irq *irq_ptr); int qdio_setup_init(void); void qdio_setup_exit(void); int qdio_enable_async_operation(struct qdio_output_q *q); diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 09bb69028d67..ef0be46f3e04 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1205,6 +1205,7 @@ int qdio_free(struct ccw_device *cdev) cdev->private->qdio_data = NULL; mutex_unlock(&irq_ptr->setup_mutex); + qdio_free_async_data(irq_ptr); qdio_free_queues(irq_ptr); free_page((unsigned long) irq_ptr->qdr); free_page(irq_ptr->chsc_page); @@ -1340,6 +1341,10 @@ int qdio_establish(struct ccw_device *cdev, if (!irq_ptr) return -ENODEV; + if (init_data->no_input_qs > irq_ptr->max_input_qs || + init_data->no_output_qs > irq_ptr->max_output_qs) + return -EINVAL; + if ((init_data->no_input_qs && !init_data->input_handler) || (init_data->no_output_qs && !init_data->output_handler)) return -EINVAL; diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index ebe61cbed443..2c5cc6ec668e 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -147,6 +147,15 @@ static void __qdio_free_queues(struct qdio_q **queues, unsigned int count) } } +void qdio_free_queues(struct qdio_irq *irq_ptr) +{ + __qdio_free_queues(irq_ptr->input_qs, irq_ptr->max_input_qs); + irq_ptr->max_input_qs = 0; + + __qdio_free_queues(irq_ptr->output_qs, irq_ptr->max_output_qs); + irq_ptr->max_output_qs = 0; +} + static int __qdio_allocate_qs(struct qdio_q **irq_ptr_qs, int nr_queues) { struct qdio_q *q; @@ -179,10 +188,14 @@ int qdio_allocate_qs(struct qdio_irq *irq_ptr, int nr_input_qs, int nr_output_qs return rc; rc = __qdio_allocate_qs(irq_ptr->output_qs, nr_output_qs); - if (rc) + if (rc) { __qdio_free_queues(irq_ptr->input_qs, nr_input_qs); + return rc; + } - return rc; + irq_ptr->max_input_qs = nr_input_qs; + irq_ptr->max_output_qs = nr_output_qs; + return 0; } static void setup_queues_misc(struct qdio_q *q, struct qdio_irq *irq_ptr, @@ -366,40 +379,26 @@ void qdio_setup_ssqd_info(struct qdio_irq *irq_ptr) DBF_EVENT("3:%4x qib:%4x", irq_ptr->ssqd_desc.qdioac3, irq_ptr->qib.ac); } -void qdio_free_queues(struct qdio_irq *irq_ptr) +void qdio_free_async_data(struct qdio_irq *irq_ptr) { struct qdio_q *q; int i; - /* - * Must check queue array manually since irq_ptr->nr_input_queues / - * irq_ptr->nr_input_queues may not yet be set. - */ - for (i = 0; i < QDIO_MAX_QUEUES_PER_IRQ; i++) { - q = irq_ptr->input_qs[i]; - if (q) { - free_page((unsigned long) q->slib); - kmem_cache_free(qdio_q_cache, q); - } - } - for (i = 0; i < QDIO_MAX_QUEUES_PER_IRQ; i++) { + for (i = 0; i < irq_ptr->max_output_qs; i++) { q = irq_ptr->output_qs[i]; - if (q) { - if (q->u.out.use_cq) { - int n; + if (q->u.out.use_cq) { + unsigned int n; - for (n = 0; n < QDIO_MAX_BUFFERS_PER_Q; ++n) { - struct qaob *aob = q->u.out.aobs[n]; - if (aob) { - qdio_release_aob(aob); - q->u.out.aobs[n] = NULL; - } + for (n = 0; n < QDIO_MAX_BUFFERS_PER_Q; n++) { + struct qaob *aob = q->u.out.aobs[n]; + + if (aob) { + qdio_release_aob(aob); + q->u.out.aobs[n] = NULL; } - - qdio_disable_async_operation(&q->u.out); } - free_page((unsigned long) q->slib); - kmem_cache_free(qdio_q_cache, q); + + qdio_disable_async_operation(&q->u.out); } } } From 9b7012dfc33db9c44add10ab58033fe167a15f13 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Fri, 17 Apr 2020 14:20:00 +0200 Subject: [PATCH 041/574] s390/qdio: de-duplicate tiqdio_inbound_processing() Except for some initial thinint-only steps, the processing is identical to the non-thinint case. So re-use the existing helper. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index ef0be46f3e04..68985871b6bf 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -880,47 +880,17 @@ static inline void qdio_check_outbound_pci_queues(struct qdio_irq *irq) qdio_tasklet_schedule(out); } -static void __tiqdio_inbound_processing(struct qdio_q *q) +void tiqdio_inbound_processing(unsigned long data) { - unsigned int start = q->first_to_check; - int count; + struct qdio_q *q = (struct qdio_q *)data; - qperf_inc(q, tasklet_inbound); if (need_siga_sync(q) && need_siga_sync_after_ai(q)) qdio_sync_queues(q); /* The interrupt could be caused by a PCI request: */ qdio_check_outbound_pci_queues(q->irq_ptr); - count = qdio_inbound_q_moved(q, start); - if (count == 0) - return; - - start = add_buf(start, count); - q->first_to_check = start; - qdio_kick_handler(q, count); - - if (!qdio_inbound_q_done(q, start)) { - qperf_inc(q, tasklet_inbound_resched); - if (!qdio_tasklet_schedule(q)) - return; - } - - qdio_stop_polling(q); - /* - * We need to check again to not lose initiative after - * resetting the ACK state. - */ - if (!qdio_inbound_q_done(q, start)) { - qperf_inc(q, tasklet_inbound_resched2); - qdio_tasklet_schedule(q); - } -} - -void tiqdio_inbound_processing(unsigned long data) -{ - struct qdio_q *q = (struct qdio_q *)data; - __tiqdio_inbound_processing(q); + __qdio_inbound_processing(q); } static inline void qdio_set_state(struct qdio_irq *irq_ptr, From 19d4c761c1d2354e6c1deb6e1d31c949028b92d6 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 21 Apr 2020 17:20:00 +0200 Subject: [PATCH 042/574] s390/qdio: remove always-true condition buf_in_between() gets passed q->u.in.ack_start as 'bufnr' parameter. The ack_start always ranges between 0 and QDIO_MAX_BUFFERS_PER_Q - 1, so the subsequent check will always return true. Remove it. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 68985871b6bf..4bd6dbfe8387 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1450,8 +1450,7 @@ static inline int buf_in_between(int bufnr, int start, int count) } /* wrap-around case */ - if ((bufnr >= start && bufnr <= QDIO_MAX_BUFFERS_PER_Q) || - (bufnr < end)) + if (bufnr >= start || bufnr < end) return 1; else return 0; From ff2a08b39bcede1b08d84d8b5c8ee1336a39c5df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:39 +0200 Subject: [PATCH 043/574] iommu: Move default domain allocation to separate function Move the code out of iommu_group_get_for_dev() into a separate function. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-2-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 74 ++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2b471419e26c..bfe011760ed1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1361,6 +1361,41 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static int iommu_alloc_default_domain(struct device *dev, + struct iommu_group *group) +{ + struct iommu_domain *dom; + + if (group->default_domain) + return 0; + + dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type); + if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) { + dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA); + if (dom) { + dev_warn(dev, + "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA", + iommu_def_domain_type); + } + } + + if (!dom) + return -ENOMEM; + + group->default_domain = dom; + if (!group->domain) + group->domain = dom; + + if (!iommu_dma_strict) { + int attr = 1; + iommu_domain_set_attr(dom, + DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, + &attr); + } + + return 0; +} + /** * iommu_group_get_for_dev - Find or create the IOMMU group for a device * @dev: target device @@ -1393,40 +1428,21 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) /* * Try to allocate a default domain - needs support from the - * IOMMU driver. + * IOMMU driver. There are still some drivers which don't support + * default domains, so the return value is not yet checked. */ - if (!group->default_domain) { - struct iommu_domain *dom; - - dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type); - if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) { - dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA); - if (dom) { - dev_warn(dev, - "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA", - iommu_def_domain_type); - } - } - - group->default_domain = dom; - if (!group->domain) - group->domain = dom; - - if (dom && !iommu_dma_strict) { - int attr = 1; - iommu_domain_set_attr(dom, - DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, - &attr); - } - } + iommu_alloc_default_domain(dev, group); ret = iommu_group_add_device(group, dev); - if (ret) { - iommu_group_put(group); - return ERR_PTR(ret); - } + if (ret) + goto out_put_group; return group; + +out_put_group: + iommu_group_put(group); + + return ERR_PTR(ret); } EXPORT_SYMBOL(iommu_group_get_for_dev); From 4cbf38511a007867def958872203ae8adb8e2351 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 29 Apr 2020 15:36:40 +0200 Subject: [PATCH 044/574] iommu: Add def_domain_type() callback in iommu_ops Some devices are reqired to use a specific type (identity or dma) of default domain when they are used with a vendor iommu. When the system level default domain type is different from it, the vendor iommu driver has to request a new default domain with iommu_request_dma_domain_for_dev() and iommu_request_dm_for_dev() in the add_dev() callback. Unfortunately, these two helpers only work when the group hasn't been assigned to any other devices, hence, some vendor iommu driver has to use a private domain if it fails to request a new default one. This adds def_domain_type() callback in the iommu_ops, so that any special requirement of default domain for a device could be aware by the iommu generic layer. Signed-off-by: Sai Praneeth Prakhya Signed-off-by: Lu Baolu [ jroedel@suse.de: Added iommu_get_def_domain_type() function and use it to allocate the default domain ] Co-developed-by: Joerg Roedel Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-3-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 20 +++++++++++++++++--- include/linux/iommu.h | 6 ++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index bfe011760ed1..5877abd9b693 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1361,21 +1361,35 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static int iommu_get_def_domain_type(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + unsigned int type = 0; + + if (ops->def_domain_type) + type = ops->def_domain_type(dev); + + return (type == 0) ? iommu_def_domain_type : type; +} + static int iommu_alloc_default_domain(struct device *dev, struct iommu_group *group) { struct iommu_domain *dom; + unsigned int type; if (group->default_domain) return 0; - dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type); - if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) { + type = iommu_get_def_domain_type(dev); + + dom = __iommu_domain_alloc(dev->bus, type); + if (!dom && type != IOMMU_DOMAIN_DMA) { dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA); if (dom) { dev_warn(dev, "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA", - iommu_def_domain_type); + type); } } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ef8b0bda695..1f027b07e499 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -248,6 +248,10 @@ struct iommu_iotlb_gather { * @cache_invalidate: invalidate translation caches * @sva_bind_gpasid: bind guest pasid and mm * @sva_unbind_gpasid: unbind guest pasid and mm + * @def_domain_type: device default domain type, return value: + * - IOMMU_DOMAIN_IDENTITY: must use an identity domain + * - IOMMU_DOMAIN_DMA: must use a dma domain + * - 0: use the default setting * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops */ @@ -318,6 +322,8 @@ struct iommu_ops { int (*sva_unbind_gpasid)(struct device *dev, int pasid); + int (*def_domain_type)(struct device *dev); + unsigned long pgsize_bitmap; struct module *owner; }; From bdf4a7c4c77dcb91bd64b53b70d9faf3184e88d8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:41 +0200 Subject: [PATCH 045/574] iommu/amd: Implement iommu_ops->def_domain_type call-back Implement the new def_domain_type call-back for the AMD IOMMU driver. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-4-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 20cce366e951..73b4f84cf449 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2661,6 +2661,20 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, amd_iommu_flush_iotlb_all(domain); } +static int amd_iommu_def_domain_type(struct device *dev) +{ + struct iommu_dev_data *dev_data; + + dev_data = get_dev_data(dev); + if (!dev_data) + return 0; + + if (dev_data->iommu_v2) + return IOMMU_DOMAIN_IDENTITY; + + return 0; +} + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, @@ -2680,6 +2694,7 @@ const struct iommu_ops amd_iommu_ops = { .pgsize_bitmap = AMD_IOMMU_PGSIZES, .flush_iotlb_all = amd_iommu_flush_iotlb_all, .iotlb_sync = amd_iommu_iotlb_sync, + .def_domain_type = amd_iommu_def_domain_type, }; /***************************************************************************** From 7039d11b3e4af77cf5ac1f689ae395b2a183bd25 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:42 +0200 Subject: [PATCH 046/574] iommu/vt-d: Wire up iommu_ops->def_domain_type The Intel VT-d driver already has a matching function to determine the default domain type for a device. Wire it up in intel_iommu_ops. Signed-off-by: Joerg Roedel Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20200429133712.31431-5-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ef0a5246700e..b9f905a55dda 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -6209,6 +6209,7 @@ const struct iommu_ops intel_iommu_ops = { .dev_enable_feat = intel_iommu_dev_enable_feat, .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, + .def_domain_type = device_def_domain_type, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, }; From c0da9b9f5afdb56ae2aaccf79f36a358ac8454c8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:43 +0200 Subject: [PATCH 047/574] iommu/amd: Remove dma_mask check from check_device() The check was only needed for the DMA-API implementation in the AMD IOMMU driver, which no longer exists. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-6-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 73b4f84cf449..504f2db75eda 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -326,7 +326,7 @@ static bool check_device(struct device *dev) { int devid; - if (!dev || !dev->dma_mask) + if (!dev) return false; devid = get_device_id(dev); From 57bd2c24ba1673bd47828f6ac5eaaa6292a03582 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:44 +0200 Subject: [PATCH 048/574] iommu/amd: Return -ENODEV in add_device when device is not handled by IOMMU When check_device() fails on the device, it is not handled by the IOMMU and amd_iommu_add_device() needs to return -ENODEV. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-7-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 504f2db75eda..3e0d27f7622e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2157,9 +2157,12 @@ static int amd_iommu_add_device(struct device *dev) struct amd_iommu *iommu; int ret, devid; - if (!check_device(dev) || get_dev_data(dev)) + if (get_dev_data(dev)) return 0; + if (!check_device(dev)) + return -ENODEV; + devid = get_device_id(dev); if (devid < 0) return devid; From a6a4c7e2c5b8b981d1c546a393ff21f2112468c3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:45 +0200 Subject: [PATCH 049/574] iommu: Add probe_device() and release_device() call-backs Add call-backs to 'struct iommu_ops' as an alternative to the add_device() and remove_device() call-backs, which will be removed when all drivers are converted. The new call-backs will not setup IOMMU groups and domains anymore, so also add a probe_finalize() call-back where the IOMMU driver can do per-device setup work which require the device to be set up with a group and a domain. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-8-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 63 ++++++++++++++++++++++++++++++++++++++----- include/linux/iommu.h | 9 +++++++ 2 files changed, 66 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5877abd9b693..6cfe7799dc8c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -174,6 +174,36 @@ static void dev_iommu_free(struct device *dev) dev->iommu = NULL; } +static int __iommu_probe_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct iommu_device *iommu_dev; + struct iommu_group *group; + int ret; + + iommu_dev = ops->probe_device(dev); + if (IS_ERR(iommu_dev)) + return PTR_ERR(iommu_dev); + + dev->iommu->iommu_dev = iommu_dev; + + group = iommu_group_get_for_dev(dev); + if (!IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_release; + } + iommu_group_put(group); + + iommu_device_link(iommu_dev, dev); + + return 0; + +out_release: + ops->release_device(dev); + + return ret; +} + int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -191,10 +221,17 @@ int iommu_probe_device(struct device *dev) goto err_free_dev_param; } - ret = ops->add_device(dev); + if (ops->probe_device) + ret = __iommu_probe_device(dev); + else + ret = ops->add_device(dev); + if (ret) goto err_module_put; + if (ops->probe_finalize) + ops->probe_finalize(dev); + return 0; err_module_put: @@ -204,17 +241,31 @@ err_free_dev_param: return ret; } +static void __iommu_release_device(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + + iommu_device_unlink(dev->iommu->iommu_dev, dev); + + iommu_group_remove_device(dev); + + ops->release_device(dev); +} + void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; - if (dev->iommu_group) + if (!dev->iommu) + return; + + if (ops->release_device) + __iommu_release_device(dev); + else if (dev->iommu_group) ops->remove_device(dev); - if (dev->iommu) { - module_put(ops->owner); - dev_iommu_free(dev); - } + module_put(ops->owner); + dev_iommu_free(dev); } static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1f027b07e499..30170d191e5e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -225,6 +225,10 @@ struct iommu_iotlb_gather { * @iova_to_phys: translate iova to physical address * @add_device: add device to iommu grouping * @remove_device: remove device from iommu grouping + * @probe_device: Add device to iommu driver handling + * @release_device: Remove device from iommu driver handling + * @probe_finalize: Do final setup work after the device is added to an IOMMU + * group and attached to the groups domain * @device_group: find iommu group for a particular device * @domain_get_attr: Query domain attributes * @domain_set_attr: Change domain attributes @@ -275,6 +279,9 @@ struct iommu_ops { phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); int (*add_device)(struct device *dev); void (*remove_device)(struct device *dev); + struct iommu_device *(*probe_device)(struct device *dev); + void (*release_device)(struct device *dev); + void (*probe_finalize)(struct device *dev); struct iommu_group *(*device_group)(struct device *dev); int (*domain_get_attr)(struct iommu_domain *domain, enum iommu_attr attr, void *data); @@ -375,6 +382,7 @@ struct iommu_fault_param { * * @fault_param: IOMMU detected device fault reporting data * @fwspec: IOMMU fwspec data + * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. @@ -384,6 +392,7 @@ struct dev_iommu { struct mutex lock; struct iommu_fault_param *fault_param; struct iommu_fwspec *fwspec; + struct iommu_device *iommu_dev; void *priv; }; From 6e1aa2049154d7462968c968b20f985859308267 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:46 +0200 Subject: [PATCH 050/574] iommu: Move default domain allocation to iommu_probe_device() Well, not really. The call to iommu_alloc_default_domain() in iommu_group_get_for_dev() has to stay around as long as there are IOMMU drivers using the add/remove_device() call-backs instead of probe/release_device(). Those drivers expect that iommu_group_get_for_dev() returns the device attached to a group and the group set up with a default domain (and the device attached to the groups current domain). But when all drivers are converted this compatability mess can be removed. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-9-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 106 +++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 6cfe7799dc8c..7a385c18e1a5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -79,6 +79,16 @@ static bool iommu_cmd_line_dma_api(void) return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API); } +static int iommu_alloc_default_domain(struct device *dev); +static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, + unsigned type); +static int __iommu_attach_device(struct iommu_domain *domain, + struct device *dev); +static int __iommu_attach_group(struct iommu_domain *domain, + struct iommu_group *group); +static void __iommu_detach_group(struct iommu_domain *domain, + struct iommu_group *group); + #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ __ATTR(_name, _mode, _show, _store) @@ -221,10 +231,29 @@ int iommu_probe_device(struct device *dev) goto err_free_dev_param; } - if (ops->probe_device) + if (ops->probe_device) { + struct iommu_group *group; + ret = __iommu_probe_device(dev); - else + + /* + * Try to allocate a default domain - needs support from the + * IOMMU driver. There are still some drivers which don't + * support default domains, so the return value is not yet + * checked. + */ + if (!ret) + iommu_alloc_default_domain(dev); + + group = iommu_group_get(dev); + if (group && group->default_domain) { + ret = __iommu_attach_device(group->default_domain, dev); + iommu_group_put(group); + } + + } else { ret = ops->add_device(dev); + } if (ret) goto err_module_put; @@ -268,15 +297,6 @@ void iommu_release_device(struct device *dev) dev_iommu_free(dev); } -static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, - unsigned type); -static int __iommu_attach_device(struct iommu_domain *domain, - struct device *dev); -static int __iommu_attach_group(struct iommu_domain *domain, - struct iommu_group *group); -static void __iommu_detach_group(struct iommu_domain *domain, - struct iommu_group *group); - static int __init iommu_set_def_domain_type(char *str) { bool pt; @@ -1423,25 +1443,18 @@ static int iommu_get_def_domain_type(struct device *dev) return (type == 0) ? iommu_def_domain_type : type; } -static int iommu_alloc_default_domain(struct device *dev, - struct iommu_group *group) +static int iommu_group_alloc_default_domain(struct bus_type *bus, + struct iommu_group *group, + unsigned int type) { struct iommu_domain *dom; - unsigned int type; - if (group->default_domain) - return 0; - - type = iommu_get_def_domain_type(dev); - - dom = __iommu_domain_alloc(dev->bus, type); + dom = __iommu_domain_alloc(bus, type); if (!dom && type != IOMMU_DOMAIN_DMA) { - dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA); - if (dom) { - dev_warn(dev, - "failed to allocate default IOMMU domain of type %u; falling back to IOMMU_DOMAIN_DMA", - type); - } + dom = __iommu_domain_alloc(bus, IOMMU_DOMAIN_DMA); + if (dom) + pr_warn("Failed to allocate default IOMMU domain of type %u for group %s - Falling back to IOMMU_DOMAIN_DMA", + type, group->name); } if (!dom) @@ -1461,6 +1474,23 @@ static int iommu_alloc_default_domain(struct device *dev, return 0; } +static int iommu_alloc_default_domain(struct device *dev) +{ + struct iommu_group *group; + unsigned int type; + + group = iommu_group_get(dev); + if (!group) + return -ENODEV; + + if (group->default_domain) + return 0; + + type = iommu_get_def_domain_type(dev); + + return iommu_group_alloc_default_domain(dev->bus, group, type); +} + /** * iommu_group_get_for_dev - Find or create the IOMMU group for a device * @dev: target device @@ -1491,17 +1521,27 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) if (IS_ERR(group)) return group; - /* - * Try to allocate a default domain - needs support from the - * IOMMU driver. There are still some drivers which don't support - * default domains, so the return value is not yet checked. - */ - iommu_alloc_default_domain(dev, group); - ret = iommu_group_add_device(group, dev); if (ret) goto out_put_group; + /* + * Try to allocate a default domain - needs support from the + * IOMMU driver. There are still some drivers which don't support + * default domains, so the return value is not yet checked. Only + * allocate the domain here when the driver still has the + * add_device/remove_device call-backs implemented. + */ + if (!ops->probe_device) { + iommu_alloc_default_domain(dev); + + if (group->default_domain) + ret = __iommu_attach_device(group->default_domain, dev); + + if (ret) + goto out_put_group; + } + return group; out_put_group: From 41df6dcc0a3ff4fb654c3d969ab96ba9c4f0e796 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:47 +0200 Subject: [PATCH 051/574] iommu: Keep a list of allocated groups in __iommu_probe_device() This is needed to defer default_domain allocation for new IOMMU groups until all devices have been added to the group. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-10-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7a385c18e1a5..18eb3623bd00 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -44,6 +44,7 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *domain; + struct list_head entry; }; struct group_device { @@ -184,7 +185,7 @@ static void dev_iommu_free(struct device *dev) dev->iommu = NULL; } -static int __iommu_probe_device(struct device *dev) +static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_device *iommu_dev; @@ -204,6 +205,9 @@ static int __iommu_probe_device(struct device *dev) } iommu_group_put(group); + if (group_list && !group->default_domain && list_empty(&group->entry)) + list_add_tail(&group->entry, group_list); + iommu_device_link(iommu_dev, dev); return 0; @@ -234,7 +238,7 @@ int iommu_probe_device(struct device *dev) if (ops->probe_device) { struct iommu_group *group; - ret = __iommu_probe_device(dev); + ret = __iommu_probe_device(dev, NULL); /* * Try to allocate a default domain - needs support from the @@ -567,6 +571,7 @@ struct iommu_group *iommu_group_alloc(void) group->kobj.kset = iommu_group_kset; mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); + INIT_LIST_HEAD(&group->entry); BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); ret = ida_simple_get(&iommu_group_ida, 0, 0, GFP_KERNEL); From cf193888bfbd3d57e03a511e49d26f7d9c6f76df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:48 +0200 Subject: [PATCH 052/574] iommu: Move new probe_device path to separate function This makes it easier to remove to old code-path when all drivers are converted. As a side effect that it also fixes the error cleanup path. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-11-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 69 ++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 18eb3623bd00..8be047a4808f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -218,12 +218,55 @@ out_release: return ret; } +static int __iommu_probe_device_helper(struct device *dev) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct iommu_group *group; + int ret; + + ret = __iommu_probe_device(dev, NULL); + if (ret) + goto err_out; + + /* + * Try to allocate a default domain - needs support from the + * IOMMU driver. There are still some drivers which don't + * support default domains, so the return value is not yet + * checked. + */ + iommu_alloc_default_domain(dev); + + group = iommu_group_get(dev); + if (!group) + goto err_release; + + if (group->default_domain) + ret = __iommu_attach_device(group->default_domain, dev); + + iommu_group_put(group); + + if (ret) + goto err_release; + + if (ops->probe_finalize) + ops->probe_finalize(dev); + + return 0; + +err_release: + iommu_release_device(dev); +err_out: + return ret; + +} + int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; int ret; WARN_ON(dev->iommu_group); + if (!ops) return -EINVAL; @@ -235,30 +278,10 @@ int iommu_probe_device(struct device *dev) goto err_free_dev_param; } - if (ops->probe_device) { - struct iommu_group *group; - - ret = __iommu_probe_device(dev, NULL); - - /* - * Try to allocate a default domain - needs support from the - * IOMMU driver. There are still some drivers which don't - * support default domains, so the return value is not yet - * checked. - */ - if (!ret) - iommu_alloc_default_domain(dev); - - group = iommu_group_get(dev); - if (group && group->default_domain) { - ret = __iommu_attach_device(group->default_domain, dev); - iommu_group_put(group); - } - - } else { - ret = ops->add_device(dev); - } + if (ops->probe_device) + return __iommu_probe_device_helper(dev); + ret = ops->add_device(dev); if (ret) goto err_module_put; From deac0b3bed26bb5d04486696b1071d8ec3851100 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:49 +0200 Subject: [PATCH 053/574] iommu: Split off default domain allocation from group assignment When a bus is initialized with iommu-ops, all devices on the bus are scanned and iommu-groups are allocated for them, and each groups will also get a default domain allocated. Until now this happened as soon as the group was created and the first device added to it. When other devices with different default domain requirements were added to the group later on, the default domain was re-allocated, if possible. This resulted in some back and forth and unnecessary allocations, so change the flow to defer default domain allocation until all devices have been added to their respective IOMMU groups. The default domains are allocated for newly allocated groups after each device on the bus is handled and was probed by the IOMMU driver. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-12-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 154 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 151 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8be047a4808f..7de0e29db333 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -199,7 +199,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list dev->iommu->iommu_dev = iommu_dev; group = iommu_group_get_for_dev(dev); - if (!IS_ERR(group)) { + if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_release; } @@ -1599,6 +1599,37 @@ static int add_iommu_group(struct device *dev, void *data) return ret; } +static int probe_iommu_group(struct device *dev, void *data) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct list_head *group_list = data; + int ret; + + if (!dev_iommu_get(dev)) + return -ENOMEM; + + if (!try_module_get(ops->owner)) { + ret = -EINVAL; + goto err_free_dev_iommu; + } + + ret = __iommu_probe_device(dev, group_list); + if (ret) + goto err_module_put; + + return 0; + +err_module_put: + module_put(ops->owner); +err_free_dev_iommu: + dev_iommu_free(dev); + + if (ret == -ENODEV) + ret = 0; + + return ret; +} + static int remove_iommu_group(struct device *dev, void *data) { iommu_release_device(dev); @@ -1658,10 +1689,127 @@ static int iommu_bus_notifier(struct notifier_block *nb, return 0; } +struct __group_domain_type { + struct device *dev; + unsigned int type; +}; + +static int probe_get_default_domain_type(struct device *dev, void *data) +{ + const struct iommu_ops *ops = dev->bus->iommu_ops; + struct __group_domain_type *gtype = data; + unsigned int type = 0; + + if (ops->def_domain_type) + type = ops->def_domain_type(dev); + + if (type) { + if (gtype->type && gtype->type != type) { + dev_warn(dev, "Device needs domain type %s, but device %s in the same iommu group requires type %s - using default\n", + iommu_domain_type_str(type), + dev_name(gtype->dev), + iommu_domain_type_str(gtype->type)); + gtype->type = 0; + } + + if (!gtype->dev) { + gtype->dev = dev; + gtype->type = type; + } + } + + return 0; +} + +static void probe_alloc_default_domain(struct bus_type *bus, + struct iommu_group *group) +{ + struct __group_domain_type gtype; + + memset(>ype, 0, sizeof(gtype)); + + /* Ask for default domain requirements of all devices in the group */ + __iommu_group_for_each_dev(group, >ype, + probe_get_default_domain_type); + + if (!gtype.type) + gtype.type = iommu_def_domain_type; + + iommu_group_alloc_default_domain(bus, group, gtype.type); +} + +static int iommu_group_do_dma_attach(struct device *dev, void *data) +{ + struct iommu_domain *domain = data; + const struct iommu_ops *ops; + int ret; + + ret = __iommu_attach_device(domain, dev); + + ops = domain->ops; + + if (ret == 0 && ops->probe_finalize) + ops->probe_finalize(dev); + + return ret; +} + +static int __iommu_group_dma_attach(struct iommu_group *group) +{ + return __iommu_group_for_each_dev(group, group->default_domain, + iommu_group_do_dma_attach); +} + +static int bus_iommu_probe(struct bus_type *bus) +{ + const struct iommu_ops *ops = bus->iommu_ops; + int ret; + + if (ops->probe_device) { + struct iommu_group *group, *next; + LIST_HEAD(group_list); + + /* + * This code-path does not allocate the default domain when + * creating the iommu group, so do it after the groups are + * created. + */ + ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); + if (ret) + return ret; + + list_for_each_entry_safe(group, next, &group_list, entry) { + /* Remove item from the list */ + list_del_init(&group->entry); + + mutex_lock(&group->mutex); + + /* Try to allocate default domain */ + probe_alloc_default_domain(bus, group); + + if (!group->default_domain) { + mutex_unlock(&group->mutex); + continue; + } + + ret = __iommu_group_dma_attach(group); + + mutex_unlock(&group->mutex); + + if (ret) + break; + } + } else { + ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group); + } + + return ret; +} + static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops) { - int err; struct notifier_block *nb; + int err; nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); if (!nb) @@ -1673,7 +1821,7 @@ static int iommu_bus_init(struct bus_type *bus, const struct iommu_ops *ops) if (err) goto out_free; - err = bus_for_each_dev(bus, NULL, NULL, add_iommu_group); + err = bus_iommu_probe(bus); if (err) goto out_err; From ce574c27ae275bc51b6437883fc9cd1c46b498e5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:50 +0200 Subject: [PATCH 054/574] iommu: Move iommu_group_create_direct_mappings() out of iommu_group_add_device() After the previous changes the iommu group may not have a default domain when iommu_group_add_device() is called. With no default domain iommu_group_create_direct_mappings() will do nothing and no direct mappings will be created. Rename iommu_group_create_direct_mappings() to iommu_create_device_direct_mappings() to better reflect that the function creates direct mappings only for one device and not for all devices in the group. Then move the call to the places where a default domain actually exists. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-13-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7de0e29db333..834a45da0ed0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -89,6 +89,8 @@ static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); static void __iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group); +static int iommu_create_device_direct_mappings(struct iommu_group *group, + struct device *dev); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -243,6 +245,8 @@ static int __iommu_probe_device_helper(struct device *dev) if (group->default_domain) ret = __iommu_attach_device(group->default_domain, dev); + iommu_create_device_direct_mappings(group, dev); + iommu_group_put(group); if (ret) @@ -263,6 +267,7 @@ err_out: int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; + struct iommu_group *group; int ret; WARN_ON(dev->iommu_group); @@ -285,6 +290,10 @@ int iommu_probe_device(struct device *dev) if (ret) goto err_module_put; + group = iommu_group_get(dev); + iommu_create_device_direct_mappings(group, dev); + iommu_group_put(group); + if (ops->probe_finalize) ops->probe_finalize(dev); @@ -736,8 +745,8 @@ int iommu_group_set_name(struct iommu_group *group, const char *name) } EXPORT_SYMBOL_GPL(iommu_group_set_name); -static int iommu_group_create_direct_mappings(struct iommu_group *group, - struct device *dev) +static int iommu_create_device_direct_mappings(struct iommu_group *group, + struct device *dev) { struct iommu_domain *domain = group->default_domain; struct iommu_resv_region *entry; @@ -841,8 +850,6 @@ rename: dev->iommu_group = group; - iommu_group_create_direct_mappings(group, dev); - mutex_lock(&group->mutex); list_add_tail(&device->list, &group->devices); if (group->domain) @@ -1736,6 +1743,7 @@ static void probe_alloc_default_domain(struct bus_type *bus, gtype.type = iommu_def_domain_type; iommu_group_alloc_default_domain(bus, group, gtype.type); + } static int iommu_group_do_dma_attach(struct device *dev, void *data) @@ -1760,6 +1768,21 @@ static int __iommu_group_dma_attach(struct iommu_group *group) iommu_group_do_dma_attach); } +static int iommu_do_create_direct_mappings(struct device *dev, void *data) +{ + struct iommu_group *group = data; + + iommu_create_device_direct_mappings(group, dev); + + return 0; +} + +static int iommu_group_create_direct_mappings(struct iommu_group *group) +{ + return __iommu_group_for_each_dev(group, group, + iommu_do_create_direct_mappings); +} + static int bus_iommu_probe(struct bus_type *bus) { const struct iommu_ops *ops = bus->iommu_ops; @@ -1792,6 +1815,8 @@ static int bus_iommu_probe(struct bus_type *bus) continue; } + iommu_group_create_direct_mappings(group); + ret = __iommu_group_dma_attach(group); mutex_unlock(&group->mutex); @@ -2632,7 +2657,7 @@ request_default_domain_for_dev(struct device *dev, unsigned long type) iommu_domain_free(group->default_domain); group->default_domain = domain; - iommu_group_create_direct_mappings(group, dev); + iommu_create_device_direct_mappings(group, dev); dev_info(dev, "Using iommu %s mapping\n", type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); From 5012c3968537e2ffecbdb2eba3479bf9fb9e5597 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:51 +0200 Subject: [PATCH 055/574] iommu: Export bus_iommu_probe() and make is safe for re-probing Add a check to the bus_iommu_probe() call-path to make sure it ignores devices which have already been successfully probed. Then export the bus_iommu_probe() function so it can be used by IOMMU drivers. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-14-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 +++++++++- include/linux/iommu.h | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 834a45da0ed0..397fd4fd0c32 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1610,11 +1610,19 @@ static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct list_head *group_list = data; + struct iommu_group *group; int ret; if (!dev_iommu_get(dev)) return -ENOMEM; + /* Device is probed already if in a group */ + group = iommu_group_get(dev); + if (group) { + iommu_group_put(group); + return 0; + } + if (!try_module_get(ops->owner)) { ret = -EINVAL; goto err_free_dev_iommu; @@ -1783,7 +1791,7 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) iommu_do_create_direct_mappings); } -static int bus_iommu_probe(struct bus_type *bus) +int bus_iommu_probe(struct bus_type *bus) { const struct iommu_ops *ops = bus->iommu_ops; int ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 30170d191e5e..fea1622408ad 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -445,6 +445,7 @@ static inline void iommu_iotlb_gather_init(struct iommu_iotlb_gather *gather) #define IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER 6 /* Post Driver unbind */ extern int bus_set_iommu(struct bus_type *bus, const struct iommu_ops *ops); +extern int bus_iommu_probe(struct bus_type *bus); extern bool iommu_present(struct bus_type *bus); extern bool iommu_capable(struct bus_type *bus, enum iommu_cap cap); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); From 57f9842e48840684f596db1de936d7c6d44cd087 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:52 +0200 Subject: [PATCH 056/574] iommu/amd: Remove dev_data->passthrough Make use of generic IOMMU infrastructure to gather the same information carried in dev_data->passthrough and remove the struct member. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-15-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 10 +++++----- drivers/iommu/amd_iommu_types.h | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 3e0d27f7622e..0b4b4faa876d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2047,8 +2047,8 @@ out_err: static int attach_device(struct device *dev, struct protection_domain *domain) { - struct pci_dev *pdev; struct iommu_dev_data *dev_data; + struct pci_dev *pdev; unsigned long flags; int ret; @@ -2067,8 +2067,10 @@ static int attach_device(struct device *dev, pdev = to_pci_dev(dev); if (domain->flags & PD_IOMMUV2_MASK) { + struct iommu_domain *def_domain = iommu_get_dma_domain(dev); + ret = -EINVAL; - if (!dev_data->passthrough) + if (def_domain->type != IOMMU_DOMAIN_IDENTITY) goto out; if (dev_data->iommu_v2) { @@ -2189,9 +2191,7 @@ static int amd_iommu_add_device(struct device *dev) /* Domains are initialized for this device - have a look what we ended up with */ domain = iommu_get_domain_for_dev(dev); - if (domain->type == IOMMU_DOMAIN_IDENTITY) - dev_data->passthrough = true; - else if (domain->type == IOMMU_DOMAIN_DMA) + if (domain->type == IOMMU_DOMAIN_DMA) iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0); out: diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index ca8c4522045b..d0d7b6a0c3d8 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -640,7 +640,6 @@ struct iommu_dev_data { struct pci_dev *pdev; u16 devid; /* PCI Device ID */ bool iommu_v2; /* Device can make use of IOMMUv2 */ - bool passthrough; /* Device is identity mapped */ struct { bool enabled; int qdep; From dce8d6964ebdb333383bacf5e7ab8c27df151218 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:53 +0200 Subject: [PATCH 057/574] iommu/amd: Convert to probe/release_device() call-backs Convert the AMD IOMMU Driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-16-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 71 ++++++++++++--------------------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 0b4b4faa876d..c30367413683 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -343,21 +343,9 @@ static bool check_device(struct device *dev) return true; } -static void init_iommu_group(struct device *dev) -{ - struct iommu_group *group; - - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return; - - iommu_group_put(group); -} - static int iommu_init_device(struct device *dev) { struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; int devid; if (dev->archdata.iommu) @@ -367,8 +355,6 @@ static int iommu_init_device(struct device *dev) if (devid < 0) return devid; - iommu = amd_iommu_rlookup_table[devid]; - dev_data = find_dev_data(devid); if (!dev_data) return -ENOMEM; @@ -391,8 +377,6 @@ static int iommu_init_device(struct device *dev) dev->archdata.iommu = dev_data; - iommu_device_link(&iommu->iommu, dev); - return 0; } @@ -410,7 +394,7 @@ static void iommu_ignore_device(struct device *dev) setup_aliases(dev); } -static void iommu_uninit_device(struct device *dev) +static void amd_iommu_uninit_device(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; @@ -429,13 +413,6 @@ static void iommu_uninit_device(struct device *dev) if (dev_data->domain) detach_device(dev); - iommu_device_unlink(&iommu->iommu, dev); - - iommu_group_remove_device(dev); - - /* Remove dma-ops */ - dev->dma_ops = NULL; - /* * We keep dev_data around for unplugged devices and reuse it when the * device is re-plugged - not doing so would introduce a ton of races. @@ -2152,55 +2129,50 @@ out: spin_unlock_irqrestore(&domain->lock, flags); } -static int amd_iommu_add_device(struct device *dev) +static struct iommu_device *amd_iommu_probe_device(struct device *dev) { - struct iommu_dev_data *dev_data; - struct iommu_domain *domain; + struct iommu_device *iommu_dev; struct amd_iommu *iommu; int ret, devid; - if (get_dev_data(dev)) - return 0; - if (!check_device(dev)) - return -ENODEV; + return ERR_PTR(-ENODEV); devid = get_device_id(dev); if (devid < 0) - return devid; + return ERR_PTR(devid); iommu = amd_iommu_rlookup_table[devid]; + if (get_dev_data(dev)) + return &iommu->iommu; + ret = iommu_init_device(dev); if (ret) { if (ret != -ENOTSUPP) dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); - + iommu_dev = ERR_PTR(ret); iommu_ignore_device(dev); - dev->dma_ops = NULL; - goto out; + } else { + iommu_dev = &iommu->iommu; } - init_iommu_group(dev); - dev_data = get_dev_data(dev); + iommu_completion_wait(iommu); - BUG_ON(!dev_data); + return iommu_dev; +} - if (dev_data->iommu_v2) - iommu_request_dm_for_dev(dev); +static void amd_iommu_probe_finalize(struct device *dev) +{ + struct iommu_domain *domain; /* Domains are initialized for this device - have a look what we ended up with */ domain = iommu_get_domain_for_dev(dev); if (domain->type == IOMMU_DOMAIN_DMA) iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0); - -out: - iommu_completion_wait(iommu); - - return 0; } -static void amd_iommu_remove_device(struct device *dev) +static void amd_iommu_release_device(struct device *dev) { struct amd_iommu *iommu; int devid; @@ -2214,7 +2186,7 @@ static void amd_iommu_remove_device(struct device *dev) iommu = amd_iommu_rlookup_table[devid]; - iommu_uninit_device(dev); + amd_iommu_uninit_device(dev); iommu_completion_wait(iommu); } @@ -2687,8 +2659,9 @@ const struct iommu_ops amd_iommu_ops = { .map = amd_iommu_map, .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, - .add_device = amd_iommu_add_device, - .remove_device = amd_iommu_remove_device, + .probe_device = amd_iommu_probe_device, + .release_device = amd_iommu_release_device, + .probe_finalize = amd_iommu_probe_finalize, .device_group = amd_iommu_device_group, .domain_get_attr = amd_iommu_domain_get_attr, .get_resv_regions = amd_iommu_get_resv_regions, From e5d1841f18b2401c8b449c024817cd243e363934 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:54 +0200 Subject: [PATCH 058/574] iommu/vt-d: Convert to probe/release_device() call-backs Convert the Intel IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20200429133712.31431-17-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 67 ++++--------------------------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index b9f905a55dda..b906727f5b85 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5781,78 +5781,27 @@ static bool intel_iommu_capable(enum iommu_cap cap) return false; } -static int intel_iommu_add_device(struct device *dev) +static struct iommu_device *intel_iommu_probe_device(struct device *dev) { - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; struct intel_iommu *iommu; - struct iommu_group *group; u8 bus, devfn; - int ret; iommu = device_to_iommu(dev, &bus, &devfn); if (!iommu) - return -ENODEV; - - iommu_device_link(&iommu->iommu, dev); + return ERR_PTR(-ENODEV); if (translation_pre_enabled(iommu)) dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO; - group = iommu_group_get_for_dev(dev); - - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto unlink; - } - - iommu_group_put(group); - - domain = iommu_get_domain_for_dev(dev); - dmar_domain = to_dmar_domain(domain); - if (domain->type == IOMMU_DOMAIN_DMA) { - if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) { - ret = iommu_request_dm_for_dev(dev); - if (ret) { - dmar_remove_one_dev_info(dev); - dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; - domain_add_dev_info(si_domain, dev); - dev_info(dev, - "Device uses a private identity domain.\n"); - } - } - } else { - if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) { - ret = iommu_request_dma_domain_for_dev(dev); - if (ret) { - dmar_remove_one_dev_info(dev); - dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; - if (!get_private_domain_for_dev(dev)) { - dev_warn(dev, - "Failed to get a private domain.\n"); - ret = -ENOMEM; - goto unlink; - } - - dev_info(dev, - "Device uses a private dma domain.\n"); - } - } - } - if (device_needs_bounce(dev)) { dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n"); set_dma_ops(dev, &bounce_dma_ops); } - return 0; - -unlink: - iommu_device_unlink(&iommu->iommu, dev); - return ret; + return &iommu->iommu; } -static void intel_iommu_remove_device(struct device *dev) +static void intel_iommu_release_device(struct device *dev) { struct intel_iommu *iommu; u8 bus, devfn; @@ -5863,10 +5812,6 @@ static void intel_iommu_remove_device(struct device *dev) dmar_remove_one_dev_info(dev); - iommu_group_remove_device(dev); - - iommu_device_unlink(&iommu->iommu, dev); - if (device_needs_bounce(dev)) set_dma_ops(dev, NULL); } @@ -6198,8 +6143,8 @@ const struct iommu_ops intel_iommu_ops = { .map = intel_iommu_map, .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, - .add_device = intel_iommu_add_device, - .remove_device = intel_iommu_remove_device, + .probe_device = intel_iommu_probe_device, + .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, .apply_resv_region = intel_iommu_apply_resv_region, From cefa0d55da3753e969764fb4b161052a1cb4ddfb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:55 +0200 Subject: [PATCH 059/574] iommu/arm-smmu: Convert to probe/release_device() call-backs Convert the arm-smmu and arm-smmu-v3 drivers to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-18-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/arm-smmu-v3.c | 38 ++++++++++-------------------------- drivers/iommu/arm-smmu.c | 39 ++++++++++++++----------------------- 2 files changed, 25 insertions(+), 52 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 82508730feb7..42e1ee7e5197 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2914,27 +2914,26 @@ static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid) static struct iommu_ops arm_smmu_ops; -static int arm_smmu_add_device(struct device *dev) +static struct iommu_device *arm_smmu_probe_device(struct device *dev) { int i, ret; struct arm_smmu_device *smmu; struct arm_smmu_master *master; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct iommu_group *group; if (!fwspec || fwspec->ops != &arm_smmu_ops) - return -ENODEV; + return ERR_PTR(-ENODEV); if (WARN_ON_ONCE(dev_iommu_priv_get(dev))) - return -EBUSY; + return ERR_PTR(-EBUSY); smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); if (!smmu) - return -ENODEV; + return ERR_PTR(-ENODEV); master = kzalloc(sizeof(*master), GFP_KERNEL); if (!master) - return -ENOMEM; + return ERR_PTR(-ENOMEM); master->dev = dev; master->smmu = smmu; @@ -2975,30 +2974,15 @@ static int arm_smmu_add_device(struct device *dev) master->ssid_bits = min_t(u8, master->ssid_bits, CTXDESC_LINEAR_CDMAX); - ret = iommu_device_link(&smmu->iommu, dev); - if (ret) - goto err_disable_pasid; + return &smmu->iommu; - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto err_unlink; - } - - iommu_group_put(group); - return 0; - -err_unlink: - iommu_device_unlink(&smmu->iommu, dev); -err_disable_pasid: - arm_smmu_disable_pasid(master); err_free_master: kfree(master); dev_iommu_priv_set(dev, NULL); - return ret; + return ERR_PTR(ret); } -static void arm_smmu_remove_device(struct device *dev) +static void arm_smmu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master *master; @@ -3010,8 +2994,6 @@ static void arm_smmu_remove_device(struct device *dev) master = dev_iommu_priv_get(dev); smmu = master->smmu; arm_smmu_detach_dev(master); - iommu_group_remove_device(dev); - iommu_device_unlink(&smmu->iommu, dev); arm_smmu_disable_pasid(master); kfree(master); iommu_fwspec_free(dev); @@ -3138,8 +3120,8 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .add_device = arm_smmu_add_device, - .remove_device = arm_smmu_remove_device, + .probe_device = arm_smmu_probe_device, + .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, .domain_get_attr = arm_smmu_domain_get_attr, .domain_set_attr = arm_smmu_domain_set_attr, diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index a6a5796e9c41..e622f4e33379 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -220,7 +220,7 @@ static int arm_smmu_register_legacy_master(struct device *dev, * With the legacy DT binding in play, we have no guarantees about * probe order, but then we're also not doing default domains, so we can * delay setting bus ops until we're sure every possible SMMU is ready, - * and that way ensure that no add_device() calls get missed. + * and that way ensure that no probe_device() calls get missed. */ static int arm_smmu_legacy_bus_init(void) { @@ -1062,7 +1062,6 @@ static int arm_smmu_master_alloc_smes(struct device *dev) struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); struct arm_smmu_device *smmu = cfg->smmu; struct arm_smmu_smr *smrs = smmu->smrs; - struct iommu_group *group; int i, idx, ret; mutex_lock(&smmu->stream_map_mutex); @@ -1090,18 +1089,9 @@ static int arm_smmu_master_alloc_smes(struct device *dev) cfg->smendx[i] = (s16)idx; } - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto out_err; - } - iommu_group_put(group); - /* It worked! Now, poke the actual hardware */ - for_each_cfg_sme(cfg, fwspec, i, idx) { + for_each_cfg_sme(cfg, fwspec, i, idx) arm_smmu_write_sme(smmu, idx); - smmu->s2crs[idx].group = group; - } mutex_unlock(&smmu->stream_map_mutex); return 0; @@ -1172,7 +1162,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* * FIXME: The arch/arm DMA API code tries to attach devices to its own - * domains between of_xlate() and add_device() - we have no way to cope + * domains between of_xlate() and probe_device() - we have no way to cope * with that, so until ARM gets converted to rely on groups and default * domains, just say no (but more politely than by dereferencing NULL). * This should be at least a WARN_ON once that's sorted. @@ -1382,7 +1372,7 @@ struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode) return dev ? dev_get_drvdata(dev) : NULL; } -static int arm_smmu_add_device(struct device *dev) +static struct iommu_device *arm_smmu_probe_device(struct device *dev) { struct arm_smmu_device *smmu = NULL; struct arm_smmu_master_cfg *cfg; @@ -1403,7 +1393,7 @@ static int arm_smmu_add_device(struct device *dev) } else if (fwspec && fwspec->ops == &arm_smmu_ops) { smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); } else { - return -ENODEV; + return ERR_PTR(-ENODEV); } ret = -EINVAL; @@ -1444,21 +1434,19 @@ static int arm_smmu_add_device(struct device *dev) if (ret) goto out_cfg_free; - iommu_device_link(&smmu->iommu, dev); - device_link_add(dev, smmu->dev, DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER); - return 0; + return &smmu->iommu; out_cfg_free: kfree(cfg); out_free: iommu_fwspec_free(dev); - return ret; + return ERR_PTR(ret); } -static void arm_smmu_remove_device(struct device *dev) +static void arm_smmu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master_cfg *cfg; @@ -1475,13 +1463,11 @@ static void arm_smmu_remove_device(struct device *dev) if (ret < 0) return; - iommu_device_unlink(&smmu->iommu, dev); arm_smmu_master_free_smes(cfg, fwspec); arm_smmu_rpm_put(smmu); dev_iommu_priv_set(dev, NULL); - iommu_group_remove_device(dev); kfree(cfg); iommu_fwspec_free(dev); } @@ -1512,6 +1498,11 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) else group = generic_device_group(dev); + /* Remember group for faster lookups */ + if (!IS_ERR(group)) + for_each_cfg_sme(cfg, fwspec, i, idx) + smmu->s2crs[idx].group = group; + return group; } @@ -1628,8 +1619,8 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .add_device = arm_smmu_add_device, - .remove_device = arm_smmu_remove_device, + .probe_device = arm_smmu_probe_device, + .release_device = arm_smmu_release_device, .device_group = arm_smmu_device_group, .domain_get_attr = arm_smmu_domain_get_attr, .domain_set_attr = arm_smmu_domain_set_attr, From 52dd3ca4176f7f87452ee4b9d9a6d2e57ac4ccb7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:56 +0200 Subject: [PATCH 060/574] iommu/pamu: Convert to probe/release_device() call-backs Convert the PAMU IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-19-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/fsl_pamu_domain.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 06828e2698d5..928d37771ece 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -1016,25 +1016,13 @@ static struct iommu_group *fsl_pamu_device_group(struct device *dev) return group; } -static int fsl_pamu_add_device(struct device *dev) +static struct iommu_device *fsl_pamu_probe_device(struct device *dev) { - struct iommu_group *group; - - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - - iommu_device_link(&pamu_iommu, dev); - - return 0; + return &pamu_iommu; } -static void fsl_pamu_remove_device(struct device *dev) +static void fsl_pamu_release_device(struct device *dev) { - iommu_device_unlink(&pamu_iommu, dev); - iommu_group_remove_device(dev); } static const struct iommu_ops fsl_pamu_ops = { @@ -1048,8 +1036,8 @@ static const struct iommu_ops fsl_pamu_ops = { .iova_to_phys = fsl_pamu_iova_to_phys, .domain_set_attr = fsl_pamu_set_domain_attr, .domain_get_attr = fsl_pamu_get_domain_attr, - .add_device = fsl_pamu_add_device, - .remove_device = fsl_pamu_remove_device, + .probe_device = fsl_pamu_probe_device, + .release_device = fsl_pamu_release_device, .device_group = fsl_pamu_device_group, }; From 522af649e57b3e3a347a8261f609f73ab4143f20 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:57 +0200 Subject: [PATCH 061/574] iommu/s390: Convert to probe/release_device() call-backs Convert the S390 IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-20-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 1137f3ddcb85..610f0828f22d 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -166,21 +166,14 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, } } -static int s390_iommu_add_device(struct device *dev) +static struct iommu_device *s390_iommu_probe_device(struct device *dev) { - struct iommu_group *group = iommu_group_get_for_dev(dev); struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - iommu_device_link(&zdev->iommu_dev, dev); - - return 0; + return &zdev->iommu_dev; } -static void s390_iommu_remove_device(struct device *dev) +static void s390_iommu_release_device(struct device *dev) { struct zpci_dev *zdev = to_pci_dev(dev)->sysdata; struct iommu_domain *domain; @@ -191,7 +184,7 @@ static void s390_iommu_remove_device(struct device *dev) * to vfio-pci and completing the VFIO_SET_IOMMU ioctl (which triggers * the attach_dev), removing the device via * "echo 1 > /sys/bus/pci/devices/.../remove" won't trigger detach_dev, - * only remove_device will be called via the BUS_NOTIFY_REMOVED_DEVICE + * only release_device will be called via the BUS_NOTIFY_REMOVED_DEVICE * notifier. * * So let's call detach_dev from here if it hasn't been called before. @@ -201,9 +194,6 @@ static void s390_iommu_remove_device(struct device *dev) if (domain) s390_iommu_detach_device(domain, dev); } - - iommu_device_unlink(&zdev->iommu_dev, dev); - iommu_group_remove_device(dev); } static int s390_iommu_update_trans(struct s390_domain *s390_domain, @@ -373,8 +363,8 @@ static const struct iommu_ops s390_iommu_ops = { .map = s390_iommu_map, .unmap = s390_iommu_unmap, .iova_to_phys = s390_iommu_iova_to_phys, - .add_device = s390_iommu_add_device, - .remove_device = s390_iommu_remove_device, + .probe_device = s390_iommu_probe_device, + .release_device = s390_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = S390_IOMMU_PGSIZES, }; From 21acf6599cfb4407e9745b36f69c93cf99a3d189 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:58 +0200 Subject: [PATCH 062/574] iommu/virtio: Convert to probe/release_device() call-backs Convert the VirtIO IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-21-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 41 +++++++++--------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index d5cac4f46ca5..bda300c2a438 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -865,24 +865,23 @@ static struct viommu_dev *viommu_get_by_fwnode(struct fwnode_handle *fwnode) return dev ? dev_to_virtio(dev)->priv : NULL; } -static int viommu_add_device(struct device *dev) +static struct iommu_device *viommu_probe_device(struct device *dev) { int ret; - struct iommu_group *group; struct viommu_endpoint *vdev; struct viommu_dev *viommu = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); if (!fwspec || fwspec->ops != &viommu_ops) - return -ENODEV; + return ERR_PTR(-ENODEV); viommu = viommu_get_by_fwnode(fwspec->iommu_fwnode); if (!viommu) - return -ENODEV; + return ERR_PTR(-ENODEV); vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) - return -ENOMEM; + return ERR_PTR(-ENOMEM); vdev->dev = dev; vdev->viommu = viommu; @@ -896,45 +895,25 @@ static int viommu_add_device(struct device *dev) goto err_free_dev; } - ret = iommu_device_link(&viommu->iommu, dev); - if (ret) - goto err_free_dev; + return &viommu->iommu; - /* - * Last step creates a default domain and attaches to it. Everything - * must be ready. - */ - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto err_unlink_dev; - } - - iommu_group_put(group); - - return PTR_ERR_OR_ZERO(group); - -err_unlink_dev: - iommu_device_unlink(&viommu->iommu, dev); err_free_dev: generic_iommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); - return ret; + return ERR_PTR(ret); } -static void viommu_remove_device(struct device *dev) +static void viommu_release_device(struct device *dev) { - struct viommu_endpoint *vdev; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct viommu_endpoint *vdev; if (!fwspec || fwspec->ops != &viommu_ops) return; vdev = dev_iommu_priv_get(dev); - iommu_group_remove_device(dev); - iommu_device_unlink(&vdev->viommu->iommu, dev); generic_iommu_put_resv_regions(dev, &vdev->resv_regions); kfree(vdev); } @@ -960,8 +939,8 @@ static struct iommu_ops viommu_ops = { .unmap = viommu_unmap, .iova_to_phys = viommu_iova_to_phys, .iotlb_sync = viommu_iotlb_sync, - .add_device = viommu_add_device, - .remove_device = viommu_remove_device, + .probe_device = viommu_probe_device, + .release_device = viommu_release_device, .device_group = viommu_device_group, .get_resv_regions = viommu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, From dea74f1c37fb6d9a8f7e0cb85beaa89f5eaea908 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:36:59 +0200 Subject: [PATCH 063/574] iommu/msm: Convert to probe/release_device() call-backs Convert the MSM IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-22-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/msm_iommu.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 94a6df1bddd6..10cd4db0710a 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -388,43 +388,23 @@ static struct msm_iommu_dev *find_iommu_for_dev(struct device *dev) return ret; } -static int msm_iommu_add_device(struct device *dev) +static struct iommu_device *msm_iommu_probe_device(struct device *dev) { struct msm_iommu_dev *iommu; - struct iommu_group *group; unsigned long flags; spin_lock_irqsave(&msm_iommu_lock, flags); iommu = find_iommu_for_dev(dev); spin_unlock_irqrestore(&msm_iommu_lock, flags); - if (iommu) - iommu_device_link(&iommu->iommu, dev); - else - return -ENODEV; + if (!iommu) + return ERR_PTR(-ENODEV); - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - - return 0; + return &iommu->iommu; } -static void msm_iommu_remove_device(struct device *dev) +static void msm_iommu_release_device(struct device *dev) { - struct msm_iommu_dev *iommu; - unsigned long flags; - - spin_lock_irqsave(&msm_iommu_lock, flags); - iommu = find_iommu_for_dev(dev); - spin_unlock_irqrestore(&msm_iommu_lock, flags); - - if (iommu) - iommu_device_unlink(&iommu->iommu, dev); - - iommu_group_remove_device(dev); } static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) @@ -708,8 +688,8 @@ static struct iommu_ops msm_iommu_ops = { */ .iotlb_sync = NULL, .iova_to_phys = msm_iommu_iova_to_phys, - .add_device = msm_iommu_add_device, - .remove_device = msm_iommu_remove_device, + .probe_device = msm_iommu_probe_device, + .release_device = msm_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = MSM_IOMMU_PGSIZES, .of_xlate = qcom_iommu_of_xlate, From 80e4592a77e4baa33035eaeac646f63f1f5d3d57 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:00 +0200 Subject: [PATCH 064/574] iommu/mediatek: Convert to probe/release_device() call-backs Convert the Mediatek IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-23-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 5f4d6df59cf6..2be96f1cdbd2 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -441,38 +441,26 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain, return pa; } -static int mtk_iommu_add_device(struct device *dev) +static struct iommu_device *mtk_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; - struct iommu_group *group; if (!fwspec || fwspec->ops != &mtk_iommu_ops) - return -ENODEV; /* Not a iommu client device */ + return ERR_PTR(-ENODEV); /* Not a iommu client device */ data = dev_iommu_priv_get(dev); - iommu_device_link(&data->iommu, dev); - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - return 0; + return &data->iommu; } -static void mtk_iommu_remove_device(struct device *dev) +static void mtk_iommu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct mtk_iommu_data *data; if (!fwspec || fwspec->ops != &mtk_iommu_ops) return; - data = dev_iommu_priv_get(dev); - iommu_device_unlink(&data->iommu, dev); - - iommu_group_remove_device(dev); iommu_fwspec_free(dev); } @@ -526,8 +514,8 @@ static const struct iommu_ops mtk_iommu_ops = { .flush_iotlb_all = mtk_iommu_flush_iotlb_all, .iotlb_sync = mtk_iommu_iotlb_sync, .iova_to_phys = mtk_iommu_iova_to_phys, - .add_device = mtk_iommu_add_device, - .remove_device = mtk_iommu_remove_device, + .probe_device = mtk_iommu_probe_device, + .release_device = mtk_iommu_release_device, .device_group = mtk_iommu_device_group, .of_xlate = mtk_iommu_of_xlate, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, From 57dbf81f50c82a0ad895a57828ad1ab539785a25 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:01 +0200 Subject: [PATCH 065/574] iommu/mediatek-v1 Convert to probe/release_device() call-backs Convert the Mediatek-v1 IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-24-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu_v1.c | 54 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a31be05601c9..7bdd74c7cb9f 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -416,14 +416,12 @@ static int mtk_iommu_create_mapping(struct device *dev, return 0; } -static int mtk_iommu_add_device(struct device *dev) +static struct iommu_device *mtk_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct dma_iommu_mapping *mtk_mapping; struct of_phandle_args iommu_spec; struct of_phandle_iterator it; struct mtk_iommu_data *data; - struct iommu_group *group; int err; of_for_each_phandle(&it, err, dev->of_node, "iommus", @@ -442,35 +440,28 @@ static int mtk_iommu_add_device(struct device *dev) } if (!fwspec || fwspec->ops != &mtk_iommu_ops) - return -ENODEV; /* Not a iommu client device */ - - /* - * This is a short-term bodge because the ARM DMA code doesn't - * understand multi-device groups, but we have to call into it - * successfully (and not just rely on a normal IOMMU API attach - * here) in order to set the correct DMA API ops on @dev. - */ - group = iommu_group_alloc(); - if (IS_ERR(group)) - return PTR_ERR(group); - - err = iommu_group_add_device(group, dev); - iommu_group_put(group); - if (err) - return err; + return ERR_PTR(-ENODEV); /* Not a iommu client device */ data = dev_iommu_priv_get(dev); - mtk_mapping = data->dev->archdata.iommu; - err = arm_iommu_attach_device(dev, mtk_mapping); - if (err) { - iommu_group_remove_device(dev); - return err; - } - return iommu_device_link(&data->iommu, dev); + return &data->iommu; } -static void mtk_iommu_remove_device(struct device *dev) +static void mtk_iommu_probe_finalize(struct device *dev) +{ + struct dma_iommu_mapping *mtk_mapping; + struct mtk_iommu_data *data; + int err; + + data = dev_iommu_priv_get(dev); + mtk_mapping = data->dev->archdata.iommu; + + err = arm_iommu_attach_device(dev, mtk_mapping); + if (err) + dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n"); +} + +static void mtk_iommu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct mtk_iommu_data *data; @@ -479,9 +470,6 @@ static void mtk_iommu_remove_device(struct device *dev) return; data = dev_iommu_priv_get(dev); - iommu_device_unlink(&data->iommu, dev); - - iommu_group_remove_device(dev); iommu_fwspec_free(dev); } @@ -534,8 +522,10 @@ static const struct iommu_ops mtk_iommu_ops = { .map = mtk_iommu_map, .unmap = mtk_iommu_unmap, .iova_to_phys = mtk_iommu_iova_to_phys, - .add_device = mtk_iommu_add_device, - .remove_device = mtk_iommu_remove_device, + .probe_device = mtk_iommu_probe_device, + .probe_finalize = mtk_iommu_probe_finalize, + .release_device = mtk_iommu_release_device, + .device_group = generic_device_group, .pgsize_bitmap = ~0UL << MT2701_IOMMU_PAGE_SHIFT, }; From bfe3bd493b3d3f9a08d1751338ccc711840e4590 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:02 +0200 Subject: [PATCH 066/574] iommu/qcom: Convert to probe/release_device() call-backs Convert the QCOM IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-25-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/qcom_iommu.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 0e2a96467767..054e476ebd49 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -524,14 +524,13 @@ static bool qcom_iommu_capable(enum iommu_cap cap) } } -static int qcom_iommu_add_device(struct device *dev) +static struct iommu_device *qcom_iommu_probe_device(struct device *dev) { struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); - struct iommu_group *group; struct device_link *link; if (!qcom_iommu) - return -ENODEV; + return ERR_PTR(-ENODEV); /* * Establish the link between iommu and master, so that the @@ -542,28 +541,19 @@ static int qcom_iommu_add_device(struct device *dev) if (!link) { dev_err(qcom_iommu->dev, "Unable to create device link between %s and %s\n", dev_name(qcom_iommu->dev), dev_name(dev)); - return -ENODEV; + return ERR_PTR(-ENODEV); } - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - iommu_device_link(&qcom_iommu->iommu, dev); - - return 0; + return &qcom_iommu->iommu; } -static void qcom_iommu_remove_device(struct device *dev) +static void qcom_iommu_release_device(struct device *dev) { struct qcom_iommu_dev *qcom_iommu = to_iommu(dev); if (!qcom_iommu) return; - iommu_device_unlink(&qcom_iommu->iommu, dev); - iommu_group_remove_device(dev); iommu_fwspec_free(dev); } @@ -619,8 +609,8 @@ static const struct iommu_ops qcom_iommu_ops = { .flush_iotlb_all = qcom_iommu_flush_iotlb_all, .iotlb_sync = qcom_iommu_iotlb_sync, .iova_to_phys = qcom_iommu_iova_to_phys, - .add_device = qcom_iommu_add_device, - .remove_device = qcom_iommu_remove_device, + .probe_device = qcom_iommu_probe_device, + .release_device = qcom_iommu_release_device, .device_group = generic_device_group, .of_xlate = qcom_iommu_of_xlate, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, From d8260443245dc8e504c3ecde5e86972f5be2b43e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:03 +0200 Subject: [PATCH 067/574] iommu/rockchip: Convert to probe/release_device() call-backs Convert the Rockchip IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-26-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index b33cdd5aad81..d25c2486ca07 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -1054,40 +1054,28 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) kfree(rk_domain); } -static int rk_iommu_add_device(struct device *dev) +static struct iommu_device *rk_iommu_probe_device(struct device *dev) { - struct iommu_group *group; - struct rk_iommu *iommu; struct rk_iommudata *data; + struct rk_iommu *iommu; data = dev->archdata.iommu; if (!data) - return -ENODEV; + return ERR_PTR(-ENODEV); iommu = rk_iommu_from_dev(dev); - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - iommu_group_put(group); - - iommu_device_link(&iommu->iommu, dev); data->link = device_link_add(dev, iommu->dev, DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); - return 0; + return &iommu->iommu; } -static void rk_iommu_remove_device(struct device *dev) +static void rk_iommu_release_device(struct device *dev) { - struct rk_iommu *iommu; struct rk_iommudata *data = dev->archdata.iommu; - iommu = rk_iommu_from_dev(dev); - device_link_del(data->link); - iommu_device_unlink(&iommu->iommu, dev); - iommu_group_remove_device(dev); } static struct iommu_group *rk_iommu_device_group(struct device *dev) @@ -1126,8 +1114,8 @@ static const struct iommu_ops rk_iommu_ops = { .detach_dev = rk_iommu_detach_device, .map = rk_iommu_map, .unmap = rk_iommu_unmap, - .add_device = rk_iommu_add_device, - .remove_device = rk_iommu_remove_device, + .probe_device = rk_iommu_probe_device, + .release_device = rk_iommu_release_device, .iova_to_phys = rk_iommu_iova_to_phys, .device_group = rk_iommu_device_group, .pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP, From b287ba73789906fc4f4ee821e1014b13e2814849 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:04 +0200 Subject: [PATCH 068/574] iommu/tegra: Convert to probe/release_device() call-backs Convert the Tegra IOMMU drivers to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-27-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/tegra-gart.c | 24 ++++++------------------ drivers/iommu/tegra-smmu.c | 31 ++++++++----------------------- 2 files changed, 14 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index db6559e8336f..5fbdff6ff41a 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -243,28 +243,16 @@ static bool gart_iommu_capable(enum iommu_cap cap) return false; } -static int gart_iommu_add_device(struct device *dev) +static struct iommu_device *gart_iommu_probe_device(struct device *dev) { - struct iommu_group *group; - if (!dev_iommu_fwspec_get(dev)) - return -ENODEV; + return ERR_PTR(-ENODEV); - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - - iommu_device_link(&gart_handle->iommu, dev); - - return 0; + return &gart_handle->iommu; } -static void gart_iommu_remove_device(struct device *dev) +static void gart_iommu_release_device(struct device *dev) { - iommu_group_remove_device(dev); - iommu_device_unlink(&gart_handle->iommu, dev); } static int gart_iommu_of_xlate(struct device *dev, @@ -290,8 +278,8 @@ static const struct iommu_ops gart_iommu_ops = { .domain_free = gart_iommu_domain_free, .attach_dev = gart_iommu_attach_dev, .detach_dev = gart_iommu_detach_dev, - .add_device = gart_iommu_add_device, - .remove_device = gart_iommu_remove_device, + .probe_device = gart_iommu_probe_device, + .release_device = gart_iommu_release_device, .device_group = generic_device_group, .map = gart_iommu_map, .unmap = gart_iommu_unmap, diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 63a147b623e6..7426b7666e2b 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -757,11 +757,10 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, return 0; } -static int tegra_smmu_add_device(struct device *dev) +static struct iommu_device *tegra_smmu_probe_device(struct device *dev) { struct device_node *np = dev->of_node; struct tegra_smmu *smmu = NULL; - struct iommu_group *group; struct of_phandle_args args; unsigned int index = 0; int err; @@ -774,7 +773,7 @@ static int tegra_smmu_add_device(struct device *dev) of_node_put(args.np); if (err < 0) - return err; + return ERR_PTR(err); /* * Only a single IOMMU master interface is currently @@ -783,8 +782,6 @@ static int tegra_smmu_add_device(struct device *dev) */ dev->archdata.iommu = smmu; - iommu_device_link(&smmu->iommu, dev); - break; } @@ -793,26 +790,14 @@ static int tegra_smmu_add_device(struct device *dev) } if (!smmu) - return -ENODEV; + return ERR_PTR(-ENODEV); - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - - return 0; + return &smmu->iommu; } -static void tegra_smmu_remove_device(struct device *dev) +static void tegra_smmu_release_device(struct device *dev) { - struct tegra_smmu *smmu = dev->archdata.iommu; - - if (smmu) - iommu_device_unlink(&smmu->iommu, dev); - dev->archdata.iommu = NULL; - iommu_group_remove_device(dev); } static const struct tegra_smmu_group_soc * @@ -895,8 +880,8 @@ static const struct iommu_ops tegra_smmu_ops = { .domain_free = tegra_smmu_domain_free, .attach_dev = tegra_smmu_attach_dev, .detach_dev = tegra_smmu_detach_dev, - .add_device = tegra_smmu_add_device, - .remove_device = tegra_smmu_remove_device, + .probe_device = tegra_smmu_probe_device, + .release_device = tegra_smmu_release_device, .device_group = tegra_smmu_device_group, .map = tegra_smmu_map, .unmap = tegra_smmu_unmap, @@ -1015,7 +1000,7 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev, * value. However the IOMMU registration process will attempt to add * all devices to the IOMMU when bus_set_iommu() is called. In order * not to rely on global variables to track the IOMMU instance, we - * set it here so that it can be looked up from the .add_device() + * set it here so that it can be looked up from the .probe_device() * callback via the IOMMU device's .drvdata field. */ mc->smmu = smmu; From 6580c8a78424fb1a0d2bd9134355563c0398fe8e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:05 +0200 Subject: [PATCH 069/574] iommu/renesas: Convert to probe/release_device() call-backs Convert the Renesas IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-28-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 62 +++++++++++++------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 310cf09feea3..fb7e702dee23 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -805,24 +805,8 @@ static int ipmmu_of_xlate(struct device *dev, static int ipmmu_init_arm_mapping(struct device *dev) { struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); - struct iommu_group *group; int ret; - /* Create a device group and add the device to it. */ - group = iommu_group_alloc(); - if (IS_ERR(group)) { - dev_err(dev, "Failed to allocate IOMMU group\n"); - return PTR_ERR(group); - } - - ret = iommu_group_add_device(group, dev); - iommu_group_put(group); - - if (ret < 0) { - dev_err(dev, "Failed to add device to IPMMU group\n"); - return ret; - } - /* * Create the ARM mapping, used by the ARM DMA mapping core to allocate * VAs. This will allocate a corresponding IOMMU domain. @@ -856,48 +840,39 @@ static int ipmmu_init_arm_mapping(struct device *dev) return 0; error: - iommu_group_remove_device(dev); if (mmu->mapping) arm_iommu_release_mapping(mmu->mapping); return ret; } -static int ipmmu_add_device(struct device *dev) +static struct iommu_device *ipmmu_probe_device(struct device *dev) { struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); - struct iommu_group *group; - int ret; /* * Only let through devices that have been verified in xlate() */ if (!mmu) - return -ENODEV; + return ERR_PTR(-ENODEV); - if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA)) { - ret = ipmmu_init_arm_mapping(dev); - if (ret) - return ret; - } else { - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) - return PTR_ERR(group); - - iommu_group_put(group); - } - - iommu_device_link(&mmu->iommu, dev); - return 0; + return &mmu->iommu; } -static void ipmmu_remove_device(struct device *dev) +static void ipmmu_probe_finalize(struct device *dev) { - struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); + int ret = 0; - iommu_device_unlink(&mmu->iommu, dev); + if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA)) + ret = ipmmu_init_arm_mapping(dev); + + if (ret) + dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not work\n"); +} + +static void ipmmu_release_device(struct device *dev) +{ arm_iommu_detach_device(dev); - iommu_group_remove_device(dev); } static struct iommu_group *ipmmu_find_group(struct device *dev) @@ -925,9 +900,14 @@ static const struct iommu_ops ipmmu_ops = { .flush_iotlb_all = ipmmu_flush_iotlb_all, .iotlb_sync = ipmmu_iotlb_sync, .iova_to_phys = ipmmu_iova_to_phys, - .add_device = ipmmu_add_device, - .remove_device = ipmmu_remove_device, + .probe_device = ipmmu_probe_device, + .release_device = ipmmu_release_device, + .probe_finalize = ipmmu_probe_finalize, +#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA) + .device_group = generic_device_group, +#else .device_group = ipmmu_find_group, +#endif .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, .of_xlate = ipmmu_of_xlate, }; From c822b37cac48ea0e4c8202a42fdc480ace099b12 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:06 +0200 Subject: [PATCH 070/574] iommu/omap: Remove orphan_dev tracking Remove the tracking of device which could not be probed because their IOMMU is not probed yet. Replace it with a call to bus_iommu_probe() when a new IOMMU is probed. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-29-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 54 +++----------------------------------- 1 file changed, 4 insertions(+), 50 deletions(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 887fefcb03b4..ecc9d0829a91 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -35,15 +35,6 @@ static const struct iommu_ops omap_iommu_ops; -struct orphan_dev { - struct device *dev; - struct list_head node; -}; - -static LIST_HEAD(orphan_dev_list); - -static DEFINE_SPINLOCK(orphan_lock); - #define to_iommu(dev) ((struct omap_iommu *)dev_get_drvdata(dev)) /* bitmap of the page sizes currently supported */ @@ -62,8 +53,6 @@ static DEFINE_SPINLOCK(orphan_lock); static struct platform_driver omap_iommu_driver; static struct kmem_cache *iopte_cachep; -static int _omap_iommu_add_device(struct device *dev); - /** * to_omap_domain - Get struct omap_iommu_domain from generic iommu_domain * @dom: generic iommu domain handle @@ -1177,7 +1166,6 @@ static int omap_iommu_probe(struct platform_device *pdev) struct omap_iommu *obj; struct resource *res; struct device_node *of = pdev->dev.of_node; - struct orphan_dev *orphan_dev, *tmp; if (!of) { pr_err("%s: only DT-based devices are supported\n", __func__); @@ -1260,13 +1248,8 @@ static int omap_iommu_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s registered\n", obj->name); - list_for_each_entry_safe(orphan_dev, tmp, &orphan_dev_list, node) { - err = _omap_iommu_add_device(orphan_dev->dev); - if (!err) { - list_del(&orphan_dev->node); - kfree(orphan_dev); - } - } + /* Re-probe bus to probe device attached to this IOMMU */ + bus_iommu_probe(&platform_bus_type); return 0; @@ -1657,7 +1640,7 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain, return ret; } -static int _omap_iommu_add_device(struct device *dev) +static int omap_iommu_add_device(struct device *dev) { struct omap_iommu_arch_data *arch_data, *tmp; struct omap_iommu *oiommu; @@ -1666,8 +1649,6 @@ static int _omap_iommu_add_device(struct device *dev) struct platform_device *pdev; int num_iommus, i; int ret; - struct orphan_dev *orphan_dev; - unsigned long flags; /* * Allocate the archdata iommu structure for DT-based devices. @@ -1702,23 +1683,7 @@ static int _omap_iommu_add_device(struct device *dev) if (!pdev) { of_node_put(np); kfree(arch_data); - spin_lock_irqsave(&orphan_lock, flags); - list_for_each_entry(orphan_dev, &orphan_dev_list, - node) { - if (orphan_dev->dev == dev) - break; - } - spin_unlock_irqrestore(&orphan_lock, flags); - - if (orphan_dev && orphan_dev->dev == dev) - return -EPROBE_DEFER; - - orphan_dev = kzalloc(sizeof(*orphan_dev), GFP_KERNEL); - orphan_dev->dev = dev; - spin_lock_irqsave(&orphan_lock, flags); - list_add(&orphan_dev->node, &orphan_dev_list); - spin_unlock_irqrestore(&orphan_lock, flags); - return -EPROBE_DEFER; + return -ENODEV; } oiommu = platform_get_drvdata(pdev); @@ -1764,17 +1729,6 @@ static int _omap_iommu_add_device(struct device *dev) return 0; } -static int omap_iommu_add_device(struct device *dev) -{ - int ret; - - ret = _omap_iommu_add_device(dev); - if (ret == -EPROBE_DEFER) - return 0; - - return ret; -} - static void omap_iommu_remove_device(struct device *dev) { struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; From 6785eb9105e3363aa51408c700a55e8b5f88fcf6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:07 +0200 Subject: [PATCH 071/574] iommu/omap: Convert to probe/release_device() call-backs Convert the OMAP IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20200429133712.31431-30-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 51 +++++++++++--------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index ecc9d0829a91..6699fe6d9e06 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1640,15 +1640,13 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain, return ret; } -static int omap_iommu_add_device(struct device *dev) +static struct iommu_device *omap_iommu_probe_device(struct device *dev) { struct omap_iommu_arch_data *arch_data, *tmp; - struct omap_iommu *oiommu; - struct iommu_group *group; - struct device_node *np; struct platform_device *pdev; + struct omap_iommu *oiommu; + struct device_node *np; int num_iommus, i; - int ret; /* * Allocate the archdata iommu structure for DT-based devices. @@ -1657,7 +1655,7 @@ static int omap_iommu_add_device(struct device *dev) * IOMMU users. */ if (!dev->of_node) - return 0; + return ERR_PTR(-ENODEV); /* * retrieve the count of IOMMU nodes using phandle size as element size @@ -1670,27 +1668,27 @@ static int omap_iommu_add_device(struct device *dev) arch_data = kcalloc(num_iommus + 1, sizeof(*arch_data), GFP_KERNEL); if (!arch_data) - return -ENOMEM; + return ERR_PTR(-ENOMEM); for (i = 0, tmp = arch_data; i < num_iommus; i++, tmp++) { np = of_parse_phandle(dev->of_node, "iommus", i); if (!np) { kfree(arch_data); - return -EINVAL; + return ERR_PTR(-EINVAL); } pdev = of_find_device_by_node(np); if (!pdev) { of_node_put(np); kfree(arch_data); - return -ENODEV; + return ERR_PTR(-ENODEV); } oiommu = platform_get_drvdata(pdev); if (!oiommu) { of_node_put(np); kfree(arch_data); - return -EINVAL; + return ERR_PTR(-EINVAL); } tmp->iommu_dev = oiommu; @@ -1699,46 +1697,25 @@ static int omap_iommu_add_device(struct device *dev) of_node_put(np); } + dev->archdata.iommu = arch_data; + /* * use the first IOMMU alone for the sysfs device linking. * TODO: Evaluate if a single iommu_group needs to be * maintained for both IOMMUs */ oiommu = arch_data->iommu_dev; - ret = iommu_device_link(&oiommu->iommu, dev); - if (ret) { - kfree(arch_data); - return ret; - } - dev->archdata.iommu = arch_data; - - /* - * IOMMU group initialization calls into omap_iommu_device_group, which - * needs a valid dev->archdata.iommu pointer - */ - group = iommu_group_get_for_dev(dev); - if (IS_ERR(group)) { - iommu_device_unlink(&oiommu->iommu, dev); - dev->archdata.iommu = NULL; - kfree(arch_data); - return PTR_ERR(group); - } - iommu_group_put(group); - - return 0; + return &oiommu->iommu; } -static void omap_iommu_remove_device(struct device *dev) +static void omap_iommu_release_device(struct device *dev) { struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; if (!dev->of_node || !arch_data) return; - iommu_device_unlink(&arch_data->iommu_dev->iommu, dev); - iommu_group_remove_device(dev); - dev->archdata.iommu = NULL; kfree(arch_data); @@ -1763,8 +1740,8 @@ static const struct iommu_ops omap_iommu_ops = { .map = omap_iommu_map, .unmap = omap_iommu_unmap, .iova_to_phys = omap_iommu_iova_to_phys, - .add_device = omap_iommu_add_device, - .remove_device = omap_iommu_remove_device, + .probe_device = omap_iommu_probe_device, + .release_device = omap_iommu_release_device, .device_group = omap_iommu_device_group, .pgsize_bitmap = OMAP_IOMMU_PGSIZES, }; From 66ae88e71ecb93bafaacaeef233971eacd10e749 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:08 +0200 Subject: [PATCH 072/574] iommu/exynos: Use first SYSMMU in controllers list for IOMMU core On Exynos platforms there can be more than one SYSMMU (IOMMU) for one DMA master device. Since the IOMMU core code expects only one hardware IOMMU, use the first SYSMMU in the list. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-31-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 186ff5cc975c..09cdd163560a 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1261,6 +1261,11 @@ static int exynos_iommu_add_device(struct device *dev) } iommu_group_put(group); + /* There is always at least one entry, see exynos_iommu_of_xlate() */ + data = list_first_entry(&owner->controllers, + struct sysmmu_drvdata, owner_node); + iommu_device_link(&data->iommu, dev); + return 0; } @@ -1286,6 +1291,11 @@ static void exynos_iommu_remove_device(struct device *dev) list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); + + /* There is always at least one entry, see exynos_iommu_of_xlate() */ + data = list_first_entry(&owner->controllers, + struct sysmmu_drvdata, owner_node); + iommu_device_unlink(&data->iommu, dev); } static int exynos_iommu_of_xlate(struct device *dev, From 3c51c05479c577ea4235d46366d6181a2d6aff2d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:09 +0200 Subject: [PATCH 073/574] iommu/exynos: Convert to probe/release_device() call-backs Convert the Exynos IOMMU driver to use the probe_device() and release_device() call-backs of iommu_ops, so that the iommu core code does the group and sysfs setup. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-32-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 09cdd163560a..60c8a56e4a3f 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1235,19 +1235,13 @@ static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *iommu_domain, return phys; } -static int exynos_iommu_add_device(struct device *dev) +static struct iommu_device *exynos_iommu_probe_device(struct device *dev) { struct exynos_iommu_owner *owner = dev->archdata.iommu; struct sysmmu_drvdata *data; - struct iommu_group *group; if (!has_sysmmu(dev)) - return -ENODEV; - - group = iommu_group_get_for_dev(dev); - - if (IS_ERR(group)) - return PTR_ERR(group); + return ERR_PTR(-ENODEV); list_for_each_entry(data, &owner->controllers, owner_node) { /* @@ -1259,17 +1253,15 @@ static int exynos_iommu_add_device(struct device *dev) DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME); } - iommu_group_put(group); /* There is always at least one entry, see exynos_iommu_of_xlate() */ data = list_first_entry(&owner->controllers, struct sysmmu_drvdata, owner_node); - iommu_device_link(&data->iommu, dev); - return 0; + return &data->iommu; } -static void exynos_iommu_remove_device(struct device *dev) +static void exynos_iommu_release_device(struct device *dev) { struct exynos_iommu_owner *owner = dev->archdata.iommu; struct sysmmu_drvdata *data; @@ -1287,15 +1279,9 @@ static void exynos_iommu_remove_device(struct device *dev) iommu_group_put(group); } } - iommu_group_remove_device(dev); list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); - - /* There is always at least one entry, see exynos_iommu_of_xlate() */ - data = list_first_entry(&owner->controllers, - struct sysmmu_drvdata, owner_node); - iommu_device_unlink(&data->iommu, dev); } static int exynos_iommu_of_xlate(struct device *dev, @@ -1341,8 +1327,8 @@ static const struct iommu_ops exynos_iommu_ops = { .unmap = exynos_iommu_unmap, .iova_to_phys = exynos_iommu_iova_to_phys, .device_group = generic_device_group, - .add_device = exynos_iommu_add_device, - .remove_device = exynos_iommu_remove_device, + .probe_device = exynos_iommu_probe_device, + .release_device = exynos_iommu_release_device, .pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE, .of_xlate = exynos_iommu_of_xlate, }; From 3eeeb45c6d0444b368cdeba9bdafa8bbcf5370d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:10 +0200 Subject: [PATCH 074/574] iommu: Remove add_device()/remove_device() code-paths All drivers are converted to use the probe/release_device() call-backs, so the add_device/remove_device() pointers are unused and the code using them can be removed. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-33-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 164 +++++++++++------------------------------- include/linux/iommu.h | 4 -- 2 files changed, 41 insertions(+), 127 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 397fd4fd0c32..7f99e5ae432c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -220,12 +220,20 @@ out_release: return ret; } -static int __iommu_probe_device_helper(struct device *dev) +int iommu_probe_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; int ret; + if (!dev_iommu_get(dev)) + return -ENOMEM; + + if (!try_module_get(ops->owner)) { + ret = -EINVAL; + goto err_out; + } + ret = __iommu_probe_device(dev, NULL); if (ret) goto err_out; @@ -259,64 +267,12 @@ static int __iommu_probe_device_helper(struct device *dev) err_release: iommu_release_device(dev); + err_out: return ret; } -int iommu_probe_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; - struct iommu_group *group; - int ret; - - WARN_ON(dev->iommu_group); - - if (!ops) - return -EINVAL; - - if (!dev_iommu_get(dev)) - return -ENOMEM; - - if (!try_module_get(ops->owner)) { - ret = -EINVAL; - goto err_free_dev_param; - } - - if (ops->probe_device) - return __iommu_probe_device_helper(dev); - - ret = ops->add_device(dev); - if (ret) - goto err_module_put; - - group = iommu_group_get(dev); - iommu_create_device_direct_mappings(group, dev); - iommu_group_put(group); - - if (ops->probe_finalize) - ops->probe_finalize(dev); - - return 0; - -err_module_put: - module_put(ops->owner); -err_free_dev_param: - dev_iommu_free(dev); - return ret; -} - -static void __iommu_release_device(struct device *dev) -{ - const struct iommu_ops *ops = dev->bus->iommu_ops; - - iommu_device_unlink(dev->iommu->iommu_dev, dev); - - iommu_group_remove_device(dev); - - ops->release_device(dev); -} - void iommu_release_device(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -324,10 +280,10 @@ void iommu_release_device(struct device *dev) if (!dev->iommu) return; - if (ops->release_device) - __iommu_release_device(dev); - else if (dev->iommu_group) - ops->remove_device(dev); + iommu_device_unlink(dev->iommu->iommu_dev, dev); + iommu_group_remove_device(dev); + + ops->release_device(dev); module_put(ops->owner); dev_iommu_free(dev); @@ -1560,23 +1516,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev) if (ret) goto out_put_group; - /* - * Try to allocate a default domain - needs support from the - * IOMMU driver. There are still some drivers which don't support - * default domains, so the return value is not yet checked. Only - * allocate the domain here when the driver still has the - * add_device/remove_device call-backs implemented. - */ - if (!ops->probe_device) { - iommu_alloc_default_domain(dev); - - if (group->default_domain) - ret = __iommu_attach_device(group->default_domain, dev); - - if (ret) - goto out_put_group; - } - return group; out_put_group: @@ -1591,21 +1530,6 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) return group->default_domain; } -static int add_iommu_group(struct device *dev, void *data) -{ - int ret = iommu_probe_device(dev); - - /* - * We ignore -ENODEV errors for now, as they just mean that the - * device is not translated by an IOMMU. We still care about - * other errors and fail to initialize when they happen. - */ - if (ret == -ENODEV) - ret = 0; - - return ret; -} - static int probe_iommu_group(struct device *dev, void *data) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -1793,47 +1717,41 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group) int bus_iommu_probe(struct bus_type *bus) { - const struct iommu_ops *ops = bus->iommu_ops; + struct iommu_group *group, *next; + LIST_HEAD(group_list); int ret; - if (ops->probe_device) { - struct iommu_group *group, *next; - LIST_HEAD(group_list); + /* + * This code-path does not allocate the default domain when + * creating the iommu group, so do it after the groups are + * created. + */ + ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); + if (ret) + return ret; - /* - * This code-path does not allocate the default domain when - * creating the iommu group, so do it after the groups are - * created. - */ - ret = bus_for_each_dev(bus, NULL, &group_list, probe_iommu_group); - if (ret) - return ret; + list_for_each_entry_safe(group, next, &group_list, entry) { + /* Remove item from the list */ + list_del_init(&group->entry); - list_for_each_entry_safe(group, next, &group_list, entry) { - /* Remove item from the list */ - list_del_init(&group->entry); + mutex_lock(&group->mutex); - mutex_lock(&group->mutex); - - /* Try to allocate default domain */ - probe_alloc_default_domain(bus, group); - - if (!group->default_domain) { - mutex_unlock(&group->mutex); - continue; - } - - iommu_group_create_direct_mappings(group); - - ret = __iommu_group_dma_attach(group); + /* Try to allocate default domain */ + probe_alloc_default_domain(bus, group); + if (!group->default_domain) { mutex_unlock(&group->mutex); - - if (ret) - break; + continue; } - } else { - ret = bus_for_each_dev(bus, NULL, NULL, add_iommu_group); + + iommu_group_create_direct_mappings(group); + + ret = __iommu_group_dma_attach(group); + + mutex_unlock(&group->mutex); + + if (ret) + break; } return ret; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index fea1622408ad..dd076366383f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,8 +223,6 @@ struct iommu_iotlb_gather { * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue * @iova_to_phys: translate iova to physical address - * @add_device: add device to iommu grouping - * @remove_device: remove device from iommu grouping * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -277,8 +275,6 @@ struct iommu_ops { void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); - int (*add_device)(struct device *dev); - void (*remove_device)(struct device *dev); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); void (*probe_finalize)(struct device *dev); From 4e8906f0d84d1a7d3cf82a30a701b0fb5d48977c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:11 +0200 Subject: [PATCH 075/574] iommu: Move more initialization to __iommu_probe_device() Move the calls to dev_iommu_get() and try_module_get() into __iommu_probe_device(), so that the callers don't have to do it on their own. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-34-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 47 +++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7f99e5ae432c..48a95f7d7999 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -194,9 +194,19 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list struct iommu_group *group; int ret; + if (!dev_iommu_get(dev)) + return -ENOMEM; + + if (!try_module_get(ops->owner)) { + ret = -EINVAL; + goto err_free; + } + iommu_dev = ops->probe_device(dev); - if (IS_ERR(iommu_dev)) - return PTR_ERR(iommu_dev); + if (IS_ERR(iommu_dev)) { + ret = PTR_ERR(iommu_dev); + goto out_module_put; + } dev->iommu->iommu_dev = iommu_dev; @@ -217,6 +227,12 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list out_release: ops->release_device(dev); +out_module_put: + module_put(ops->owner); + +err_free: + dev_iommu_free(dev); + return ret; } @@ -226,14 +242,6 @@ int iommu_probe_device(struct device *dev) struct iommu_group *group; int ret; - if (!dev_iommu_get(dev)) - return -ENOMEM; - - if (!try_module_get(ops->owner)) { - ret = -EINVAL; - goto err_out; - } - ret = __iommu_probe_device(dev, NULL); if (ret) goto err_out; @@ -1532,14 +1540,10 @@ struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) static int probe_iommu_group(struct device *dev, void *data) { - const struct iommu_ops *ops = dev->bus->iommu_ops; struct list_head *group_list = data; struct iommu_group *group; int ret; - if (!dev_iommu_get(dev)) - return -ENOMEM; - /* Device is probed already if in a group */ group = iommu_group_get(dev); if (group) { @@ -1547,22 +1551,7 @@ static int probe_iommu_group(struct device *dev, void *data) return 0; } - if (!try_module_get(ops->owner)) { - ret = -EINVAL; - goto err_free_dev_iommu; - } - ret = __iommu_probe_device(dev, group_list); - if (ret) - goto err_module_put; - - return 0; - -err_module_put: - module_put(ops->owner); -err_free_dev_iommu: - dev_iommu_free(dev); - if (ret == -ENODEV) ret = 0; From 1b032ec1ecbce6047af7d11c9db432e237cb17d8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Apr 2020 15:37:12 +0200 Subject: [PATCH 076/574] iommu: Unexport iommu_group_get_for_dev() The function is now only used in IOMMU core code and shouldn't be used outside of it anyway, so remove the export for it. Signed-off-by: Joerg Roedel Tested-by: Marek Szyprowski Acked-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200429133712.31431-35-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 4 ++-- include/linux/iommu.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 48a95f7d7999..a9e5618cde80 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -91,6 +91,7 @@ static void __iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group); static int iommu_create_device_direct_mappings(struct iommu_group *group, struct device *dev); +static struct iommu_group *iommu_group_get_for_dev(struct device *dev); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -1500,7 +1501,7 @@ static int iommu_alloc_default_domain(struct device *dev) * to the returned IOMMU group, which will already include the provided * device. The reference should be released with iommu_group_put(). */ -struct iommu_group *iommu_group_get_for_dev(struct device *dev) +static struct iommu_group *iommu_group_get_for_dev(struct device *dev) { const struct iommu_ops *ops = dev->bus->iommu_ops; struct iommu_group *group; @@ -1531,7 +1532,6 @@ out_put_group: return ERR_PTR(ret); } -EXPORT_SYMBOL(iommu_group_get_for_dev); struct iommu_domain *iommu_group_default_domain(struct iommu_group *group) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index dd076366383f..7cfd2dddb49d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -527,7 +527,6 @@ extern int iommu_page_response(struct device *dev, struct iommu_page_response *msg); extern int iommu_group_id(struct iommu_group *group); -extern struct iommu_group *iommu_group_get_for_dev(struct device *dev); extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *); extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr, From 0ba577802b0b183a38a5606e2c67504aba8b6f9d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 13 Apr 2020 10:31:13 +0900 Subject: [PATCH 077/574] s390: ptrace: hard-code "s390x" instead of UTS_MACHINE s390 uses the UTS_MACHINE defined arch/s390/Makefile as follows: UTS_MACHINE := s390x We do not need to pass the fixed string from the command line. Hard-code user_regset_view::name, like many other architectures do. Link: https://lkml.kernel.org/r/20200413013113.8529-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Vasily Gorbik --- arch/s390/kernel/Makefile | 5 ----- arch/s390/kernel/ptrace.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 75f26d775027..a8f136943deb 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -33,11 +33,6 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -# -# Pass UTS_MACHINE for user_regset definition -# -CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' - obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 58faa12542a1..994a8b86edae 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -1416,7 +1416,7 @@ static const struct user_regset s390_regsets[] = { }; static const struct user_regset_view user_s390_view = { - .name = UTS_MACHINE, + .name = "s390x", .e_machine = EM_S390, .regsets = s390_regsets, .n = ARRAY_SIZE(s390_regsets) From d1379279f2d6b407bd08324a170cb21928e69854 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 28 Apr 2020 11:25:56 -0400 Subject: [PATCH 078/574] s390/pci: removes wrong PCI multifunction assignment The assignment of the PCI device multifunction attribute is set during the PCI device probe. There is no need to set it here. Let's do it right and remove this assignment. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_bus.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 542c6b8f56df..ada571d1c630 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -156,10 +156,8 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) } pdev = pci_scan_single_device(bus, zdev->devfn); - if (pdev) { - pdev->multifunction = 1; + if (pdev) pci_bus_add_device(pdev); - } return 0; } From 9056754f65052bed370df04523dc3e8628948f9c Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 30 Apr 2020 04:02:17 -0400 Subject: [PATCH 079/574] s390/pci: Documentation update for s390 PCI Clarify the documentation. Signed-off-by: Pierre Morel Reviewed-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- Documentation/s390/pci.rst | 83 +++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/Documentation/s390/pci.rst b/Documentation/s390/pci.rst index 75e043d4da85..492850bff316 100644 --- a/Documentation/s390/pci.rst +++ b/Documentation/s390/pci.rst @@ -10,7 +10,7 @@ Authors: Copyright, IBM Corp. 2020 -command line parameters and debugfs entries +Command line parameters and debugfs entries =========================================== Command line parameters @@ -18,7 +18,7 @@ Command line parameters * nomio - Do not use MIO instructions. + Do not use PCI Mapped I/O (MIO) instructions. * norid @@ -27,100 +27,99 @@ Command line parameters debugfs entries --------------- -* /sys/kernel/debug/s390dbf/pci_*/ (S/390 debug feature) +The S/390 debug feature (s390dbf) generates views to hold various debug results in sysfs directories of the form: - Some views generated by the debug feature to hold various debug outputs. + * /sys/kernel/debug/s390dbf/pci_*/ + +For example: - /sys/kernel/debug/s390dbf/pci_msg/sprintf - Messages from the processing of PCI events like machine check handling - and setting of global functionality like UID checking. + Holds messages from the processing of PCI events, like machine check handling + and setting of global functionality, like UID checking. - The level of logging can be changed to be more or less verbose by piping to - /sys/kernel/debug/s390dbf/pci_*/level a number between 0 and 6; see the - documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst) - for details. + Change the level of logging to be more or less verbose by piping + a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For + details, see the documentation on the S/390 debug feature at + Documentation/s390/s390dbf.rst. Sysfs entries ============= -Specific entries, or entries specificities for zPCI functions. +Entries specific to zPCI functions and entries that hold zPCI information. * /sys/bus/pci/slots/XXXXXXXX - The slot entries are setup using the FID (Function Identifier) of the + The slot entries are set up using the function identifier (FID) of the PCI function. - /sys/bus/pci/slots/XXXXXXXX/power - A physical function currently supporting virtual function can not be - powered-off until all virtual-function have been removed with + A physical function that currently supports a virtual function cannot be + powered off until all virtual functions are removed with: echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf * /sys/bus/pci/devices/XXXX:XX:XX.X/ - function_id - zPCI function identifier unique for the complete Z System. - It define uniquely a function in the system. + A zPCI function identifier that uniquely identifies the function in the Z server. - function_handle - Low level identifier used for a configured PCI function. - It may be useful for debuging. + Low-level identifier used for a configured PCI function. + It might be useful for debuging. - pchid - Model dependent location of the I/O adapter. + Model-dependent location of the I/O adapter. - pfgid - PCI Function Group ID, functions sharing identical functionality - are using a common identifier. - A PCI group defines interrupts, IOMMU, IOTLB and DMA specifics. + PCI function group ID, functions that share identical functionality + use a common identifier. + A PCI group defines interrupts, IOMMU, IOTLB, and DMA specifics. - vfn - The Virtual Function Number, from 1 to N for virtual functions. + The virtual function number, from 1 to N for virtual functions, 0 for physical functions. - pft - PCI function type specifies the type of the PCI function. + The PCI function type - port - The port correspond to the physical port the function is attached to. - It also gives an indication on the physical function a virtual function + The port corresponds to the physical port the function is attached to. + It also gives an indication of the physical function a virtual function is attached to. - uid - The UID, Unique Identifier is defined when configuring a LPAR and is - unique inside an LPAR. + The unique identifier (UID) is defined when configuring an LPAR and is + unique in the LPAR. - pfip/segmentX - The segments are used to determine the isolation of a function. - They corresponds to the physical path to the function. - The more the segment are different the more the functions are isolated. + The segments determine the isolation of a function. + They correspond to the physical path to the function. + The more the segments are different, the more the functions are isolated. Enumeration and hotplug ======================= -The PCI address is made of 4 parts: domain, bus, device and function, -like in DDDD:BB:dd.f +The PCI address consists of four parts: domain, bus, device and function, +and is of this form: DDDD:BB:dd.f -* When not using multi-functions (norid is set or firmware does not support - multi-functions) +* When not using multi-functions (norid is set, or the firmware does not + support multi-functions): - There is only one function per domain. - - the domain is set from the zPCI function's UID as defined during the + - The domain is set from the zPCI function's UID as defined during the LPAR creation. - - Addresses look like DDDD:00:00.0 - -* When using multi-functions (norid parameter is not set), there are some - change in the way zPCI functions are addressed: +* When using multi-functions (norid parameter is not set), + zPCI functions are addressed differently: - There is still only one bus per domain. - There can be up to 256 functions per bus. - - The domain part of the address of all functions of all functions for + - The domain part of the address of all functions for a multi-Function device is set from the zPCI function's UID as defined in the LPAR creation for the function zero. - - New functions will only be ready to be used after the function zero + - New functions will only be ready for use after the function zero (the function with devfn 0) has been enumerated. From 3737e8ee4f2fc7e77994d1a8bd618a9dda5a5514 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 2 Mar 2020 14:03:37 -0500 Subject: [PATCH 080/574] s390: nvme ipl Recognize IPL Block's Ipl Type of "nvme". Populate related structs and sysfs entries. Signed-off-by: Jason J. Herne Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ipl.h | 11 ++++++ arch/s390/include/uapi/asm/ipl.h | 25 +++++++++++++ arch/s390/kernel/ipl.c | 63 ++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index b63bd66404b8..7d5cfdda5277 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -21,6 +21,7 @@ struct ipl_parameter_block { struct ipl_pb0_common common; struct ipl_pb0_fcp fcp; struct ipl_pb0_ccw ccw; + struct ipl_pb0_nvme nvme; char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; }; } __packed __aligned(PAGE_SIZE); @@ -30,6 +31,11 @@ struct ipl_parameter_block { #define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ sizeof(struct ipl_pb0_fcp)) #define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) + +#define IPL_BP_NVME_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_nvme)) +#define IPL_BP0_NVME_LEN (sizeof(struct ipl_pb0_nvme)) + #define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ sizeof(struct ipl_pb0_ccw)) #define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) @@ -59,6 +65,7 @@ enum ipl_type { IPL_TYPE_FCP = 4, IPL_TYPE_FCP_DUMP = 8, IPL_TYPE_NSS = 16, + IPL_TYPE_NVME = 32, }; struct ipl_info @@ -73,6 +80,10 @@ struct ipl_info u64 wwpn; u64 lun; } fcp; + struct { + u32 fid; + u32 nsid; + } nvme; struct { char name[NSS_NAME_SIZE + 1]; } nss; diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 451ba7d08905..d1ecd5d722a0 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -27,6 +27,7 @@ enum ipl_pbt { IPL_PBT_FCP = 0, IPL_PBT_SCP_DATA = 1, IPL_PBT_CCW = 2, + IPL_PBT_NVME = 4, }; /* IPL Parameter Block 0 with common fields */ @@ -67,6 +68,30 @@ struct ipl_pb0_fcp { #define IPL_PB0_FCP_OPT_IPL 0x10 #define IPL_PB0_FCP_OPT_DUMP 0x20 +/* IPL Parameter Block 0 for NVMe */ +struct ipl_pb0_nvme { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u32 fid; + __u8 reserved4[12]; + __u32 nsid; + __u8 reserved5[4]; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_NVME_OPT_IPL 0x10 +#define IPL_PB0_NVME_OPT_DUMP 0x20 + /* IPL Parameter Block 0 for CCW */ struct ipl_pb0_ccw { __u32 len; diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 4a71061974fd..939d421017fd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -39,6 +39,7 @@ #define IPL_CCW_STR "ccw" #define IPL_FCP_STR "fcp" #define IPL_FCP_DUMP_STR "fcp_dump" +#define IPL_NVME_STR "nvme" #define IPL_NSS_STR "nss" #define DUMP_CCW_STR "ccw" @@ -93,6 +94,8 @@ static char *ipl_type_str(enum ipl_type type) return IPL_FCP_DUMP_STR; case IPL_TYPE_NSS: return IPL_NSS_STR; + case IPL_TYPE_NVME: + return IPL_NVME_STR; case IPL_TYPE_UNKNOWN: default: return IPL_UNKNOWN_STR; @@ -261,6 +264,8 @@ static __init enum ipl_type get_ipl_type(void) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; + case IPL_PBT_NVME: + return IPL_TYPE_NVME; } return IPL_TYPE_UNKNOWN; } @@ -317,6 +322,8 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + case IPL_TYPE_NVME: + return sprintf(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; } @@ -345,15 +352,35 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, return memory_read_from_buffer(buf, count, &off, scp_data, size); } + +static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + unsigned int size = ipl_block.nvme.scp_data_len; + void *scp_data = &ipl_block.nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + static struct bin_attribute ipl_scp_data_attr = __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE); +static struct bin_attribute ipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, S_IRUGO, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); + static struct bin_attribute *ipl_fcp_bin_attrs[] = { &ipl_parameter_attr, &ipl_scp_data_attr, NULL, }; +static struct bin_attribute *ipl_nvme_bin_attrs[] = { + &ipl_parameter_attr, + &ipl_nvme_scp_data_attr, + NULL, +}; + /* FCP ipl device attributes */ DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", @@ -365,6 +392,16 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", (unsigned long long)ipl_block.fcp.br_lba); +/* NVMe ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.fid); +DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n", + (unsigned long long)ipl_block.nvme.nsid); +DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n", + (unsigned long long)ipl_block.nvme.bootprog); +DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n", + (unsigned long long)ipl_block.nvme.br_lba); + static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -399,6 +436,24 @@ static struct attribute_group ipl_fcp_attr_group = { .bin_attrs = ipl_fcp_bin_attrs, }; +static struct attribute *ipl_nvme_attrs[] = { + &sys_ipl_type_attr.attr, + &sys_ipl_nvme_fid_attr.attr, + &sys_ipl_nvme_nsid_attr.attr, + &sys_ipl_nvme_bootprog_attr.attr, + &sys_ipl_nvme_br_lba_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, + NULL, +}; + +static struct attribute_group ipl_nvme_attr_group = { + .attrs = ipl_nvme_attrs, + .bin_attrs = ipl_nvme_bin_attrs, +}; + + /* CCW ipl device attributes */ static struct attribute *ipl_ccw_attrs_vm[] = { @@ -474,6 +529,9 @@ static int __init ipl_init(void) case IPL_TYPE_FCP_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); break; + case IPL_TYPE_NVME: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); + break; default: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_unknown_attr_group); @@ -949,6 +1007,7 @@ static void __reipl_run(void *unused) diag308(DIAG308_SET, reipl_block_nss); diag308(DIAG308_LOAD_CLEAR, NULL); break; + case IPL_TYPE_NVME: case IPL_TYPE_UNKNOWN: diag308(DIAG308_LOAD_CLEAR, NULL); break; @@ -1750,6 +1809,10 @@ void __init setup_ipl(void) ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; + case IPL_TYPE_NVME: + ipl_info.data.nvme.fid = ipl_block.nvme.fid; + ipl_info.data.nvme.nsid = ipl_block.nvme.nsid; + break; case IPL_TYPE_NSS: case IPL_TYPE_UNKNOWN: /* We have no info to copy */ From 23a457b8d57dc8d0cc1dbd1882993dd2fcc4b0c0 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Tue, 3 Mar 2020 14:11:19 -0500 Subject: [PATCH 081/574] s390: nvme reipl Populate sysfs and structs with reipl entries for nvme ipl type. This allows specifying a target nvme device when rebooting/reipling. Signed-off-by: Jason J. Herne Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ipl.c | 148 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 939d421017fd..ccea9a245867 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -136,6 +136,7 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN; static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; static struct ipl_parameter_block *reipl_block_fcp; +static struct ipl_parameter_block *reipl_block_nvme; static struct ipl_parameter_block *reipl_block_ccw; static struct ipl_parameter_block *reipl_block_nss; static struct ipl_parameter_block *reipl_block_actual; @@ -785,6 +786,93 @@ static struct attribute_group reipl_fcp_attr_group = { static struct kobj_attribute sys_reipl_fcp_clear_attr = __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store); +/* NVME reipl device attributes */ + +static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t size = reipl_block_nvme->nvme.scp_data_len; + void *scp_data = reipl_block_nvme->nvme.scp_data; + + return memory_read_from_buffer(buf, count, &off, scp_data, size); +} + +static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + size_t scpdata_len = count; + size_t padding; + + if (off) + return -EINVAL; + + memcpy(reipl_block_nvme->nvme.scp_data, buf, count); + if (scpdata_len % 8) { + padding = 8 - (scpdata_len % 8); + memset(reipl_block_nvme->nvme.scp_data + scpdata_len, + 0, padding); + scpdata_len += padding; + } + + reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_nvme->nvme.scp_data_len = scpdata_len; + + return count; +} + +static struct bin_attribute sys_reipl_nvme_scp_data_attr = + __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_nvme_scpdata_read, + reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); + +static struct bin_attribute *reipl_nvme_bin_attrs[] = { + &sys_reipl_nvme_scp_data_attr, + NULL, +}; + +DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.fid); +DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n", + reipl_block_nvme->nvme.nsid); +DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.bootprog); +DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", + reipl_block_nvme->nvme.br_lba); + +/* nvme wrapper */ +static ssize_t reipl_nvme_loadparm_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return reipl_generic_loadparm_show(reipl_block_nvme, page); +} + +static ssize_t reipl_nvme_loadparm_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return reipl_generic_loadparm_store(reipl_block_nvme, buf, len); +} + +static struct kobj_attribute sys_reipl_nvme_loadparm_attr = + __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nvme_loadparm_show, + reipl_nvme_loadparm_store); + +static struct attribute *reipl_nvme_attrs[] = { + &sys_reipl_nvme_fid_attr.attr, + &sys_reipl_nvme_nsid_attr.attr, + &sys_reipl_nvme_bootprog_attr.attr, + &sys_reipl_nvme_br_lba_attr.attr, + &sys_reipl_nvme_loadparm_attr.attr, + NULL, +}; + +static struct attribute_group reipl_nvme_attr_group = { + .attrs = reipl_nvme_attrs, + .bin_attrs = reipl_nvme_bin_attrs +}; + /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); @@ -949,6 +1037,9 @@ static int reipl_set_type(enum ipl_type type) case IPL_TYPE_FCP: reipl_block_actual = reipl_block_fcp; break; + case IPL_TYPE_NVME: + reipl_block_actual = reipl_block_nvme; + break; case IPL_TYPE_NSS: reipl_block_actual = reipl_block_nss; break; @@ -975,6 +1066,8 @@ static ssize_t reipl_type_store(struct kobject *kobj, rc = reipl_set_type(IPL_TYPE_CCW); else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) rc = reipl_set_type(IPL_TYPE_FCP); + else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_NVME); else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0) rc = reipl_set_type(IPL_TYPE_NSS); return (rc != 0) ? rc : len; @@ -985,6 +1078,7 @@ static struct kobj_attribute reipl_type_attr = static struct kset *reipl_kset; static struct kset *reipl_fcp_kset; +static struct kset *reipl_nvme_kset; static void __reipl_run(void *unused) { @@ -1003,11 +1097,14 @@ static void __reipl_run(void *unused) else diag308(DIAG308_LOAD_NORMAL, NULL); break; + case IPL_TYPE_NVME: + diag308(DIAG308_SET, reipl_block_nvme); + diag308(DIAG308_LOAD_CLEAR, NULL); + break; case IPL_TYPE_NSS: diag308(DIAG308_SET, reipl_block_nss); diag308(DIAG308_LOAD_CLEAR, NULL); break; - case IPL_TYPE_NVME: case IPL_TYPE_UNKNOWN: diag308(DIAG308_LOAD_CLEAR, NULL); break; @@ -1152,6 +1249,49 @@ out1: return rc; } +static int __init reipl_nvme_init(void) +{ + int rc; + + reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL); + if (!reipl_block_nvme) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL, + &reipl_kset->kobj); + if (!reipl_nvme_kset) { + free_page((unsigned long) reipl_block_nvme); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group); + if (rc) { + kset_unregister(reipl_nvme_kset); + free_page((unsigned long) reipl_block_nvme); + return rc; + } + + if (ipl_info.type == IPL_TYPE_NVME) { + memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block)); + /* + * Fix loadparm: There are systems where the (SCSI) LOADPARM + * is invalid in the IPL parameter block, so take it + * always from sclp_ipl_info. + */ + memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm, + LOADPARM_LEN); + } else { + reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN; + reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + reipl_block_nvme->nvme.pbt = IPL_PBT_NVME; + reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_NVME; + return 0; +} + static int __init reipl_type_init(void) { enum ipl_type reipl_type = ipl_info.type; @@ -1167,6 +1307,9 @@ static int __init reipl_type_init(void) if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { memcpy(reipl_block_fcp, reipl_block, size); reipl_type = IPL_TYPE_FCP; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) { + memcpy(reipl_block_nvme, reipl_block, size); + reipl_type = IPL_TYPE_NVME; } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; @@ -1191,6 +1334,9 @@ static int __init reipl_init(void) if (rc) return rc; rc = reipl_fcp_init(); + if (rc) + return rc; + rc = reipl_nvme_init(); if (rc) return rc; rc = reipl_nss_init(); From 57829ea46875770c919ae491dcabf4e2aa5cb385 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 6 May 2020 14:16:59 +0800 Subject: [PATCH 082/574] sparc: mm: return true,false in kern_addr_valid() This function's return type is bool and returns both true/false and 0/1. This fixes the following coccicheck warning: arch/sparc/mm/init_64.c:1652:9-10: WARNING: return of 0/1 in function 'kern_addr_valid' with return type bool Signed-off-by: Jason Yan Signed-off-by: David S. Miller --- arch/sparc/mm/init_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 1cf0d666dea3..b488d4587793 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1674,29 +1674,29 @@ bool kern_addr_valid(unsigned long addr) pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) - return 0; + return false; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) - return 0; + return false; pud = pud_offset(p4d, addr); if (pud_none(*pud)) - return 0; + return false; if (pud_large(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) - return 0; + return false; if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) - return 0; + return false; return pfn_valid(pte_pfn(*pte)); } From 02782f3d60f197f02efd153b43ef64214dc00c01 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Thu, 23 Apr 2020 15:25:31 +0530 Subject: [PATCH 083/574] iommu/arm-smmu: Make remove callback message more informative Currently on reboot/shutdown, the following messages are displayed on the console as error messages before the system reboots/shutdown as part of remove callback. On SC7180: arm-smmu 15000000.iommu: removing device with active domains! arm-smmu 5040000.iommu: removing device with active domains! Make this error message more informative and less scary. Reported-by: Douglas Anderson Suggested-by: Robin Murphy Signed-off-by: Sai Prakash Ranjan Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20200423095531.9868-1-saiprakash.ranjan@codeaurora.org [will: use dev_notice() as per Robin] Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index e622f4e33379..befe605f9c43 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -2244,7 +2244,7 @@ static int arm_smmu_device_remove(struct platform_device *pdev) return -ENODEV; if (!bitmap_empty(smmu->context_map, ARM_SMMU_MAX_CBS)) - dev_err(&pdev->dev, "removing device with active domains!\n"); + dev_notice(&pdev->dev, "disabling translation\n"); arm_smmu_bus_init(NULL); iommu_device_unregister(&smmu->iommu); From 64510ede36845500584485f3ad30dbcbf83091a6 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Tue, 21 Apr 2020 00:03:49 +0530 Subject: [PATCH 084/574] iommu: arm-smmu-impl: Convert to a generic reset implementation Currently the QCOM specific smmu reset implementation is very specific to SDM845 SoC and has a wait-for-safe logic which may not be required for other SoCs. So move the SDM845 specific logic to its specific reset function. Also add SC7180 SMMU compatible for calling into QCOM specific implementation. Signed-off-by: Sai Prakash Ranjan Reviewed-by: Bjorn Andersson Reviewed-by: Stephen Boyd Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/d24a0278021bc0b2732636c5728efe55e7318a8b.1587407458.git.saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-impl.c | 8 +++++--- drivers/iommu/arm-smmu-qcom.c | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c index 74d97a886e93..c75b9d957b70 100644 --- a/drivers/iommu/arm-smmu-impl.c +++ b/drivers/iommu/arm-smmu-impl.c @@ -150,6 +150,8 @@ static const struct arm_smmu_impl arm_mmu500_impl = { struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) { + const struct device_node *np = smmu->dev->of_node; + /* * We will inevitably have to combine model-specific implementation * quirks with platform-specific integration quirks, but everything @@ -166,11 +168,11 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu) break; } - if (of_property_read_bool(smmu->dev->of_node, - "calxeda,smmu-secure-config-access")) + if (of_property_read_bool(np, "calxeda,smmu-secure-config-access")) smmu->impl = &calxeda_impl; - if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm845-smmu-500")) + if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") || + of_device_is_compatible(np, "qcom,sc7180-smmu-500")) return qcom_smmu_impl_init(smmu); return smmu; diff --git a/drivers/iommu/arm-smmu-qcom.c b/drivers/iommu/arm-smmu-qcom.c index 24c071c1d8b0..64a4ab270ab7 100644 --- a/drivers/iommu/arm-smmu-qcom.c +++ b/drivers/iommu/arm-smmu-qcom.c @@ -15,8 +15,6 @@ static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu) { int ret; - arm_mmu500_reset(smmu); - /* * To address performance degradation in non-real time clients, * such as USB and UFS, turn off wait-for-safe on sdm845 based boards, @@ -30,8 +28,20 @@ static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu) return ret; } +static int qcom_smmu500_reset(struct arm_smmu_device *smmu) +{ + const struct device_node *np = smmu->dev->of_node; + + arm_mmu500_reset(smmu); + + if (of_device_is_compatible(np, "qcom,sdm845-smmu-500")) + return qcom_sdm845_smmu500_reset(smmu); + + return 0; +} + static const struct arm_smmu_impl qcom_smmu_impl = { - .reset = qcom_sdm845_smmu500_reset, + .reset = qcom_smmu500_reset, }; struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu) From 232c5ae8a3614f112712d43e1dbbd8dd6f8453c1 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Tue, 21 Apr 2020 00:03:50 +0530 Subject: [PATCH 085/574] iommu/arm-smmu: Implement iommu_ops->def_domain_type call-back Implement the new def_domain_type call-back for the ARM SMMU driver. We need this to support requesting the domain type by the client devices. Signed-off-by: Sai Prakash Ranjan Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/28c5d101cc4ac29aff3553ecec7cf256d0907ed7.1587407458.git.saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu.c | 12 ++++++++++++ drivers/iommu/arm-smmu.h | 1 + 2 files changed, 13 insertions(+) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index befe605f9c43..243bc4cb2705 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1609,6 +1609,17 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); } +static int arm_smmu_def_domain_type(struct device *dev) +{ + struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); + const struct arm_smmu_impl *impl = cfg->smmu->impl; + + if (impl && impl->def_domain_type) + return impl->def_domain_type(dev); + + return 0; +} + static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -1627,6 +1638,7 @@ static struct iommu_ops arm_smmu_ops = { .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, + .def_domain_type = arm_smmu_def_domain_type, .pgsize_bitmap = -1UL, /* Restricted during device attach */ }; diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h index 8d1cd54d82a6..d172c024be61 100644 --- a/drivers/iommu/arm-smmu.h +++ b/drivers/iommu/arm-smmu.h @@ -386,6 +386,7 @@ struct arm_smmu_impl { int (*init_context)(struct arm_smmu_domain *smmu_domain); void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync, int status); + int (*def_domain_type)(struct device *dev); }; static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n) From 0e764a01015dfebff8a8ffd297d74663772e248a Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 21 Apr 2020 00:03:51 +0530 Subject: [PATCH 086/574] iommu/arm-smmu: Allow client devices to select direct mapping Some client devices want to directly map the IOMMU themselves instead of using the DMA domain. Allow those devices to opt in to direct mapping by way of a list of compatible strings. Co-developed-by: Sai Prakash Ranjan Signed-off-by: Jordan Crouse Signed-off-by: Sai Prakash Ranjan Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/7cf1f64167b5545b7f42275395be1f1e2ea3a6ac.1587407458.git.saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-qcom.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/arm-smmu-qcom.c b/drivers/iommu/arm-smmu-qcom.c index 64a4ab270ab7..5bedf21587a5 100644 --- a/drivers/iommu/arm-smmu-qcom.c +++ b/drivers/iommu/arm-smmu-qcom.c @@ -3,6 +3,7 @@ * Copyright (c) 2019, The Linux Foundation. All rights reserved. */ +#include #include #include "arm-smmu.h" @@ -11,6 +12,23 @@ struct qcom_smmu { struct arm_smmu_device smmu; }; +static const struct of_device_id qcom_smmu_client_of_match[] = { + { .compatible = "qcom,adreno" }, + { .compatible = "qcom,mdp4" }, + { .compatible = "qcom,mdss" }, + { .compatible = "qcom,sc7180-mdss" }, + { .compatible = "qcom,sdm845-mdss" }, + { } +}; + +static int qcom_smmu_def_domain_type(struct device *dev) +{ + const struct of_device_id *match = + of_match_device(qcom_smmu_client_of_match, dev); + + return match ? IOMMU_DOMAIN_IDENTITY : 0; +} + static int qcom_sdm845_smmu500_reset(struct arm_smmu_device *smmu) { int ret; @@ -41,6 +59,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, }; From 23cf515c604277f6a44ef2ba415d7f6327abcf05 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 00:03:53 +0530 Subject: [PATCH 087/574] dt-bindings: remoteproc: qcom: Add iommus property Add iommus property to allow Q6 modem to boot on platforms which do not have trustZone. Signed-off-by: Sibi Sankar Signed-off-by: Sai Prakash Ranjan Acked-by: Rob Herring Link: https://lore.kernel.org/r/561e9b42b8665fc9712fdb40a525ab8871fcbdac.1587407458.git.saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 88dfa3fc15f7..130e50aab741 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -184,6 +184,9 @@ For the compatible strings below the following phandle references are required: followed by the offset within syscon for conn_box_spare0 register. +The Hexagon node must contain iommus property as described in ../iommu/iommu.txt +on platforms which do not have TrustZone. + = SUBNODES: The Hexagon node must contain two subnodes, named "mba" and "mpss" representing the memory regions used by the Hexagon firmware. Each sub-node must contain: From 68aee4af5f62030869021d81c5d59ebd99528d4b Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 00:03:54 +0530 Subject: [PATCH 088/574] arm64: dts: qcom: sdm845-cheza: Add iommus property Add iommus property to remoteproc modem node. Following SMMU global faults are seen without it. arm-smmu 15000000.iommu: Unexpected global fault, this could be serious arm-smmu 15000000.iommu: GFSR 0x80000002, GFSYNR0 0x00000000, GFSYNR1 0x00000781, GFSYNR2 0x00000000 arm-smmu 15000000.iommu: Unexpected global fault, this could be serious arm-smmu 15000000.iommu: GFSR 0x80000002, GFSYNR0 0x00000000, GFSYNR1 0x00000461, GFSYNR2 0x00000000 Signed-off-by: Sibi Sankar Signed-off-by: Sai Prakash Ranjan Link: https://lore.kernel.org/r/38c607841e81664a2db69a27260cd7dfbd653458.1587407458.git.saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi index 9070be43a309..07081da2c83e 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi @@ -631,6 +631,11 @@ ap_ts_i2c: &i2c14 { status = "okay"; }; +&mss_pil { + iommus = <&apps_smmu 0x780 0x1>, + <&apps_smmu 0x724 0x3>; +}; + &pm8998_pwrkey { status = "disabled"; }; From bd0d696023cb381608e9c3d59dddb25b3816eee7 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 7 May 2020 12:24:14 +0100 Subject: [PATCH 089/574] dt-bindings: arm-smmu: Allow mmu-400, smmu-v1 compatible The Arm SMMUv1 DT binding only allows combining arm,mmu-401 with arm,smmu-v1, even though the MMU-400 is compatible as well. Allow this combination as well to let the Arm Juno board pass the test. Signed-off-by: Andre Przywara Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20200507112430.183940-2-andre.przywara@arm.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index 6515dbe47508..e3ef1c69d132 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -41,7 +41,9 @@ properties: - const: arm,mmu-500 - const: arm,smmu-v2 - items: - - const: arm,mmu-401 + - enum: + - arm,mmu-400 + - arm,mmu-401 - const: arm,smmu-v1 - enum: - arm,smmu-v1 From cd9fc8f1b35bd75e0d33470a01baff2848a9443a Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Apr 2020 17:37:33 -0700 Subject: [PATCH 090/574] remoteproc: qcom: Pass ssr_name to glink subdevice Pass ssr_name to glink subdevice in preparation for tying glink_ssr to the glink subdevice, rather than having its own "ssr subdevice". Acked-by: Chris Lew Acked-by: Mathieu Poirier Acked-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/20200423003736.2027371-2-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_common.c | 9 ++++++++- drivers/remoteproc/qcom_common.h | 5 ++++- drivers/remoteproc/qcom_q6v5_adsp.c | 2 +- drivers/remoteproc/qcom_q6v5_mss.c | 2 +- drivers/remoteproc/qcom_q6v5_pas.c | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/qcom_common.c b/drivers/remoteproc/qcom_common.c index 60650bcc8c67..ff26f2b68752 100644 --- a/drivers/remoteproc/qcom_common.c +++ b/drivers/remoteproc/qcom_common.c @@ -46,8 +46,10 @@ static void glink_subdev_stop(struct rproc_subdev *subdev, bool crashed) * qcom_add_glink_subdev() - try to add a GLINK subdevice to rproc * @rproc: rproc handle to parent the subdevice * @glink: reference to a GLINK subdev context + * @ssr_name: identifier of the associated remoteproc for ssr notifications */ -void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink) +void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink, + const char *ssr_name) { struct device *dev = &rproc->dev; @@ -55,6 +57,10 @@ void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink) if (!glink->node) return; + glink->ssr_name = kstrdup_const(ssr_name, GFP_KERNEL); + if (!glink->ssr_name) + return; + glink->dev = dev; glink->subdev.start = glink_subdev_start; glink->subdev.stop = glink_subdev_stop; @@ -74,6 +80,7 @@ void qcom_remove_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glin return; rproc_remove_subdev(rproc, &glink->subdev); + kfree_const(glink->ssr_name); of_node_put(glink->node); } EXPORT_SYMBOL_GPL(qcom_remove_glink_subdev); diff --git a/drivers/remoteproc/qcom_common.h b/drivers/remoteproc/qcom_common.h index 58de71e4781c..34e5188187dc 100644 --- a/drivers/remoteproc/qcom_common.h +++ b/drivers/remoteproc/qcom_common.h @@ -11,6 +11,8 @@ struct qcom_sysmon; struct qcom_rproc_glink { struct rproc_subdev subdev; + const char *ssr_name; + struct device *dev; struct device_node *node; struct qcom_glink *edge; @@ -30,7 +32,8 @@ struct qcom_rproc_ssr { const char *name; }; -void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink); +void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink, + const char *ssr_name); void qcom_remove_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink); int qcom_register_dump_segments(struct rproc *rproc, const struct firmware *fw); diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c index c60dabc6939e..d2a2574dcf35 100644 --- a/drivers/remoteproc/qcom_q6v5_adsp.c +++ b/drivers/remoteproc/qcom_q6v5_adsp.c @@ -461,7 +461,7 @@ static int adsp_probe(struct platform_device *pdev) if (ret) goto disable_pm; - qcom_add_glink_subdev(rproc, &adsp->glink_subdev); + qcom_add_glink_subdev(rproc, &adsp->glink_subdev, desc->ssr_name); qcom_add_ssr_subdev(rproc, &adsp->ssr_subdev, desc->ssr_name); adsp->sysmon = qcom_add_sysmon_subdev(rproc, desc->sysmon_name, diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index a335d2eebe97..38d2b21735da 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1762,7 +1762,7 @@ static int q6v5_probe(struct platform_device *pdev) qproc->mpss_perm = BIT(QCOM_SCM_VMID_HLOS); qproc->mba_perm = BIT(QCOM_SCM_VMID_HLOS); - qcom_add_glink_subdev(rproc, &qproc->glink_subdev); + qcom_add_glink_subdev(rproc, &qproc->glink_subdev, "mpss"); qcom_add_smd_subdev(rproc, &qproc->smd_subdev); qcom_add_ssr_subdev(rproc, &qproc->ssr_subdev, "mpss"); qcom_add_ipa_notify_subdev(rproc, &qproc->ipa_notify_subdev); diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 8ecc157f1ed1..fc6658b523b6 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -436,7 +436,7 @@ static int adsp_probe(struct platform_device *pdev) if (ret) goto detach_proxy_pds; - qcom_add_glink_subdev(rproc, &adsp->glink_subdev); + qcom_add_glink_subdev(rproc, &adsp->glink_subdev, desc->ssr_name); qcom_add_smd_subdev(rproc, &adsp->smd_subdev); qcom_add_ssr_subdev(rproc, &adsp->ssr_subdev, desc->ssr_name); adsp->sysmon = qcom_add_sysmon_subdev(rproc, From 5d1f2e3c8090c0769ee4a1b920e82277613327fc Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Apr 2020 17:37:34 -0700 Subject: [PATCH 091/574] soc: qcom: glink_ssr: Internalize ssr_notifiers Rather than carrying a special purpose blocking notifier for glink_ssr in remoteproc's qcom_common.c, move it into glink_ssr so allow wider reuse of the common one. The rpmsg glink header file is used in preparation for the next patch. Acked-by: Chris Lew Acked-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/20200423003736.2027371-3-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_common.c | 8 ++++++++ drivers/soc/qcom/glink_ssr.c | 24 +++++++++++++++++++----- include/linux/rpmsg/qcom_glink.h | 6 ++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/qcom_common.c b/drivers/remoteproc/qcom_common.c index ff26f2b68752..9028cea2d81e 100644 --- a/drivers/remoteproc/qcom_common.c +++ b/drivers/remoteproc/qcom_common.c @@ -42,6 +42,13 @@ static void glink_subdev_stop(struct rproc_subdev *subdev, bool crashed) glink->edge = NULL; } +static void glink_subdev_unprepare(struct rproc_subdev *subdev) +{ + struct qcom_rproc_glink *glink = to_glink_subdev(subdev); + + qcom_glink_ssr_notify(glink->ssr_name); +} + /** * qcom_add_glink_subdev() - try to add a GLINK subdevice to rproc * @rproc: rproc handle to parent the subdevice @@ -64,6 +71,7 @@ void qcom_add_glink_subdev(struct rproc *rproc, struct qcom_rproc_glink *glink, glink->dev = dev; glink->subdev.start = glink_subdev_start; glink->subdev.stop = glink_subdev_stop; + glink->subdev.unprepare = glink_subdev_unprepare; rproc_add_subdev(rproc, &glink->subdev); } diff --git a/drivers/soc/qcom/glink_ssr.c b/drivers/soc/qcom/glink_ssr.c index d7babe3d67bc..847d79c935f1 100644 --- a/drivers/soc/qcom/glink_ssr.c +++ b/drivers/soc/qcom/glink_ssr.c @@ -54,6 +54,19 @@ struct glink_ssr { struct completion completion; }; +/* Notifier list for all registered glink_ssr instances */ +static BLOCKING_NOTIFIER_HEAD(ssr_notifiers); + +/** + * qcom_glink_ssr_notify() - notify GLINK SSR about stopped remoteproc + * @ssr_name: name of the remoteproc that has been stopped + */ +void qcom_glink_ssr_notify(const char *ssr_name) +{ + blocking_notifier_call_chain(&ssr_notifiers, 0, (void *)ssr_name); +} +EXPORT_SYMBOL_GPL(qcom_glink_ssr_notify); + static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, void *data, int len, void *priv, u32 addr) { @@ -81,8 +94,9 @@ static int qcom_glink_ssr_callback(struct rpmsg_device *rpdev, return 0; } -static int qcom_glink_ssr_notify(struct notifier_block *nb, unsigned long event, - void *data) +static int qcom_glink_ssr_notifier_call(struct notifier_block *nb, + unsigned long event, + void *data) { struct glink_ssr *ssr = container_of(nb, struct glink_ssr, nb); struct do_cleanup_msg msg; @@ -121,18 +135,18 @@ static int qcom_glink_ssr_probe(struct rpmsg_device *rpdev) ssr->dev = &rpdev->dev; ssr->ept = rpdev->ept; - ssr->nb.notifier_call = qcom_glink_ssr_notify; + ssr->nb.notifier_call = qcom_glink_ssr_notifier_call; dev_set_drvdata(&rpdev->dev, ssr); - return qcom_register_ssr_notifier(&ssr->nb); + return blocking_notifier_chain_register(&ssr_notifiers, &ssr->nb); } static void qcom_glink_ssr_remove(struct rpmsg_device *rpdev) { struct glink_ssr *ssr = dev_get_drvdata(&rpdev->dev); - qcom_unregister_ssr_notifier(&ssr->nb); + blocking_notifier_chain_unregister(&ssr_notifiers, &ssr->nb); } static const struct rpmsg_device_id qcom_glink_ssr_match[] = { diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h index 96e26d94719f..09daa0acde2c 100644 --- a/include/linux/rpmsg/qcom_glink.h +++ b/include/linux/rpmsg/qcom_glink.h @@ -26,4 +26,10 @@ static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {} #endif +#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SSR) +void qcom_glink_ssr_notify(const char *ssr_name); +#else +static inline void qcom_glink_ssr_notify(const char *ssr_name) {} +#endif + #endif From 93bc3feee8bd5fbe29ad27779c5e7b369fd7c80b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 22 Apr 2020 17:37:35 -0700 Subject: [PATCH 092/574] rpmsg: glink: Integrate glink_ssr in qcom_glink In all but the very special case of a system with _only_ glink_rpm, GLINK is dependent on glink_ssr, so move it to rpmsg and combine it with qcom_glink_native in the new qcom_glink kernel module. Acked-by: Chris Lew Acked-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/20200423003736.2027371-4-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/rpmsg/Kconfig | 6 +++--- drivers/rpmsg/Makefile | 3 ++- drivers/{soc/qcom/glink_ssr.c => rpmsg/qcom_glink_ssr.c} | 4 ---- drivers/soc/qcom/Kconfig | 9 --------- drivers/soc/qcom/Makefile | 1 - include/linux/rpmsg/qcom_glink.h | 7 +------ 6 files changed, 6 insertions(+), 24 deletions(-) rename drivers/{soc/qcom/glink_ssr.c => rpmsg/qcom_glink_ssr.c} (97%) diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig index a9108ff563dc..f96716893c2a 100644 --- a/drivers/rpmsg/Kconfig +++ b/drivers/rpmsg/Kconfig @@ -24,13 +24,13 @@ config RPMSG_MTK_SCP remote processors in MediaTek platforms. This use IPI and IPC to communicate with remote processors. -config RPMSG_QCOM_GLINK_NATIVE +config RPMSG_QCOM_GLINK tristate select RPMSG config RPMSG_QCOM_GLINK_RPM tristate "Qualcomm RPM Glink driver" - select RPMSG_QCOM_GLINK_NATIVE + select RPMSG_QCOM_GLINK depends on HAS_IOMEM depends on MAILBOX help @@ -40,7 +40,7 @@ config RPMSG_QCOM_GLINK_RPM config RPMSG_QCOM_GLINK_SMEM tristate "Qualcomm SMEM Glink driver" - select RPMSG_QCOM_GLINK_NATIVE + select RPMSG_QCOM_GLINK depends on MAILBOX depends on QCOM_SMEM help diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile index ae92a7fb08f6..ffe932ef6050 100644 --- a/drivers/rpmsg/Makefile +++ b/drivers/rpmsg/Makefile @@ -2,8 +2,9 @@ obj-$(CONFIG_RPMSG) += rpmsg_core.o obj-$(CONFIG_RPMSG_CHAR) += rpmsg_char.o obj-$(CONFIG_RPMSG_MTK_SCP) += mtk_rpmsg.o +qcom_glink-objs := qcom_glink_native.o qcom_glink_ssr.o +obj-$(CONFIG_RPMSG_QCOM_GLINK) += qcom_glink.o obj-$(CONFIG_RPMSG_QCOM_GLINK_RPM) += qcom_glink_rpm.o -obj-$(CONFIG_RPMSG_QCOM_GLINK_NATIVE) += qcom_glink_native.o obj-$(CONFIG_RPMSG_QCOM_GLINK_SMEM) += qcom_glink_smem.o obj-$(CONFIG_RPMSG_QCOM_SMD) += qcom_smd.o obj-$(CONFIG_RPMSG_VIRTIO) += virtio_rpmsg_bus.o diff --git a/drivers/soc/qcom/glink_ssr.c b/drivers/rpmsg/qcom_glink_ssr.c similarity index 97% rename from drivers/soc/qcom/glink_ssr.c rename to drivers/rpmsg/qcom_glink_ssr.c index 847d79c935f1..dcd1ce616974 100644 --- a/drivers/soc/qcom/glink_ssr.c +++ b/drivers/rpmsg/qcom_glink_ssr.c @@ -164,7 +164,3 @@ static struct rpmsg_driver qcom_glink_ssr_driver = { }, }; module_rpmsg_driver(qcom_glink_ssr_driver); - -MODULE_ALIAS("rpmsg:glink_ssr"); -MODULE_DESCRIPTION("Qualcomm GLINK SSR notifier"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index bf42a17a45de..811c91289cbf 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -35,15 +35,6 @@ config QCOM_GENI_SE driver is also used to manage the common aspects of multiple Serial Engines present in the QUP. -config QCOM_GLINK_SSR - tristate "Qualcomm Glink SSR driver" - depends on RPMSG - depends on QCOM_RPROC_COMMON - help - Say y here to enable GLINK SSR support. The GLINK SSR driver - implements the SSR protocol for notifying the remote processor about - neighboring subsystems going up or down. - config QCOM_GSBI tristate "QCOM General Serial Bus Interface" depends on ARCH_QCOM || COMPILE_TEST diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index 5d6b83dc58e8..e9cacc9ad401 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -3,7 +3,6 @@ CFLAGS_rpmh-rsc.o := -I$(src) obj-$(CONFIG_QCOM_AOSS_QMP) += qcom_aoss.o obj-$(CONFIG_QCOM_GENI_SE) += qcom-geni-se.o obj-$(CONFIG_QCOM_COMMAND_DB) += cmd-db.o -obj-$(CONFIG_QCOM_GLINK_SSR) += glink_ssr.o obj-$(CONFIG_QCOM_GSBI) += qcom_gsbi.o obj-$(CONFIG_QCOM_MDT_LOADER) += mdt_loader.o obj-$(CONFIG_QCOM_OCMEM) += ocmem.o diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h index 09daa0acde2c..daded9fddf36 100644 --- a/include/linux/rpmsg/qcom_glink.h +++ b/include/linux/rpmsg/qcom_glink.h @@ -12,6 +12,7 @@ struct qcom_glink; struct qcom_glink *qcom_glink_smem_register(struct device *parent, struct device_node *node); void qcom_glink_smem_unregister(struct qcom_glink *glink); +void qcom_glink_ssr_notify(const char *ssr_name); #else @@ -23,12 +24,6 @@ qcom_glink_smem_register(struct device *parent, } static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {} - -#endif - -#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SSR) -void qcom_glink_ssr_notify(const char *ssr_name); -#else static inline void qcom_glink_ssr_notify(const char *ssr_name) {} #endif From 60da7d0bc7482a30817c73fdc8152123d19919f8 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:23:02 -0500 Subject: [PATCH 093/574] sparc64: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- arch/sparc/kernel/cpumap.c | 2 +- arch/sparc/kernel/ds.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c index 1cb62bfeaa1f..f07ea88a83af 100644 --- a/arch/sparc/kernel/cpumap.c +++ b/arch/sparc/kernel/cpumap.c @@ -50,7 +50,7 @@ struct cpuinfo_tree { /* Offsets into nodes[] for each level of the tree */ struct cpuinfo_level level[CPUINFO_LVL_MAX]; - struct cpuinfo_node nodes[0]; + struct cpuinfo_node nodes[]; }; diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c index 75232cbd58bf..522e5b51050c 100644 --- a/arch/sparc/kernel/ds.c +++ b/arch/sparc/kernel/ds.c @@ -87,7 +87,7 @@ struct ds_reg_req { __u64 handle; __u16 major; __u16 minor; - char svc_id[0]; + char svc_id[]; }; struct ds_reg_ack { @@ -701,12 +701,12 @@ struct ds_var_hdr { struct ds_var_set_msg { struct ds_var_hdr hdr; - char name_and_value[0]; + char name_and_value[]; }; struct ds_var_delete_msg { struct ds_var_hdr hdr; - char name[0]; + char name[]; }; struct ds_var_resp { @@ -989,7 +989,7 @@ struct ds_queue_entry { struct ds_info *dp; int req_len; int __pad; - u64 req[0]; + u64 req[]; }; static void process_ds_work(void) From 8096f80a5c09b716be207eb042c4e40d6cdefbd2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 9 May 2020 08:42:37 +0000 Subject: [PATCH 094/574] remoteproc/mediatek: fix invalid use of sizeof in scp_ipi_init() sizeof() when applied to a pointer typed expression gives the size of the pointer, not that of the pointed data. Reviewed-by: Mathieu Poirier Fixes: 63c13d61eafe ("remoteproc/mediatek: add SCP support for mt8183") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20200509084237.36293-1-weiyongjun1@huawei.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/mtk_scp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index ea3743e7e794..1ebad7084c47 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -132,8 +132,8 @@ static int scp_ipi_init(struct mtk_scp *scp) (struct mtk_share_obj __iomem *)(scp->sram_base + recv_offset); scp->send_buf = (struct mtk_share_obj __iomem *)(scp->sram_base + send_offset); - memset_io(scp->recv_buf, 0, sizeof(scp->recv_buf)); - memset_io(scp->send_buf, 0, sizeof(scp->send_buf)); + memset_io(scp->recv_buf, 0, sizeof(*scp->recv_buf)); + memset_io(scp->send_buf, 0, sizeof(*scp->send_buf)); return 0; } From 69acee2e4ee3d719e9424fbcb4528e88a5e695e8 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 20:02:22 +0530 Subject: [PATCH 095/574] dt-bindings: remoteproc: qcom: Add SC7180 MPSS support Add MPSS PAS support for SC7180 SoCs. Acked-by: Rob Herring Reviewed-by: Bjorn Andersson Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200421143228.8981-2-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 9938918b2fea..22604d2cd3f8 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -15,6 +15,7 @@ on the Qualcomm ADSP Hexagon core. "qcom,qcs404-adsp-pas" "qcom,qcs404-cdsp-pas" "qcom,qcs404-wcss-pas" + "qcom,sc7180-mpss-pas" "qcom,sdm845-adsp-pas" "qcom,sdm845-cdsp-pas" "qcom,sm8150-adsp-pas" @@ -46,6 +47,7 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-slpi-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack" qcom,qcs404-wcss-pas: + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack", "shutdown-ack" @@ -106,6 +108,7 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: must be "cx", "load_state" + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "cx", "load_state", "mss" qcom,sm8150-slpi-pas: From 620d70b04d43b8659b099b125626325333bc887f Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 20:02:23 +0530 Subject: [PATCH 096/574] remoteproc: qcom: pas: Add SC7180 Modem support Add support for booting the Modem DSP found on Qualcomm's SC7180 SoCs. Tested-by: Evan Green Reviewed-by: Bjorn Andersson Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200421143228.8981-3-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_pas.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index fc6658b523b6..5a33318e8c68 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -638,6 +638,7 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,qcs404-adsp-pas", .data = &adsp_resource_init }, { .compatible = "qcom,qcs404-cdsp-pas", .data = &cdsp_resource_init }, { .compatible = "qcom,qcs404-wcss-pas", .data = &wcss_resource_init }, + { .compatible = "qcom,sc7180-mpss-pas", .data = &mpss_resource_init}, { .compatible = "qcom,sdm845-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-cdsp-pas", .data = &cdsp_resource_init}, { .compatible = "qcom,sm8150-adsp-pas", .data = &sm8150_adsp_resource}, From d964b0b1a863cceaa694a2db485b1347e49a8082 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 20:02:24 +0530 Subject: [PATCH 097/574] dt-bindings: remoteproc: qcom: Use memory-region to reference memory Use memory-region property to reference mba and mpss memory regions. Reviewed-by: Rob Herring Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200421143228.8981-4-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/remoteproc/qcom,q6v5.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 88dfa3fc15f7..ea2869c3210f 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -102,6 +102,14 @@ on the Qualcomm Hexagon core. must be "mss_restart", "pdc_reset" for the modem sub-system on SC7180, SDM845 SoCs +For devices where the mba and mpss sub-nodes are not specified, mba/mpss region +should be referenced as follows: +- memory-region: + Usage: required + Value type: + Definition: reference to the reserved-memory for the mba region followed + by the mpss region + For the compatible strings below the following supplies are required: "qcom,q6v5-pil" "qcom,msm8916-mss-pil", From 6663ce6facf93727a3a08c5a6408405cd6094c48 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Tue, 21 Apr 2020 20:02:25 +0530 Subject: [PATCH 098/574] remoteproc: qcom_q6v5_mss: Extract mba/mpss from memory-region In the absence of mba and mpss sub-child extract the mba/mpss regions from the memory-region property. Tested-by: Evan Green Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200421143228.8981-5-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 38d2b21735da..3a7fcd6485eb 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1568,8 +1568,17 @@ static int q6v5_alloc_memory_region(struct q6v5 *qproc) struct resource r; int ret; + /* + * In the absence of mba/mpss sub-child, extract the mba and mpss + * reserved memory regions from device's memory-region property. + */ child = of_get_child_by_name(qproc->dev->of_node, "mba"); - node = of_parse_phandle(child, "memory-region", 0); + if (!child) + node = of_parse_phandle(qproc->dev->of_node, + "memory-region", 0); + else + node = of_parse_phandle(child, "memory-region", 0); + ret = of_address_to_resource(node, 0, &r); if (ret) { dev_err(qproc->dev, "unable to resolve mba region\n"); @@ -1586,8 +1595,14 @@ static int q6v5_alloc_memory_region(struct q6v5 *qproc) return -EBUSY; } - child = of_get_child_by_name(qproc->dev->of_node, "mpss"); - node = of_parse_phandle(child, "memory-region", 0); + if (!child) { + node = of_parse_phandle(qproc->dev->of_node, + "memory-region", 1); + } else { + child = of_get_child_by_name(qproc->dev->of_node, "mpss"); + node = of_parse_phandle(child, "memory-region", 0); + } + ret = of_address_to_resource(node, 0, &r); if (ret) { dev_err(qproc->dev, "unable to resolve mpss region\n"); From 45ba7a893ad89114e773b3dc32f6431354c465d6 Mon Sep 17 00:00:00 2001 From: David Gow Date: Thu, 30 Apr 2020 21:27:01 -0700 Subject: [PATCH 099/574] kunit: kunit_tool: Separate out config/build/exec/parse Add new subcommands to kunit.py to allow stages of the existing 'run' subcommand to be run independently: - 'config': Verifies that .config is a subset of .kunitconfig - 'build': Compiles a UML kernel for KUnit - 'exec': Runs the kernel, and outputs the test results. - 'parse': Parses test results from a file or stdin The 'run' command continues to behave as before. Signed-off-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 287 ++++++++++++++++++++----- tools/testing/kunit/kunit_tool_test.py | 63 +++++- 2 files changed, 293 insertions(+), 57 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 7dca74774dd2..b01838b6f5f9 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -20,8 +20,17 @@ import kunit_config import kunit_kernel import kunit_parser -KunitResult = namedtuple('KunitResult', ['status','result']) +KunitResult = namedtuple('KunitResult', ['status','result','elapsed_time']) +KunitConfigRequest = namedtuple('KunitConfigRequest', + ['build_dir', 'defconfig', 'make_options']) +KunitBuildRequest = namedtuple('KunitBuildRequest', + ['jobs', 'build_dir', 'alltests', + 'make_options']) +KunitExecRequest = namedtuple('KunitExecRequest', + ['timeout', 'build_dir', 'alltests']) +KunitParseRequest = namedtuple('KunitParseRequest', + ['raw_output', 'input_data']) KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs', 'build_dir', 'defconfig', 'alltests', 'make_options']) @@ -46,14 +55,25 @@ def get_kernel_root_path(): sys.exit(1) return parts[0] -def run_tests(linux: kunit_kernel.LinuxSourceTree, - request: KunitRequest) -> KunitResult: +def config_tests(linux: kunit_kernel.LinuxSourceTree, + request: KunitConfigRequest) -> KunitResult: + kunit_parser.print_with_timestamp('Configuring KUnit Kernel ...') + config_start = time.time() + if request.defconfig: + create_default_kunitconfig() success = linux.build_reconfig(request.build_dir, request.make_options) config_end = time.time() if not success: - return KunitResult(KunitStatus.CONFIG_FAILURE, 'could not configure kernel') + return KunitResult(KunitStatus.CONFIG_FAILURE, + 'could not configure kernel', + config_end - config_start) + return KunitResult(KunitStatus.SUCCESS, + 'configured kernel successfully', + config_end - config_start) +def build_tests(linux: kunit_kernel.LinuxSourceTree, + request: KunitBuildRequest) -> KunitResult: kunit_parser.print_with_timestamp('Building KUnit Kernel ...') build_start = time.time() @@ -64,78 +84,167 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, build_end = time.time() if not success: return KunitResult(KunitStatus.BUILD_FAILURE, 'could not build kernel') + if not success: + return KunitResult(KunitStatus.BUILD_FAILURE, + 'could not build kernel', + build_end - build_start) + return KunitResult(KunitStatus.SUCCESS, + 'built kernel successfully', + build_end - build_start) +def exec_tests(linux: kunit_kernel.LinuxSourceTree, + request: KunitExecRequest) -> KunitResult: kunit_parser.print_with_timestamp('Starting KUnit Kernel ...') test_start = time.time() - kunit_output = linux.run_kernel( + result = linux.run_kernel( timeout=None if request.alltests else request.timeout, build_dir=request.build_dir) - if request.raw_output: - raw_output = kunit_parser.raw_output(kunit_output) - isolated = list(kunit_parser.isolate_kunit_output(raw_output)) - test_result = kunit_parser.parse_test_result(isolated) - else: - test_result = kunit_parser.parse_run_tests(kunit_output) + test_end = time.time() + return KunitResult(KunitStatus.SUCCESS, + result, + test_end - test_start) + +def parse_tests(request: KunitParseRequest) -> KunitResult: + parse_start = time.time() + + test_result = kunit_parser.TestResult(kunit_parser.TestStatus.SUCCESS, + [], + 'Tests not Parsed.') + if request.raw_output: + kunit_parser.raw_output(request.input_data) + else: + test_result = kunit_parser.parse_run_tests(request.input_data) + parse_end = time.time() + + if test_result.status != kunit_parser.TestStatus.SUCCESS: + return KunitResult(KunitStatus.TEST_FAILURE, test_result, + parse_end - parse_start) + + return KunitResult(KunitStatus.SUCCESS, test_result, + parse_end - parse_start) + + +def run_tests(linux: kunit_kernel.LinuxSourceTree, + request: KunitRequest) -> KunitResult: + run_start = time.time() + + config_request = KunitConfigRequest(request.build_dir, + request.defconfig, + request.make_options) + config_result = config_tests(linux, config_request) + if config_result.status != KunitStatus.SUCCESS: + return config_result + + build_request = KunitBuildRequest(request.jobs, request.build_dir, + request.alltests, + request.make_options) + build_result = build_tests(linux, build_request) + if build_result.status != KunitStatus.SUCCESS: + return build_result + + exec_request = KunitExecRequest(request.timeout, request.build_dir, + request.alltests) + exec_result = exec_tests(linux, exec_request) + if exec_result.status != KunitStatus.SUCCESS: + return exec_result + + parse_request = KunitParseRequest(request.raw_output, + exec_result.result) + parse_result = parse_tests(parse_request) + + run_end = time.time() + kunit_parser.print_with_timestamp(( 'Elapsed time: %.3fs total, %.3fs configuring, %.3fs ' + 'building, %.3fs running\n') % ( - test_end - config_start, - config_end - config_start, - build_end - build_start, - test_end - test_start)) + run_end - run_start, + config_result.elapsed_time, + build_result.elapsed_time, + exec_result.elapsed_time)) + return parse_result + +def add_common_opts(parser): + parser.add_argument('--build_dir', + help='As in the make command, it specifies the build ' + 'directory.', + type=str, default='', metavar='build_dir') + parser.add_argument('--make_options', + help='X=Y make option, can be repeated.', + action='append') + parser.add_argument('--alltests', + help='Run all KUnit tests through allyesconfig', + action='store_true') + +def add_config_opts(parser): + parser.add_argument('--defconfig', + help='Uses a default .kunitconfig.', + action='store_true') + +def add_build_opts(parser): + parser.add_argument('--jobs', + help='As in the make command, "Specifies the number of ' + 'jobs (commands) to run simultaneously."', + type=int, default=8, metavar='jobs') + +def add_exec_opts(parser): + parser.add_argument('--timeout', + help='maximum number of seconds to allow for all tests ' + 'to run. This does not include time taken to build the ' + 'tests.', + type=int, + default=300, + metavar='timeout') + +def add_parse_opts(parser): + parser.add_argument('--raw_output', help='don\'t format output from kernel', + action='store_true') - if test_result.status != kunit_parser.TestStatus.SUCCESS: - return KunitResult(KunitStatus.TEST_FAILURE, test_result) - else: - return KunitResult(KunitStatus.SUCCESS, test_result) def main(argv, linux=None): parser = argparse.ArgumentParser( description='Helps writing and running KUnit tests.') subparser = parser.add_subparsers(dest='subcommand') + # The 'run' command will config, build, exec, and parse in one go. run_parser = subparser.add_parser('run', help='Runs KUnit tests.') - run_parser.add_argument('--raw_output', help='don\'t format output from kernel', - action='store_true') + add_common_opts(run_parser) + add_config_opts(run_parser) + add_build_opts(run_parser) + add_exec_opts(run_parser) + add_parse_opts(run_parser) - run_parser.add_argument('--timeout', - help='maximum number of seconds to allow for all tests ' - 'to run. This does not include time taken to build the ' - 'tests.', - type=int, - default=300, - metavar='timeout') + config_parser = subparser.add_parser('config', + help='Ensures that .config contains all of ' + 'the options in .kunitconfig') + add_common_opts(config_parser) + add_config_opts(config_parser) - run_parser.add_argument('--jobs', - help='As in the make command, "Specifies the number of ' - 'jobs (commands) to run simultaneously."', - type=int, default=8, metavar='jobs') + build_parser = subparser.add_parser('build', help='Builds a kernel with KUnit tests') + add_common_opts(build_parser) + add_build_opts(build_parser) - run_parser.add_argument('--build_dir', - help='As in the make command, it specifies the build ' - 'directory.', - type=str, default='', metavar='build_dir') + exec_parser = subparser.add_parser('exec', help='Run a kernel with KUnit tests') + add_common_opts(exec_parser) + add_exec_opts(exec_parser) + add_parse_opts(exec_parser) - run_parser.add_argument('--defconfig', - help='Uses a default .kunitconfig.', - action='store_true') - - run_parser.add_argument('--alltests', - help='Run all KUnit tests through allyesconfig', - action='store_true') - - run_parser.add_argument('--make_options', - help='X=Y make option, can be repeated.', - action='append') + # The 'parse' option is special, as it doesn't need the kernel source + # (therefore there is no need for a build_dir, hence no add_common_opts) + # and the '--file' argument is not relevant to 'run', so isn't in + # add_parse_opts() + parse_parser = subparser.add_parser('parse', + help='Parses KUnit results from a file, ' + 'and parses formatted results.') + add_parse_opts(parse_parser) + parse_parser.add_argument('file', + help='Specifies the file to read results from.', + type=str, nargs='?', metavar='input_file') cli_args = parser.parse_args(argv) if cli_args.subcommand == 'run': - if get_kernel_root_path(): - os.chdir(get_kernel_root_path()) - if cli_args.build_dir: if not os.path.exists(cli_args.build_dir): os.mkdir(cli_args.build_dir) @@ -143,9 +252,6 @@ def main(argv, linux=None): cli_args.build_dir, kunit_kernel.kunitconfig_path) - if cli_args.defconfig: - create_default_kunitconfig() - if not linux: linux = kunit_kernel.LinuxSourceTree() @@ -159,6 +265,81 @@ def main(argv, linux=None): result = run_tests(linux, request) if result.status != KunitStatus.SUCCESS: sys.exit(1) + elif cli_args.subcommand == 'config': + if cli_args.build_dir: + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) + + if not linux: + linux = kunit_kernel.LinuxSourceTree() + + request = KunitConfigRequest(cli_args.build_dir, + cli_args.defconfig, + cli_args.make_options) + result = config_tests(linux, request) + kunit_parser.print_with_timestamp(( + 'Elapsed time: %.3fs\n') % ( + result.elapsed_time)) + if result.status != KunitStatus.SUCCESS: + sys.exit(1) + elif cli_args.subcommand == 'build': + if cli_args.build_dir: + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) + + if not linux: + linux = kunit_kernel.LinuxSourceTree() + + request = KunitBuildRequest(cli_args.jobs, + cli_args.build_dir, + cli_args.alltests, + cli_args.make_options) + result = build_tests(linux, request) + kunit_parser.print_with_timestamp(( + 'Elapsed time: %.3fs\n') % ( + result.elapsed_time)) + if result.status != KunitStatus.SUCCESS: + sys.exit(1) + elif cli_args.subcommand == 'exec': + if cli_args.build_dir: + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) + + if not linux: + linux = kunit_kernel.LinuxSourceTree() + + exec_request = KunitExecRequest(cli_args.timeout, + cli_args.build_dir, + cli_args.alltests) + exec_result = exec_tests(linux, exec_request) + parse_request = KunitParseRequest(cli_args.raw_output, + exec_result.result) + result = parse_tests(parse_request) + kunit_parser.print_with_timestamp(( + 'Elapsed time: %.3fs\n') % ( + exec_result.elapsed_time)) + if result.status != KunitStatus.SUCCESS: + sys.exit(1) + elif cli_args.subcommand == 'parse': + if cli_args.file == None: + kunit_output = sys.stdin + else: + with open(cli_args.file, 'r') as f: + kunit_output = f.read().splitlines() + request = KunitParseRequest(cli_args.raw_output, + kunit_output) + result = parse_tests(request) + if result.status != KunitStatus.SUCCESS: + sys.exit(1) else: parser.print_help() diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 984588d6ba95..5bb7b118ebd9 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -239,6 +239,24 @@ class KUnitMainTest(unittest.TestCase): self.print_patch.stop() pass + def test_config_passes_args_pass(self): + kunit.main(['config'], self.linux_source_mock) + assert self.linux_source_mock.build_reconfig.call_count == 1 + assert self.linux_source_mock.run_kernel.call_count == 0 + + def test_build_passes_args_pass(self): + kunit.main(['build'], self.linux_source_mock) + assert self.linux_source_mock.build_reconfig.call_count == 0 + self.linux_source_mock.build_um_kernel.assert_called_once_with(False, 8, '', None) + assert self.linux_source_mock.run_kernel.call_count == 0 + + def test_exec_passes_args_pass(self): + kunit.main(['exec'], self.linux_source_mock) + assert self.linux_source_mock.build_reconfig.call_count == 0 + assert self.linux_source_mock.run_kernel.call_count == 1 + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_run_passes_args_pass(self): kunit.main(['run'], self.linux_source_mock) assert self.linux_source_mock.build_reconfig.call_count == 1 @@ -247,6 +265,13 @@ class KUnitMainTest(unittest.TestCase): build_dir='', timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_exec_passes_args_fail(self): + self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) + with self.assertRaises(SystemExit) as e: + kunit.main(['exec'], self.linux_source_mock) + assert type(e.exception) == SystemExit + assert e.exception.code == 1 + def test_run_passes_args_fail(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) with self.assertRaises(SystemExit) as e: @@ -257,14 +282,28 @@ class KUnitMainTest(unittest.TestCase): assert self.linux_source_mock.run_kernel.call_count == 1 self.print_mock.assert_any_call(StrContains(' 0 tests run')) + def test_exec_raw_output(self): + self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) + kunit.main(['exec', '--raw_output'], self.linux_source_mock) + assert self.linux_source_mock.run_kernel.call_count == 1 + for kall in self.print_mock.call_args_list: + assert kall != mock.call(StrContains('Testing complete.')) + assert kall != mock.call(StrContains(' 0 tests run')) + def test_run_raw_output(self): self.linux_source_mock.run_kernel = mock.Mock(return_value=[]) - with self.assertRaises(SystemExit) as e: - kunit.main(['run', '--raw_output'], self.linux_source_mock) - assert type(e.exception) == SystemExit - assert e.exception.code == 1 + kunit.main(['run', '--raw_output'], self.linux_source_mock) assert self.linux_source_mock.build_reconfig.call_count == 1 assert self.linux_source_mock.run_kernel.call_count == 1 + for kall in self.print_mock.call_args_list: + assert kall != mock.call(StrContains('Testing complete.')) + assert kall != mock.call(StrContains(' 0 tests run')) + + def test_exec_timeout(self): + timeout = 3453 + kunit.main(['exec', '--timeout', str(timeout)], self.linux_source_mock) + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=timeout) + self.print_mock.assert_any_call(StrContains('Testing complete.')) def test_run_timeout(self): timeout = 3453 @@ -282,5 +321,21 @@ class KUnitMainTest(unittest.TestCase): build_dir=build_dir, timeout=300) self.print_mock.assert_any_call(StrContains('Testing complete.')) + def test_config_builddir(self): + build_dir = '.kunit' + kunit.main(['config', '--build_dir', build_dir], self.linux_source_mock) + assert self.linux_source_mock.build_reconfig.call_count == 1 + + def test_build_builddir(self): + build_dir = '.kunit' + kunit.main(['build', '--build_dir', build_dir], self.linux_source_mock) + self.linux_source_mock.build_um_kernel.assert_called_once_with(False, 8, build_dir, None) + + def test_exec_builddir(self): + build_dir = '.kunit' + kunit.main(['exec', '--build_dir', build_dir], self.linux_source_mock) + self.linux_source_mock.run_kernel.assert_called_once_with(build_dir=build_dir, timeout=300) + self.print_mock.assert_any_call(StrContains('Testing complete.')) + if __name__ == '__main__': unittest.main() From 4a995747049eea8ba61a3e356a1b7407e60efbd1 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 30 Apr 2020 11:00:50 -0700 Subject: [PATCH 100/574] dt-bindings: remoteproc: qcom: pas: Add SM8250 remoteprocs Add the SM8250 audio, compute and sensor remoteprocs to the PAS DT binding. Acked-by: Rob Herring Reviewed-by: Sibi Sankar Link: https://lore.kernel.org/r/20200430180051.3795305-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/remoteproc/qcom,adsp.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 22604d2cd3f8..54737024da20 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -22,6 +22,9 @@ on the Qualcomm ADSP Hexagon core. "qcom,sm8150-cdsp-pas" "qcom,sm8150-mpss-pas" "qcom,sm8150-slpi-pas" + "qcom,sm8250-adsp-pas" + "qcom,sm8250-cdsp-pas" + "qcom,sm8250-slpi-pas" - interrupts-extended: Usage: required @@ -45,6 +48,9 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-adsp-pas: + qcom,sm8250-cdsp-pas: + qcom,sm8250-slpi-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack" qcom,qcs404-wcss-pas: qcom,sc7180-mpss-pas: @@ -107,11 +113,14 @@ on the Qualcomm ADSP Hexagon core. qcom,sdm845-cdsp-pas: qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: + qcom,sm8250-cdsp-pas: must be "cx", "load_state" qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "cx", "load_state", "mss" + qcom,sm8250-adsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-slpi-pas: must be "lcx", "lmx", "load_state" - memory-region: From f6da4831c55ae0c86c496e330f3b61ccbee1fcce Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 30 Apr 2020 11:00:51 -0700 Subject: [PATCH 101/574] remoteproc: qcom: pas: Add SM8250 PAS remoteprocs Add audio, compute and sensor DSP compatibles to the Qualcomm PAS binding and driver. Reviewed-by: Sibi Sankar Link: https://lore.kernel.org/r/20200430180051.3795305-2-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_pas.c | 62 ++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 5a33318e8c68..3bb69f58e086 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -508,6 +508,26 @@ static const struct adsp_data sm8150_adsp_resource = { .ssctl_id = 0x14, }; +static const struct adsp_data sm8250_adsp_resource = { + .crash_reason_smem = 423, + .firmware_name = "adsp.mdt", + .pas_id = 1, + .has_aggre2_clk = false, + .auto_boot = true, + .active_pd_names = (char*[]){ + "load_state", + NULL + }, + .proxy_pd_names = (char*[]){ + "lcx", + "lmx", + NULL + }, + .ssr_name = "lpass", + .sysmon_name = "adsp", + .ssctl_id = 0x14, +}; + static const struct adsp_data msm8998_adsp_resource = { .crash_reason_smem = 423, .firmware_name = "adsp.mdt", @@ -553,6 +573,25 @@ static const struct adsp_data sm8150_cdsp_resource = { .ssctl_id = 0x17, }; +static const struct adsp_data sm8250_cdsp_resource = { + .crash_reason_smem = 601, + .firmware_name = "cdsp.mdt", + .pas_id = 18, + .has_aggre2_clk = false, + .auto_boot = true, + .active_pd_names = (char*[]){ + "load_state", + NULL + }, + .proxy_pd_names = (char*[]){ + "cx", + NULL + }, + .ssr_name = "cdsp", + .sysmon_name = "cdsp", + .ssctl_id = 0x17, +}; + static const struct adsp_data mpss_resource_init = { .crash_reason_smem = 421, .firmware_name = "modem.mdt", @@ -604,6 +643,26 @@ static const struct adsp_data sm8150_slpi_resource = { .ssctl_id = 0x16, }; +static const struct adsp_data sm8250_slpi_resource = { + .crash_reason_smem = 424, + .firmware_name = "slpi.mdt", + .pas_id = 12, + .has_aggre2_clk = false, + .auto_boot = true, + .active_pd_names = (char*[]){ + "load_state", + NULL + }, + .proxy_pd_names = (char*[]){ + "lcx", + "lmx", + NULL + }, + .ssr_name = "dsps", + .sysmon_name = "slpi", + .ssctl_id = 0x16, +}; + static const struct adsp_data msm8998_slpi_resource = { .crash_reason_smem = 424, .firmware_name = "slpi.mdt", @@ -645,6 +704,9 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,sm8150-cdsp-pas", .data = &sm8150_cdsp_resource}, { .compatible = "qcom,sm8150-mpss-pas", .data = &mpss_resource_init}, { .compatible = "qcom,sm8150-slpi-pas", .data = &sm8150_slpi_resource}, + { .compatible = "qcom,sm8250-adsp-pas", .data = &sm8250_adsp_resource}, + { .compatible = "qcom,sm8250-cdsp-pas", .data = &sm8250_cdsp_resource}, + { .compatible = "qcom,sm8250-slpi-pas", .data = &sm8250_slpi_resource}, { }, }; MODULE_DEVICE_TABLE(of, adsp_of_match); From e62e3acd61d36b07878cd33a868a5797fe1e25b5 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 15 Apr 2020 20:21:09 +0530 Subject: [PATCH 102/574] dt-bindings: remoteproc: qcom: Replace halt-nav with spare-regs 7C retail devices using MSA based boot will result in a fuse combination which will prevent accesses to MSS PERPH register space where the mpss clocks and halt-nav reside. However accesses to conn_box_spare0 in TCSR register space is still permitted so rename the binding appropriately to qcom,spare-regs and drop all accesses to the MPSS PERPH register space. Tested-by: Evan Green Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200415145110.20624-2-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/remoteproc/qcom,q6v5.txt | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index ea2869c3210f..6451cf4a7b7b 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -79,7 +79,7 @@ on the Qualcomm Hexagon core. "snoc_axi", "mnoc_axi", "qdss" qcom,sc7180-mss-pil: must be "iface", "bus", "xo", "snoc_axi", "mnoc_axi", - "mss_crypto", "mss_nav", "nav" + "nav" qcom,sdm845-mss-pil: must be "iface", "bus", "mem", "xo", "gpll0_mss", "snoc_axi", "mnoc_axi", "prng" @@ -181,16 +181,12 @@ For the compatible string below the following supplies are required: For the compatible strings below the following phandle references are required: "qcom,sc7180-mss-pil" -- qcom,halt-nav-regs: +- qcom,spare-regs: Usage: required Value type: - Definition: reference to a list of 2 phandles with one offset each for - the modem sub-system running on SC7180 SoC. The first - phandle reference is to the mss clock node followed by the - offset within register space for nav halt register. The - second phandle reference is to a syscon representing TCSR - followed by the offset within syscon for conn_box_spare0 - register. + Definition: a phandle reference to a syscon representing TCSR followed + by the offset within syscon for conn_box_spare0 register + used by the modem sub-system running on SC7180 SoC. = SUBNODES: The Hexagon node must contain two subnodes, named "mba" and "mpss" representing From a9fdc79d488623d36341f0f3d08f5aa1bedb9d53 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 15 Apr 2020 20:21:10 +0530 Subject: [PATCH 103/574] remoteproc: qcom_q6v5_mss: Drop accesses to MPSS PERPH register space 7C retail devices using MSA based boot will result in a fuse combination which will prevent accesses to MSS PERPH register space where the mpss clocks and halt-nav reside. So drop all accesses to the MPSS PERPH register space. Issuing HALT NAV request and turning on the mss clocks as part of SSR will no longer be required since the modem firmware will have the necessary fixes to ensure that there are no pending NAV DMA transactions. Tested-by: Evan Green Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200415145110.20624-3-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 102 +++++------------------------ 1 file changed, 18 insertions(+), 84 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 3a7fcd6485eb..1ded14253a83 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -69,13 +69,9 @@ #define AXI_HALTREQ_REG 0x0 #define AXI_HALTACK_REG 0x4 #define AXI_IDLE_REG 0x8 -#define NAV_AXI_HALTREQ_BIT BIT(0) -#define NAV_AXI_HALTACK_BIT BIT(1) -#define NAV_AXI_IDLE_BIT BIT(2) #define AXI_GATING_VALID_OVERRIDE BIT(0) #define HALT_ACK_TIMEOUT_US 100000 -#define NAV_HALT_ACK_TIMEOUT_US 200 /* QDSP6SS_RESET */ #define Q6SS_STOP_CORE BIT(0) @@ -143,7 +139,7 @@ struct rproc_hexagon_res { int version; bool need_mem_protection; bool has_alt_reset; - bool has_halt_nav; + bool has_spare_reg; }; struct q6v5 { @@ -154,13 +150,11 @@ struct q6v5 { void __iomem *rmb_base; struct regmap *halt_map; - struct regmap *halt_nav_map; struct regmap *conn_map; u32 halt_q6; u32 halt_modem; u32 halt_nc; - u32 halt_nav; u32 conn_box; struct reset_control *mss_restart; @@ -206,7 +200,7 @@ struct q6v5 { struct qcom_sysmon *sysmon; bool need_mem_protection; bool has_alt_reset; - bool has_halt_nav; + bool has_spare_reg; int mpss_perm; int mba_perm; const char *hexagon_mdt_image; @@ -427,21 +421,19 @@ static int q6v5_reset_assert(struct q6v5 *qproc) reset_control_assert(qproc->pdc_reset); ret = reset_control_reset(qproc->mss_restart); reset_control_deassert(qproc->pdc_reset); - } else if (qproc->has_halt_nav) { + } else if (qproc->has_spare_reg) { /* * When the AXI pipeline is being reset with the Q6 modem partly * operational there is possibility of AXI valid signal to * glitch, leading to spurious transactions and Q6 hangs. A work * around is employed by asserting the AXI_GATING_VALID_OVERRIDE - * BIT before triggering Q6 MSS reset. Both the HALTREQ and - * AXI_GATING_VALID_OVERRIDE are withdrawn post MSS assert - * followed by a MSS deassert, while holding the PDC reset. + * BIT before triggering Q6 MSS reset. AXI_GATING_VALID_OVERRIDE + * is withdrawn post MSS assert followed by a MSS deassert, + * while holding the PDC reset. */ reset_control_assert(qproc->pdc_reset); regmap_update_bits(qproc->conn_map, qproc->conn_box, AXI_GATING_VALID_OVERRIDE, 1); - regmap_update_bits(qproc->halt_nav_map, qproc->halt_nav, - NAV_AXI_HALTREQ_BIT, 0); reset_control_assert(qproc->mss_restart); reset_control_deassert(qproc->pdc_reset); regmap_update_bits(qproc->conn_map, qproc->conn_box, @@ -464,7 +456,7 @@ static int q6v5_reset_deassert(struct q6v5 *qproc) ret = reset_control_reset(qproc->mss_restart); writel(0, qproc->rmb_base + RMB_MBA_ALT_RESET); reset_control_deassert(qproc->pdc_reset); - } else if (qproc->has_halt_nav) { + } else if (qproc->has_spare_reg) { ret = reset_control_reset(qproc->mss_restart); } else { ret = reset_control_deassert(qproc->mss_restart); @@ -761,32 +753,6 @@ static void q6v5proc_halt_axi_port(struct q6v5 *qproc, regmap_write(halt_map, offset + AXI_HALTREQ_REG, 0); } -static void q6v5proc_halt_nav_axi_port(struct q6v5 *qproc, - struct regmap *halt_map, - u32 offset) -{ - unsigned int val; - int ret; - - /* Check if we're already idle */ - ret = regmap_read(halt_map, offset, &val); - if (!ret && (val & NAV_AXI_IDLE_BIT)) - return; - - /* Assert halt request */ - regmap_update_bits(halt_map, offset, NAV_AXI_HALTREQ_BIT, - NAV_AXI_HALTREQ_BIT); - - /* Wait for halt ack*/ - regmap_read_poll_timeout(halt_map, offset, val, - (val & NAV_AXI_HALTACK_BIT), - 5, NAV_HALT_ACK_TIMEOUT_US); - - ret = regmap_read(halt_map, offset, &val); - if (ret || !(val & NAV_AXI_IDLE_BIT)) - dev_err(qproc->dev, "port failed halt\n"); -} - static int q6v5_mpss_init_image(struct q6v5 *qproc, const struct firmware *fw) { unsigned long dma_attrs = DMA_ATTR_FORCE_CONTIGUOUS; @@ -951,9 +917,6 @@ static int q6v5_mba_load(struct q6v5 *qproc) halt_axi_ports: q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_q6); q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_modem); - if (qproc->has_halt_nav) - q6v5proc_halt_nav_axi_port(qproc, qproc->halt_nav_map, - qproc->halt_nav); q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_nc); reclaim_mba: @@ -1001,9 +964,6 @@ static void q6v5_mba_reclaim(struct q6v5 *qproc) q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_q6); q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_modem); - if (qproc->has_halt_nav) - q6v5proc_halt_nav_axi_port(qproc, qproc->halt_nav_map, - qproc->halt_nav); q6v5proc_halt_axi_port(qproc, qproc->halt_map, qproc->halt_nc); if (qproc->version == MSS_MSM8996) { /* @@ -1434,36 +1394,12 @@ static int q6v5_init_mem(struct q6v5 *qproc, struct platform_device *pdev) qproc->halt_modem = args.args[1]; qproc->halt_nc = args.args[2]; - if (qproc->has_halt_nav) { - struct platform_device *nav_pdev; - + if (qproc->has_spare_reg) { ret = of_parse_phandle_with_fixed_args(pdev->dev.of_node, - "qcom,halt-nav-regs", + "qcom,spare-regs", 1, 0, &args); if (ret < 0) { - dev_err(&pdev->dev, "failed to parse halt-nav-regs\n"); - return -EINVAL; - } - - nav_pdev = of_find_device_by_node(args.np); - of_node_put(args.np); - if (!nav_pdev) { - dev_err(&pdev->dev, "failed to get mss clock device\n"); - return -EPROBE_DEFER; - } - - qproc->halt_nav_map = dev_get_regmap(&nav_pdev->dev, NULL); - if (!qproc->halt_nav_map) { - dev_err(&pdev->dev, "failed to get map from device\n"); - return -EINVAL; - } - qproc->halt_nav = args.args[0]; - - ret = of_parse_phandle_with_fixed_args(pdev->dev.of_node, - "qcom,halt-nav-regs", - 1, 1, &args); - if (ret < 0) { - dev_err(&pdev->dev, "failed to parse halt-nav-regs\n"); + dev_err(&pdev->dev, "failed to parse spare-regs\n"); return -EINVAL; } @@ -1549,7 +1485,7 @@ static int q6v5_init_reset(struct q6v5 *qproc) return PTR_ERR(qproc->mss_restart); } - if (qproc->has_alt_reset || qproc->has_halt_nav) { + if (qproc->has_alt_reset || qproc->has_spare_reg) { qproc->pdc_reset = devm_reset_control_get_exclusive(qproc->dev, "pdc_reset"); if (IS_ERR(qproc->pdc_reset)) { @@ -1697,7 +1633,7 @@ static int q6v5_probe(struct platform_device *pdev) platform_set_drvdata(pdev, qproc); - qproc->has_halt_nav = desc->has_halt_nav; + qproc->has_spare_reg = desc->has_spare_reg; ret = q6v5_init_mem(qproc, pdev); if (ret) goto free_rproc; @@ -1839,8 +1775,6 @@ static const struct rproc_hexagon_res sc7180_mss = { .active_clk_names = (char*[]){ "mnoc_axi", "nav", - "mss_nav", - "mss_crypto", NULL }, .active_pd_names = (char*[]){ @@ -1855,7 +1789,7 @@ static const struct rproc_hexagon_res sc7180_mss = { }, .need_mem_protection = true, .has_alt_reset = false, - .has_halt_nav = true, + .has_spare_reg = true, .version = MSS_SC7180, }; @@ -1890,7 +1824,7 @@ static const struct rproc_hexagon_res sdm845_mss = { }, .need_mem_protection = true, .has_alt_reset = true, - .has_halt_nav = false, + .has_spare_reg = false, .version = MSS_SDM845, }; @@ -1917,7 +1851,7 @@ static const struct rproc_hexagon_res msm8998_mss = { }, .need_mem_protection = true, .has_alt_reset = false, - .has_halt_nav = false, + .has_spare_reg = false, .version = MSS_MSM8998, }; @@ -1947,7 +1881,7 @@ static const struct rproc_hexagon_res msm8996_mss = { }, .need_mem_protection = true, .has_alt_reset = false, - .has_halt_nav = false, + .has_spare_reg = false, .version = MSS_MSM8996, }; @@ -1980,7 +1914,7 @@ static const struct rproc_hexagon_res msm8916_mss = { }, .need_mem_protection = false, .has_alt_reset = false, - .has_halt_nav = false, + .has_spare_reg = false, .version = MSS_MSM8916, }; @@ -2021,7 +1955,7 @@ static const struct rproc_hexagon_res msm8974_mss = { }, .need_mem_protection = false, .has_alt_reset = false, - .has_halt_nav = false, + .has_spare_reg = false, .version = MSS_MSM8974, }; From be050a3429f46ecf13eb2b80f299479f8bb823fb Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 15 Apr 2020 12:46:18 +0530 Subject: [PATCH 104/574] remoteproc: qcom_q6v5_mss: map/unmap mpss segments before/after use The application processor accessing the mpss region when the Q6 modem is running will lead to an XPU violation. Fix this by un-mapping the mpss segments post copy during mpss authentication and coredumps. Tested-by: Evan Green Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200415071619.6052-1-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 31 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 1ded14253a83..30652b381c2c 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1116,7 +1116,13 @@ static int q6v5_mpss_load(struct q6v5 *qproc) goto release_firmware; } - ptr = qproc->mpss_region + offset; + ptr = ioremap_wc(qproc->mpss_phys + offset, phdr->p_memsz); + if (!ptr) { + dev_err(qproc->dev, + "unable to map memory region: %pa+%zx-%x\n", + &qproc->mpss_phys, offset, phdr->p_memsz); + goto release_firmware; + } if (phdr->p_filesz && phdr->p_offset < fw->size) { /* Firmware is large enough to be non-split */ @@ -1125,6 +1131,7 @@ static int q6v5_mpss_load(struct q6v5 *qproc) "failed to load segment %d from truncated file %s\n", i, fw_name); ret = -EINVAL; + iounmap(ptr); goto release_firmware; } @@ -1135,6 +1142,7 @@ static int q6v5_mpss_load(struct q6v5 *qproc) ret = request_firmware(&seg_fw, fw_name, qproc->dev); if (ret) { dev_err(qproc->dev, "failed to load %s\n", fw_name); + iounmap(ptr); goto release_firmware; } @@ -1147,6 +1155,7 @@ static int q6v5_mpss_load(struct q6v5 *qproc) memset(ptr + phdr->p_filesz, 0, phdr->p_memsz - phdr->p_filesz); } + iounmap(ptr); size += phdr->p_memsz; code_length = readl(qproc->rmb_base + RMB_PMI_CODE_LENGTH_REG); @@ -1196,7 +1205,8 @@ static void qcom_q6v5_dump_segment(struct rproc *rproc, int ret = 0; struct q6v5 *qproc = rproc->priv; unsigned long mask = BIT((unsigned long)segment->priv); - void *ptr = rproc_da_to_va(rproc, segment->da, segment->size); + int offset = segment->da - qproc->mpss_reloc; + void *ptr = NULL; /* Unlock mba before copying segments */ if (!qproc->dump_mba_loaded) { @@ -1210,10 +1220,15 @@ static void qcom_q6v5_dump_segment(struct rproc *rproc, } } - if (!ptr || ret) - memset(dest, 0xff, segment->size); - else + if (!ret) + ptr = ioremap_wc(qproc->mpss_phys + offset, segment->size); + + if (ptr) { memcpy(dest, ptr, segment->size); + iounmap(ptr); + } else { + memset(dest, 0xff, segment->size); + } qproc->dump_segment_mask |= mask; @@ -1548,12 +1563,6 @@ static int q6v5_alloc_memory_region(struct q6v5 *qproc) qproc->mpss_phys = qproc->mpss_reloc = r.start; qproc->mpss_size = resource_size(&r); - qproc->mpss_region = devm_ioremap_wc(qproc->dev, qproc->mpss_phys, qproc->mpss_size); - if (!qproc->mpss_region) { - dev_err(qproc->dev, "unable to map memory region: %pa+%zx\n", - &r.start, qproc->mpss_size); - return -EBUSY; - } return 0; } From 9666174a4e1a7b1e32c214312678f8452275da6a Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 15 Apr 2020 12:46:19 +0530 Subject: [PATCH 105/574] remoteproc: qcom_q6v5_mss: Remove unused q6v5_da_to_va function Remove unsed q6v5_da_to_va function as the mss driver uses a per segment dump function. Tested-by: Evan Green Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200415071619.6052-2-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 30652b381c2c..7ca205c242d2 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -190,7 +190,6 @@ struct q6v5 { phys_addr_t mpss_phys; phys_addr_t mpss_reloc; - void *mpss_region; size_t mpss_size; struct qcom_rproc_glink glink_subdev; @@ -1302,18 +1301,6 @@ static int q6v5_stop(struct rproc *rproc) return 0; } -static void *q6v5_da_to_va(struct rproc *rproc, u64 da, size_t len) -{ - struct q6v5 *qproc = rproc->priv; - int offset; - - offset = da - qproc->mpss_reloc; - if (offset < 0 || offset + len > qproc->mpss_size) - return NULL; - - return qproc->mpss_region + offset; -} - static int qcom_q6v5_register_dump_segments(struct rproc *rproc, const struct firmware *mba_fw) { @@ -1361,7 +1348,6 @@ static int qcom_q6v5_register_dump_segments(struct rproc *rproc, static const struct rproc_ops q6v5_ops = { .start = q6v5_start, .stop = q6v5_stop, - .da_to_va = q6v5_da_to_va, .parse_fw = qcom_q6v5_register_dump_segments, .load = q6v5_load, }; From a781e5aa59110d002a56bd41a397c0c8892f0609 Mon Sep 17 00:00:00 2001 From: Rishabh Bhatnagar Date: Wed, 29 Apr 2020 11:04:42 -0700 Subject: [PATCH 106/574] remoteproc: core: Prevent system suspend during remoteproc recovery The system might go into suspend during recovery of any remoteproc. This will interrupt the recovery process in between increasing the recovery time. Make the platform device as wakeup capable and use pm_stay_wake/pm_relax APIs to avoid system from going into suspend during recovery. Signed-off-by: Siddharth Gupta Signed-off-by: Rishabh Bhatnagar Acked-by: Mathieu Poirier Link: https://lore.kernel.org/r/1588183482-21146-1-git-send-email-rishabhb@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_pas.c | 2 ++ drivers/remoteproc/remoteproc_core.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 3bb69f58e086..61791a03f648 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -407,6 +407,8 @@ static int adsp_probe(struct platform_device *pdev) adsp->has_aggre2_clk = desc->has_aggre2_clk; platform_set_drvdata(pdev, adsp); + device_wakeup_enable(adsp->dev); + ret = adsp_alloc_memory_region(adsp); if (ret) goto free_rproc; diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 4bd0f45a00c0..7350161a3ed0 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1766,6 +1766,8 @@ static void rproc_crash_handler_work(struct work_struct *work) if (!rproc->recovery_disabled) rproc_trigger_recovery(rproc); + + pm_relax(rproc->dev.parent); } /** @@ -2353,6 +2355,9 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type) return; } + /* Prevent suspend while the remoteproc is being recovered */ + pm_stay_awake(rproc->dev.parent); + dev_err(&rproc->dev, "crash detected in %s: type %s\n", rproc->name, rproc_crash_to_string(type)); From 8a226e2c71bb3763e27a063d36eac5fa4ea53c3f Mon Sep 17 00:00:00 2001 From: Sivaprakash Murugesan Date: Fri, 1 May 2020 21:58:12 +0530 Subject: [PATCH 107/574] remoteproc: wcss: add support for rpmsg communication add glink and ssr subdevices for wcss rproc to enable rpmsg communication. Signed-off-by: Sivaprakash Murugesan Link: https://lore.kernel.org/r/1588350492-4663-1-git-send-email-sivaprak@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index f1924b740a10..48d16d81f94d 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -91,6 +91,9 @@ struct q6v5_wcss { phys_addr_t mem_reloc; void *mem_region; size_t mem_size; + + struct qcom_rproc_glink glink_subdev; + struct qcom_rproc_ssr ssr_subdev; }; static int q6v5_wcss_reset(struct q6v5_wcss *wcss) @@ -557,6 +560,9 @@ static int q6v5_wcss_probe(struct platform_device *pdev) if (ret) goto free_rproc; + qcom_add_glink_subdev(rproc, &wcss->glink_subdev); + qcom_add_ssr_subdev(rproc, &wcss->ssr_subdev, "q6wcss"); + ret = rproc_add(rproc); if (ret) goto free_rproc; From 529798bae7c155d38eec211436df736349dca2ee Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:19:43 -0500 Subject: [PATCH 108/574] remoteproc: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507191943.GA16033@embeddedor Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 0468be4d02f4..e7b7bab8b235 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -73,7 +73,7 @@ struct resource_table { u32 ver; u32 num; u32 reserved[2]; - u32 offset[0]; + u32 offset[]; } __packed; /** @@ -87,7 +87,7 @@ struct resource_table { */ struct fw_rsc_hdr { u32 type; - u8 data[0]; + u8 data[]; } __packed; /** @@ -306,7 +306,7 @@ struct fw_rsc_vdev { u8 status; u8 num_of_vrings; u8 reserved[2]; - struct fw_rsc_vdev_vring vring[0]; + struct fw_rsc_vdev_vring vring[]; } __packed; struct rproc; From 4f05fc33bebdc7d69259c412dd21d09751827dbd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:19:48 -0500 Subject: [PATCH 109/574] rpmsg: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507191948.GA16053@embeddedor Signed-off-by: Bjorn Andersson --- drivers/rpmsg/virtio_rpmsg_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 376ebbf880d6..07d4f3374098 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -89,7 +89,7 @@ struct rpmsg_hdr { u32 reserved; u16 len; u16 flags; - u8 data[0]; + u8 data[]; } __packed; /** From db9178a4f8c4e523f824892cb8bab00961b07385 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Mon, 20 Apr 2020 11:05:59 -0500 Subject: [PATCH 110/574] remoteproc: Fall back to using parent memory pool if no dedicated available In some cases, like with OMAP remoteproc, we are not creating dedicated memory pool for the virtio device. Instead, we use the same memory pool for all shared memories. The current virtio memory pool handling forces a split between these two, as a separate device is created for it, causing memory to be allocated from bad location if the dedicated pool is not available. Fix this by falling back to using the parent device memory pool if dedicated is not available. Cc: stable@vger.kernel.org Reviewed-by: Mathieu Poirier Acked-by: Arnaud Pouliquen Fixes: 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool") Signed-off-by: Tero Kristo Signed-off-by: Suman Anna Link: https://lore.kernel.org/r/20200420160600.10467-2-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_virtio.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index 15ec5fc65257..dfd3808c34fd 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -375,6 +375,18 @@ int rproc_add_virtio_dev(struct rproc_vdev *rvdev, int id) goto out; } } + } else { + struct device_node *np = rproc->dev.parent->of_node; + + /* + * If we don't have dedicated buffer, just attempt to re-assign + * the reserved memory from our parent. A default memory-region + * at index 0 from the parent's memory-regions is assigned for + * the rvdev dev to allocate from. Failure is non-critical and + * the allocations will fall back to global pools, so don't + * check return value either. + */ + of_reserved_mem_device_init_by_idx(dev, np, 0); } /* Allocate virtio device */ From c774ad010873bb89dcc0cdcb1e96aef6664d8caf Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Mon, 20 Apr 2020 11:06:00 -0500 Subject: [PATCH 111/574] remoteproc: Fix and restore the parenting hierarchy for vdev The commit 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool") has introduced a new vdev subdevice for each vdev declared in the firmware resource table and made it as the parent for the created virtio rpmsg devices instead of the previous remoteproc device. This changed the overall parenting hierarchy for the rpmsg devices, which were children of virtio devices, and does not allow the corresponding rpmsg drivers to retrieve the parent rproc device through the rproc_get_by_child() API. Fix this by restoring the remoteproc device as the parent. The new vdev subdevice can continue to inherit the DMA attributes from the remoteproc's parent device (actual platform device). Cc: stable@vger.kernel.org Fixes: 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool") Signed-off-by: Suman Anna Reviewed-by: Mathieu Poirier Acked-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20200420160600.10467-3-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 7350161a3ed0..9c6d9ff7a1fc 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -517,7 +517,7 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc, /* Initialise vdev subdevice */ snprintf(name, sizeof(name), "vdev%dbuffer", rvdev->index); - rvdev->dev.parent = rproc->dev.parent; + rvdev->dev.parent = &rproc->dev; rvdev->dev.dma_pfn_offset = rproc->dev.parent->dma_pfn_offset; rvdev->dev.release = rproc_rvdev_release; dev_set_name(&rvdev->dev, "%s#%s", dev_name(rvdev->dev.parent), name); From cfcccbe8879f79bc9f8a162bcb482c74b8768094 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 8 May 2020 21:56:45 -0400 Subject: [PATCH 112/574] iommu/amd: Fix variable "iommu" set but not used The commit dce8d6964ebd ("iommu/amd: Convert to probe/release_device() call-backs") introduced an unused variable, drivers/iommu/amd_iommu.c: In function 'amd_iommu_uninit_device': drivers/iommu/amd_iommu.c:422:20: warning: variable 'iommu' set but not used [-Wunused-but-set-variable] struct amd_iommu *iommu; ^~~~~ Signed-off-by: Qian Cai Link: https://lore.kernel.org/r/20200509015645.3236-1-cai@lca.pw Fixes: dce8d6964ebd ("iommu/amd: Convert to probe/release_device() call-backs") Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c30367413683..1880811cec33 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -397,15 +397,12 @@ static void iommu_ignore_device(struct device *dev) static void amd_iommu_uninit_device(struct device *dev) { struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; int devid; devid = get_device_id(dev); if (devid < 0) return; - iommu = amd_iommu_rlookup_table[devid]; - dev_data = search_dev_data(devid); if (!dev_data) return; From f38338cf0691b5fae5f9a46d188eef92ab9e6296 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 11 May 2020 18:10:00 +0200 Subject: [PATCH 113/574] iommu: Do not probe devices on IOMMU-less busses The host1x bus implemented on Tegra SoCs is primarily an abstraction to create logical device from multiple platform devices. Since the devices in such a setup are typically hierarchical, DMA setup still needs to be done so that DMA masks can be properly inherited, but we don't actually want to attach the host1x logical devices to any IOMMU. The platform devices that make up the logical device are responsible for memory bus transactions, so it is them that will need to be attached to the IOMMU. Add a check to __iommu_probe_device() that aborts IOMMU setup early for busses that don't have the IOMMU operations pointer set since they will cause a crash otherwise. Signed-off-by: Thierry Reding Link: https://lore.kernel.org/r/20200511161000.3853342-1-thierry.reding@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a9e5618cde80..9d1d917e1050 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -195,6 +195,9 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list struct iommu_group *group; int ret; + if (!ops) + return -ENODEV; + if (!dev_iommu_get(dev)) return -ENOMEM; From 2ba20b5a5b8f948d498f4b38ba910ff339e609ef Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 9 May 2020 00:02:16 +0200 Subject: [PATCH 114/574] iommu/renesas: Fix unused-function warning gcc warns because the only reference to ipmmu_find_group is inside of an #ifdef: drivers/iommu/ipmmu-vmsa.c:878:28: error: 'ipmmu_find_group' defined but not used [-Werror=unused-function] Change the #ifdef to an equivalent IS_ENABLED(). Fixes: 6580c8a78424 ("iommu/renesas: Convert to probe/release_device() call-backs") Signed-off-by: Arnd Bergmann Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20200508220224.688985-1-arnd@arndb.de Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index fb7e702dee23..4c2972f3153b 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -903,11 +903,8 @@ static const struct iommu_ops ipmmu_ops = { .probe_device = ipmmu_probe_device, .release_device = ipmmu_release_device, .probe_finalize = ipmmu_probe_finalize, -#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA) - .device_group = generic_device_group, -#else - .device_group = ipmmu_find_group, -#endif + .device_group = IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA) + ? generic_device_group : ipmmu_find_group, .pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K, .of_xlate = ipmmu_of_xlate, }; From 3a0ce12e3b8e3cb7d54569a42aec743cc93f4f0d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 May 2020 19:18:03 +0300 Subject: [PATCH 115/574] iommu/iova: Unify format of the printed messages Unify format of the printed messages, i.e. replace printk(LEVEL ... ) with pr_level(...). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200507161804.13275-2-andriy.shevchenko@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iova.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 0e6a9536eca6..49fc01f2a28d 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -253,7 +253,7 @@ int iova_cache_get(void) SLAB_HWCACHE_ALIGN, NULL); if (!iova_cache) { mutex_unlock(&iova_cache_mutex); - printk(KERN_ERR "Couldn't create iova cache\n"); + pr_err("Couldn't create iova cache\n"); return -ENOMEM; } } @@ -718,8 +718,8 @@ copy_reserved_iova(struct iova_domain *from, struct iova_domain *to) new_iova = reserve_iova(to, iova->pfn_lo, iova->pfn_hi); if (!new_iova) - printk(KERN_ERR "Reserve iova range %lx@%lx failed\n", - iova->pfn_lo, iova->pfn_lo); + pr_err("Reserve iova range %lx@%lx failed\n", + iova->pfn_lo, iova->pfn_lo); } spin_unlock_irqrestore(&from->iova_rbtree_lock, flags); } From c4e0f3b240041bf504e3a25e300eadb88f6a68eb Mon Sep 17 00:00:00 2001 From: Samuel Zou Date: Tue, 12 May 2020 10:17:19 +0800 Subject: [PATCH 116/574] iommu/msm: Make msm_iommu_lock static Fix the following sparse warning: drivers/iommu/msm_iommu.c:37:1: warning: symbol 'msm_iommu_lock' was not declared. The msm_iommu_lock has only call site within msm_iommu.c It should be static Fixes: 0720d1f052dc ("msm: Add MSM IOMMU support") Reported-by: Hulk Robot Signed-off-by: Samuel Zou Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/1589249839-105820-1-git-send-email-zou_wei@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/msm_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 94a6df1bddd6..83414e226c9d 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -34,7 +34,7 @@ __asm__ __volatile__ ( \ /* bitmap of the page sizes currently supported */ #define MSM_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M) -DEFINE_SPINLOCK(msm_iommu_lock); +static DEFINE_SPINLOCK(msm_iommu_lock); static LIST_HEAD(qcom_iommu_devices); static struct iommu_ops msm_iommu_ops; From 327d5b2fee91c404a3956c324193892cf2cc9528 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 6 May 2020 09:59:45 +0800 Subject: [PATCH 117/574] iommu/vt-d: Allow 32bit devices to uses DMA domain Currently, if a 32bit device initially uses an identity domain, Intel IOMMU driver will convert it forcibly to a DMA one if its address capability is not enough for the whole system memory. The motivation was to overcome the overhead caused by possible bounced buffer. Unfortunately, this improvement has led to many problems. For example, some 32bit devices are required to use an identity domain, forcing them to use DMA domain will cause the device not to work anymore. On the other hand, the VMD sub-devices share a domain but each sub-device might have different address capability. Forcing a VMD sub-device to use DMA domain blindly will impact the operation of other sub-devices without any notification. Further more, PCI aliased devices (PCI bridge and all devices beneath it, VMD devices and various devices quirked with pci_add_dma_alias()) must use the same domain. Forcing one device to switch to DMA domain during runtime will cause in-fligh DMAs for other devices to abort or target to other memory which might cause undefind system behavior. With the last private domain usage in iommu_need_mapping() removed, all private domain helpers are also cleaned in this patch. Otherwise, the compiler will complain that some functions are defined but not used. Signed-off-by: Lu Baolu Tested-by: Daniel Drake Reviewed-by: Jon Derrick Reviewed-by: Jerry Snitselaar Cc: Daniel Drake Cc: Derrick Jonathan Cc: Jerry Snitselaar Link: https://lore.kernel.org/r/20200506015947.28662-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 291 +----------------------------------- 1 file changed, 1 insertion(+), 290 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 34e08fa2ce3a..16ba7add0f72 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -355,11 +355,6 @@ static void domain_exit(struct dmar_domain *domain); static void domain_remove_dev_info(struct dmar_domain *domain); static void dmar_remove_one_dev_info(struct device *dev); static void __dmar_remove_one_dev_info(struct device_domain_info *info); -static void domain_context_clear(struct intel_iommu *iommu, - struct device *dev); -static int domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu); -static bool device_is_rmrr_locked(struct device *dev); static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev); static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, @@ -1930,65 +1925,6 @@ static inline int guestwidth_to_adjustwidth(int gaw) return agaw; } -static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu, - int guest_width) -{ - int adjust_width, agaw; - unsigned long sagaw; - int ret; - - init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); - - if (!intel_iommu_strict) { - ret = init_iova_flush_queue(&domain->iovad, - iommu_flush_iova, iova_entry_free); - if (ret) - pr_info("iova flush queue initialization failed\n"); - } - - domain_reserve_special_ranges(domain); - - /* calculate AGAW */ - if (guest_width > cap_mgaw(iommu->cap)) - guest_width = cap_mgaw(iommu->cap); - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - agaw = width_to_agaw(adjust_width); - sagaw = cap_sagaw(iommu->cap); - if (!test_bit(agaw, &sagaw)) { - /* hardware doesn't support it, choose a bigger one */ - pr_debug("Hardware doesn't support agaw %d\n", agaw); - agaw = find_next_bit(&sagaw, 5, agaw); - if (agaw >= 5) - return -ENODEV; - } - domain->agaw = agaw; - - if (ecap_coherent(iommu->ecap)) - domain->iommu_coherency = 1; - else - domain->iommu_coherency = 0; - - if (ecap_sc_support(iommu->ecap)) - domain->iommu_snooping = 1; - else - domain->iommu_snooping = 0; - - if (intel_iommu_superpage) - domain->iommu_superpage = fls(cap_super_page_val(iommu->cap)); - else - domain->iommu_superpage = 0; - - domain->nid = iommu->node; - - /* always allocate the top pgd */ - domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); - if (!domain->pgd) - return -ENOMEM; - __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE); - return 0; -} - static void domain_exit(struct dmar_domain *domain) { @@ -2704,94 +2640,6 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, return domain; } -static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque) -{ - *(u16 *)opaque = alias; - return 0; -} - -static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw) -{ - struct device_domain_info *info; - struct dmar_domain *domain = NULL; - struct intel_iommu *iommu; - u16 dma_alias; - unsigned long flags; - u8 bus, devfn; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return NULL; - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - - pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias); - - spin_lock_irqsave(&device_domain_lock, flags); - info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus), - PCI_BUS_NUM(dma_alias), - dma_alias & 0xff); - if (info) { - iommu = info->iommu; - domain = info->domain; - } - spin_unlock_irqrestore(&device_domain_lock, flags); - - /* DMA alias already has a domain, use it */ - if (info) - goto out; - } - - /* Allocate and initialize new domain for the device */ - domain = alloc_domain(0); - if (!domain) - return NULL; - if (domain_init(domain, iommu, gaw)) { - domain_exit(domain); - return NULL; - } - -out: - return domain; -} - -static struct dmar_domain *set_domain_for_dev(struct device *dev, - struct dmar_domain *domain) -{ - struct intel_iommu *iommu; - struct dmar_domain *tmp; - u16 req_id, dma_alias; - u8 bus, devfn; - - iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) - return NULL; - - req_id = ((u16)bus << 8) | devfn; - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - - pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias); - - /* register PCI DMA alias device */ - if (req_id != dma_alias) { - tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias), - dma_alias & 0xff, NULL, domain); - - if (!tmp || tmp != domain) - return tmp; - } - } - - tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain); - if (!tmp || tmp != domain) - return tmp; - - return domain; -} - static int iommu_domain_identity_map(struct dmar_domain *domain, unsigned long long start, unsigned long long end) @@ -2817,45 +2665,6 @@ static int iommu_domain_identity_map(struct dmar_domain *domain, DMA_PTE_READ|DMA_PTE_WRITE); } -static int domain_prepare_identity_map(struct device *dev, - struct dmar_domain *domain, - unsigned long long start, - unsigned long long end) -{ - /* For _hardware_ passthrough, don't bother. But for software - passthrough, we do it anyway -- it may indicate a memory - range which is reserved in E820, so which didn't get set - up to start with in si_domain */ - if (domain == si_domain && hw_pass_through) { - dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n", - start, end); - return 0; - } - - dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end); - - if (end < start) { - WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n" - "BIOS vendor: %s; Ver: %s; Product Version: %s\n", - dmi_get_system_info(DMI_BIOS_VENDOR), - dmi_get_system_info(DMI_BIOS_VERSION), - dmi_get_system_info(DMI_PRODUCT_VERSION)); - return -EIO; - } - - if (end >> agaw_to_width(domain->agaw)) { - WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n" - "BIOS vendor: %s; Ver: %s; Product Version: %s\n", - agaw_to_width(domain->agaw), - dmi_get_system_info(DMI_BIOS_VENDOR), - dmi_get_system_info(DMI_BIOS_VERSION), - dmi_get_system_info(DMI_PRODUCT_VERSION)); - return -EIO; - } - - return iommu_domain_identity_map(domain, start, end); -} - static int md_domain_init(struct dmar_domain *domain, int guest_width); static int __init si_domain_init(int hw) @@ -3531,98 +3340,16 @@ static unsigned long intel_alloc_iova(struct device *dev, return iova_pfn; } -static struct dmar_domain *get_private_domain_for_dev(struct device *dev) -{ - struct dmar_domain *domain, *tmp; - struct dmar_rmrr_unit *rmrr; - struct device *i_dev; - int i, ret; - - /* Device shouldn't be attached by any domains. */ - domain = find_domain(dev); - if (domain) - return NULL; - - domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH); - if (!domain) - goto out; - - /* We have a new domain - setup possible RMRRs for the device */ - rcu_read_lock(); - for_each_rmrr_units(rmrr) { - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, i_dev) { - if (i_dev != dev) - continue; - - ret = domain_prepare_identity_map(dev, domain, - rmrr->base_address, - rmrr->end_address); - if (ret) - dev_err(dev, "Mapping reserved region failed\n"); - } - } - rcu_read_unlock(); - - tmp = set_domain_for_dev(dev, domain); - if (!tmp || domain != tmp) { - domain_exit(domain); - domain = tmp; - } - -out: - if (!domain) - dev_err(dev, "Allocating domain failed\n"); - else - domain->domain.type = IOMMU_DOMAIN_DMA; - - return domain; -} - /* Check if the dev needs to go through non-identity map and unmap process.*/ static bool iommu_need_mapping(struct device *dev) { - int ret; - if (iommu_dummy(dev)) return false; if (unlikely(attach_deferred(dev))) do_deferred_attach(dev); - ret = identity_mapping(dev); - if (ret) { - u64 dma_mask = *dev->dma_mask; - - if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask) - dma_mask = dev->coherent_dma_mask; - - if (dma_mask >= dma_direct_get_required_mask(dev)) - return false; - - /* - * 32 bit DMA is removed from si_domain and fall back to - * non-identity mapping. - */ - dmar_remove_one_dev_info(dev); - ret = iommu_request_dma_domain_for_dev(dev); - if (ret) { - struct iommu_domain *domain; - struct dmar_domain *dmar_domain; - - domain = iommu_get_domain_for_dev(dev); - if (domain) { - dmar_domain = to_dmar_domain(domain); - dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN; - } - dmar_remove_one_dev_info(dev); - get_private_domain_for_dev(dev); - } - - dev_info(dev, "32bit DMA uses non-identity mapping\n"); - } - - return true; + return !identity_mapping(dev); } static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, @@ -5186,16 +4913,6 @@ int __init intel_iommu_init(void) } up_write(&dmar_global_lock); -#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB) - /* - * If the system has no untrusted device or the user has decided - * to disable the bounce page mechanisms, we don't need swiotlb. - * Mark this and the pre-allocated bounce pages will be released - * later. - */ - if (!has_untrusted_dev() || intel_no_bounce) - swiotlb = 0; -#endif dma_ops = &intel_dma_ops; init_iommu_pm_ops(); @@ -5296,12 +5013,6 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) domain_detach_iommu(domain, iommu); spin_unlock_irqrestore(&iommu->lock, flags); - /* free the private domain */ - if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN && - !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) && - list_empty(&domain->devices)) - domain_exit(info->domain); - free_devinfo_mem(info); } From 14b3526d5909f01e1d1baa05f50952788bb7418e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 6 May 2020 09:59:46 +0800 Subject: [PATCH 118/574] iommu/vt-d: Allow PCI sub-hierarchy to use DMA domain Before commit fa954e6831789 ("iommu/vt-d: Delegate the dma domain to upper layer"), Intel IOMMU started off with all devices in the identity domain, and took them out later if it found they couldn't access all of memory. This required devices behind a PCI bridge to use a DMA domain at the beginning because all PCI devices behind the bridge use the same source-id in their transactions and the domain couldn't be changed at run-time. Intel IOMMU driver is now aligned with the default domain framework, there's no need to keep this requirement anymore. Signed-off-by: Lu Baolu Tested-by: Daniel Drake Reviewed-by: Jon Derrick Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20200506015947.28662-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 16ba7add0f72..af309e8fa6f5 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2857,31 +2857,6 @@ static int device_def_domain_type(struct device *dev) if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev)) return IOMMU_DOMAIN_IDENTITY; - - /* - * We want to start off with all devices in the 1:1 domain, and - * take them out later if we find they can't access all of memory. - * - * However, we can't do this for PCI devices behind bridges, - * because all PCI devices behind the same bridge will end up - * with the same source-id on their transactions. - * - * Practically speaking, we can't change things around for these - * devices at run-time, because we can't be sure there'll be no - * DMA transactions in flight for any of their siblings. - * - * So PCI devices (unless they're on the root bus) as well as - * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of - * the 1:1 domain, just in _case_ one of their siblings turns out - * not to be able to map all of memory. - */ - if (!pci_is_pcie(pdev)) { - if (!pci_is_root_bus(pdev->bus)) - return IOMMU_DOMAIN_DMA; - if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) - return IOMMU_DOMAIN_DMA; - } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE) - return IOMMU_DOMAIN_DMA; } return 0; From 6fc7020cf298aaec343df423746b44d99c6efaa5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 6 May 2020 09:59:47 +0800 Subject: [PATCH 119/574] iommu/vt-d: Apply per-device dma_ops Current Intel IOMMU driver sets the system level dma_ops. This causes each dma API to go through the IOMMU driver even the devices are using identity mapped domains. This sets per-device dma_ops only if a device is using a DMA domain. Otherwise, use the default system level dma_ops for direct dma. Signed-off-by: Lu Baolu Tested-by: Daniel Drake Reviewed-by: Jon Derrick Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20200506015947.28662-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 82 ++++++++++++------------------------- 1 file changed, 26 insertions(+), 56 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index af309e8fa6f5..29d3940847d3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2720,17 +2720,6 @@ static int __init si_domain_init(int hw) return 0; } -static int identity_mapping(struct device *dev) -{ - struct device_domain_info *info; - - info = dev->archdata.iommu; - if (info) - return (info->domain == si_domain); - - return 0; -} - static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) { struct dmar_domain *ndomain; @@ -3315,18 +3304,6 @@ static unsigned long intel_alloc_iova(struct device *dev, return iova_pfn; } -/* Check if the dev needs to go through non-identity map and unmap process.*/ -static bool iommu_need_mapping(struct device *dev) -{ - if (iommu_dummy(dev)) - return false; - - if (unlikely(attach_deferred(dev))) - do_deferred_attach(dev); - - return !identity_mapping(dev); -} - static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir, u64 dma_mask) { @@ -3340,6 +3317,9 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, BUG_ON(dir == DMA_NONE); + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); + domain = find_domain(dev); if (!domain) return DMA_MAPPING_ERROR; @@ -3391,20 +3371,15 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - if (iommu_need_mapping(dev)) - return __intel_map_single(dev, page_to_phys(page) + offset, - size, dir, *dev->dma_mask); - return dma_direct_map_page(dev, page, offset, size, dir, attrs); + return __intel_map_single(dev, page_to_phys(page) + offset, + size, dir, *dev->dma_mask); } static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - if (iommu_need_mapping(dev)) - return __intel_map_single(dev, phys_addr, size, dir, - *dev->dma_mask); - return dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask); } static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size) @@ -3455,17 +3430,13 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - if (iommu_need_mapping(dev)) - intel_unmap(dev, dev_addr, size); - else - dma_direct_unmap_page(dev, dev_addr, size, dir, attrs); + intel_unmap(dev, dev_addr, size); } static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - if (iommu_need_mapping(dev)) - intel_unmap(dev, dev_addr, size); + intel_unmap(dev, dev_addr, size); } static void *intel_alloc_coherent(struct device *dev, size_t size, @@ -3475,8 +3446,8 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, struct page *page = NULL; int order; - if (!iommu_need_mapping(dev)) - return dma_direct_alloc(dev, size, dma_handle, flags, attrs); + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); size = PAGE_ALIGN(size); order = get_order(size); @@ -3511,9 +3482,6 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, int order; struct page *page = virt_to_page(vaddr); - if (!iommu_need_mapping(dev)) - return dma_direct_free(dev, size, vaddr, dma_handle, attrs); - size = PAGE_ALIGN(size); order = get_order(size); @@ -3531,9 +3499,6 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, struct scatterlist *sg; int i; - if (!iommu_need_mapping(dev)) - return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs); - for_each_sg(sglist, sg, nelems, i) { nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg)); } @@ -3557,8 +3522,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (!iommu_need_mapping(dev)) - return dma_direct_map_sg(dev, sglist, nelems, dir, attrs); + + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); domain = find_domain(dev); if (!domain) @@ -3605,8 +3571,6 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele static u64 intel_get_required_mask(struct device *dev) { - if (!iommu_need_mapping(dev)) - return dma_direct_get_required_mask(dev); return DMA_BIT_MASK(32); } @@ -4888,8 +4852,6 @@ int __init intel_iommu_init(void) } up_write(&dmar_global_lock); - dma_ops = &intel_dma_ops; - init_iommu_pm_ops(); down_read(&dmar_global_lock); @@ -5479,11 +5441,6 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) if (translation_pre_enabled(iommu)) dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO; - if (device_needs_bounce(dev)) { - dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n"); - set_dma_ops(dev, &bounce_dma_ops); - } - return &iommu->iommu; } @@ -5498,7 +5455,19 @@ static void intel_iommu_release_device(struct device *dev) dmar_remove_one_dev_info(dev); + set_dma_ops(dev, NULL); +} + +static void intel_iommu_probe_finalize(struct device *dev) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(dev); if (device_needs_bounce(dev)) + set_dma_ops(dev, &bounce_dma_ops); + else if (domain && domain->type == IOMMU_DOMAIN_DMA) + set_dma_ops(dev, &intel_dma_ops); + else set_dma_ops(dev, NULL); } @@ -5830,6 +5799,7 @@ const struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, .probe_device = intel_iommu_probe_device, + .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, From 8627892af6cb80f80acab066e26851d7b5fb3ca6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 May 2020 19:18:02 +0300 Subject: [PATCH 120/574] iommu/vt-d: Unify format of the printed messages Unify format of the printed messages, i.e. replace printk(LEVEL ... ) with pr_level(...). Signed-off-by: Andy Shevchenko Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20200507161804.13275-1-andriy.shevchenko@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 29d3940847d3..2ff8d69ce4f8 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -475,8 +475,7 @@ static int __init intel_iommu_setup(char *str) pr_info("Intel-IOMMU: scalable mode supported\n"); intel_iommu_sm = 1; } else if (!strncmp(str, "tboot_noforce", 13)) { - printk(KERN_INFO - "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); + pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); intel_iommu_tboot_noforce = 1; } else if (!strncmp(str, "nobounce", 8)) { pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n"); From 555fb5ae0f39962417d35e02d77ee9b2c14a5428 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 May 2020 19:18:04 +0300 Subject: [PATCH 121/574] iommu/amd: Unify format of the printed messages Unify format of the printed messages, i.e. replace printk(LEVEL ... ) with pr_level(...). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200507161804.13275-3-andriy.shevchenko@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index d0d7b6a0c3d8..f55405ee83cf 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -395,10 +395,10 @@ #define PD_IOMMUV2_MASK (1UL << 3) /* domain has gcr3 table */ extern bool amd_iommu_dump; -#define DUMP_printk(format, arg...) \ - do { \ - if (amd_iommu_dump) \ - printk(KERN_INFO "AMD-Vi: " format, ## arg); \ +#define DUMP_printk(format, arg...) \ + do { \ + if (amd_iommu_dump) \ + pr_info("AMD-Vi: " format, ## arg); \ } while(0); /* global flag if IOMMUs cache non-present entries */ From ed894bf5a76357eb92045c79d6ae2f29814c6183 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Apr 2020 22:40:08 +0100 Subject: [PATCH 122/574] sparc32: mm: Fix argument checking in __srmmu_get_nocache() The 'size' argument to __srmmu_get_nocache() is a number of bytes not a shift value, so fix up the sanity checking to treat it properly. Cc: "David S. Miller" Cc: Peter Zijlstra Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/mm/srmmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index b7c94de70cca..cb9ded8a68b7 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -175,18 +175,18 @@ pte_t *pte_offset_kernel(pmd_t *dir, unsigned long address) */ static void *__srmmu_get_nocache(int size, int align) { - int offset; + int offset, minsz = 1 << SRMMU_NOCACHE_BITMAP_SHIFT; unsigned long addr; - if (size < SRMMU_NOCACHE_BITMAP_SHIFT) { + if (size < minsz) { printk(KERN_ERR "Size 0x%x too small for nocache request\n", size); - size = SRMMU_NOCACHE_BITMAP_SHIFT; + size = minsz; } - if (size & (SRMMU_NOCACHE_BITMAP_SHIFT - 1)) { - printk(KERN_ERR "Size 0x%x unaligned int nocache request\n", + if (size & (minsz - 1)) { + printk(KERN_ERR "Size 0x%x unaligned in nocache request\n", size); - size += SRMMU_NOCACHE_BITMAP_SHIFT - 1; + size += minsz - 1; } BUG_ON(align > SRMMU_NOCACHE_ALIGN_MAX); From 8e958839e4b9fb6ea4385ff2c52d1333a3a618de Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Apr 2020 22:40:09 +0100 Subject: [PATCH 123/574] sparc32: mm: Restructure sparc32 MMU page-table layout The "SRMMU" supports 4k pages using a fixed three-level walk with a 256-entry PGD and 64-entry PMD/PTE levels. In order to fill a page with a 'pgtable_t', the SRMMU code allocates four native PTE tables into a single PTE allocation and similarly for the PMD level, leading to an array of 16 physical pointers in a 'pmd_t' This breaks the generic code which assumes READ_ONCE(*pmd) will be word sized. In a manner similar to ef22d8abd876 ("m68k: mm: Restructure Motorola MMU page-table layout"), this patch implements the native page-table setup directly. This significantly increases the page-table memory overhead, but will be addresses in a subsequent patch. Cc: "David S. Miller" Cc: Peter Zijlstra Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/include/asm/page_32.h | 10 ++--- arch/sparc/include/asm/pgalloc_32.h | 5 ++- arch/sparc/include/asm/pgtable_32.h | 29 +++++++------- arch/sparc/include/asm/pgtsrmmu.h | 36 ++--------------- arch/sparc/include/asm/viking.h | 5 ++- arch/sparc/kernel/head_32.S | 8 ++-- arch/sparc/mm/hypersparc.S | 3 +- arch/sparc/mm/srmmu.c | 60 ++++++++++------------------- arch/sparc/mm/viking.S | 5 ++- 9 files changed, 58 insertions(+), 103 deletions(-) diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index 478260002836..da01c8c45412 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -54,7 +54,7 @@ extern struct sparc_phys_banks sp_banks[SPARC_PHYS_BANKS+1]; */ typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long iopte; } iopte_t; -typedef struct { unsigned long pmdv[16]; } pmd_t; +typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long ctxd; } ctxd_t; typedef struct { unsigned long pgprot; } pgprot_t; @@ -62,7 +62,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t; #define pte_val(x) ((x).pte) #define iopte_val(x) ((x).iopte) -#define pmd_val(x) ((x).pmdv[0]) +#define pmd_val(x) ((x).pmd) #define pgd_val(x) ((x).pgd) #define ctxd_val(x) ((x).ctxd) #define pgprot_val(x) ((x).pgprot) @@ -82,7 +82,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t; */ typedef unsigned long pte_t; typedef unsigned long iopte_t; -typedef struct { unsigned long pmdv[16]; } pmd_t; +typedef unsigned long pmd_t; typedef unsigned long pgd_t; typedef unsigned long ctxd_t; typedef unsigned long pgprot_t; @@ -90,14 +90,14 @@ typedef unsigned long iopgprot_t; #define pte_val(x) (x) #define iopte_val(x) (x) -#define pmd_val(x) ((x).pmdv[0]) +#define pmd_val(x) (x) #define pgd_val(x) (x) #define ctxd_val(x) (x) #define pgprot_val(x) (x) #define iopgprot_val(x) (x) #define __pte(x) (x) -#define __pmd(x) ((pmd_t) { { (x) }, }) +#define __pmd(x) (x) #define __iopte(x) (x) #define __pgd(x) (x) #define __ctxd(x) (x) diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index eae0c92ec422..99c032424946 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -60,13 +60,14 @@ pgtable_t pte_alloc_one(struct mm_struct *mm); static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return srmmu_get_nocache(PTE_SIZE, PTE_SIZE); + return srmmu_get_nocache(SRMMU_PTE_TABLE_SIZE, + SRMMU_PTE_TABLE_SIZE); } static inline void free_pte_fast(pte_t *pte) { - srmmu_free_nocache(pte, PTE_SIZE); + srmmu_free_nocache(pte, SRMMU_PTE_TABLE_SIZE); } #define pte_free_kernel(mm, pte) free_pte_fast(pte) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 0de659ae0ba4..3367e2ba89e0 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -11,6 +11,16 @@ #include +#define PMD_SHIFT 18 +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) +#define PMD_ALIGN(__addr) (((__addr) + ~PMD_MASK) & PMD_MASK) + +#define PGDIR_SHIFT 24 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) +#define PGDIR_ALIGN(__addr) (((__addr) + ~PGDIR_MASK) & PGDIR_MASK) + #ifndef __ASSEMBLY__ #include @@ -34,17 +44,10 @@ unsigned long __init bootmem_init(unsigned long *pages_avail); #define pmd_ERROR(e) __builtin_trap() #define pgd_ERROR(e) __builtin_trap() -#define PMD_SHIFT 22 -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PMD_ALIGN(__addr) (((__addr) + ~PMD_MASK) & PMD_MASK) -#define PGDIR_SHIFT SRMMU_PGDIR_SHIFT -#define PGDIR_SIZE SRMMU_PGDIR_SIZE -#define PGDIR_MASK SRMMU_PGDIR_MASK -#define PTRS_PER_PTE 1024 -#define PTRS_PER_PMD SRMMU_PTRS_PER_PMD -#define PTRS_PER_PGD SRMMU_PTRS_PER_PGD -#define USER_PTRS_PER_PGD PAGE_OFFSET / SRMMU_PGDIR_SIZE +#define PTRS_PER_PTE 64 +#define PTRS_PER_PMD 64 +#define PTRS_PER_PGD 256 +#define USER_PTRS_PER_PGD PAGE_OFFSET / PGDIR_SIZE #define FIRST_USER_ADDRESS 0UL #define PTE_SIZE (PTRS_PER_PTE*4) @@ -179,9 +182,7 @@ static inline int pmd_none(pmd_t pmd) static inline void pmd_clear(pmd_t *pmdp) { - int i; - for (i = 0; i < PTRS_PER_PTE/SRMMU_REAL_PTRS_PER_PTE; i++) - set_pte((pte_t *)&pmdp->pmdv[i], __pte(0)); + set_pte((pte_t *)&pmd_val(*pmdp), __pte(0)); } static inline int pud_none(pud_t pud) diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 32a508897501..58ea8e8c6ee7 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -17,39 +17,9 @@ /* Number of contexts is implementation-dependent; 64k is the most we support */ #define SRMMU_MAX_CONTEXTS 65536 -/* PMD_SHIFT determines the size of the area a second-level page table entry can map */ -#define SRMMU_REAL_PMD_SHIFT 18 -#define SRMMU_REAL_PMD_SIZE (1UL << SRMMU_REAL_PMD_SHIFT) -#define SRMMU_REAL_PMD_MASK (~(SRMMU_REAL_PMD_SIZE-1)) -#define SRMMU_REAL_PMD_ALIGN(__addr) (((__addr)+SRMMU_REAL_PMD_SIZE-1)&SRMMU_REAL_PMD_MASK) - -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define SRMMU_PGDIR_SHIFT 24 -#define SRMMU_PGDIR_SIZE (1UL << SRMMU_PGDIR_SHIFT) -#define SRMMU_PGDIR_MASK (~(SRMMU_PGDIR_SIZE-1)) -#define SRMMU_PGDIR_ALIGN(addr) (((addr)+SRMMU_PGDIR_SIZE-1)&SRMMU_PGDIR_MASK) - -#define SRMMU_REAL_PTRS_PER_PTE 64 -#define SRMMU_REAL_PTRS_PER_PMD 64 -#define SRMMU_PTRS_PER_PGD 256 - -#define SRMMU_REAL_PTE_TABLE_SIZE (SRMMU_REAL_PTRS_PER_PTE*4) -#define SRMMU_PMD_TABLE_SIZE (SRMMU_REAL_PTRS_PER_PMD*4) -#define SRMMU_PGD_TABLE_SIZE (SRMMU_PTRS_PER_PGD*4) - -/* - * To support pagetables in highmem, Linux introduces APIs which - * return struct page* and generally manipulate page tables when - * they are not mapped into kernel space. Our hardware page tables - * are smaller than pages. We lump hardware tabes into big, page sized - * software tables. - * - * PMD_SHIFT determines the size of the area a second-level page table entry - * can map, and our pmd_t is 16 times larger than normal. The values which - * were once defined here are now generic for 4c and srmmu, so they're - * found in pgtable.h. - */ -#define SRMMU_PTRS_PER_PMD 4 +#define SRMMU_PTE_TABLE_SIZE (PAGE_SIZE) +#define SRMMU_PMD_TABLE_SIZE (PAGE_SIZE) +#define SRMMU_PGD_TABLE_SIZE (PTRS_PER_PGD*4) /* Definition of the values in the ET field of PTD's and PTE's */ #define SRMMU_ET_MASK 0x3 diff --git a/arch/sparc/include/asm/viking.h b/arch/sparc/include/asm/viking.h index 0bbefd184221..08ffc605035f 100644 --- a/arch/sparc/include/asm/viking.h +++ b/arch/sparc/include/asm/viking.h @@ -10,6 +10,7 @@ #include #include +#include #include /* Bits in the SRMMU control register for GNU/Viking modules. @@ -227,7 +228,7 @@ static inline unsigned long viking_hwprobe(unsigned long vaddr) : "=r" (val) : "r" (vaddr | 0x200), "i" (ASI_M_FLUSH_PROBE)); if ((val & SRMMU_ET_MASK) == SRMMU_ET_PTE) { - vaddr &= ~SRMMU_PGDIR_MASK; + vaddr &= ~PGDIR_MASK; vaddr >>= PAGE_SHIFT; return val | (vaddr << 8); } @@ -237,7 +238,7 @@ static inline unsigned long viking_hwprobe(unsigned long vaddr) : "=r" (val) : "r" (vaddr | 0x100), "i" (ASI_M_FLUSH_PROBE)); if ((val & SRMMU_ET_MASK) == SRMMU_ET_PTE) { - vaddr &= ~SRMMU_REAL_PMD_MASK; + vaddr &= ~PMD_MASK; vaddr >>= PAGE_SHIFT; return val | (vaddr << 8); } diff --git a/arch/sparc/kernel/head_32.S b/arch/sparc/kernel/head_32.S index e55f2c075165..be30c8d4cc73 100644 --- a/arch/sparc/kernel/head_32.S +++ b/arch/sparc/kernel/head_32.S @@ -24,7 +24,7 @@ #include #include /* TI_UWINMASK */ #include -#include /* SRMMU_PGDIR_SHIFT */ +#include /* PGDIR_SHIFT */ #include .data @@ -273,7 +273,7 @@ not_a_sun4: lda [%o1] ASI_M_BYPASS, %o2 ! This is the 0x0 16MB pgd /* Calculate to KERNBASE entry. */ - add %o1, KERNBASE >> (SRMMU_PGDIR_SHIFT - 2), %o3 + add %o1, KERNBASE >> (PGDIR_SHIFT - 2), %o3 /* Poke the entry into the calculated address. */ sta %o2, [%o3] ASI_M_BYPASS @@ -317,7 +317,7 @@ srmmu_not_viking: sll %g1, 0x8, %g1 ! make phys addr for l1 tbl lda [%g1] ASI_M_BYPASS, %g2 ! get level1 entry for 0x0 - add %g1, KERNBASE >> (SRMMU_PGDIR_SHIFT - 2), %g3 + add %g1, KERNBASE >> (PGDIR_SHIFT - 2), %g3 sta %g2, [%g3] ASI_M_BYPASS ! place at KERNBASE entry b go_to_highmem nop ! wheee.... @@ -341,7 +341,7 @@ leon_remap: sll %g1, 0x8, %g1 ! make phys addr for l1 tbl lda [%g1] ASI_M_BYPASS, %g2 ! get level1 entry for 0x0 - add %g1, KERNBASE >> (SRMMU_PGDIR_SHIFT - 2), %g3 + add %g1, KERNBASE >> (PGDIR_SHIFT - 2), %g3 sta %g2, [%g3] ASI_M_BYPASS ! place at KERNBASE entry b go_to_highmem nop ! wheee.... diff --git a/arch/sparc/mm/hypersparc.S b/arch/sparc/mm/hypersparc.S index 66885a8dc50a..6c2521e85a42 100644 --- a/arch/sparc/mm/hypersparc.S +++ b/arch/sparc/mm/hypersparc.S @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -293,7 +294,7 @@ hypersparc_flush_tlb_range: cmp %o3, -1 be hypersparc_flush_tlb_range_out #endif - sethi %hi(~((1 << SRMMU_PGDIR_SHIFT) - 1)), %o4 + sethi %hi(~((1 << PGDIR_SHIFT) - 1)), %o4 sta %o3, [%g1] ASI_M_MMUREGS and %o1, %o4, %o1 add %o1, 0x200, %o1 diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index cb9ded8a68b7..50da4bcdd6fa 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -136,26 +136,14 @@ static void msi_set_sync(void) void pmd_set(pmd_t *pmdp, pte_t *ptep) { - unsigned long ptp; /* Physical address, shifted right by 4 */ - int i; - - ptp = __nocache_pa(ptep) >> 4; - for (i = 0; i < PTRS_PER_PTE/SRMMU_REAL_PTRS_PER_PTE; i++) { - set_pte((pte_t *)&pmdp->pmdv[i], __pte(SRMMU_ET_PTD | ptp)); - ptp += (SRMMU_REAL_PTRS_PER_PTE * sizeof(pte_t) >> 4); - } + unsigned long ptp = __nocache_pa(ptep) >> 4; + set_pte((pte_t *)&pmd_val(*pmdp), __pte(SRMMU_ET_PTD | ptp)); } void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) { - unsigned long ptp; /* Physical address, shifted right by 4 */ - int i; - - ptp = page_to_pfn(ptep) << (PAGE_SHIFT-4); /* watch for overflow */ - for (i = 0; i < PTRS_PER_PTE/SRMMU_REAL_PTRS_PER_PTE; i++) { - set_pte((pte_t *)&pmdp->pmdv[i], __pte(SRMMU_ET_PTD | ptp)); - ptp += (SRMMU_REAL_PTRS_PER_PTE * sizeof(pte_t) >> 4); - } + unsigned long ptp = page_to_pfn(ptep) << (PAGE_SHIFT-4); /* watch for overflow */ + set_pte((pte_t *)&pmd_val(*pmdp), __pte(SRMMU_ET_PTD | ptp)); } /* Find an entry in the third-level page table.. */ @@ -163,7 +151,7 @@ pte_t *pte_offset_kernel(pmd_t *dir, unsigned long address) { void *pte; - pte = __nocache_va((dir->pmdv[0] & SRMMU_PTD_PMASK) << 4); + pte = __nocache_va((pmd_val(*dir) & SRMMU_PTD_PMASK) << 4); return (pte_t *) pte + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)); } @@ -400,7 +388,7 @@ void pte_free(struct mm_struct *mm, pgtable_t pte) p = page_to_pfn(pte) << PAGE_SHIFT; /* Physical address */ /* free non cached virtual address*/ - srmmu_free_nocache(__nocache_va(p), PTE_SIZE); + srmmu_free_nocache(__nocache_va(p), SRMMU_PTE_TABLE_SIZE); } /* context handling - a dynamically sized pool is used */ @@ -822,13 +810,13 @@ static void __init srmmu_inherit_prom_mappings(unsigned long start, what = 0; addr = start - PAGE_SIZE; - if (!(start & ~(SRMMU_REAL_PMD_MASK))) { - if (srmmu_probe(addr + SRMMU_REAL_PMD_SIZE) == probed) + if (!(start & ~(PMD_MASK))) { + if (srmmu_probe(addr + PMD_SIZE) == probed) what = 1; } - if (!(start & ~(SRMMU_PGDIR_MASK))) { - if (srmmu_probe(addr + SRMMU_PGDIR_SIZE) == probed) + if (!(start & ~(PGDIR_MASK))) { + if (srmmu_probe(addr + PGDIR_SIZE) == probed) what = 2; } @@ -837,7 +825,7 @@ static void __init srmmu_inherit_prom_mappings(unsigned long start, pudp = pud_offset(p4dp, start); if (what == 2) { *(pgd_t *)__nocache_fix(pgdp) = __pgd(probed); - start += SRMMU_PGDIR_SIZE; + start += PGDIR_SIZE; continue; } if (pud_none(*(pud_t *)__nocache_fix(pudp))) { @@ -849,6 +837,11 @@ static void __init srmmu_inherit_prom_mappings(unsigned long start, pud_set(__nocache_fix(pudp), pmdp); } pmdp = pmd_offset(__nocache_fix(pgdp), start); + if (what == 1) { + *(pmd_t *)__nocache_fix(pmdp) = __pmd(probed); + start += PMD_SIZE; + continue; + } if (srmmu_pmd_none(*(pmd_t *)__nocache_fix(pmdp))) { ptep = __srmmu_get_nocache(PTE_SIZE, PTE_SIZE); if (ptep == NULL) @@ -856,19 +849,6 @@ static void __init srmmu_inherit_prom_mappings(unsigned long start, memset(__nocache_fix(ptep), 0, PTE_SIZE); pmd_set(__nocache_fix(pmdp), ptep); } - if (what == 1) { - /* We bend the rule where all 16 PTPs in a pmd_t point - * inside the same PTE page, and we leak a perfectly - * good hardware PTE piece. Alternatives seem worse. - */ - unsigned int x; /* Index of HW PMD in soft cluster */ - unsigned long *val; - x = (start >> PMD_SHIFT) & 15; - val = &pmdp->pmdv[x]; - *(unsigned long *)__nocache_fix(val) = probed; - start += SRMMU_REAL_PMD_SIZE; - continue; - } ptep = pte_offset_kernel(__nocache_fix(pmdp), start); *(pte_t *)__nocache_fix(ptep) = __pte(probed); start += PAGE_SIZE; @@ -890,9 +870,9 @@ static void __init do_large_mapping(unsigned long vaddr, unsigned long phys_base /* Map sp_bank entry SP_ENTRY, starting at virtual address VBASE. */ static unsigned long __init map_spbank(unsigned long vbase, int sp_entry) { - unsigned long pstart = (sp_banks[sp_entry].base_addr & SRMMU_PGDIR_MASK); - unsigned long vstart = (vbase & SRMMU_PGDIR_MASK); - unsigned long vend = SRMMU_PGDIR_ALIGN(vbase + sp_banks[sp_entry].num_bytes); + unsigned long pstart = (sp_banks[sp_entry].base_addr & PGDIR_MASK); + unsigned long vstart = (vbase & PGDIR_MASK); + unsigned long vend = PGDIR_ALIGN(vbase + sp_banks[sp_entry].num_bytes); /* Map "low" memory only */ const unsigned long min_vaddr = PAGE_OFFSET; const unsigned long max_vaddr = PAGE_OFFSET + SRMMU_MAXMEM; @@ -905,7 +885,7 @@ static unsigned long __init map_spbank(unsigned long vbase, int sp_entry) while (vstart < vend) { do_large_mapping(vstart, pstart); - vstart += SRMMU_PGDIR_SIZE; pstart += SRMMU_PGDIR_SIZE; + vstart += PGDIR_SIZE; pstart += PGDIR_SIZE; } return vstart; } diff --git a/arch/sparc/mm/viking.S b/arch/sparc/mm/viking.S index adaef6e7b8cf..48f062de7a7f 100644 --- a/arch/sparc/mm/viking.S +++ b/arch/sparc/mm/viking.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -157,7 +158,7 @@ viking_flush_tlb_range: cmp %o3, -1 be 2f #endif - sethi %hi(~((1 << SRMMU_PGDIR_SHIFT) - 1)), %o4 + sethi %hi(~((1 << PGDIR_SHIFT) - 1)), %o4 sta %o3, [%g1] ASI_M_MMUREGS and %o1, %o4, %o1 add %o1, 0x200, %o1 @@ -243,7 +244,7 @@ sun4dsmp_flush_tlb_range: ld [%o0 + VMA_VM_MM], %o0 ld [%o0 + AOFF_mm_context], %o3 lda [%g1] ASI_M_MMUREGS, %g5 - sethi %hi(~((1 << SRMMU_PGDIR_SHIFT) - 1)), %o4 + sethi %hi(~((1 << PGDIR_SHIFT) - 1)), %o4 sta %o3, [%g1] ASI_M_MMUREGS and %o1, %o4, %o1 add %o1, 0x200, %o1 From 3f407976ac2953116cb8880a7a18b63bcc81829d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Apr 2020 22:40:10 +0100 Subject: [PATCH 124/574] sparc32: mm: Change pgtable_t type to pte_t * instead of struct page * Change the 'pgtable_t' type for sparc32 so that it represents the uncached virtual address of the PTE table, rather than the underlying 'struct page'. This allows us to free page table allocations smaller than a page. Cc: "David S. Miller" Cc: Peter Zijlstra Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/include/asm/page_32.h | 2 +- arch/sparc/include/asm/pgalloc_32.h | 6 +++--- arch/sparc/include/asm/pgtable_32.h | 11 +++++++++++ arch/sparc/mm/srmmu.c | 29 +++++++++-------------------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index da01c8c45412..fff8861df107 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -106,7 +106,7 @@ typedef unsigned long iopgprot_t; #endif -typedef struct page *pgtable_t; +typedef pte_t *pgtable_t; #define TASK_UNMAPPED_BASE 0x50000000 diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 99c032424946..b772384871e9 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -50,11 +50,11 @@ static inline void free_pmd_fast(pmd_t * pmd) #define pmd_free(mm, pmd) free_pmd_fast(pmd) #define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) -void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep); -#define pmd_pgtable(pmd) pmd_page(pmd) +#define pmd_populate(mm, pmd, pte) pmd_set(pmd, pte) +#define pmd_pgtable(pmd) (pgtable_t)__pmd_page(pmd) void pmd_set(pmd_t *pmdp, pte_t *ptep); -#define pmd_populate_kernel(MM, PMD, PTE) pmd_set(PMD, PTE) +#define pmd_populate_kernel pmd_populate pgtable_t pte_alloc_one(struct mm_struct *mm); diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 3367e2ba89e0..c5625b2aa331 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -135,6 +135,17 @@ static inline struct page *pmd_page(pmd_t pmd) return pfn_to_page((pmd_val(pmd) & SRMMU_PTD_PMASK) >> (PAGE_SHIFT-4)); } +static inline unsigned long __pmd_page(pmd_t pmd) +{ + unsigned long v; + + if (srmmu_device_memory(pmd_val(pmd))) + BUG(); + + v = pmd_val(pmd) & SRMMU_PTD_PMASK; + return (unsigned long)__nocache_va(v << 4); +} + static inline unsigned long pud_page_vaddr(pud_t pud) { if (srmmu_device_memory(pud_val(pud))) { diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 50da4bcdd6fa..c861c0f0df73 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -140,12 +140,6 @@ void pmd_set(pmd_t *pmdp, pte_t *ptep) set_pte((pte_t *)&pmd_val(*pmdp), __pte(SRMMU_ET_PTD | ptp)); } -void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep) -{ - unsigned long ptp = page_to_pfn(ptep) << (PAGE_SHIFT-4); /* watch for overflow */ - set_pte((pte_t *)&pmd_val(*pmdp), __pte(SRMMU_ET_PTD | ptp)); -} - /* Find an entry in the third-level page table.. */ pte_t *pte_offset_kernel(pmd_t *dir, unsigned long address) { @@ -364,31 +358,26 @@ pgd_t *get_pgd_fast(void) */ pgtable_t pte_alloc_one(struct mm_struct *mm) { - unsigned long pte; + pte_t *ptep; struct page *page; - if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) + if ((ptep = pte_alloc_one_kernel(mm)) == 0) return NULL; - page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); + page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } - return page; + return ptep; } -void pte_free(struct mm_struct *mm, pgtable_t pte) +void pte_free(struct mm_struct *mm, pgtable_t ptep) { - unsigned long p; + struct page *page; - pgtable_pte_page_dtor(pte); - p = (unsigned long)page_address(pte); /* Cached address (for test) */ - if (p == 0) - BUG(); - p = page_to_pfn(pte) << PAGE_SHIFT; /* Physical address */ - - /* free non cached virtual address*/ - srmmu_free_nocache(__nocache_va(p), SRMMU_PTE_TABLE_SIZE); + page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); + pgtable_pte_page_dtor(page); + srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); } /* context handling - a dynamically sized pool is used */ From 8c8f3156dd40f8bdc58f2ac461374bc804c28e3b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Apr 2020 22:40:11 +0100 Subject: [PATCH 125/574] sparc32: mm: Reduce allocation size for PMD and PTE tables Now that the page table allocator can free page table allocations smaller than PAGE_SIZE, reduce the size of the PMD and PTE allocations to avoid needlessly wasting memory. Cc: "David S. Miller" Cc: Peter Zijlstra Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/include/asm/pgtsrmmu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 58ea8e8c6ee7..7708d015712b 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -17,8 +17,8 @@ /* Number of contexts is implementation-dependent; 64k is the most we support */ #define SRMMU_MAX_CONTEXTS 65536 -#define SRMMU_PTE_TABLE_SIZE (PAGE_SIZE) -#define SRMMU_PMD_TABLE_SIZE (PAGE_SIZE) +#define SRMMU_PTE_TABLE_SIZE (PTRS_PER_PTE*4) +#define SRMMU_PMD_TABLE_SIZE (PTRS_PER_PMD*4) #define SRMMU_PGD_TABLE_SIZE (PTRS_PER_PGD*4) /* Definition of the values in the ET field of PTD's and PTE's */ From 9ef391bdc29f0570c2be578210e220d15ea7a453 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 27 Apr 2020 13:22:20 +0000 Subject: [PATCH 126/574] tty: vcc: Fix error return code in vcc_probe() Fix to return negative error code -ENOMEM from the error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/tty/vcc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/vcc.c b/drivers/tty/vcc.c index d2a1e1228c82..9ffd42e333b8 100644 --- a/drivers/tty/vcc.c +++ b/drivers/tty/vcc.c @@ -605,6 +605,7 @@ static int vcc_probe(struct vio_dev *vdev, const struct vio_device_id *id) port->index = vcc_table_add(port); if (port->index == -1) { pr_err("VCC: no more TTY indices left for allocation\n"); + rv = -ENOMEM; goto free_ldc; } From 03a1b56f501ef4f8c54cb89a5f5ed7cc9ccac38f Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Sat, 9 May 2020 17:18:48 +0800 Subject: [PATCH 127/574] sparc: use scnprintf() in show_pciobppath_attr() in pci.c snprintf() returns the number of bytes that would be written, which may be greater than the the actual length to be written. show_pciobppath_attr() should return the number of bytes printed into the buffer. This is the return value of scnprintf(). Signed-off-by: Chen Zhou Signed-off-by: David S. Miller --- arch/sparc/kernel/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 5ed43828e078..a41ad562ed4e 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -593,7 +593,7 @@ show_pciobppath_attr(struct device * dev, struct device_attribute * attr, char * pdev = to_pci_dev(dev); dp = pdev->dev.of_node; - return snprintf (buf, PAGE_SIZE, "%pOF\n", dp); + return scnprintf(buf, PAGE_SIZE, "%pOF\n", dp); } static DEVICE_ATTR(obppath, S_IRUSR | S_IRGRP | S_IROTH, show_pciobppath_attr, NULL); From 117e2cb3eeee2940ba581d104951f6207cb46ce5 Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Sat, 9 May 2020 17:18:49 +0800 Subject: [PATCH 128/574] sparc: use scnprintf() in show_pciobppath_attr() in vio.c snprintf() returns the number of bytes that would be written, which may be greater than the the actual length to be written. show_pciobppath_attr() should return the number of bytes printed into the buffer. This is the return value of scnprintf(). Signed-off-by: Chen Zhou Signed-off-by: David S. Miller --- arch/sparc/kernel/vio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/kernel/vio.c b/arch/sparc/kernel/vio.c index c7cad9b7bba7..4f57056ed463 100644 --- a/arch/sparc/kernel/vio.c +++ b/arch/sparc/kernel/vio.c @@ -193,7 +193,7 @@ show_pciobppath_attr(struct device *dev, struct device_attribute *attr, vdev = to_vio_dev(dev); dp = vdev->dp; - return snprintf (buf, PAGE_SIZE, "%pOF\n", dp); + return scnprintf(buf, PAGE_SIZE, "%pOF\n", dp); } static DEVICE_ATTR(obppath, S_IRUSR | S_IRGRP | S_IROTH, From db7d3cbd6a99c5148eafd4685b33d48d118a317f Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 13 May 2020 16:07:20 +0200 Subject: [PATCH 129/574] dt-bindings: iommu: Add Allwinner H6 IOMMU bindings The Allwinner H6 has introduced an IOMMU. Let's add a device tree binding for it. Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/f3e1633677a9cf9cf36fe3582f0168fae94c1b3e.1589378833.git-series.maxime@cerno.tech Signed-off-by: Joerg Roedel --- .../iommu/allwinner,sun50i-h6-iommu.yaml | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml diff --git a/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml new file mode 100644 index 000000000000..5e125cf2a88b --- /dev/null +++ b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iommu/allwinner,sun50i-h6-iommu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner H6 IOMMU Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#iommu-cells": + const: 1 + description: + The content of the cell is the master ID. + + compatible: + const: allwinner,sun50i-h6-iommu + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + +required: + - "#iommu-cells" + - compatible + - reg + - interrupts + - clocks + - resets + +additionalProperties: false + +examples: + - | + #include + #include + + #include + #include + + iommu: iommu@30f0000 { + compatible = "allwinner,sun50i-h6-iommu"; + reg = <0x030f0000 0x10000>; + interrupts = ; + clocks = <&ccu CLK_BUS_IOMMU>; + resets = <&ccu RST_BUS_IOMMU>; + #iommu-cells = <1>; + }; + +... From 4100b8c229b328358cc4a82f5042dbf22f1c1ccb Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 13 May 2020 16:07:22 +0200 Subject: [PATCH 130/574] iommu: Add Allwinner H6 IOMMU driver The Allwinner H6 has introduced an IOMMU for a few DMA controllers, mostly video related: the display engine, the video decoders / encoders, the camera capture controller, etc. The design is pretty simple compared to other IOMMUs found in SoCs: there's a single instance, controlling all the masters, with a single address space. It also features a performance monitoring unit that allows to retrieve various informations (per-master and global TLB accesses, hits and misses, access latency, etc) that isn't supported at the moment. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/d122a8670361e36fc26b4ce2674a2223d30dc4cc.1589378833.git-series.maxime@cerno.tech Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 9 + drivers/iommu/Makefile | 1 + drivers/iommu/sun50i-iommu.c | 1027 ++++++++++++++++++++++++++++++++++ 3 files changed, 1037 insertions(+) create mode 100644 drivers/iommu/sun50i-iommu.c diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 2ab07ce17abb..aca76383f201 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -303,6 +303,15 @@ config ROCKCHIP_IOMMU Say Y here if you are using a Rockchip SoC that includes an IOMMU device. +config SUN50I_IOMMU + bool "Allwinner H6 IOMMU Support" + depends on ARCH_SUNXI || COMPILE_TEST + select ARM_DMA_USE_IOMMU + select IOMMU_API + select IOMMU_DMA + help + Support for the IOMMU introduced in the Allwinner H6 SoCs. + config TEGRA_IOMMU_GART bool "Tegra GART IOMMU Support" depends on ARCH_TEGRA_2x_SOC diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 9f33fdb3bb05..57cf4ba5e27c 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o +obj-$(CONFIG_SUN50I_IOMMU) += sun50i-iommu.o obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c new file mode 100644 index 000000000000..989d87aa4426 --- /dev/null +++ b/drivers/iommu/sun50i-iommu.c @@ -0,0 +1,1027 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2016-2018, Allwinner Technology CO., LTD. +// Copyright (C) 2019-2020, Cerno + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IOMMU_RESET_REG 0x010 +#define IOMMU_ENABLE_REG 0x020 +#define IOMMU_ENABLE_ENABLE BIT(0) + +#define IOMMU_BYPASS_REG 0x030 +#define IOMMU_AUTO_GATING_REG 0x040 +#define IOMMU_AUTO_GATING_ENABLE BIT(0) + +#define IOMMU_WBUF_CTRL_REG 0x044 +#define IOMMU_OOO_CTRL_REG 0x048 +#define IOMMU_4KB_BDY_PRT_CTRL_REG 0x04c +#define IOMMU_TTB_REG 0x050 +#define IOMMU_TLB_ENABLE_REG 0x060 +#define IOMMU_TLB_PREFETCH_REG 0x070 +#define IOMMU_TLB_PREFETCH_MASTER_ENABLE(m) BIT(m) + +#define IOMMU_TLB_FLUSH_REG 0x080 +#define IOMMU_TLB_FLUSH_PTW_CACHE BIT(17) +#define IOMMU_TLB_FLUSH_MACRO_TLB BIT(16) +#define IOMMU_TLB_FLUSH_MICRO_TLB(i) (BIT(i) & GENMASK(5, 0)) + +#define IOMMU_TLB_IVLD_ADDR_REG 0x090 +#define IOMMU_TLB_IVLD_ADDR_MASK_REG 0x094 +#define IOMMU_TLB_IVLD_ENABLE_REG 0x098 +#define IOMMU_TLB_IVLD_ENABLE_ENABLE BIT(0) + +#define IOMMU_PC_IVLD_ADDR_REG 0x0a0 +#define IOMMU_PC_IVLD_ENABLE_REG 0x0a8 +#define IOMMU_PC_IVLD_ENABLE_ENABLE BIT(0) + +#define IOMMU_DM_AUT_CTRL_REG(d) (0x0b0 + ((d) / 2) * 4) +#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m) (1 << (((d & 1) * 16) + ((m) * 2))) +#define IOMMU_DM_AUT_CTRL_WR_UNAVAIL(d, m) (1 << (((d & 1) * 16) + ((m) * 2) + 1)) + +#define IOMMU_DM_AUT_OVWT_REG 0x0d0 +#define IOMMU_INT_ENABLE_REG 0x100 +#define IOMMU_INT_CLR_REG 0x104 +#define IOMMU_INT_STA_REG 0x108 +#define IOMMU_INT_ERR_ADDR_REG(i) (0x110 + (i) * 4) +#define IOMMU_INT_ERR_ADDR_L1_REG 0x130 +#define IOMMU_INT_ERR_ADDR_L2_REG 0x134 +#define IOMMU_INT_ERR_DATA_REG(i) (0x150 + (i) * 4) +#define IOMMU_L1PG_INT_REG 0x0180 +#define IOMMU_L2PG_INT_REG 0x0184 + +#define IOMMU_INT_INVALID_L2PG BIT(17) +#define IOMMU_INT_INVALID_L1PG BIT(16) +#define IOMMU_INT_MASTER_PERMISSION(m) BIT(m) +#define IOMMU_INT_MASTER_MASK (IOMMU_INT_MASTER_PERMISSION(0) | \ + IOMMU_INT_MASTER_PERMISSION(1) | \ + IOMMU_INT_MASTER_PERMISSION(2) | \ + IOMMU_INT_MASTER_PERMISSION(3) | \ + IOMMU_INT_MASTER_PERMISSION(4) | \ + IOMMU_INT_MASTER_PERMISSION(5)) +#define IOMMU_INT_MASK (IOMMU_INT_INVALID_L1PG | \ + IOMMU_INT_INVALID_L2PG | \ + IOMMU_INT_MASTER_MASK) + +#define PT_ENTRY_SIZE sizeof(u32) + +#define NUM_DT_ENTRIES 4096 +#define DT_SIZE (NUM_DT_ENTRIES * PT_ENTRY_SIZE) + +#define NUM_PT_ENTRIES 256 +#define PT_SIZE (NUM_PT_ENTRIES * PT_ENTRY_SIZE) + +struct sun50i_iommu { + struct iommu_device iommu; + + /* Lock to modify the IOMMU registers */ + spinlock_t iommu_lock; + + struct device *dev; + void __iomem *base; + struct reset_control *reset; + struct clk *clk; + + struct iommu_domain *domain; + struct iommu_group *group; + struct kmem_cache *pt_pool; +}; + +struct sun50i_iommu_domain { + struct iommu_domain domain; + + /* Number of devices attached to the domain */ + refcount_t refcnt; + + /* L1 Page Table */ + u32 *dt; + dma_addr_t dt_dma; + + struct sun50i_iommu *iommu; +}; + +static struct sun50i_iommu_domain *to_sun50i_domain(struct iommu_domain *domain) +{ + return container_of(domain, struct sun50i_iommu_domain, domain); +} + +static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev) +{ + return dev_iommu_priv_get(dev); +} + +static u32 iommu_read(struct sun50i_iommu *iommu, u32 offset) +{ + return readl(iommu->base + offset); +} + +static void iommu_write(struct sun50i_iommu *iommu, u32 offset, u32 value) +{ + writel(value, iommu->base + offset); +} + +/* + * The Allwinner H6 IOMMU uses a 2-level page table. + * + * The first level is the usual Directory Table (DT), that consists of + * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page + * Table (PT). + * + * Each PT consits of 256 4-bytes Page Table Entries (PTE), each + * pointing to a 4kB page of physical memory. + * + * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG + * register that contains its physical address. + */ + +#define SUN50I_IOVA_DTE_MASK GENMASK(31, 20) +#define SUN50I_IOVA_PTE_MASK GENMASK(19, 12) +#define SUN50I_IOVA_PAGE_MASK GENMASK(11, 0) + +static u32 sun50i_iova_get_dte_index(dma_addr_t iova) +{ + return FIELD_GET(SUN50I_IOVA_DTE_MASK, iova); +} + +static u32 sun50i_iova_get_pte_index(dma_addr_t iova) +{ + return FIELD_GET(SUN50I_IOVA_PTE_MASK, iova); +} + +static u32 sun50i_iova_get_page_offset(dma_addr_t iova) +{ + return FIELD_GET(SUN50I_IOVA_PAGE_MASK, iova); +} + +/* + * Each Directory Table Entry has a Page Table address and a valid + * bit: + + * +---------------------+-----------+-+ + * | PT address | Reserved |V| + * +---------------------+-----------+-+ + * 31:10 - Page Table address + * 9:2 - Reserved + * 1:0 - 1 if the entry is valid + */ + +#define SUN50I_DTE_PT_ADDRESS_MASK GENMASK(31, 10) +#define SUN50I_DTE_PT_ATTRS GENMASK(1, 0) +#define SUN50I_DTE_PT_VALID 1 + +static phys_addr_t sun50i_dte_get_pt_address(u32 dte) +{ + return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK; +} + +static bool sun50i_dte_is_pt_valid(u32 dte) +{ + return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID; +} + +static u32 sun50i_mk_dte(dma_addr_t pt_dma) +{ + return (pt_dma & SUN50I_DTE_PT_ADDRESS_MASK) | SUN50I_DTE_PT_VALID; +} + +/* + * Each PTE has a Page address, an authority index and a valid bit: + * + * +----------------+-----+-----+-----+---+-----+ + * | Page address | Rsv | ACI | Rsv | V | Rsv | + * +----------------+-----+-----+-----+---+-----+ + * 31:12 - Page address + * 11:8 - Reserved + * 7:4 - Authority Control Index + * 3:2 - Reserved + * 1 - 1 if the entry is valid + * 0 - Reserved + * + * The way permissions work is that the IOMMU has 16 "domains" that + * can be configured to give each masters either read or write + * permissions through the IOMMU_DM_AUT_CTRL_REG registers. The domain + * 0 seems like the default domain, and its permissions in the + * IOMMU_DM_AUT_CTRL_REG are only read-only, so it's not really + * useful to enforce any particular permission. + * + * Each page entry will then have a reference to the domain they are + * affected to, so that we can actually enforce them on a per-page + * basis. + * + * In order to make it work with the IOMMU framework, we will be using + * 4 different domains, starting at 1: RD_WR, RD, WR and NONE + * depending on the permission we want to enforce. Each domain will + * have each master setup in the same way, since the IOMMU framework + * doesn't seem to restrict page access on a per-device basis. And + * then we will use the relevant domain index when generating the page + * table entry depending on the permissions we want to be enforced. + */ + +enum sun50i_iommu_aci { + SUN50I_IOMMU_ACI_DO_NOT_USE = 0, + SUN50I_IOMMU_ACI_NONE, + SUN50I_IOMMU_ACI_RD, + SUN50I_IOMMU_ACI_WR, + SUN50I_IOMMU_ACI_RD_WR, +}; + +#define SUN50I_PTE_PAGE_ADDRESS_MASK GENMASK(31, 12) +#define SUN50I_PTE_ACI_MASK GENMASK(7, 4) +#define SUN50I_PTE_PAGE_VALID BIT(1) + +static phys_addr_t sun50i_pte_get_page_address(u32 pte) +{ + return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK; +} + +static enum sun50i_iommu_aci sun50i_get_pte_aci(u32 pte) +{ + return FIELD_GET(SUN50I_PTE_ACI_MASK, pte); +} + +static bool sun50i_pte_is_page_valid(u32 pte) +{ + return pte & SUN50I_PTE_PAGE_VALID; +} + +static u32 sun50i_mk_pte(phys_addr_t page, int prot) +{ + enum sun50i_iommu_aci aci; + u32 flags = 0; + + if (prot & (IOMMU_READ | IOMMU_WRITE)) + aci = SUN50I_IOMMU_ACI_RD_WR; + else if (prot & IOMMU_READ) + aci = SUN50I_IOMMU_ACI_RD; + else if (prot & IOMMU_WRITE) + aci = SUN50I_IOMMU_ACI_WR; + else + aci = SUN50I_IOMMU_ACI_NONE; + + flags |= FIELD_PREP(SUN50I_PTE_ACI_MASK, aci); + page &= SUN50I_PTE_PAGE_ADDRESS_MASK; + return page | flags | SUN50I_PTE_PAGE_VALID; +} + +static void sun50i_table_flush(struct sun50i_iommu_domain *sun50i_domain, + void *vaddr, unsigned int count) +{ + struct sun50i_iommu *iommu = sun50i_domain->iommu; + dma_addr_t dma = virt_to_phys(vaddr); + size_t size = count * PT_ENTRY_SIZE; + + dma_sync_single_for_device(iommu->dev, dma, size, DMA_TO_DEVICE); +} + +static int sun50i_iommu_flush_all_tlb(struct sun50i_iommu *iommu) +{ + u32 reg; + int ret; + + assert_spin_locked(&iommu->iommu_lock); + + iommu_write(iommu, + IOMMU_TLB_FLUSH_REG, + IOMMU_TLB_FLUSH_PTW_CACHE | + IOMMU_TLB_FLUSH_MACRO_TLB | + IOMMU_TLB_FLUSH_MICRO_TLB(5) | + IOMMU_TLB_FLUSH_MICRO_TLB(4) | + IOMMU_TLB_FLUSH_MICRO_TLB(3) | + IOMMU_TLB_FLUSH_MICRO_TLB(2) | + IOMMU_TLB_FLUSH_MICRO_TLB(1) | + IOMMU_TLB_FLUSH_MICRO_TLB(0)); + + ret = readl_poll_timeout(iommu->base + IOMMU_TLB_FLUSH_REG, + reg, !reg, + 1, 2000); + if (ret) + dev_warn(iommu->dev, "TLB Flush timed out!\n"); + + return ret; +} + +static void sun50i_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu = sun50i_domain->iommu; + unsigned long flags; + + /* + * At boot, we'll have a first call into .flush_iotlb_all right after + * .probe_device, and since we link our (single) domain to our iommu in + * the .attach_device callback, we don't have that pointer set. + * + * It shouldn't really be any trouble to ignore it though since we flush + * all caches as part of the device powerup. + */ + if (!iommu) + return; + + spin_lock_irqsave(&iommu->iommu_lock, flags); + sun50i_iommu_flush_all_tlb(iommu); + spin_unlock_irqrestore(&iommu->iommu_lock, flags); +} + +static void sun50i_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + sun50i_iommu_flush_iotlb_all(domain); +} + +static int sun50i_iommu_enable(struct sun50i_iommu *iommu) +{ + struct sun50i_iommu_domain *sun50i_domain; + unsigned long flags; + int ret; + + if (!iommu->domain) + return 0; + + sun50i_domain = to_sun50i_domain(iommu->domain); + + ret = reset_control_deassert(iommu->reset); + if (ret) + return ret; + + ret = clk_prepare_enable(iommu->clk); + if (ret) + goto err_reset_assert; + + spin_lock_irqsave(&iommu->iommu_lock, flags); + + iommu_write(iommu, IOMMU_TTB_REG, sun50i_domain->dt_dma); + iommu_write(iommu, IOMMU_TLB_PREFETCH_REG, + IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) | + IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) | + IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) | + IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) | + IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) | + IOMMU_TLB_PREFETCH_MASTER_ENABLE(5)); + iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK); + iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_NONE), + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 0) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 1) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 1) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 2) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 2) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 3) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 3) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 4) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 4) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 5) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_NONE, 5)); + + iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_RD), + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 0) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 1) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 2) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 3) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 4) | + IOMMU_DM_AUT_CTRL_WR_UNAVAIL(SUN50I_IOMMU_ACI_RD, 5)); + + iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(SUN50I_IOMMU_ACI_WR), + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 0) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 1) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 2) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 3) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 4) | + IOMMU_DM_AUT_CTRL_RD_UNAVAIL(SUN50I_IOMMU_ACI_WR, 5)); + + ret = sun50i_iommu_flush_all_tlb(iommu); + if (ret) { + spin_unlock_irqrestore(&iommu->iommu_lock, flags); + goto err_clk_disable; + } + + iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE); + iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE); + + spin_unlock_irqrestore(&iommu->iommu_lock, flags); + + return 0; + +err_clk_disable: + clk_disable_unprepare(iommu->clk); + +err_reset_assert: + reset_control_assert(iommu->reset); + + return ret; +} + +static void sun50i_iommu_disable(struct sun50i_iommu *iommu) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu->iommu_lock, flags); + + iommu_write(iommu, IOMMU_ENABLE_REG, 0); + iommu_write(iommu, IOMMU_TTB_REG, 0); + + spin_unlock_irqrestore(&iommu->iommu_lock, flags); + + clk_disable_unprepare(iommu->clk); + reset_control_assert(iommu->reset); +} + +static void *sun50i_iommu_alloc_page_table(struct sun50i_iommu *iommu, + gfp_t gfp) +{ + dma_addr_t pt_dma; + u32 *page_table; + + page_table = kmem_cache_zalloc(iommu->pt_pool, gfp); + if (!page_table) + return ERR_PTR(-ENOMEM); + + pt_dma = dma_map_single(iommu->dev, page_table, PT_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(iommu->dev, pt_dma)) { + dev_err(iommu->dev, "Couldn't map L2 Page Table\n"); + kmem_cache_free(iommu->pt_pool, page_table); + return ERR_PTR(-ENOMEM); + } + + /* We rely on the physical address and DMA address being the same */ + WARN_ON(pt_dma != virt_to_phys(page_table)); + + return page_table; +} + +static void sun50i_iommu_free_page_table(struct sun50i_iommu *iommu, + u32 *page_table) +{ + phys_addr_t pt_phys = virt_to_phys(page_table); + + dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE); + kmem_cache_free(iommu->pt_pool, page_table); +} + +static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain, + dma_addr_t iova, gfp_t gfp) +{ + struct sun50i_iommu *iommu = sun50i_domain->iommu; + unsigned long flags; + u32 *page_table; + u32 *dte_addr; + u32 old_dte; + u32 dte; + + dte_addr = &sun50i_domain->dt[sun50i_iova_get_dte_index(iova)]; + dte = *dte_addr; + if (sun50i_dte_is_pt_valid(dte)) { + phys_addr_t pt_phys = sun50i_dte_get_pt_address(dte); + return (u32 *)phys_to_virt(pt_phys); + } + + page_table = sun50i_iommu_alloc_page_table(iommu, gfp); + if (IS_ERR(page_table)) + return page_table; + + dte = sun50i_mk_dte(virt_to_phys(page_table)); + old_dte = cmpxchg(dte_addr, 0, dte); + if (old_dte) { + phys_addr_t installed_pt_phys = + sun50i_dte_get_pt_address(old_dte); + u32 *installed_pt = phys_to_virt(installed_pt_phys); + u32 *drop_pt = page_table; + + page_table = installed_pt; + dte = old_dte; + sun50i_iommu_free_page_table(iommu, drop_pt); + } + + sun50i_table_flush(sun50i_domain, page_table, PT_SIZE); + sun50i_table_flush(sun50i_domain, dte_addr, 1); + + return page_table; +} + +static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu = sun50i_domain->iommu; + u32 pte_index; + u32 *page_table, *pte_addr; + int ret = 0; + + page_table = sun50i_dte_get_page_table(sun50i_domain, iova, gfp); + if (IS_ERR(page_table)) { + ret = PTR_ERR(page_table); + goto out; + } + + pte_index = sun50i_iova_get_pte_index(iova); + pte_addr = &page_table[pte_index]; + if (unlikely(sun50i_pte_is_page_valid(*pte_addr))) { + phys_addr_t page_phys = sun50i_pte_get_page_address(*pte_addr); + dev_err(iommu->dev, + "iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n", + &iova, &page_phys, &paddr, prot); + ret = -EBUSY; + goto out; + } + + *pte_addr = sun50i_mk_pte(paddr, prot); + sun50i_table_flush(sun50i_domain, pte_addr, 1); + +out: + return ret; +} + +static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu = sun50i_domain->iommu; + phys_addr_t pt_phys; + dma_addr_t pte_dma; + u32 *pte_addr; + u32 dte; + + dte = sun50i_domain->dt[sun50i_iova_get_dte_index(iova)]; + if (!sun50i_dte_is_pt_valid(dte)) + return 0; + + pt_phys = sun50i_dte_get_pt_address(dte); + pte_addr = (u32 *)phys_to_virt(pt_phys) + sun50i_iova_get_pte_index(iova); + pte_dma = pt_phys + sun50i_iova_get_pte_index(iova) * PT_ENTRY_SIZE; + + if (!sun50i_pte_is_page_valid(*pte_addr)) + return 0; + + memset(pte_addr, 0, sizeof(*pte_addr)); + sun50i_table_flush(sun50i_domain, pte_addr, 1); + + return SZ_4K; +} + +static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + phys_addr_t pt_phys; + u32 *page_table; + u32 dte, pte; + + dte = sun50i_domain->dt[sun50i_iova_get_dte_index(iova)]; + if (!sun50i_dte_is_pt_valid(dte)) + return 0; + + pt_phys = sun50i_dte_get_pt_address(dte); + page_table = (u32 *)phys_to_virt(pt_phys); + pte = page_table[sun50i_iova_get_pte_index(iova)]; + if (!sun50i_pte_is_page_valid(pte)) + return 0; + + return sun50i_pte_get_page_address(pte) + + sun50i_iova_get_page_offset(iova); +} + +static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) +{ + struct sun50i_iommu_domain *sun50i_domain; + + if (type != IOMMU_DOMAIN_DMA && + type != IOMMU_DOMAIN_IDENTITY && + type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + + sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL); + if (!sun50i_domain) + return NULL; + + if (type == IOMMU_DOMAIN_DMA && + iommu_get_dma_cookie(&sun50i_domain->domain)) + goto err_free_domain; + + sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL, + get_order(DT_SIZE)); + if (!sun50i_domain->dt) + goto err_put_cookie; + memset(sun50i_domain->dt, 0, DT_SIZE); + + refcount_set(&sun50i_domain->refcnt, 1); + + sun50i_domain->domain.geometry.aperture_start = 0; + sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32); + sun50i_domain->domain.geometry.force_aperture = true; + + return &sun50i_domain->domain; + +err_put_cookie: + if (type == IOMMU_DOMAIN_DMA) + iommu_put_dma_cookie(&sun50i_domain->domain); + +err_free_domain: + kfree(sun50i_domain); + + return NULL; +} + +static void sun50i_iommu_domain_free(struct iommu_domain *domain) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + + free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE)); + sun50i_domain->dt = NULL; + + iommu_put_dma_cookie(domain); + + kfree(sun50i_domain); +} + +static int sun50i_iommu_attach_domain(struct sun50i_iommu *iommu, + struct sun50i_iommu_domain *sun50i_domain) +{ + iommu->domain = &sun50i_domain->domain; + sun50i_domain->iommu = iommu; + + sun50i_domain->dt_dma = dma_map_single(iommu->dev, sun50i_domain->dt, + DT_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(iommu->dev, sun50i_domain->dt_dma)) { + dev_err(iommu->dev, "Couldn't map L1 Page Table\n"); + return -ENOMEM; + } + + return sun50i_iommu_enable(iommu); +} + +static void sun50i_iommu_detach_domain(struct sun50i_iommu *iommu, + struct sun50i_iommu_domain *sun50i_domain) +{ + unsigned int i; + + for (i = 0; i < NUM_DT_ENTRIES; i++) { + phys_addr_t pt_phys; + u32 *page_table; + u32 *dte_addr; + u32 dte; + + dte_addr = &sun50i_domain->dt[i]; + dte = *dte_addr; + if (!sun50i_dte_is_pt_valid(dte)) + continue; + + memset(dte_addr, 0, sizeof(*dte_addr)); + sun50i_table_flush(sun50i_domain, dte_addr, 1); + + pt_phys = sun50i_dte_get_pt_address(dte); + page_table = phys_to_virt(pt_phys); + sun50i_iommu_free_page_table(iommu, page_table); + } + + + sun50i_iommu_disable(iommu); + + dma_unmap_single(iommu->dev, virt_to_phys(sun50i_domain->dt), + DT_SIZE, DMA_TO_DEVICE); + + iommu->domain = NULL; +} + +static void sun50i_iommu_detach_device(struct iommu_domain *domain, + struct device *dev) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); + + dev_dbg(dev, "Detaching from IOMMU domain\n"); + + if (iommu->domain != domain) + return; + + if (refcount_dec_and_test(&sun50i_domain->refcnt)) + sun50i_iommu_detach_domain(iommu, sun50i_domain); +} + +static int sun50i_iommu_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); + struct sun50i_iommu *iommu; + + iommu = sun50i_iommu_from_dev(dev); + if (!iommu) + return -ENODEV; + + dev_dbg(dev, "Attaching to IOMMU domain\n"); + + refcount_inc(&sun50i_domain->refcnt); + + if (iommu->domain == domain) + return 0; + + if (iommu->domain) + sun50i_iommu_detach_device(iommu->domain, dev); + + sun50i_iommu_attach_domain(iommu, sun50i_domain); + + return 0; +} + +static struct iommu_device *sun50i_iommu_probe_device(struct device *dev) +{ + struct sun50i_iommu *iommu; + struct iommu_group *group; + + iommu = sun50i_iommu_from_dev(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + + return &iommu->iommu; +} + +static void sun50i_iommu_release_device(struct device *dev) {} + +static struct iommu_group *sun50i_iommu_device_group(struct device *dev) +{ + struct sun50i_iommu *iommu = sun50i_iommu_from_dev(dev); + + return iommu_group_ref_get(iommu->group); +} + +static int sun50i_iommu_of_xlate(struct device *dev, + struct of_phandle_args *args) +{ + struct platform_device *iommu_pdev = of_find_device_by_node(args->np); + unsigned id = args->args[0]; + + dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev)); + + return iommu_fwspec_add_ids(dev, &id, 1); +} + +static struct iommu_ops sun50i_iommu_ops = { + .pgsize_bitmap = SZ_4K, + .attach_dev = sun50i_iommu_attach_device, + .detach_dev = sun50i_iommu_detach_device, + .device_group = sun50i_iommu_device_group, + .domain_alloc = sun50i_iommu_domain_alloc, + .domain_free = sun50i_iommu_domain_free, + .flush_iotlb_all = sun50i_iommu_flush_iotlb_all, + .iotlb_sync = sun50i_iommu_iotlb_sync, + .iova_to_phys = sun50i_iommu_iova_to_phys, + .map = sun50i_iommu_map, + .of_xlate = sun50i_iommu_of_xlate, + .probe_device = sun50i_iommu_probe_device, + .release_device = sun50i_iommu_release_device, + .unmap = sun50i_iommu_unmap, +}; + +static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu, + unsigned master, phys_addr_t iova, + unsigned prot) +{ + dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n", + &iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd"); + + if (iommu->domain) + report_iommu_fault(iommu->domain, iommu->dev, iova, prot); + else + dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n"); +} + +static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu, + unsigned addr_reg, + unsigned blame_reg) +{ + phys_addr_t iova; + unsigned master; + u32 blame; + + assert_spin_locked(&iommu->iommu_lock); + + iova = iommu_read(iommu, addr_reg); + blame = iommu_read(iommu, blame_reg); + master = ilog2(blame & IOMMU_INT_MASTER_MASK); + + /* + * If the address is not in the page table, we can't get what + * operation triggered the fault. Assume it's a read + * operation. + */ + sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ); + + return iova; +} + +static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu) +{ + enum sun50i_iommu_aci aci; + phys_addr_t iova; + unsigned master; + unsigned dir; + u32 blame; + + assert_spin_locked(&iommu->iommu_lock); + + blame = iommu_read(iommu, IOMMU_INT_STA_REG); + master = ilog2(blame & IOMMU_INT_MASTER_MASK); + iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master)); + aci = sun50i_get_pte_aci(iommu_read(iommu, + IOMMU_INT_ERR_DATA_REG(master))); + + switch (aci) { + /* + * If we are in the read-only domain, then it means we + * tried to write. + */ + case SUN50I_IOMMU_ACI_RD: + dir = IOMMU_FAULT_WRITE; + break; + + /* + * If we are in the write-only domain, then it means + * we tried to read. + */ + case SUN50I_IOMMU_ACI_WR: + + /* + * If we are in the domain without any permission, we + * can't really tell. Let's default to a read + * operation. + */ + case SUN50I_IOMMU_ACI_NONE: + + /* WTF? */ + case SUN50I_IOMMU_ACI_RD_WR: + default: + dir = IOMMU_FAULT_READ; + break; + } + + /* + * If the address is not in the page table, we can't get what + * operation triggered the fault. Assume it's a read + * operation. + */ + sun50i_iommu_report_fault(iommu, master, iova, dir); + + return iova; +} + +static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id) +{ + struct sun50i_iommu *iommu = dev_id; + phys_addr_t iova; + u32 status; + + spin_lock(&iommu->iommu_lock); + + status = iommu_read(iommu, IOMMU_INT_STA_REG); + if (!(status & IOMMU_INT_MASK)) { + spin_unlock(&iommu->iommu_lock); + return IRQ_NONE; + } + + if (status & IOMMU_INT_INVALID_L2PG) + iova = sun50i_iommu_handle_pt_irq(iommu, + IOMMU_INT_ERR_ADDR_L2_REG, + IOMMU_L2PG_INT_REG); + else if (status & IOMMU_INT_INVALID_L1PG) + iova = sun50i_iommu_handle_pt_irq(iommu, + IOMMU_INT_ERR_ADDR_L1_REG, + IOMMU_L1PG_INT_REG); + else + iova = sun50i_iommu_handle_perm_irq(iommu); + + iommu_write(iommu, IOMMU_INT_CLR_REG, status); + + iommu_write(iommu, IOMMU_RESET_REG, ~status); + iommu_write(iommu, IOMMU_RESET_REG, status); + + spin_unlock(&iommu->iommu_lock); + + return IRQ_HANDLED; +} + +static int sun50i_iommu_probe(struct platform_device *pdev) +{ + struct sun50i_iommu *iommu; + int ret, irq; + + iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + spin_lock_init(&iommu->iommu_lock); + platform_set_drvdata(pdev, iommu); + iommu->dev = &pdev->dev; + + iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev), + PT_SIZE, PT_SIZE, + SLAB_HWCACHE_ALIGN, + NULL); + if (!iommu->pt_pool) + return -ENOMEM; + + iommu->group = iommu_group_alloc(); + if (IS_ERR(iommu->group)) { + ret = PTR_ERR(iommu->group); + goto err_free_cache; + } + + iommu->base = devm_platform_ioremap_resource(pdev, 0); + if (!iommu->base) { + ret = PTR_ERR(iommu->base); + goto err_free_group; + } + + irq = platform_get_irq(pdev, 0); + if (irq < 0) { + ret = irq; + goto err_free_group; + } + + iommu->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(iommu->clk)) { + dev_err(&pdev->dev, "Couldn't get our clock.\n"); + ret = PTR_ERR(iommu->clk); + goto err_free_group; + } + + iommu->reset = devm_reset_control_get(&pdev->dev, NULL); + if (IS_ERR(iommu->reset)) { + dev_err(&pdev->dev, "Couldn't get our reset line.\n"); + ret = PTR_ERR(iommu->reset); + goto err_free_group; + } + + ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev, + NULL, dev_name(&pdev->dev)); + if (ret) + goto err_free_group; + + iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops); + iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode); + + ret = iommu_device_register(&iommu->iommu); + if (ret) + goto err_remove_sysfs; + + ret = devm_request_irq(&pdev->dev, irq, sun50i_iommu_irq, 0, + dev_name(&pdev->dev), iommu); + if (ret < 0) + goto err_unregister; + + bus_set_iommu(&platform_bus_type, &sun50i_iommu_ops); + + return 0; + +err_unregister: + iommu_device_unregister(&iommu->iommu); + +err_remove_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); + +err_free_group: + iommu_group_put(iommu->group); + +err_free_cache: + kmem_cache_destroy(iommu->pt_pool); + + return ret; +} + +static const struct of_device_id sun50i_iommu_dt[] = { + { .compatible = "allwinner,sun50i-h6-iommu", }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, sun50i_iommu_dt); + +static struct platform_driver sun50i_iommu_driver = { + .driver = { + .name = "sun50i-iommu", + .of_match_table = sun50i_iommu_dt, + .suppress_bind_attrs = true, + } +}; +builtin_platform_driver_probe(sun50i_iommu_driver, sun50i_iommu_probe); + +MODULE_DESCRIPTION("Allwinner H6 IOMMU driver"); +MODULE_AUTHOR("Maxime Ripard "); +MODULE_AUTHOR("zhuxianbin "); +MODULE_LICENSE("Dual BSD/GPL"); From ab785cfa5907b3fa141438baed898849b6b43b4e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 May 2020 14:46:20 +0200 Subject: [PATCH 131/574] iommu/sun50i: Fix compile warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few compile warnings show up when building this driver: CC drivers/iommu/sun50i-iommu.o drivers/iommu/sun50i-iommu.c: In function ‘sun50i_dte_get_page_table’: drivers/iommu/sun50i-iommu.c:486:16: warning: unused variable ‘flags’ [-Wunused-variable] 486 | unsigned long flags; | ^~~~~ drivers/iommu/sun50i-iommu.c: In function ‘sun50i_iommu_unmap’: drivers/iommu/sun50i-iommu.c:559:23: warning: unused variable ‘iommu’ [-Wunused-variable] 559 | struct sun50i_iommu *iommu = sun50i_domain->iommu; | ^~~~~ drivers/iommu/sun50i-iommu.c: In function ‘sun50i_iommu_probe_device’: drivers/iommu/sun50i-iommu.c:749:22: warning: unused variable ‘group’ [-Wunused-variable] 749 | struct iommu_group *group; | ^~~~~ Remove the unused variables. Signed-off-by: Joerg Roedel Cc: Maxime Ripard Link: https://lore.kernel.org/r/20200514124621.25999-1-joro@8bytes.org --- drivers/iommu/sun50i-iommu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 989d87aa4426..a52f52eff7c8 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -483,7 +483,6 @@ static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain, dma_addr_t iova, gfp_t gfp) { struct sun50i_iommu *iommu = sun50i_domain->iommu; - unsigned long flags; u32 *page_table; u32 *dte_addr; u32 old_dte; @@ -556,7 +555,6 @@ static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova size_t size, struct iommu_iotlb_gather *gather) { struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); - struct sun50i_iommu *iommu = sun50i_domain->iommu; phys_addr_t pt_phys; dma_addr_t pte_dma; u32 *pte_addr; @@ -746,7 +744,6 @@ static int sun50i_iommu_attach_device(struct iommu_domain *domain, static struct iommu_device *sun50i_iommu_probe_device(struct device *dev) { struct sun50i_iommu *iommu; - struct iommu_group *group; iommu = sun50i_iommu_from_dev(dev); if (!iommu) From 38b91f810b6873bcdfd6ef441e9a3794a9c69101 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 May 2020 14:46:21 +0200 Subject: [PATCH 132/574] iommu/sun50i: Use __GFP_ZERO instead of memset() Allocate zeroed memory so there is no need to memset it to 0 in the driver. Signed-off-by: Joerg Roedel Cc: Maxime Ripard Link: https://lore.kernel.org/r/20200514124621.25999-2-joro@8bytes.org --- drivers/iommu/sun50i-iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index a52f52eff7c8..9c763d4a8e2a 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -616,11 +616,10 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) iommu_get_dma_cookie(&sun50i_domain->domain)) goto err_free_domain; - sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL, + sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(DT_SIZE)); if (!sun50i_domain->dt) goto err_put_cookie; - memset(sun50i_domain->dt, 0, DT_SIZE); refcount_set(&sun50i_domain->refcnt, 1); From 25f9f5a2107fdc6510463be4e55012d17d83ab2b Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 14 May 2020 11:58:56 -0700 Subject: [PATCH 133/574] remoteproc: wcss: Fix arguments passed to qcom_add_glink_subdev() Recently qcom_add_glink_subdev() was extended to also take the glink_ssr identifier as an argument and I missed this while applying '8a226e2c71bb ("remoteproc: wcss: add support for rpmsg communication")'. Fixes: 8a226e2c71bb ("remoteproc: wcss: add support for rpmsg communication") Reported-by: kbuild test robot Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20200514185856.1598945-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 48d16d81f94d..88c76b9417fa 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -560,7 +560,7 @@ static int q6v5_wcss_probe(struct platform_device *pdev) if (ret) goto free_rproc; - qcom_add_glink_subdev(rproc, &wcss->glink_subdev); + qcom_add_glink_subdev(rproc, &wcss->glink_subdev, "q6wcss"); qcom_add_ssr_subdev(rproc, &wcss->ssr_subdev, "q6wcss"); ret = rproc_add(rproc); From 69cf449166987d9a041020be6422ee7bf94a7228 Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 13 May 2020 15:47:21 -0700 Subject: [PATCH 134/574] iommu: Remove functions that support private domain After moving iommu_group setup to iommu core code [1][2] and removing private domain support in vt-d [3], there are no users for functions such as iommu_request_dm_for_dev(), iommu_request_dma_domain_for_dev() and request_default_domain_for_dev(). So, remove these functions. [1] commit dce8d6964ebd ("iommu/amd: Convert to probe/release_device() call-backs") [2] commit e5d1841f18b2 ("iommu/vt-d: Convert to probe/release_device() call-backs") [3] commit 327d5b2fee91 ("iommu/vt-d: Allow 32bit devices to uses DMA domain") Signed-off-by: Sai Praneeth Prakhya Cc: Joerg Roedel Cc: Lu Baolu Link: https://lore.kernel.org/r/20200513224721.20504-1-sai.praneeth.prakhya@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 65 ------------------------------------------- include/linux/iommu.h | 12 -------- 2 files changed, 77 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4050569188be..374b34fd6fac 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2536,71 +2536,6 @@ struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, } EXPORT_SYMBOL_GPL(iommu_alloc_resv_region); -static int -request_default_domain_for_dev(struct device *dev, unsigned long type) -{ - struct iommu_domain *domain; - struct iommu_group *group; - int ret; - - /* Device must already be in a group before calling this function */ - group = iommu_group_get(dev); - if (!group) - return -EINVAL; - - mutex_lock(&group->mutex); - - ret = 0; - if (group->default_domain && group->default_domain->type == type) - goto out; - - /* Don't change mappings of existing devices */ - ret = -EBUSY; - if (iommu_group_device_count(group) != 1) - goto out; - - ret = -ENOMEM; - domain = __iommu_domain_alloc(dev->bus, type); - if (!domain) - goto out; - - /* Attach the device to the domain */ - ret = __iommu_attach_group(domain, group); - if (ret) { - iommu_domain_free(domain); - goto out; - } - - /* Make the domain the default for this group */ - if (group->default_domain) - iommu_domain_free(group->default_domain); - group->default_domain = domain; - - iommu_create_device_direct_mappings(group, dev); - - dev_info(dev, "Using iommu %s mapping\n", - type == IOMMU_DOMAIN_DMA ? "dma" : "direct"); - - ret = 0; -out: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return ret; -} - -/* Request that a device is direct mapped by the IOMMU */ -int iommu_request_dm_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_IDENTITY); -} - -/* Request that a device can't be direct mapped by the IOMMU */ -int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return request_default_domain_for_dev(dev, IOMMU_DOMAIN_DMA); -} - void iommu_set_default_passthrough(bool cmd_line) { if (cmd_line) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..78a26ae5c2b6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -482,8 +482,6 @@ extern void iommu_get_resv_regions(struct device *dev, struct list_head *list); extern void iommu_put_resv_regions(struct device *dev, struct list_head *list); extern void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list); -extern int iommu_request_dm_for_dev(struct device *dev); -extern int iommu_request_dma_domain_for_dev(struct device *dev); extern void iommu_set_default_passthrough(bool cmd_line); extern void iommu_set_default_translated(bool cmd_line); extern bool iommu_default_passthrough(void); @@ -804,16 +802,6 @@ static inline int iommu_get_group_resv_regions(struct iommu_group *group, return -ENODEV; } -static inline int iommu_request_dm_for_dev(struct device *dev) -{ - return -ENODEV; -} - -static inline int iommu_request_dma_domain_for_dev(struct device *dev) -{ - return -ENODEV; -} - static inline void iommu_set_default_passthrough(bool cmd_line) { } From 0a2576dae0328f6cb50cef9eee9ba760ddcc4691 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sat, 16 May 2020 19:08:29 -0700 Subject: [PATCH 135/574] oradax: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages_fast(), in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages_fast() + put_page() calls to pin_user_pages_fast() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Cc: David S. Miller Cc: sparclinux@vger.kernel.org Signed-off-by: John Hubbard Signed-off-by: David S. Miller --- drivers/sbus/char/oradax.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 8af216287a84..21b7cb6e7e70 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -410,9 +410,7 @@ static void dax_unlock_pages(struct dax_ctx *ctx, int ccb_index, int nelem) if (p) { dax_dbg("freeing page %p", p); - if (j == OUT) - set_page_dirty(p); - put_page(p); + unpin_user_pages_dirty_lock(&p, 1, j == OUT); ctx->pages[i][j] = NULL; } } @@ -425,13 +423,13 @@ static int dax_lock_page(void *va, struct page **p) dax_dbg("uva %p", va); - ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); + ret = pin_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); if (ret == 1) { dax_dbg("locked page %p, for VA %p", *p, va); return 0; } - dax_dbg("get_user_pages failed, va=%p, ret=%d", va, ret); + dax_dbg("pin_user_pages failed, va=%p, ret=%d", va, ret); return -1; } From 142cd25293f6a7ecbdff4fb0af17de6438d46433 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 May 2020 15:37:50 -0400 Subject: [PATCH 136/574] sparc64: fix misuses of access_process_vm() in genregs32_[sg]et() We do need access_process_vm() to access the target's reg_window. However, access to caller's memory (storing the result in genregs32_get(), fetching the new values in case of genregs32_set()) should be done by normal uaccess primitives. Fixes: ad4f95764040 ([SPARC64]: Fix user accesses in regset code.) Cc: stable@kernel.org Signed-off-by: Al Viro --- arch/sparc/kernel/ptrace_64.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index c9d41a96468f..3f5930bfab06 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -572,19 +572,13 @@ static int genregs32_get(struct task_struct *target, for (; count > 0 && pos < 32; count--) { if (access_process_vm(target, (unsigned long) - ®_window[pos], + ®_window[pos++], ®, sizeof(reg), FOLL_FORCE) != sizeof(reg)) return -EFAULT; - if (access_process_vm(target, - (unsigned long) u, - ®, sizeof(reg), - FOLL_FORCE | FOLL_WRITE) - != sizeof(reg)) + if (put_user(reg, u++)) return -EFAULT; - pos++; - u++; } } } @@ -684,12 +678,7 @@ static int genregs32_set(struct task_struct *target, } } else { for (; count > 0 && pos < 32; count--) { - if (access_process_vm(target, - (unsigned long) - u, - ®, sizeof(reg), - FOLL_FORCE) - != sizeof(reg)) + if (get_user(reg, u++)) return -EFAULT; if (access_process_vm(target, (unsigned long) From 46b14fc61bfa409efe95a7752a5dba361d753c66 Mon Sep 17 00:00:00 2001 From: Tero Kristo via iommu Date: Mon, 18 May 2020 14:10:57 +0300 Subject: [PATCH 137/574] iommu/omap: Add check for iommu group when no IOMMU in use Most of the devices in OMAP family of SoCs are not using IOMMU. The patch for converting the OMAP IOMMU to use generic IOMMU bus probe functionality failed to add a check for this, so add it here. Fixes: c822b37cac48 ("iommu/omap: Remove orphan_dev tracking") Reported-by: Tomi Valkeinen Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20200518111057.23140-1-t-kristo@ti.com Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 6699fe6d9e06..5a9ba815863b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1726,6 +1726,9 @@ static struct iommu_group *omap_iommu_device_group(struct device *dev) struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; struct iommu_group *group = ERR_PTR(-EINVAL); + if (!arch_data) + return ERR_PTR(-ENODEV); + if (arch_data->iommu_dev) group = iommu_group_ref_get(arch_data->iommu_dev->group); From 3db9983e4327f773c490de2a8c66d6000561d88a Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:44 +0800 Subject: [PATCH 138/574] iommu/vt-d: Move domain helper to header Move domain helper to header to be used by SVA code. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 6 ------ include/linux/intel-iommu.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2ff8d69ce4f8..8027f21073eb 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -441,12 +441,6 @@ static void init_translation_status(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED; } -/* Convert generic 'struct iommu_domain to private struct dmar_domain */ -static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) -{ - return container_of(dom, struct dmar_domain, domain); -} - static int __init intel_iommu_setup(char *str) { if (!str) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 980234ae0312..ed7171d2ae1f 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -595,6 +595,12 @@ static inline void __iommu_flush_cache( clflush_cache_range(addr, size); } +/* Convert generic struct iommu_domain to private struct dmar_domain */ +static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct dmar_domain, domain); +} + /* * 0: readable * 1: writable From 3aef9ca6a42aca8c797f867e554ad297b446439f Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:45 +0800 Subject: [PATCH 139/574] iommu/vt-d: Use a helper function to skip agaw for SL An Intel iommu domain uses 5-level page table by default. If the iommu that the domain tries to attach supports less page levels, the top level page tables should be skipped. Add a helper to do this so that it could be used in other places. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-pasid.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 22b30f10b396..d9cea3011b58 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -499,6 +499,25 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return 0; } +/* + * Skip top levels of page tables for iommu which has less agaw + * than default. Unnecessary for PT mode. + */ +static inline int iommu_skip_agaw(struct dmar_domain *domain, + struct intel_iommu *iommu, + struct dma_pte **pgd) +{ + int agaw; + + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + *pgd = phys_to_virt(dma_pte_addr(*pgd)); + if (!dma_pte_present(*pgd)) + return -EINVAL; + } + + return agaw; +} + /* * Set up the scalable mode pasid entry for second only translation type. */ @@ -522,17 +541,11 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -EINVAL; } - /* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. - */ pgd = domain->pgd; - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) { - dev_err(dev, "Invalid domain page table\n"); - return -EINVAL; - } + agaw = iommu_skip_agaw(domain, iommu, &pgd); + if (agaw < 0) { + dev_err(dev, "Invalid domain page table\n"); + return -EINVAL; } pgd_val = virt_to_phys(pgd); From b0d1f8741b812352fe0e5f3b2381427085f23e19 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:46 +0800 Subject: [PATCH 140/574] iommu/vt-d: Add nested translation helper function Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8. With PASID granular translation type set to 0x11b, translation result from the first level(FL) also subject to a second level(SL) page table translation. This mode is used for SVA virtualization, where FL performs guest virtual to guest physical translation and SL performs guest physical to host physical translation. This patch adds a helper function for setting up nested translation where second level comes from a domain and first level comes from a guest PGD. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 25 ------ drivers/iommu/intel-pasid.c | 174 +++++++++++++++++++++++++++++++++++- drivers/iommu/intel-pasid.h | 10 +++ include/linux/intel-iommu.h | 20 +++++ include/uapi/linux/iommu.h | 5 ++ 5 files changed, 206 insertions(+), 28 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 8027f21073eb..7e85c09eec71 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -296,31 +296,6 @@ static inline void context_clear_entry(struct context_entry *context) static struct dmar_domain *si_domain; static int hw_pass_through = 1; -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) - -/* - * This is a DMA domain allocated through the iommu domain allocation - * interface. But one or more devices belonging to this domain have - * been chosen to use a private domain. We should avoid to use the - * map/unmap/iova_to_phys APIs on it. - */ -#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1) - -/* - * When VT-d works in the scalable mode, it allows DMA translation to - * happen through either first level or second level page table. This - * bit marks that the DMA translation for the domain goes through the - * first level page table, otherwise, it goes through the second level. - */ -#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2) - -/* - * Domain represents a virtual machine which demands iommu nested - * translation mode support. - */ -#define DOMAIN_FLAG_NESTING_MODE BIT(3) - #define for_each_domain_iommu(idx, domain) \ for (idx = 0; idx < g_num_of_iommus; idx++) \ if (domain->iommu_refcnt[idx]) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index d9cea3011b58..c7fa1b79eaf7 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -359,6 +359,16 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void +pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, int pasid) @@ -492,7 +502,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, 1); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); pasid_set_present(pte); pasid_flush_caches(iommu, pte, pasid, did); @@ -561,7 +571,7 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, 2); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -595,7 +605,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, 4); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); @@ -609,3 +619,161 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } + +static int +intel_pasid_setup_bind_data(struct intel_iommu *iommu, struct pasid_entry *pte, + struct iommu_gpasid_bind_data_vtd *pasid_data) +{ + /* + * Not all guest PASID table entry fields are passed down during bind, + * here we only set up the ones that are dependent on guest settings. + * Execution related bits such as NXE, SMEP are not supported. + * Other fields, such as snoop related, are set based on host needs + * regardless of guest settings. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_SRE) { + if (!ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_sre(pte); + } + + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) { + if (!ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + pasid_set_eafe(pte); + } + + /* + * Memory type is only applicable to devices inside processor coherent + * domain. Will add MTS support once coherent devices are available. + */ + if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_MTS_MASK) { + pr_warn_ratelimited("No memory type support %s\n", + iommu->name); + return -EINVAL; + } + + return 0; +} + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * This could be used for guest shared virtual address. In this case, the + * first level page tables are used for GVA-GPA translation in the guest, + * second level page tables are used for GPA-HPA translation. + * + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @gpgd: FLPTPTR: First Level Page translation pointer in GPA + * @pasid: PASID to be programmed in the device PASID table + * @pasid_data: Additional PASID info from the guest bind request + * @domain: Domain info for setting up second level page tables + * @addr_width: Address width of the first level (guest) + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + pgd_t *gpgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width) +{ + struct pasid_entry *pte; + struct dma_pte *pgd; + int ret = 0; + u64 pgd_val; + int agaw; + u16 did; + + if (!ecap_nest(iommu->ecap)) { + pr_err_ratelimited("IOMMU: %s: No nested translation support\n", + iommu->name); + return -EINVAL; + } + + if (!(domain->flags & DOMAIN_FLAG_NESTING_MODE)) { + pr_err_ratelimited("Domain is not in nesting mode, %x\n", + domain->flags); + return -EINVAL; + } + + pte = intel_pasid_get_entry(dev, pasid); + if (WARN_ON(!pte)) + return -EINVAL; + + /* + * Caller must ensure PASID entry is not in use, i.e. not bind the + * same PASID to the same device twice. + */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + + pasid_clear_entry(pte); + + /* Sanity checking performed by caller to make sure address + * width matching in two dimensions: + * 1. CPU vs. IOMMU + * 2. Guest vs. Host. + */ + switch (addr_width) { +#ifdef CONFIG_X86 + case ADDR_WIDTH_5LEVEL: + if (!cpu_feature_enabled(X86_FEATURE_LA57) || + !cap_5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + + pasid_set_flpm(pte, 1); + break; +#endif + case ADDR_WIDTH_4LEVEL: + pasid_set_flpm(pte, 0); + break; + default: + dev_err_ratelimited(dev, "Invalid guest address width %d\n", + addr_width); + return -EINVAL; + } + + /* First level PGD is in GPA, must be supported by the second level */ + if ((unsigned long long)gpgd > domain->max_addr) { + dev_err_ratelimited(dev, + "Guest PGD %llx not supported, max %llx\n", + (unsigned long long)gpgd, domain->max_addr); + return -EINVAL; + } + pasid_set_flptr(pte, (u64)gpgd); + + ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data); + if (ret) + return ret; + + /* Setup the second level based on the given domain */ + pgd = domain->pgd; + + agaw = iommu_skip_agaw(domain, iommu, &pgd); + if (agaw < 0) { + dev_err_ratelimited(dev, "Invalid domain page table\n"); + return -EINVAL; + } + pgd_val = virt_to_phys(pgd); + pasid_set_slptr(pte, pgd_val); + pasid_set_fault_enable(pte); + + did = domain->iommu_did[iommu->seq_id]; + pasid_set_domain_id(pte, did); + + pasid_set_address_width(pte, agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + pasid_flush_caches(iommu, pte, pasid, did); + + return ret; +} diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 92de6df24ccb..ccd50c2ae75c 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -36,6 +36,7 @@ * to vmalloc or even module mappings. */ #define PASID_FLAG_SUPERVISOR_MODE BIT(0) +#define PASID_FLAG_NESTED BIT(1) /* * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first- @@ -51,6 +52,11 @@ struct pasid_entry { u64 val[8]; }; +#define PASID_ENTRY_PGTT_FL_ONLY (1) +#define PASID_ENTRY_PGTT_SL_ONLY (2) +#define PASID_ENTRY_PGTT_NESTED (3) +#define PASID_ENTRY_PGTT_PT (4) + /* The representative of a PASID table */ struct pasid_table { void *table; /* pasid table pointer */ @@ -99,6 +105,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, int pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, int pasid, + struct iommu_gpasid_bind_data_vtd *pasid_data, + struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index ed7171d2ae1f..e0d1fed7cbe4 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -42,6 +42,9 @@ #define DMA_FL_PTE_PRESENT BIT_ULL(0) #define DMA_FL_PTE_XD BIT_ULL(63) +#define ADDR_WIDTH_5LEVEL (57) +#define ADDR_WIDTH_4LEVEL (48) + #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THROUGH 2 @@ -480,6 +483,23 @@ struct context_entry { u64 hi; }; +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) + +/* + * When VT-d works in the scalable mode, it allows DMA translation to + * happen through either first level or second level page table. This + * bit marks that the DMA translation for the domain goes through the + * first level page table, otherwise, it goes through the second level. + */ +#define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(1) + +/* + * Domain represents a virtual machine which demands iommu nested + * translation mode support. + */ +#define DOMAIN_FLAG_NESTING_MODE BIT(2) + struct dmar_domain { int nid; /* node id */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 4ad3496e5c43..e907b7091a46 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -285,6 +285,11 @@ struct iommu_gpasid_bind_data_vtd { __u32 emt; }; +#define IOMMU_SVA_VTD_GPASID_MTS_MASK (IOMMU_SVA_VTD_GPASID_CD | \ + IOMMU_SVA_VTD_GPASID_EMTE | \ + IOMMU_SVA_VTD_GPASID_PCD | \ + IOMMU_SVA_VTD_GPASID_PWT) + /** * struct iommu_gpasid_bind_data - Information about device and guest PASID binding * @version: Version of this data structure From 56722a4398a306585ca3ed39ff54fc907af98618 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:47 +0800 Subject: [PATCH 141/574] iommu/vt-d: Add bind guest PASID support When supporting guest SVA with emulated IOMMU, the guest PASID table is shadowed in VMM. Updates to guest vIOMMU PASID table will result in PASID cache flush which will be passed down to the host as bind guest PASID calls. For the SL page tables, it will be harvested from device's default domain (request w/o PASID), or aux domain in case of mediated device. .-------------. .---------------------------. | vIOMMU | | Guest process CR3, FL only| | | '---------------------------' .----------------/ | PASID Entry |--- PASID cache flush - '-------------' | | | V | | CR3 in GPA '-------------' Guest ------| Shadow |--------------------------|-------- v v v Host .-------------. .----------------------. | pIOMMU | | Bind FL for GVA-GPA | | | '----------------------' .----------------/ | | PASID Entry | V (Nested xlate) '----------------\.------------------------------. | | |SL for GPA-HPA, default domain| | | '------------------------------' '-------------' Where: - FL = First level/stage one page tables - SL = Second level/stage two page tables Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 4 + drivers/iommu/intel-svm.c | 200 ++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 6 +- include/linux/intel-svm.h | 12 +++ 4 files changed, 221 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7e85c09eec71..f42c548f8421 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5780,6 +5780,10 @@ const struct iommu_ops intel_iommu_ops = { .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, +#ifdef CONFIG_INTEL_IOMMU_SVM + .sva_bind_gpasid = intel_svm_bind_gpasid, + .sva_unbind_gpasid = intel_svm_unbind_gpasid, +#endif }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 2998418f0a38..7d3405c5a198 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -226,6 +226,206 @@ static LIST_HEAD(global_svm_list); list_for_each_entry((sdev), &(svm)->devs, list) \ if ((d) != (sdev)->dev) {} else +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct dmar_domain *dmar_domain; + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = 0; + + if (WARN_ON(!iommu) || !data) + return -EINVAL; + + if (data->version != IOMMU_GPASID_BIND_VERSION_1 || + data->format != IOMMU_PASID_FORMAT_INTEL_VTD) + return -EINVAL; + + if (!dev_is_pci(dev)) + return -ENOTSUPP; + + /* VT-d supports devices with full 20 bit PASIDs only */ + if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX) + return -EINVAL; + + /* + * We only check host PASID range, we have no knowledge to check + * guest PASID range. + */ + if (data->hpasid <= 0 || data->hpasid >= PASID_MAX) + return -EINVAL; + + dmar_domain = to_dmar_domain(domain); + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, data->hpasid, NULL); + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + if (svm) { + /* + * If we found svm for the PASID, there must be at + * least one device bond, otherwise svm should be freed. + */ + if (WARN_ON(list_empty(&svm->devs))) { + ret = -EINVAL; + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + /* + * For devices with aux domains, we should allow + * multiple bind calls with the same PASID and pdev. + */ + if (iommu_dev_feature_enabled(dev, + IOMMU_DEV_FEAT_AUX)) { + sdev->users++; + } else { + dev_warn_ratelimited(dev, + "Already bound with PASID %u\n", + svm->pasid); + ret = -EBUSY; + } + goto out; + } + } else { + /* We come here when PASID has never been bond to a device. */ + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) { + ret = -ENOMEM; + goto out; + } + /* REVISIT: upper layer/VFIO can track host process that bind + * the PASID. ioasid_set = mm might be sufficient for vfio to + * check pasid VMM ownership. We can drop the following line + * once VFIO and IOASID set check is in place. + */ + svm->mm = get_task_mm(current); + svm->pasid = data->hpasid; + if (data->flags & IOMMU_SVA_GPASID_VAL) { + svm->gpasid = data->gpasid; + svm->flags |= SVM_FLAG_GUEST_PASID; + } + ioasid_set_data(data->hpasid, svm); + INIT_LIST_HEAD_RCU(&svm->devs); + mmput(svm->mm); + } + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) { + ret = -ENOMEM; + goto out; + } + sdev->dev = dev; + + /* Only count users if device has aux domains */ + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users = 1; + + /* Set up device context entry for PASID if not enabled already */ + ret = intel_iommu_enable_pasid(iommu, sdev->dev); + if (ret) { + dev_err_ratelimited(dev, "Failed to enable PASID capability\n"); + kfree(sdev); + goto out; + } + + /* + * PASID table is per device for better security. Therefore, for + * each bind of a new device even with an existing PASID, we need to + * call the nested mode setup function here. + */ + spin_lock(&iommu->lock); + ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd, + data->hpasid, &data->vtd, dmar_domain, + data->addr_width); + spin_unlock(&iommu->lock); + if (ret) { + dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", + data->hpasid, ret); + /* + * PASID entry should be in cleared state if nested mode + * set up failed. So we only need to clear IOASID tracking + * data such that free call will succeed. + */ + kfree(sdev); + goto out; + } + + svm->flags |= SVM_FLAG_GUEST_MODE; + + init_rcu_head(&sdev->rcu); + list_add_rcu(&sdev->list, &svm->devs); + out: + if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { + ioasid_set_data(data->hpasid, NULL); + kfree(svm); + } + + mutex_unlock(&pasid_mutex); + return ret; +} + +int intel_svm_unbind_gpasid(struct device *dev, int pasid) +{ + struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = -EINVAL; + + if (WARN_ON(!iommu)) + return -EINVAL; + + mutex_lock(&pasid_mutex); + svm = ioasid_find(NULL, pasid, NULL); + if (!svm) { + ret = -EINVAL; + goto out; + } + + if (IS_ERR(svm)) { + ret = PTR_ERR(svm); + goto out; + } + + for_each_svm_dev(sdev, svm, dev) { + ret = 0; + if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) + sdev->users--; + if (!sdev->users) { + list_del_rcu(&sdev->list); + intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); + /* TODO: Drain in flight PRQ for the PASID since it + * may get reused soon, we don't want to + * confuse with its previous life. + * intel_svm_drain_prq(dev, pasid); + */ + kfree_rcu(sdev, rcu); + + if (list_empty(&svm->devs)) { + /* + * We do not free the IOASID here in that + * IOMMU driver did not allocate it. + * Unlike native SVM, IOASID for guest use was + * allocated prior to the bind call. + * In any case, if the free call comes before + * the unbind, IOMMU driver will get notified + * and perform cleanup. + */ + ioasid_set_data(pasid, NULL); + kfree(svm); + } + } + break; + } +out: + mutex_unlock(&pasid_mutex); + return ret; +} + int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e0d1fed7cbe4..3dfd426dfb03 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -698,7 +698,9 @@ struct dmar_domain *find_domain(struct device *dev); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); - +int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, + struct iommu_gpasid_bind_data *data); +int intel_svm_unbind_gpasid(struct device *dev, int pasid); struct svm_dev_ops; struct intel_svm_dev { @@ -715,9 +717,11 @@ struct intel_svm_dev { struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; + struct intel_iommu *iommu; int flags; int pasid; + int gpasid; /* In case that guest PASID is different from host PASID */ struct list_head devs; struct list_head list; }; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index d7c403d0dd27..1b47ca46373e 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -44,6 +44,18 @@ struct svm_dev_ops { * do such IOTLB flushes automatically. */ #define SVM_FLAG_SUPERVISOR_MODE (1<<1) +/* + * The SVM_FLAG_GUEST_MODE flag is used when a PASID bind is for guest + * processes. Compared to the host bind, the primary differences are: + * 1. mm life cycle management + * 2. fault reporting + */ +#define SVM_FLAG_GUEST_MODE (1<<2) +/* + * The SVM_FLAG_GUEST_PASID flag is used when a guest has its own PASID space, + * which requires guest and host PASID translation at both directions. + */ +#define SVM_FLAG_GUEST_PASID (1<<3) #ifdef CONFIG_INTEL_IOMMU_SVM From 61a06a16e36d830f7811fbf931668d87197d95b7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:48 +0800 Subject: [PATCH 142/574] iommu/vt-d: Support flushing more translation cache types When Shared Virtual Memory is exposed to a guest via vIOMMU, scalable IOTLB invalidation may be passed down from outside IOMMU subsystems. This patch adds invalidation functions that can be used for additional translation cache types. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 39 +++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.c | 3 ++- include/linux/intel-iommu.h | 21 ++++++++++++++++---- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index f77dae7ba7d4..34ee8f28555f 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1421,6 +1421,45 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, qi_submit_sync(&desc, iommu); } +/* PASID-based device IOTLB Invalidate */ +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + desc.qw1 = QI_DEV_EIOTLB_GLOB(granu); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + desc.qw1 |= addr & ~mask; + if (size_order) + desc.qw1 |= QI_DEV_EIOTLB_SIZE; + + qi_submit_sync(&desc, iommu); +} + +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, + u64 granu, int pasid) +{ + struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; + + desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | + QI_PC_GRAN(granu) | QI_PC_TYPE; + qi_submit_sync(&desc, iommu); +} + /* * Disable Queued Invalidation interface. */ diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c7fa1b79eaf7..5d9d9ff49334 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -375,7 +375,8 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, { struct qi_desc desc; - desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid); + desc.qw0 = QI_PC_DID(did) | QI_PC_GRAN(QI_PC_PASID_SEL) | + QI_PC_PASID(pasid) | QI_PC_TYPE; desc.qw1 = 0; desc.qw2 = 0; desc.qw3 = 0; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 3dfd426dfb03..a9c984b29a72 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -334,7 +334,7 @@ enum { #define QI_IOTLB_GRAN(gran) (((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4)) #define QI_IOTLB_ADDR(addr) (((u64)addr) & VTD_PAGE_MASK) #define QI_IOTLB_IH(ih) (((u64)ih) << 6) -#define QI_IOTLB_AM(am) (((u8)am)) +#define QI_IOTLB_AM(am) (((u8)am) & 0x3f) #define QI_CC_FM(fm) (((u64)fm) << 48) #define QI_CC_SID(sid) (((u64)sid) << 32) @@ -353,16 +353,21 @@ enum { #define QI_PC_DID(did) (((u64)did) << 16) #define QI_PC_GRAN(gran) (((u64)gran) << 4) -#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) -#define QI_PC_PASID_SEL (QI_PC_TYPE | QI_PC_GRAN(1)) +/* PASID cache invalidation granu */ +#define QI_PC_ALL_PASIDS 0 +#define QI_PC_PASID_SEL 1 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_IH(ih) (((u64)ih) << 6) -#define QI_EIOTLB_AM(am) (((u64)am)) +#define QI_EIOTLB_AM(am) (((u64)am) & 0x3f) #define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32) #define QI_EIOTLB_DID(did) (((u64)did) << 16) #define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) +/* QI Dev-IOTLB inv granu */ +#define QI_DEV_IOTLB_GRAN_ALL 1 +#define QI_DEV_IOTLB_GRAN_PASID_SEL 0 + #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) #define QI_DEV_EIOTLB_GLOB(g) ((u64)g) @@ -679,8 +684,16 @@ extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask); + void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, unsigned long npages, bool ih); + +void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, u64 granu); +void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, + int pasid); + extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); From 6ee1b77ba3ac0a79fc6f3273f3b27b13240a355e Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:49 +0800 Subject: [PATCH 143/574] iommu/vt-d: Add svm/sva invalidate function When Shared Virtual Address (SVA) is enabled for a guest OS via vIOMMU, we need to provide invalidation support at IOMMU API and driver level. This patch adds Intel VT-d specific function to implement iommu passdown invalidate API for shared virtual address. The use case is for supporting caching structure invalidation of assigned SVM capable devices. Emulated IOMMU exposes queue invalidation capability and passes down all descriptors from the guest to the physical IOMMU. The assumption is that guest to host device ID mapping should be resolved prior to calling IOMMU driver. Based on the device handle, host IOMMU driver can replace certain fields before submit to the invalidation queue. Signed-off-by: Liu Yi L Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 171 ++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f42c548f8421..627bb5093317 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5242,6 +5242,176 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain, aux_domain_remove_dev(to_dmar_domain(domain), dev); } +/* + * 2D array for converting and sanitizing IOMMU generic TLB granularity to + * VT-d granularity. Invalidation is typically included in the unmap operation + * as a result of DMA or VFIO unmap. However, for assigned devices guest + * owns the first level page tables. Invalidations of translation caches in the + * guest are trapped and passed down to the host. + * + * vIOMMU in the guest will only expose first level page tables, therefore + * we do not support IOTLB granularity for request without PASID (second level). + * + * For example, to find the VT-d granularity encoding for IOTLB + * type and page selective granularity within PASID: + * X: indexed by iommu cache type + * Y: indexed by enum iommu_inv_granularity + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] + */ + +const static int +inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { + /* + * PASID based IOTLB invalidation: PASID selective (per PASID), + * page selective (address granularity) + */ + {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}, + /* PASID based dev TLBs */ + {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL}, + /* PASID cache */ + {-EINVAL, -EINVAL, -EINVAL} +}; + +static inline int to_vtd_granularity(int type, int granu) +{ + return inv_type_granu_table[type][granu]; +} + +static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules) +{ + u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT; + + /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc. + * IOMMU cache invalidate API passes granu_size in bytes, and number of + * granu size in contiguous memory. + */ + return order_base_2(nr_pages); +} + +#ifdef CONFIG_INTEL_IOMMU_SVM +static int +intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, + struct iommu_cache_invalidate_info *inv_info) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + struct intel_iommu *iommu; + unsigned long flags; + int cache_type; + u8 bus, devfn; + u16 did, sid; + int ret = 0; + u64 size = 0; + + if (!inv_info || !dmar_domain || + inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1) + return -EINVAL; + + if (!dev || !dev_is_pci(dev)) + return -ENODEV; + + iommu = device_to_iommu(dev, &bus, &devfn); + if (!iommu) + return -ENODEV; + + if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE)) + return -EINVAL; + + spin_lock_irqsave(&device_domain_lock, flags); + spin_lock(&iommu->lock); + info = dev->archdata.iommu; + if (!info) { + ret = -EINVAL; + goto out_unlock; + } + did = dmar_domain->iommu_did[iommu->seq_id]; + sid = PCI_DEVID(bus, devfn); + + /* Size is only valid in address selective invalidation */ + if (inv_info->granularity != IOMMU_INV_GRANU_PASID) + size = to_vtd_size(inv_info->addr_info.granule_size, + inv_info->addr_info.nb_granules); + + for_each_set_bit(cache_type, + (unsigned long *)&inv_info->cache, + IOMMU_CACHE_INV_TYPE_NR) { + int granu = 0; + u64 pasid = 0; + + granu = to_vtd_granularity(cache_type, inv_info->granularity); + if (granu == -EINVAL) { + pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n", + cache_type, inv_info->granularity); + break; + } + + /* + * PASID is stored in different locations based on the + * granularity. + */ + if (inv_info->granularity == IOMMU_INV_GRANU_PASID && + (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID)) + pasid = inv_info->pasid_info.pasid; + else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && + (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID)) + pasid = inv_info->addr_info.pasid; + + switch (BIT(cache_type)) { + case IOMMU_CACHE_INV_TYPE_IOTLB: + if (inv_info->granularity == IOMMU_INV_GRANU_ADDR && + size && + (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) { + pr_err_ratelimited("Address out of range, 0x%llx, size order %llu\n", + inv_info->addr_info.addr, size); + ret = -ERANGE; + goto out_unlock; + } + + /* + * If granu is PASID-selective, address is ignored. + * We use npages = -1 to indicate that. + */ + qi_flush_piotlb(iommu, did, pasid, + mm_to_dma_pfn(inv_info->addr_info.addr), + (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size, + inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF); + + /* + * Always flush device IOTLB if ATS is enabled. vIOMMU + * in the guest may assume IOTLB flush is inclusive, + * which is more efficient. + */ + if (info->ats_enabled) + qi_flush_dev_iotlb_pasid(iommu, sid, + info->pfsid, pasid, + info->ats_qdep, + inv_info->addr_info.addr, + size, granu); + break; + case IOMMU_CACHE_INV_TYPE_DEV_IOTLB: + if (info->ats_enabled) + qi_flush_dev_iotlb_pasid(iommu, sid, + info->pfsid, pasid, + info->ats_qdep, + inv_info->addr_info.addr, + size, granu); + else + pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n"); + break; + default: + dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n", + cache_type); + ret = -EINVAL; + } + } +out_unlock: + spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&device_domain_lock, flags); + + return ret; +} +#endif + static int intel_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t hpa, size_t size, int iommu_prot, gfp_t gfp) @@ -5781,6 +5951,7 @@ const struct iommu_ops intel_iommu_ops = { .def_domain_type = device_def_domain_type, .pgsize_bitmap = INTEL_IOMMU_PGSIZES, #ifdef CONFIG_INTEL_IOMMU_SVM + .cache_invalidate = intel_iommu_sva_invalidate, .sva_bind_gpasid = intel_svm_bind_gpasid, .sva_unbind_gpasid = intel_svm_unbind_gpasid, #endif From 24f27d32ab6b71dedcbbeeab8f9bdc143b539ac0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:50 +0800 Subject: [PATCH 144/574] iommu/vt-d: Enlightened PASID allocation Enabling IOMMU in a guest requires communication with the host driver for certain aspects. Use of PASID ID to enable Shared Virtual Addressing (SVA) requires managing PASID's in the host. VT-d 3.0 spec provides a Virtual Command Register (VCMD) to facilitate this. Writes to this register in the guest are trapped by vIOMMU which proxies the call to the host driver. This virtual command interface consists of a capability register, a virtual command register, and a virtual response register. Refer to section 10.4.42, 10.4.43, 10.4.44 for more information. This patch adds the enlightened PASID allocation/free interfaces via the virtual command interface. Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Signed-off-by: Jacob Pan Reviewed-by: Eric Auger Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-pasid.c | 57 +++++++++++++++++++++++++++++++++++++ drivers/iommu/intel-pasid.h | 13 ++++++++- include/linux/intel-iommu.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 5d9d9ff49334..ea8f4ef4e295 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -27,6 +27,63 @@ static DEFINE_SPINLOCK(pasid_lock); u32 intel_pasid_max_id = PASID_MAX; +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid) +{ + unsigned long flags; + u8 status_code; + int ret = 0; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + *pasid = VCMD_VRSP_RESULT_PASID(res); + break; + case VCMD_VRSP_SC_NO_PASID_AVAIL: + pr_info("IOMMU: %s: No PASID available\n", iommu->name); + ret = -ENOSPC; + break; + default: + ret = -ENODEV; + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } + + return ret; +} + +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid) +{ + unsigned long flags; + u8 status_code; + u64 res; + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + dmar_writeq(iommu->reg + DMAR_VCMD_REG, + VCMD_CMD_OPERAND(pasid) | VCMD_CMD_FREE); + IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq, + !(res & VCMD_VRSP_IP), res); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + status_code = VCMD_VRSP_SC(res); + switch (status_code) { + case VCMD_VRSP_SC_SUCCESS: + break; + case VCMD_VRSP_SC_INVALID_PASID: + pr_info("IOMMU: %s: Invalid PASID\n", iommu->name); + break; + default: + pr_warn("IOMMU: %s: Unexpected error code %d\n", + iommu->name, status_code); + } +} + /* * Per device pasid table management: */ diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index ccd50c2ae75c..a41b09b3ffde 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -23,6 +23,16 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) +/* Virtual command interface for enlightened pasid management. */ +#define VCMD_CMD_ALLOC 0x1 +#define VCMD_CMD_FREE 0x2 +#define VCMD_VRSP_IP 0x1 +#define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3) +#define VCMD_VRSP_SC_SUCCESS 0 +#define VCMD_VRSP_SC_NO_PASID_AVAIL 1 +#define VCMD_VRSP_SC_INVALID_PASID 1 +#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff) +#define VCMD_CMD_OPERAND(e) ((e) << 8) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. @@ -111,5 +121,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, int pasid); - +int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid); +void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid); #endif /* __INTEL_PASID_H */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a9c984b29a72..addb310b4ded 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -169,6 +169,7 @@ #define ecap_smpwc(e) (((e) >> 48) & 0x1) #define ecap_flts(e) (((e) >> 47) & 0x1) #define ecap_slts(e) (((e) >> 46) & 0x1) +#define ecap_vcs(e) (((e) >> 44) & 0x1) #define ecap_smts(e) (((e) >> 43) & 0x1) #define ecap_dit(e) ((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) From 3375303e82877552f3b2b42309e8233fe715fd9f Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:51 +0800 Subject: [PATCH 145/574] iommu/vt-d: Add custom allocator for IOASID When VT-d driver runs in the guest, PASID allocation must be performed via virtual command interface. This patch registers a custom IOASID allocator which takes precedence over the default XArray based allocator. The resulting IOASID allocation will always come from the host. This ensures that PASID namespace is system- wide. Virtual command registers are used in the guest only, to prevent vmexit cost, we cache the capability and store it during initialization. Signed-off-by: Liu, Yi L Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200516062101.29541-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 1 + drivers/iommu/intel-iommu.c | 85 +++++++++++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 7 +++ 3 files changed, 93 insertions(+) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 34ee8f28555f..66af08ad10fb 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -963,6 +963,7 @@ static int map_iommu(struct intel_iommu *iommu, u64 phys_addr) warn_invalid_dmar(phys_addr, " returns all ones"); goto unmap; } + iommu->vccap = dmar_readq(iommu->reg + DMAR_VCCAP_REG); /* the registers might be more than one page */ map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap), diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 627bb5093317..80d0bd561bdd 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1726,6 +1726,9 @@ static void free_dmar_iommu(struct intel_iommu *iommu) if (ecap_prs(iommu->ecap)) intel_svm_finish_prq(iommu); } + if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) + ioasid_unregister_allocator(&iommu->pasid_allocator); + #endif } @@ -3038,6 +3041,85 @@ out_unmap: return ret; } +#ifdef CONFIG_INTEL_IOMMU_SVM +static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data) +{ + struct intel_iommu *iommu = data; + ioasid_t ioasid; + + if (!iommu) + return INVALID_IOASID; + /* + * VT-d virtual command interface always uses the full 20 bit + * PASID range. Host can partition guest PASID range based on + * policies but it is out of guest's control. + */ + if (min < PASID_MIN || max > intel_pasid_max_id) + return INVALID_IOASID; + + if (vcmd_alloc_pasid(iommu, &ioasid)) + return INVALID_IOASID; + + return ioasid; +} + +static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data) +{ + struct intel_iommu *iommu = data; + + if (!iommu) + return; + /* + * Sanity check the ioasid owner is done at upper layer, e.g. VFIO + * We can only free the PASID when all the devices are unbound. + */ + if (ioasid_find(NULL, ioasid, NULL)) { + pr_alert("Cannot free active IOASID %d\n", ioasid); + return; + } + vcmd_free_pasid(iommu, ioasid); +} + +static void register_pasid_allocator(struct intel_iommu *iommu) +{ + /* + * If we are running in the host, no need for custom allocator + * in that PASIDs are allocated from the host system-wide. + */ + if (!cap_caching_mode(iommu->cap)) + return; + + if (!sm_supported(iommu)) { + pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n"); + return; + } + + /* + * Register a custom PASID allocator if we are running in a guest, + * guest PASID must be obtained via virtual command interface. + * There can be multiple vIOMMUs in each guest but only one allocator + * is active. All vIOMMU allocators will eventually be calling the same + * host allocator. + */ + if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap)) + return; + + pr_info("Register custom PASID allocator\n"); + iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc; + iommu->pasid_allocator.free = intel_vcmd_ioasid_free; + iommu->pasid_allocator.pdata = (void *)iommu; + if (ioasid_register_allocator(&iommu->pasid_allocator)) { + pr_warn("Custom PASID allocator failed, scalable mode disabled\n"); + /* + * Disable scalable mode on this IOMMU if there + * is no custom allocator. Mixing SM capable vIOMMU + * and non-SM vIOMMU are not supported. + */ + intel_iommu_sm = 0; + } +} +#endif + static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; @@ -3155,6 +3237,9 @@ static int __init init_dmars(void) */ for_each_active_iommu(iommu, drhd) { iommu_flush_write_buffer(iommu); +#ifdef CONFIG_INTEL_IOMMU_SVM + register_pasid_allocator(iommu); +#endif iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index addb310b4ded..e14124f74b3a 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -195,6 +196,9 @@ #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) #define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */ +/* Virtual command interface capability */ +#define vccap_pasid(v) (((v) & DMA_VCS_PAS)) /* PASID allocation */ + /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) @@ -288,6 +292,7 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ @@ -555,6 +560,7 @@ struct intel_iommu { u64 reg_size; /* size of hw register set */ u64 cap; u64 ecap; + u64 vccap; u32 gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */ raw_spinlock_t register_lock; /* protect register handling */ int seq_id; /* sequence id of the iommu */ @@ -575,6 +581,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ u32 *iommu_state; /* Store iommu states between suspend and resume.*/ From e85bb99b79ca5ad2681612a7bb22f94cc2c71866 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:52 +0800 Subject: [PATCH 146/574] iommu/vt-d: Add get_domain_info() helper Add a get_domain_info() helper to retrieve the valid per-device iommu private data. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 40 +++++++++++++++++++++++++------------ drivers/iommu/intel-pasid.c | 12 +++++------ drivers/iommu/intel-svm.c | 2 +- include/linux/intel-iommu.h | 1 + 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 80d0bd561bdd..a13b723ca38d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -365,6 +365,21 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2)) +struct device_domain_info *get_domain_info(struct device *dev) +{ + struct device_domain_info *info; + + if (!dev) + return NULL; + + info = dev->archdata.iommu; + if (unlikely(info == DUMMY_DEVICE_DOMAIN_INFO || + info == DEFER_DEVICE_DOMAIN_INFO)) + return NULL; + + return info; +} + DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); @@ -2429,7 +2444,7 @@ struct dmar_domain *find_domain(struct device *dev) dev = &pci_real_dma_dev(to_pci_dev(dev))->dev; /* No lock here, assumes no domain exit in normal case */ - info = dev->archdata.iommu; + info = get_domain_info(dev); if (likely(info)) return info->domain; @@ -5012,9 +5027,8 @@ static void dmar_remove_one_dev_info(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; - if (info && info != DEFER_DEVICE_DOMAIN_INFO - && info != DUMMY_DEVICE_DOMAIN_INFO) + info = get_domain_info(dev); + if (info) __dmar_remove_one_dev_info(info); spin_unlock_irqrestore(&device_domain_lock, flags); } @@ -5104,7 +5118,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) static inline bool is_aux_domain(struct device *dev, struct iommu_domain *domain) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); return info && info->auxd_enabled && domain->type == IOMMU_DOMAIN_UNMANAGED; @@ -5113,7 +5127,7 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain) static void auxiliary_link_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5126,7 +5140,7 @@ static void auxiliary_link_device(struct dmar_domain *domain, static void auxiliary_unlink_device(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) @@ -5214,7 +5228,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, return; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); iommu = info->iommu; auxiliary_unlink_device(domain, dev); @@ -5404,7 +5418,7 @@ intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev, spin_lock_irqsave(&device_domain_lock, flags); spin_lock(&iommu->lock); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) { ret = -EINVAL; goto out_unlock; @@ -5768,7 +5782,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev) spin_lock(&iommu->lock); ret = -EINVAL; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) goto out; @@ -5864,7 +5878,7 @@ static int intel_iommu_enable_auxd(struct device *dev) return -ENODEV; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); info->auxd_enabled = 1; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5877,7 +5891,7 @@ static int intel_iommu_disable_auxd(struct device *dev) unsigned long flags; spin_lock_irqsave(&device_domain_lock, flags); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!WARN_ON(!info)) info->auxd_enabled = 0; spin_unlock_irqrestore(&device_domain_lock, flags); @@ -5954,7 +5968,7 @@ intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) static bool intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat) { - struct device_domain_info *info = dev->archdata.iommu; + struct device_domain_info *info = get_domain_info(dev); if (feat == IOMMU_DEV_FEAT_AUX) return scalable_mode_support() && info && info->auxd_enabled; diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index ea8f4ef4e295..c46a068142b9 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -151,7 +151,7 @@ int intel_pasid_alloc_table(struct device *dev) int size; might_sleep(); - info = dev->archdata.iommu; + info = get_domain_info(dev); if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) return -EINVAL; @@ -198,7 +198,7 @@ void intel_pasid_free_table(struct device *dev) struct pasid_entry *table; int i, max_pde; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !dev_is_pci(dev) || !info->pasid_table) return; @@ -224,7 +224,7 @@ struct pasid_table *intel_pasid_get_table(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info) return NULL; @@ -235,7 +235,7 @@ int intel_pasid_get_dev_max_id(struct device *dev) { struct device_domain_info *info; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_table) return 0; @@ -256,7 +256,7 @@ struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid) return NULL; dir = pasid_table->table; - info = dev->archdata.iommu; + info = get_domain_info(dev); dir_index = pasid >> PASID_PDE_SHIFT; index = pasid & PASID_PTE_MASK; @@ -462,7 +462,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, struct device_domain_info *info; u16 sid, qdep, pfsid; - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->ats_enabled) return; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 7d3405c5a198..75a1ba4439f7 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -503,7 +503,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ goto out; } - info = dev->archdata.iommu; + info = get_domain_info(dev); if (!info || !info->pasid_supported) { kfree(sdev); goto out; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e14124f74b3a..caa179e806fc 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -714,6 +714,7 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); struct dmar_domain *find_domain(struct device *dev); +struct device_domain_info *get_domain_info(struct device *dev); #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); From 76fdd6c59532630559a2c63e8645a7033f9623c4 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:53 +0800 Subject: [PATCH 147/574] iommu/vt-d: Report SVA feature with generic flag Query Shared Virtual Address/Memory capability is a generic feature. SVA feature check is the required first step before calling iommu_sva_bind_device(). VT-d checks SVA feature enabling at per IOMMU level during this step, SVA bind device will check and enable PCI ATS, PRS, and PASID capabilities at device level. This patch reports Intel SVM as SVA feature such that generic code (e.g. Uacce [1]) can use it. [1] https://lkml.org/lkml/2020/1/15/604 Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-11-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index a13b723ca38d..ed7de7420b3c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5944,6 +5944,14 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) return !!siov_find_pci_dvsec(to_pci_dev(dev)); } + if (feat == IOMMU_DEV_FEAT_SVA) { + struct device_domain_info *info = get_domain_info(dev); + + return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) && + info->pasid_supported && info->pri_supported && + info->ats_supported; + } + return false; } @@ -5953,6 +5961,16 @@ intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) if (feat == IOMMU_DEV_FEAT_AUX) return intel_iommu_enable_auxd(dev); + if (feat == IOMMU_DEV_FEAT_SVA) { + struct device_domain_info *info = get_domain_info(dev); + + if (!info) + return -EINVAL; + + if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) + return 0; + } + return -ENODEV; } From 064a57d7ddfc46ada02b477b91c478001b03bfa3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Sat, 16 May 2020 14:20:54 +0800 Subject: [PATCH 148/574] iommu/vt-d: Replace intel SVM APIs with generic SVA APIs This patch is an initial step to replace Intel SVM code with the following IOMMU SVA ops: intel_svm_bind_mm() => iommu_sva_bind_device() intel_svm_unbind_mm() => iommu_sva_unbind_device() intel_svm_is_pasid_valid() => iommu_sva_get_pasid() The features below will continue to work but are not included in this patch in that they are handled mostly within the IOMMU subsystem. - IO page fault - mmu notifier Consolidation of the above will come after merging generic IOMMU sva code[1]. There should not be any changes needed for SVA users such as accelerator device drivers during this time. [1] http://jpbrucker.net/sva/ Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-12-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 + drivers/iommu/intel-svm.c | 124 ++++++++++++++++++++---------------- include/linux/intel-iommu.h | 6 ++ include/linux/intel-svm.h | 86 ------------------------- 4 files changed, 78 insertions(+), 141 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ed7de7420b3c..7d28ef2e6fe2 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -6071,6 +6071,9 @@ const struct iommu_ops intel_iommu_ops = { .cache_invalidate = intel_iommu_sva_invalidate, .sva_bind_gpasid = intel_svm_bind_gpasid, .sva_unbind_gpasid = intel_svm_unbind_gpasid, + .sva_bind = intel_svm_bind, + .sva_unbind = intel_svm_unbind, + .sva_get_pasid = intel_svm_get_pasid, #endif }; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 75a1ba4439f7..8b66bf45477e 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -426,13 +426,15 @@ out: return ret; } -int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) +/* Caller must hold pasid_mutex, mm reference */ +static int +intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops, + struct mm_struct *mm, struct intel_svm_dev **sd) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); struct device_domain_info *info; struct intel_svm_dev *sdev; struct intel_svm *svm = NULL; - struct mm_struct *mm = NULL; int pasid_max; int ret; @@ -449,16 +451,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } else pasid_max = 1 << 20; + /* Bind supervisor PASID shuld have mm = NULL */ if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) + if (!ecap_srs(iommu->ecap) || mm) { + pr_err("Supervisor PASID with user provided mm.\n"); return -EINVAL; - } else if (pasid) { - mm = get_task_mm(current); - BUG_ON(!mm); + } } - mutex_lock(&pasid_mutex); - if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) { + if (!(flags & SVM_FLAG_PRIVATE_PASID)) { struct intel_svm *t; list_for_each_entry(t, &global_svm_list, list) { @@ -496,9 +497,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ sdev->dev = dev; ret = intel_iommu_enable_pasid(iommu, dev); - if (ret || !pasid) { - /* If they don't actually want to assign a PASID, this is - * just an enabling check/preparation. */ + if (ret) { kfree(sdev); goto out; } @@ -597,18 +596,17 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } } list_add_rcu(&sdev->list, &svm->devs); - - success: - *pasid = svm->pasid; +success: + sdev->pasid = svm->pasid; + sdev->sva.dev = dev; + if (sd) + *sd = sdev; ret = 0; out: - mutex_unlock(&pasid_mutex); - if (mm) - mmput(mm); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_bind_mm); +/* Caller must hold pasid_mutex */ int intel_svm_unbind_mm(struct device *dev, int pasid) { struct intel_svm_dev *sdev; @@ -616,7 +614,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) struct intel_svm *svm; int ret = -EINVAL; - mutex_lock(&pasid_mutex); iommu = intel_svm_device_to_iommu(dev); if (!iommu) goto out; @@ -662,45 +659,9 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) break; } out: - mutex_unlock(&pasid_mutex); return ret; } -EXPORT_SYMBOL_GPL(intel_svm_unbind_mm); - -int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - struct intel_iommu *iommu; - struct intel_svm *svm; - int ret = -EINVAL; - - mutex_lock(&pasid_mutex); - iommu = intel_svm_device_to_iommu(dev); - if (!iommu) - goto out; - - svm = ioasid_find(NULL, pasid, NULL); - if (!svm) - goto out; - - if (IS_ERR(svm)) { - ret = PTR_ERR(svm); - goto out; - } - /* init_mm is used in this case */ - if (!svm->mm) - ret = 1; - else if (atomic_read(&svm->mm->mm_users) > 0) - ret = 1; - else - ret = 0; - - out: - mutex_unlock(&pasid_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid); /* Page request queue descriptor */ struct page_req_dsc { @@ -894,3 +855,56 @@ static irqreturn_t prq_event_thread(int irq, void *d) return IRQ_RETVAL(handled); } + +#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) +struct iommu_sva * +intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +{ + struct iommu_sva *sva = ERR_PTR(-EINVAL); + struct intel_svm_dev *sdev = NULL; + int flags = 0; + int ret; + + /* + * TODO: Consolidate with generic iommu-sva bind after it is merged. + * It will require shared SVM data structures, i.e. combine io_mm + * and intel_svm etc. + */ + if (drvdata) + flags = *(int *)drvdata; + mutex_lock(&pasid_mutex); + ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev); + if (ret) + sva = ERR_PTR(ret); + else if (sdev) + sva = &sdev->sva; + else + WARN(!sdev, "SVM bind succeeded with no sdev!\n"); + + mutex_unlock(&pasid_mutex); + + return sva; +} + +void intel_svm_unbind(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + intel_svm_unbind_mm(sdev->dev, sdev->pasid); + mutex_unlock(&pasid_mutex); +} + +int intel_svm_get_pasid(struct iommu_sva *sva) +{ + struct intel_svm_dev *sdev; + int pasid; + + mutex_lock(&pasid_mutex); + sdev = to_intel_svm_dev(sva); + pasid = sdev->pasid; + mutex_unlock(&pasid_mutex); + + return pasid; +} diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index caa179e806fc..42245e1e1b48 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -723,6 +723,10 @@ extern int intel_svm_finish_prq(struct intel_iommu *iommu); int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, struct iommu_gpasid_bind_data *data); int intel_svm_unbind_gpasid(struct device *dev, int pasid); +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, + void *drvdata); +void intel_svm_unbind(struct iommu_sva *handle); +int intel_svm_get_pasid(struct iommu_sva *handle); struct svm_dev_ops; struct intel_svm_dev { @@ -730,6 +734,8 @@ struct intel_svm_dev { struct rcu_head rcu; struct device *dev; struct svm_dev_ops *ops; + struct iommu_sva sva; + int pasid; int users; u16 did; u16 dev_iotlb:1; diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 1b47ca46373e..c9e7e601950d 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -21,7 +21,6 @@ struct svm_dev_ops { #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) - /* * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" * PASID for the current process. Even if a PASID already exists, a new one @@ -57,89 +56,4 @@ struct svm_dev_ops { */ #define SVM_FLAG_GUEST_PASID (1<<3) -#ifdef CONFIG_INTEL_IOMMU_SVM - -/** - * intel_svm_bind_mm() - Bind the current process to a PASID - * @dev: Device to be granted access - * @pasid: Address for allocated PASID - * @flags: Flags. Later for requesting supervisor mode, etc. - * @ops: Callbacks to device driver - * - * This function attempts to enable PASID support for the given device. - * If the @pasid argument is non-%NULL, a PASID is allocated for access - * to the MM of the current process. - * - * By using a %NULL value for the @pasid argument, this function can - * be used to simply validate that PASID support is available for the - * given device — i.e. that it is behind an IOMMU which has the - * requisite support, and is enabled. - * - * Page faults are handled transparently by the IOMMU code, and there - * should be no need for the device driver to be involved. If a page - * fault cannot be handled (i.e. is an invalid address rather than - * just needs paging in), then the page request will be completed by - * the core IOMMU code with appropriate status, and the device itself - * can then report the resulting fault to its driver via whatever - * mechanism is appropriate. - * - * Multiple calls from the same process may result in the same PASID - * being re-used. A reference count is kept. - */ -extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, - struct svm_dev_ops *ops); - -/** - * intel_svm_unbind_mm() - Unbind a specified PASID - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be unbound - * - * This function allows a PASID to be retired when the device no - * longer requires access to the address space of a given process. - * - * If the use count for the PASID in question reaches zero, the - * PASID is revoked and may no longer be used by hardware. - * - * Device drivers are required to ensure that no access (including - * page requests) is currently outstanding for the PASID in question, - * before calling this function. - */ -extern int intel_svm_unbind_mm(struct device *dev, int pasid); - -/** - * intel_svm_is_pasid_valid() - check if pasid is valid - * @dev: Device for which PASID was allocated - * @pasid: PASID value to be checked - * - * This function checks if the specified pasid is still valid. A - * valid pasid means the backing mm is still having a valid user. - * For kernel callers init_mm is always valid. for other mm, if mm->mm_users - * is non-zero, it is valid. - * - * returns -EINVAL if invalid pasid, 0 if pasid ref count is invalid - * 1 if pasid is valid. - */ -extern int intel_svm_is_pasid_valid(struct device *dev, int pasid); - -#else /* CONFIG_INTEL_IOMMU_SVM */ - -static inline int intel_svm_bind_mm(struct device *dev, int *pasid, - int flags, struct svm_dev_ops *ops) -{ - return -ENOSYS; -} - -static inline int intel_svm_unbind_mm(struct device *dev, int pasid) -{ - BUG(); -} - -static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid) -{ - return -EINVAL; -} -#endif /* CONFIG_INTEL_IOMMU_SVM */ - -#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) - #endif /* __INTEL_SVM_H__ */ From 8a1d824625402b3ef3c3e5965663354ff0394d86 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:55 +0800 Subject: [PATCH 149/574] iommu/vt-d: Multiple descriptors per qi_submit_sync() Current qi_submit_sync() only supports single invalidation descriptor per submission and appends wait descriptor after each submission to poll the hardware completion. This extends the qi_submit_sync() helper to support multiple descriptors, and add an option so that the caller could specify the Page-request Drain (PD) bit in the wait descriptor. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-13-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 63 +++++++++++++++++------------ drivers/iommu/intel-pasid.c | 4 +- drivers/iommu/intel-svm.c | 6 +-- drivers/iommu/intel_irq_remapping.c | 2 +- include/linux/intel-iommu.h | 9 ++++- 5 files changed, 52 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 66af08ad10fb..60a2970c37ff 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1157,12 +1157,11 @@ static inline void reclaim_free_desc(struct q_inval *qi) } } -static int qi_check_fault(struct intel_iommu *iommu, int index) +static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index) { u32 fault; int head, tail; struct q_inval *qi = iommu->qi; - int wait_index = (index + 1) % QI_LENGTH; int shift = qi_shift(iommu); if (qi->desc_status[wait_index] == QI_ABORT) @@ -1225,17 +1224,21 @@ static int qi_check_fault(struct intel_iommu *iommu, int index) } /* - * Submit the queued invalidation descriptor to the remapping - * hardware unit and wait for its completion. + * Function to submit invalidation descriptors of all types to the queued + * invalidation interface(QI). Multiple descriptors can be submitted at a + * time, a wait descriptor will be appended to each submission to ensure + * hardware has completed the invalidation before return. Wait descriptors + * can be part of the submission but it will not be polled for completion. */ -int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu) +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options) { - int rc; struct q_inval *qi = iommu->qi; - int offset, shift, length; struct qi_desc wait_desc; int wait_index, index; unsigned long flags; + int offset, shift; + int rc, i; if (!qi) return 0; @@ -1244,32 +1247,41 @@ restart: rc = 0; raw_spin_lock_irqsave(&qi->q_lock, flags); - while (qi->free_cnt < 3) { + /* + * Check if we have enough empty slots in the queue to submit, + * the calculation is based on: + * # of desc + 1 wait desc + 1 space between head and tail + */ + while (qi->free_cnt < count + 2) { raw_spin_unlock_irqrestore(&qi->q_lock, flags); cpu_relax(); raw_spin_lock_irqsave(&qi->q_lock, flags); } index = qi->free_head; - wait_index = (index + 1) % QI_LENGTH; + wait_index = (index + count) % QI_LENGTH; shift = qi_shift(iommu); - length = 1 << shift; - qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE; + for (i = 0; i < count; i++) { + offset = ((index + i) % QI_LENGTH) << shift; + memcpy(qi->desc + offset, &desc[i], 1 << shift); + qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE; + } + qi->desc_status[wait_index] = QI_IN_USE; - offset = index << shift; - memcpy(qi->desc + offset, desc, length); wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE; + if (options & QI_OPT_WAIT_DRAIN) + wait_desc.qw0 |= QI_IWD_PRQ_DRAIN; wait_desc.qw1 = virt_to_phys(&qi->desc_status[wait_index]); wait_desc.qw2 = 0; wait_desc.qw3 = 0; offset = wait_index << shift; - memcpy(qi->desc + offset, &wait_desc, length); + memcpy(qi->desc + offset, &wait_desc, 1 << shift); - qi->free_head = (qi->free_head + 2) % QI_LENGTH; - qi->free_cnt -= 2; + qi->free_head = (qi->free_head + count + 1) % QI_LENGTH; + qi->free_cnt -= count + 1; /* * update the HW tail register indicating the presence of @@ -1285,7 +1297,7 @@ restart: * a deadlock where the interrupt context can wait indefinitely * for free slots in the queue. */ - rc = qi_check_fault(iommu, index); + rc = qi_check_fault(iommu, index, wait_index); if (rc) break; @@ -1294,7 +1306,8 @@ restart: raw_spin_lock(&qi->q_lock); } - qi->desc_status[index] = QI_DONE; + for (i = 0; i < count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1318,7 +1331,7 @@ void qi_global_iec(struct intel_iommu *iommu) desc.qw3 = 0; /* should never fail */ - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, @@ -1332,7 +1345,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, @@ -1356,7 +1369,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, @@ -1378,7 +1391,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based IOTLB invalidation */ @@ -1419,7 +1432,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, QI_EIOTLB_AM(mask); } - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* PASID-based device IOTLB Invalidate */ @@ -1448,7 +1461,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (size_order) desc.qw1 |= QI_DEV_EIOTLB_SIZE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, @@ -1458,7 +1471,7 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_TYPE; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } /* diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index c46a068142b9..45e9b5b291bc 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -438,7 +438,7 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void @@ -452,7 +452,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, iommu); + qi_submit_sync(iommu, &desc, 1, 0); } static void diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 8b66bf45477e..5133b2d4428f 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -138,7 +138,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); if (sdev->dev_iotlb) { desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | @@ -162,7 +162,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(&desc, svm->iommu); + qi_submit_sync(svm->iommu, &desc, 1, 0); } } @@ -846,7 +846,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) sizeof(req->priv_data)); resp.qw2 = 0; resp.qw3 = 0; - qi_submit_sync(&resp, iommu); + qi_submit_sync(iommu, &resp, 1, 0); } head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 81e43c1df7ec..a042f123b091 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -151,7 +151,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) desc.qw2 = 0; desc.qw3 = 0; - return qi_submit_sync(&desc, iommu); + return qi_submit_sync(iommu, &desc, 1, 0); } static int modify_irte(struct irq_2_iommu *irq_iommu, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 42245e1e1b48..677dee59e3c0 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -333,6 +333,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) #define QI_IOTLB_DR(dr) (((u64)dr) << 7) @@ -702,7 +703,13 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int pasid); -extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); +int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, + unsigned int count, unsigned long options); +/* + * Options used in qi_submit_sync: + * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. + */ +#define QI_OPT_WAIT_DRAIN BIT(0) extern int dmar_ir_support(void); From 4c0fa5bfca7eba479002f0a1ecd1bf7631b2f5da Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:56 +0800 Subject: [PATCH 150/574] iommu/vt-d: debugfs: Add support to show inv queue internals Export invalidation queue internals of each iommu device through the debugfs. Example of such dump on a Skylake machine: $ sudo cat /sys/kernel/debug/iommu/intel/invalidation_queue Invalidation queue on IOMMU: dmar1 Base: 0x1672c9000 Head: 80 Tail: 80 Index qw0 qw1 status 0 0000000000000004 0000000000000000 0000000000000000 1 0000000200000025 00000001672be804 0000000000000000 2 0000000000000011 0000000000000000 0000000000000000 3 0000000200000025 00000001672be80c 0000000000000000 4 00000000000000d2 0000000000000000 0000000000000000 5 0000000200000025 00000001672be814 0000000000000000 6 0000000000000014 0000000000000000 0000000000000000 7 0000000200000025 00000001672be81c 0000000000000000 8 0000000000000014 0000000000000000 0000000000000000 9 0000000200000025 00000001672be824 0000000000000000 Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-14-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu-debugfs.c | 62 +++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/iommu/intel-iommu-debugfs.c b/drivers/iommu/intel-iommu-debugfs.c index 3eb1fe240fb0..cf1ebb98e418 100644 --- a/drivers/iommu/intel-iommu-debugfs.c +++ b/drivers/iommu/intel-iommu-debugfs.c @@ -372,6 +372,66 @@ static int domain_translation_struct_show(struct seq_file *m, void *unused) } DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); +static void invalidation_queue_entry_show(struct seq_file *m, + struct intel_iommu *iommu) +{ + int index, shift = qi_shift(iommu); + struct qi_desc *desc; + int offset; + + if (ecap_smts(iommu->ecap)) + seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n"); + else + seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n"); + + for (index = 0; index < QI_LENGTH; index++) { + offset = index << shift; + desc = iommu->qi->desc + offset; + if (ecap_smts(iommu->ecap)) + seq_printf(m, "%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n", + index, desc->qw0, desc->qw1, + desc->qw2, desc->qw3, + iommu->qi->desc_status[index]); + else + seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n", + index, desc->qw0, desc->qw1, + iommu->qi->desc_status[index]); + } +} + +static int invalidation_queue_show(struct seq_file *m, void *unused) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + unsigned long flags; + struct q_inval *qi; + int shift; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + qi = iommu->qi; + shift = qi_shift(iommu); + + if (!qi || !ecap_qis(iommu->ecap)) + continue; + + seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name); + + raw_spin_lock_irqsave(&qi->q_lock, flags); + seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n", + (u64)virt_to_phys(qi->desc), + dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift, + dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift); + invalidation_queue_entry_show(m, iommu); + raw_spin_unlock_irqrestore(&qi->q_lock, flags); + seq_putc(m, '\n'); + } + rcu_read_unlock(); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(invalidation_queue); + #ifdef CONFIG_IRQ_REMAP static void ir_tbl_remap_entry_show(struct seq_file *m, struct intel_iommu *iommu) @@ -490,6 +550,8 @@ void __init intel_iommu_debugfs_init(void) debugfs_create_file("domain_translation_struct", 0444, intel_iommu_debug, NULL, &domain_translation_struct_fops); + debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug, + NULL, &invalidation_queue_fops); #ifdef CONFIG_IRQ_REMAP debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug, NULL, &ir_translation_struct_fops); From 37e91bd4b39922b31ca4e6c4eabb0d7140b14e74 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:57 +0800 Subject: [PATCH 151/574] iommu/vt-d: Disable non-recoverable fault processing before unbind When a PASID is used for SVA by the device, it's possible that the PASID entry is cleared before the device flushes all ongoing DMA requests. The IOMMU should tolerate and ignore the non-recoverable faults caused by the untranslated requests from this device. For example, when an exception happens, the process terminates before the device driver stops DMA and call IOMMU driver to unbind PASID. The flow of process exist is as follows: do_exit() { exit_mm() { mm_put(); exit_mmap() { intel_invalidate_range() //mmu notifier tlb_finish_mmu() mmu_notifier_release(mm) { intel_iommu_release() { [2] intel_iommu_teardown_pasid(); intel_iommu_flush_tlbs(); } } unmap_vmas(); free_pgtables(); }; } exit_files(tsk) { close_files() { dsa_close(); [1] dsa_stop_dma(); intel_svm_unbind_pasid(); } } } Care must be taken on VT-d to avoid unrecoverable faults between the time window of [1] and [2]. [Process exist flow was contributed by Jacob Pan.] Intel VT-d provides such function through the FPD bit of the PASID entry. This sets FPD bit when PASID entry is changing from present to nonpresent in the mm notifier and will clear it when the pasid is unbound. Signed-off-by: Lu Baolu Reviewed-by: Jacob Pan Link: https://lore.kernel.org/r/20200516062101.29541-15-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 4 ++-- drivers/iommu/intel-pasid.c | 26 +++++++++++++++++++++----- drivers/iommu/intel-pasid.h | 4 +++- drivers/iommu/intel-svm.c | 9 ++++++--- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 7d28ef2e6fe2..3c5cc3424e90 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5005,7 +5005,7 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) if (info->dev) { if (dev_is_pci(info->dev) && sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, info->dev, - PASID_RID2PASID); + PASID_RID2PASID, false); iommu_disable_dev_iotlb(info); domain_context_clear(iommu, info->dev); @@ -5234,7 +5234,7 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, auxiliary_unlink_device(domain, dev); spin_lock(&iommu->lock); - intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid); + intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); domain_detach_iommu(domain, iommu); spin_unlock(&iommu->lock); diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 45e9b5b291bc..25d749830500 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -292,7 +292,20 @@ static inline void pasid_clear_entry(struct pasid_entry *pe) WRITE_ONCE(pe->val[7], 0); } -static void intel_pasid_clear_entry(struct device *dev, int pasid) +static inline void pasid_clear_entry_with_fpd(struct pasid_entry *pe) +{ + WRITE_ONCE(pe->val[0], PASID_PTE_FPD); + WRITE_ONCE(pe->val[1], 0); + WRITE_ONCE(pe->val[2], 0); + WRITE_ONCE(pe->val[3], 0); + WRITE_ONCE(pe->val[4], 0); + WRITE_ONCE(pe->val[5], 0); + WRITE_ONCE(pe->val[6], 0); + WRITE_ONCE(pe->val[7], 0); +} + +static void +intel_pasid_clear_entry(struct device *dev, int pasid, bool fault_ignore) { struct pasid_entry *pe; @@ -300,7 +313,10 @@ static void intel_pasid_clear_entry(struct device *dev, int pasid) if (WARN_ON(!pe)) return; - pasid_clear_entry(pe); + if (fault_ignore && pasid_pte_is_present(pe)) + pasid_clear_entry_with_fpd(pe); + else + pasid_clear_entry(pe); } static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) @@ -473,8 +489,8 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, qi_flush_dev_iotlb(iommu, sid, pfsid, qdep, 0, 64 - VTD_PAGE_SHIFT); } -void intel_pasid_tear_down_entry(struct intel_iommu *iommu, - struct device *dev, int pasid) +void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, + int pasid, bool fault_ignore) { struct pasid_entry *pte; u16 did; @@ -484,7 +500,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, return; did = pasid_get_domain_id(pte); - intel_pasid_clear_entry(dev, pasid); + intel_pasid_clear_entry(dev, pasid, fault_ignore); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(pte, sizeof(*pte)); diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index a41b09b3ffde..c5318d40e0fa 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -15,6 +15,7 @@ #define PASID_MAX 0x100000 #define PASID_PTE_MASK 0x3F #define PASID_PTE_PRESENT 1 +#define PASID_PTE_FPD 2 #define PDE_PFN_MASK PAGE_MASK #define PASID_PDE_SHIFT 6 #define MAX_NR_PASID_BITS 20 @@ -120,7 +121,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct iommu_gpasid_bind_data_vtd *pasid_data, struct dmar_domain *domain, int addr_width); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, - struct device *dev, int pasid); + struct device *dev, int pasid, + bool fault_ignore); int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid); void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid); #endif /* __INTEL_PASID_H */ diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 5133b2d4428f..960a3610e852 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -207,7 +207,8 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) */ rcu_read_lock(); list_for_each_entry_rcu(sdev, &svm->devs, list) { - intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid); + intel_pasid_tear_down_entry(svm->iommu, sdev->dev, + svm->pasid, true); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); } rcu_read_unlock(); @@ -396,7 +397,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid) sdev->users--; if (!sdev->users) { list_del_rcu(&sdev->list); - intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_pasid_tear_down_entry(iommu, dev, + svm->pasid, false); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); /* TODO: Drain in flight PRQ for the PASID since it * may get reused soon, we don't want to @@ -639,7 +641,8 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) * to use. We have a *shared* PASID table, because it's * large and has to be physically contiguous. So it's * hard to be as defensive as we might like. */ - intel_pasid_tear_down_entry(iommu, dev, svm->pasid); + intel_pasid_tear_down_entry(iommu, dev, + svm->pasid, false); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); From 66ac4db36f4c12a3b02db864ccb0801cd938b6de Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:58 +0800 Subject: [PATCH 152/574] iommu/vt-d: Add page request draining support When a PASID is stopped or terminated, there can be pending PRQs (requests that haven't received responses) in remapping hardware. This adds the interface to drain page requests and call it when a PASID is terminated. Signed-off-by: Jacob Pan Signed-off-by: Liu Yi L Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-16-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 107 ++++++++++++++++++++++++++++++++++-- include/linux/intel-iommu.h | 4 ++ 2 files changed, 106 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 960a3610e852..5ab71107afd5 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -23,6 +23,7 @@ #include "intel-pasid.h" static irqreturn_t prq_event_thread(int irq, void *d); +static void intel_svm_drain_prq(struct device *dev, int pasid); #define PRQ_ORDER 0 @@ -66,6 +67,8 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + init_completion(&iommu->prq_complete); + return 0; } @@ -399,12 +402,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid) list_del_rcu(&sdev->list); intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); - /* TODO: Drain in flight PRQ for the PASID since it - * may get reused soon, we don't want to - * confuse with its previous life. - * intel_svm_drain_prq(dev, pasid); - */ kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { @@ -643,6 +642,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) * hard to be as defensive as we might like. */ intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); + intel_svm_drain_prq(dev, svm->pasid); intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); @@ -721,6 +721,93 @@ static bool is_canonical_address(u64 addr) return (((saddr << shift) >> shift) == saddr); } +/** + * intel_svm_drain_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +static void intel_svm_drain_prq(struct device *dev, int pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + struct pci_dev *pdev; + int head, tail; + u16 sid, did; + int qdep; + + info = get_domain_info(dev); + if (WARN_ON(!info || !dev_is_pci(dev))) + return; + + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + pdev = to_pci_dev(dev); + sid = PCI_DEVID(info->bus, info->devfn); + did = domain->iommu_did[iommu->seq_id]; + qdep = pci_ats_queue_depth(pdev); + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + desc[1].qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | + QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | + QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(info->pfsid); +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + static irqreturn_t prq_event_thread(int irq, void *d) { struct intel_iommu *iommu = d; @@ -856,6 +943,16 @@ static irqreturn_t prq_event_thread(int irq, void *d) dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + return IRQ_RETVAL(handled); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 677dee59e3c0..21633cee6331 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -292,6 +292,8 @@ /* PRS_REG */ #define DMA_PRS_PPR ((u32)1) +#define DMA_PRS_PRO ((u32)2) + #define DMA_VCS_PAS ((u64)1) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ @@ -333,6 +335,7 @@ enum { #define QI_IWD_STATUS_DATA(d) (((u64)d) << 32) #define QI_IWD_STATUS_WRITE (((u64)1) << 5) +#define QI_IWD_FENCE (((u64)1) << 6) #define QI_IWD_PRQ_DRAIN (((u64)1) << 7) #define QI_IOTLB_DID(did) (((u64)did) << 16) @@ -582,6 +585,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ + struct completion prq_complete; struct ioasid_allocator_ops pasid_allocator; /* Custom allocator for PASIDs */ #endif struct q_inval *qi; /* Queued invalidation info */ From 81ebd91a436b87158b2ab6c71a51395316b147dc Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:20:59 +0800 Subject: [PATCH 153/574] iommu/vt-d: Remove redundant IOTLB flush IOTLB flush already included in the PASID tear down and the page request drain process. There is no need to flush again. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20200516062101.29541-17-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 5ab71107afd5..42f916b9667e 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -209,11 +209,9 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) * *has* to handle gracefully without affecting other processes. */ rcu_read_lock(); - list_for_each_entry_rcu(sdev, &svm->devs, list) { + list_for_each_entry_rcu(sdev, &svm->devs, list) intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid, true); - intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); - } rcu_read_unlock(); } @@ -403,7 +401,6 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid) intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); intel_svm_drain_prq(dev, svm->pasid); - intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { @@ -643,7 +640,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid) intel_pasid_tear_down_entry(iommu, dev, svm->pasid, false); intel_svm_drain_prq(dev, svm->pasid); - intel_flush_svm_range_dev(svm, sdev, 0, -1, 0); kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { From 7482fd59259a7f23e191c14b8a126a0f6981b3e4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 16 May 2020 14:21:00 +0800 Subject: [PATCH 154/574] iommu/vt-d: Remove duplicated check in intel_svm_bind_mm() The info and info->pasid_support have already been checked in previous intel_iommu_enable_pasid() call. No need to check again. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-18-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 42f916b9667e..11366dc91971 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -501,11 +501,6 @@ intel_svm_bind_mm(struct device *dev, int flags, struct svm_dev_ops *ops, } info = get_domain_info(dev); - if (!info || !info->pasid_supported) { - kfree(sdev); - goto out; - } - sdev->did = FLPT_DEFAULT_DID; sdev->sid = PCI_DEVID(info->bus, info->devfn); if (info->ats_enabled) { From e70b081c6f376471d7a9fee69e12e8f05ac2925d Mon Sep 17 00:00:00 2001 From: Tom Murphy Date: Sat, 16 May 2020 14:21:01 +0800 Subject: [PATCH 155/574] iommu/vt-d: Remove IOVA handling code from the non-dma_ops path There's no need for the non-dma_ops path to keep track of IOVAs. The whole point of the non-dma_ops path is that it allows the IOVAs to be handled separately. The IOVA handling code removed in this patch is pointless. Signed-off-by: Tom Murphy Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200516062101.29541-19-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 95 +++++++++++++------------------------ 1 file changed, 32 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 3c5cc3424e90..f75d7d9c231f 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1892,11 +1892,6 @@ static int dmar_init_reserved_ranges(void) return 0; } -static void domain_reserve_special_ranges(struct dmar_domain *domain) -{ - copy_reserved_iova(&reserved_iova_list, &domain->iovad); -} - static inline int guestwidth_to_adjustwidth(int gaw) { int agaw; @@ -1918,7 +1913,8 @@ static void domain_exit(struct dmar_domain *domain) domain_remove_dev_info(domain); /* destroy iovas */ - put_iova_domain(&domain->iovad); + if (domain->domain.type == IOMMU_DOMAIN_DMA) + put_iova_domain(&domain->iovad); if (domain->pgd) { struct page *freelist; @@ -2627,19 +2623,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, } static int iommu_domain_identity_map(struct dmar_domain *domain, - unsigned long long start, - unsigned long long end) + unsigned long first_vpfn, + unsigned long last_vpfn) { - unsigned long first_vpfn = start >> VTD_PAGE_SHIFT; - unsigned long last_vpfn = end >> VTD_PAGE_SHIFT; - - if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn), - dma_to_mm_pfn(last_vpfn))) { - pr_err("Reserving iova failed\n"); - return -ENOMEM; - } - - pr_debug("Mapping reserved region %llx-%llx\n", start, end); /* * RMRR range might have overlap with physical memory range, * clear it first @@ -2677,7 +2663,8 @@ static int __init si_domain_init(int hw) for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { ret = iommu_domain_identity_map(si_domain, - PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + mm_to_dma_pfn(start_pfn), + mm_to_dma_pfn(end_pfn)); if (ret) return ret; } @@ -4547,58 +4534,37 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { struct memory_notify *mhp = v; - unsigned long long start, end; - unsigned long start_vpfn, last_vpfn; + unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn); + unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn + + mhp->nr_pages - 1); switch (val) { case MEM_GOING_ONLINE: - start = mhp->start_pfn << PAGE_SHIFT; - end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1; - if (iommu_domain_identity_map(si_domain, start, end)) { - pr_warn("Failed to build identity map for [%llx-%llx]\n", - start, end); + if (iommu_domain_identity_map(si_domain, + start_vpfn, last_vpfn)) { + pr_warn("Failed to build identity map for [%lx-%lx]\n", + start_vpfn, last_vpfn); return NOTIFY_BAD; } break; case MEM_OFFLINE: case MEM_CANCEL_ONLINE: - start_vpfn = mm_to_dma_pfn(mhp->start_pfn); - last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1); - while (start_vpfn <= last_vpfn) { - struct iova *iova; + { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; struct page *freelist; - iova = find_iova(&si_domain->iovad, start_vpfn); - if (iova == NULL) { - pr_debug("Failed get IOVA for PFN %lx\n", - start_vpfn); - break; - } - - iova = split_and_remove_iova(&si_domain->iovad, iova, - start_vpfn, last_vpfn); - if (iova == NULL) { - pr_warn("Failed to split IOVA PFN [%lx-%lx]\n", - start_vpfn, last_vpfn); - return NOTIFY_BAD; - } - - freelist = domain_unmap(si_domain, iova->pfn_lo, - iova->pfn_hi); + freelist = domain_unmap(si_domain, + start_vpfn, last_vpfn); rcu_read_lock(); for_each_active_iommu(iommu, drhd) iommu_flush_iotlb_psi(iommu, si_domain, - iova->pfn_lo, iova_size(iova), + start_vpfn, mhp->nr_pages, !freelist, 0); rcu_read_unlock(); dma_free_pagelist(freelist); - - start_vpfn = iova->pfn_hi + 1; - free_iova_mem(iova); } break; } @@ -4626,8 +4592,9 @@ static void free_all_cpu_cached_iovas(unsigned int cpu) for (did = 0; did < cap_ndoms(iommu->cap); did++) { domain = get_iommu_domain(iommu, (u16)did); - if (!domain) + if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA) continue; + free_cpu_cached_iovas(cpu, &domain->iovad); } } @@ -5037,9 +5004,6 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) { int adjust_width; - init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); - domain_reserve_special_ranges(domain); - /* calculate AGAW */ domain->gaw = guest_width; adjust_width = guestwidth_to_adjustwidth(guest_width); @@ -5058,11 +5022,21 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) return 0; } +static void intel_init_iova_domain(struct dmar_domain *dmar_domain) +{ + init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN); + copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad); + + if (!intel_iommu_strict && + init_iova_flush_queue(&dmar_domain->iovad, + iommu_flush_iova, iova_entry_free)) + pr_info("iova flush queue initialization failed\n"); +} + static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) { struct dmar_domain *dmar_domain; struct iommu_domain *domain; - int ret; switch (type) { case IOMMU_DOMAIN_DMA: @@ -5079,13 +5053,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } - if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) { - ret = init_iova_flush_queue(&dmar_domain->iovad, - iommu_flush_iova, - iova_entry_free); - if (ret) - pr_info("iova flush queue initialization failed\n"); - } + if (type == IOMMU_DOMAIN_DMA) + intel_init_iova_domain(dmar_domain); domain_update_iommu_cap(dmar_domain); From 5df362a53f7d36e032668e7e6725d80622b98525 Mon Sep 17 00:00:00 2001 From: Tero Kristo via iommu Date: Fri, 24 Apr 2020 17:58:28 +0300 Subject: [PATCH 156/574] iommu/omap: Add registration for DT fwnode pointer The fwnode pointer must be passed to the iommu core, so that the core can map the IOMMU towards device requests properly. Without this, some IOMMU clients like OMAP remoteproc will fail the iommu configuration multiple times with -EPROBE_DEFER, which will eventually be ignored with a kernel warning banner. Signed-off-by: Tero Kristo Link: https://lore.kernel.org/r/20200424145828.3159-1-t-kristo@ti.com Signed-off-by: Joerg Roedel --- drivers/iommu/omap-iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 5a9ba815863b..c8282cc212cb 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1236,6 +1236,7 @@ static int omap_iommu_probe(struct platform_device *pdev) goto out_group; iommu_device_set_ops(&obj->iommu, &omap_iommu_ops); + iommu_device_set_fwnode(&obj->iommu, &of->fwnode); err = iommu_device_register(&obj->iommu); if (err) From 8bbe13f52cb79666ada12033f04bdeb4741b7188 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 15 May 2020 16:08:43 +0800 Subject: [PATCH 157/574] iommu/mediatek-v1: Add def_domain_type The MediaTek V1 IOMMU is arm32 whose default domain type is IOMMU_DOMAIN_UNMANAGED. Add this to satisfy the bus_iommu_probe to enter "probe_finalize". The iommu framework will create a iommu domain for each a device. But all the devices share a iommu domain here, thus we skip all the other domains in the "attach_device" except the domain we create internally with arm_iommu_create_mapping. Also a minor change: in the attach_device, "data" always is not null. Remove "if (!data) return". Signed-off-by: Yong Wu Link: https://lore.kernel.org/r/1589530123-30240-1-git-send-email-yong.wu@mediatek.com Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu_v1.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 7bdd74c7cb9f..f353b072a5d0 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -265,10 +265,13 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, { struct mtk_iommu_data *data = dev_iommu_priv_get(dev); struct mtk_iommu_domain *dom = to_mtk_domain(domain); + struct dma_iommu_mapping *mtk_mapping; int ret; - if (!data) - return -ENODEV; + /* Only allow the domain created internally. */ + mtk_mapping = data->dev->archdata.iommu; + if (mtk_mapping->domain != domain) + return 0; if (!data->m4u_dom) { data->m4u_dom = dom; @@ -288,9 +291,6 @@ static void mtk_iommu_detach_device(struct iommu_domain *domain, { struct mtk_iommu_data *data = dev_iommu_priv_get(dev); - if (!data) - return; - mtk_iommu_config(data, dev, false); } @@ -416,6 +416,11 @@ static int mtk_iommu_create_mapping(struct device *dev, return 0; } +static int mtk_iommu_def_domain_type(struct device *dev) +{ + return IOMMU_DOMAIN_UNMANAGED; +} + static struct iommu_device *mtk_iommu_probe_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -525,6 +530,7 @@ static const struct iommu_ops mtk_iommu_ops = { .probe_device = mtk_iommu_probe_device, .probe_finalize = mtk_iommu_probe_finalize, .release_device = mtk_iommu_release_device, + .def_domain_type = mtk_iommu_def_domain_type, .device_group = generic_device_group, .pgsize_bitmap = ~0UL << MT2701_IOMMU_PAGE_SHIFT, }; From 06020196c82e26e835b739dfa91a2717b0d37148 Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Fri, 8 May 2020 09:49:55 +0800 Subject: [PATCH 158/574] iommu/arm-smmu-v3: remove set but not used variable 'smmu' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes gcc '-Wunused-but-set-variable' warning: drivers/iommu/arm-smmu-v3.c:2989:26: warning: variable ‘smmu’ set but not used [-Wunused-but-set-variable] struct arm_smmu_device *smmu; Reported-by: Hulk Robot Signed-off-by: Chen Zhou Link: https://lore.kernel.org/r/20200508014955.87630-1-chenzhou10@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 42e1ee7e5197..89ee9c5d8b88 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2986,13 +2986,11 @@ static void arm_smmu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct arm_smmu_master *master; - struct arm_smmu_device *smmu; if (!fwspec || fwspec->ops != &arm_smmu_ops) return; master = dev_iommu_priv_get(dev); - smmu = master->smmu; arm_smmu_detach_dev(master); arm_smmu_disable_pasid(master); kfree(master); From d100ff3843b731c5c0c974bc9210cf092a7ec9b6 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Mon, 11 May 2020 23:25:32 +0530 Subject: [PATCH 159/574] iommu/arm-smmu-qcom: Request direct mapping for modem device The modem remote processor has two access paths to DDR. One path is directly connected to DDR and another path goes through an SMMU. The SMMU path is configured to be a direct mapping because it's used by various peripherals in the modem subsystem. Typically this direct mapping is configured statically at EL2 by QHEE (Qualcomm's Hypervisor Execution Environment) before the kernel is entered. In certain firmware configuration, especially when the kernel is already in full control of the SMMU, defer programming the modem SIDs to the kernel. Let's add compatibles here so that we can have the kernel program the SIDs for the modem in these cases. Signed-off-by: Sibi Sankar Reviewed-by: Bjorn Andersson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20200511175532.25874-1-sibis@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm-smmu-qcom.c b/drivers/iommu/arm-smmu-qcom.c index 5bedf21587a5..cf01d0215a39 100644 --- a/drivers/iommu/arm-smmu-qcom.c +++ b/drivers/iommu/arm-smmu-qcom.c @@ -17,7 +17,9 @@ static const struct of_device_id qcom_smmu_client_of_match[] = { { .compatible = "qcom,mdp4" }, { .compatible = "qcom,mdss" }, { .compatible = "qcom,sc7180-mdss" }, + { .compatible = "qcom,sc7180-mss-pil" }, { .compatible = "qcom,sdm845-mdss" }, + { .compatible = "qcom,sdm845-mss-pil" }, { } }; From 52f3fab0067d6fa9e99c1b7f63265dd48ca76046 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 13 May 2020 13:02:57 +0200 Subject: [PATCH 160/574] iommu/arm-smmu-v3: Don't reserve implementation defined register space Some SMMUv3 implementation embed the Perf Monitor Group Registers (PMCG) inside the first 64kB region of the SMMU. Since PMCG are managed by a separate driver, this layout causes resource reservation conflicts during boot. To avoid this conflict, don't reserve the MMIO regions that are implementation defined. Although devm_ioremap_resource() still works on full pages under the hood, this way we benefit from resource conflict checks. Fixes: 7d839b4b9e00 ("perf/smmuv3: Add arm64 smmuv3 pmu driver") Signed-off-by: Jean-Philippe Brucker Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20200513110255.597203-1-jean-philippe@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 89ee9c5d8b88..5eec8ebdd4b5 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -171,6 +171,8 @@ #define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8 #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc +#define ARM_SMMU_REG_SZ 0xe00 + /* Common MSI config fields */ #define MSI_CFG0_ADDR_MASK GENMASK_ULL(51, 2) #define MSI_CFG2_SH GENMASK(5, 4) @@ -628,6 +630,7 @@ struct arm_smmu_strtab_cfg { struct arm_smmu_device { struct device *dev; void __iomem *base; + void __iomem *page1; #define ARM_SMMU_FEAT_2_LVL_STRTAB (1 << 0) #define ARM_SMMU_FEAT_2_LVL_CDTAB (1 << 1) @@ -733,9 +736,8 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { static inline void __iomem *arm_smmu_page1_fixup(unsigned long offset, struct arm_smmu_device *smmu) { - if ((offset > SZ_64K) && - (smmu->options & ARM_SMMU_OPT_PAGE0_REGS_ONLY)) - offset -= SZ_64K; + if (offset > SZ_64K) + return smmu->page1 + offset - SZ_64K; return smmu->base + offset; } @@ -4001,6 +4003,18 @@ err_reset_pci_ops: __maybe_unused; return err; } +static void __iomem *arm_smmu_ioremap(struct device *dev, resource_size_t start, + resource_size_t size) +{ + struct resource res = { + .flags = IORESOURCE_MEM, + .start = start, + .end = start + size - 1, + }; + + return devm_ioremap_resource(dev, &res); +} + static int arm_smmu_device_probe(struct platform_device *pdev) { int irq, ret; @@ -4036,10 +4050,23 @@ static int arm_smmu_device_probe(struct platform_device *pdev) } ioaddr = res->start; - smmu->base = devm_ioremap_resource(dev, res); + /* + * Don't map the IMPLEMENTATION DEFINED regions, since they may contain + * the PMCG registers which are reserved by the PMU driver. + */ + smmu->base = arm_smmu_ioremap(dev, ioaddr, ARM_SMMU_REG_SZ); if (IS_ERR(smmu->base)) return PTR_ERR(smmu->base); + if (arm_smmu_resource_size(smmu) > SZ_64K) { + smmu->page1 = arm_smmu_ioremap(dev, ioaddr + SZ_64K, + ARM_SMMU_REG_SZ); + if (IS_ERR(smmu->page1)) + return PTR_ERR(smmu->page1); + } else { + smmu->page1 = smmu->base; + } + /* Interrupt lines */ irq = platform_get_irq_byname_optional(pdev, "combined"); From 81c4389e4835ad14280718f477cf30b64643b721 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 9 May 2020 12:48:10 +0200 Subject: [PATCH 161/574] drm/msm/mdp5: Add MDP5 configuration for MSM8x36. This change adds MDP5 configuration for MSM8x36-based SoCs, like MSM8936, 8939 and their APQ variants. The configuration is based on MSM8916's, but adds some notable features, like ad and pp blocks, along with some register changes. changes since v1: - add an ad block - add a second mixer @ 0x47000 - adjust .max_width - write a more descriptive commit message Signed-off-by: Konrad Dybcio Reviewed-by: Shawn Guo Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c | 76 ++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c index e3c4c250238b..a7df8dbffdc2 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c @@ -342,6 +342,81 @@ static const struct mdp5_cfg_hw msm8x16_config = { .max_clk = 320000000, }; +static const struct mdp5_cfg_hw msm8x36_config = { + .name = "msm8x36", + .mdp = { + .count = 1, + .base = { 0x0 }, + .caps = MDP_CAP_SMP | + 0, + }, + .smp = { + .mmb_count = 8, + .mmb_size = 10240, + .clients = { + [SSPP_VIG0] = 1, [SSPP_DMA0] = 4, + [SSPP_RGB0] = 7, [SSPP_RGB1] = 8, + }, + }, + .ctl = { + .count = 3, + .base = { 0x01000, 0x01200, 0x01400 }, + .flush_hw_mask = 0x4003ffff, + }, + .pipe_vig = { + .count = 1, + .base = { 0x04000 }, + .caps = MDP_PIPE_CAP_HFLIP | MDP_PIPE_CAP_VFLIP | + MDP_PIPE_CAP_SCALE | MDP_PIPE_CAP_CSC | + MDP_PIPE_CAP_DECIMATION, + }, + .pipe_rgb = { + .count = 2, + .base = { 0x14000, 0x16000 }, + .caps = MDP_PIPE_CAP_HFLIP | MDP_PIPE_CAP_VFLIP | + MDP_PIPE_CAP_DECIMATION, + }, + .pipe_dma = { + .count = 1, + .base = { 0x24000 }, + .caps = MDP_PIPE_CAP_HFLIP | MDP_PIPE_CAP_VFLIP, + }, + .lm = { + .count = 2, + .base = { 0x44000, 0x47000 }, + .instances = { + { .id = 0, .pp = 0, .dspp = 0, + .caps = MDP_LM_CAP_DISPLAY, }, + { .id = 1, .pp = -1, .dspp = -1, + .caps = MDP_LM_CAP_WB, }, + }, + .nb_stages = 8, + .max_width = 2560, + .max_height = 0xFFFF, + }, + .pp = { + .count = 1, + .base = { 0x70000 }, + }, + .ad = { + .count = 1, + .base = { 0x78000 }, + }, + .dspp = { + .count = 1, + .base = { 0x54000 }, + }, + .intf = { + .base = { 0x00000, 0x6a800, 0x6b000 }, + .connect = { + [0] = INTF_DISABLED, + [1] = INTF_DSI, + [2] = INTF_DSI, + }, + }, + .max_clk = 366670000, +}; + static const struct mdp5_cfg_hw msm8x94_config = { .name = "msm8x94", .mdp = { @@ -840,6 +915,7 @@ static const struct mdp5_cfg_handler cfg_handlers_v1[] = { { .revision = 2, .config = { .hw = &msm8x74v2_config } }, { .revision = 3, .config = { .hw = &apq8084_config } }, { .revision = 6, .config = { .hw = &msm8x16_config } }, + { .revision = 8, .config = { .hw = &msm8x36_config } }, { .revision = 9, .config = { .hw = &msm8x94_config } }, { .revision = 7, .config = { .hw = &msm8x96_config } }, { .revision = 11, .config = { .hw = &msm8x76_config } }, From 09b4138ec2878d504c28a35f0c4c9bb4a0b0c5f4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 4 Dec 2019 07:02:20 +0100 Subject: [PATCH 162/574] drm/msm/a6xx: Fix a typo in an error message 'in' is duplicated in the error message. Axe one of them. While at it, slighly improve indentation. Signed-off-by: Christophe JAILLET Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index c4e71abbdd53..40cb4535d554 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -1011,8 +1011,8 @@ static int a6xx_gmu_rpmh_arc_votes_init(struct device *dev, u32 *votes, if (j == pri_count) { DRM_DEV_ERROR(dev, - "Level %u not found in in the RPMh list\n", - level); + "Level %u not found in the RPMh list\n", + level); DRM_DEV_ERROR(dev, "Available levels:\n"); for (j = 0; j < pri_count; j++) DRM_DEV_ERROR(dev, " %u\n", pri[j]); From 6a523388a2d4a1a00ec4153592be997066fb596e Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Wed, 4 Dec 2019 13:45:48 +0800 Subject: [PATCH 163/574] drm/msm/dpu: Fix compile warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the following command will get compile warnings: make W=1 drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.o ARCH=arm64 drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘_dpu_crtc_program_lm_output_roi’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:91:19: warning: variable ‘dpu_crtc’ set but not used [-Wunused-but-set-variable] struct dpu_crtc *dpu_crtc; ^~~~~~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_atomic_begin’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:428:35: warning: variable ‘smmu_state’ set but not used [-Wunused-but-set-variable] struct dpu_crtc_smmu_state_data *smmu_state; ^~~~~~~~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_atomic_flush’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:489:25: warning: variable ‘event_thread’ set but not used [-Wunused-but-set-variable] struct msm_drm_thread *event_thread; ^~~~~~~~~~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_destroy_state’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:565:19: warning: variable ‘dpu_crtc’ set but not used [-Wunused-but-set-variable] struct dpu_crtc *dpu_crtc; ^~~~~~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_duplicate_state’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:664:19: warning: variable ‘dpu_crtc’ set but not used [-Wunused-but-set-variable] struct dpu_crtc *dpu_crtc; ^~~~~~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_disable’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:693:26: warning: variable ‘priv’ set but not used [-Wunused-but-set-variable] struct msm_drm_private *priv; ^~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:691:27: warning: variable ‘mode’ set but not used [-Wunused-but-set-variable] struct drm_display_mode *mode; ^~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_enable’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:766:26: warning: variable ‘priv’ set but not used [-Wunused-but-set-variable] struct msm_drm_private *priv; ^~~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c: In function ‘dpu_crtc_init’: drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:1292:18: warning: variable ‘kms’ set but not used [-Wunused-but-set-variable] struct dpu_kms *kms = NULL; ^~~ drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c:663: warning: Excess function parameter 'Returns' description in 'dpu_crtc_duplicate_state' Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index 17448505a9b5..cf3af731a324 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -88,11 +88,9 @@ static void _dpu_crtc_setup_blend_cfg(struct dpu_crtc_mixer *mixer, static void _dpu_crtc_program_lm_output_roi(struct drm_crtc *crtc) { - struct dpu_crtc *dpu_crtc; struct dpu_crtc_state *crtc_state; int lm_idx, lm_horiz_position; - dpu_crtc = to_dpu_crtc(crtc); crtc_state = to_dpu_crtc_state(crtc->state); lm_horiz_position = 0; @@ -430,7 +428,6 @@ static void dpu_crtc_atomic_begin(struct drm_crtc *crtc, struct drm_encoder *encoder; struct drm_device *dev; unsigned long flags; - struct dpu_crtc_smmu_state_data *smmu_state; if (!crtc) { DPU_ERROR("invalid crtc\n"); @@ -448,7 +445,6 @@ static void dpu_crtc_atomic_begin(struct drm_crtc *crtc, dpu_crtc = to_dpu_crtc(crtc); cstate = to_dpu_crtc_state(crtc->state); dev = crtc->dev; - smmu_state = &dpu_crtc->smmu_state; _dpu_crtc_setup_lm_bounds(crtc, crtc->state); @@ -491,7 +487,6 @@ static void dpu_crtc_atomic_flush(struct drm_crtc *crtc, struct drm_device *dev; struct drm_plane *plane; struct msm_drm_private *priv; - struct msm_drm_thread *event_thread; unsigned long flags; struct dpu_crtc_state *cstate; @@ -513,8 +508,6 @@ static void dpu_crtc_atomic_flush(struct drm_crtc *crtc, return; } - event_thread = &priv->event_thread[crtc->index]; - if (dpu_crtc->event) { DPU_DEBUG("already received dpu_crtc->event\n"); } else { @@ -567,7 +560,6 @@ static void dpu_crtc_atomic_flush(struct drm_crtc *crtc, static void dpu_crtc_destroy_state(struct drm_crtc *crtc, struct drm_crtc_state *state) { - struct dpu_crtc *dpu_crtc; struct dpu_crtc_state *cstate; if (!crtc || !state) { @@ -575,7 +567,6 @@ static void dpu_crtc_destroy_state(struct drm_crtc *crtc, return; } - dpu_crtc = to_dpu_crtc(crtc); cstate = to_dpu_crtc_state(state); DPU_DEBUG("crtc%d\n", crtc->base.id); @@ -662,11 +653,9 @@ static void dpu_crtc_reset(struct drm_crtc *crtc) /** * dpu_crtc_duplicate_state - state duplicate hook * @crtc: Pointer to drm crtc structure - * @Returns: Pointer to new drm_crtc_state structure */ static struct drm_crtc_state *dpu_crtc_duplicate_state(struct drm_crtc *crtc) { - struct dpu_crtc *dpu_crtc; struct dpu_crtc_state *cstate, *old_cstate; if (!crtc || !crtc->state) { @@ -674,7 +663,6 @@ static struct drm_crtc_state *dpu_crtc_duplicate_state(struct drm_crtc *crtc) return NULL; } - dpu_crtc = to_dpu_crtc(crtc); old_cstate = to_dpu_crtc_state(crtc->state); cstate = kmemdup(old_cstate, sizeof(*old_cstate), GFP_KERNEL); if (!cstate) { @@ -693,9 +681,7 @@ static void dpu_crtc_disable(struct drm_crtc *crtc, { struct dpu_crtc *dpu_crtc; struct dpu_crtc_state *cstate; - struct drm_display_mode *mode; struct drm_encoder *encoder; - struct msm_drm_private *priv; unsigned long flags; bool release_bandwidth = false; @@ -705,8 +691,6 @@ static void dpu_crtc_disable(struct drm_crtc *crtc, } dpu_crtc = to_dpu_crtc(crtc); cstate = to_dpu_crtc_state(crtc->state); - mode = &cstate->base.adjusted_mode; - priv = crtc->dev->dev_private; DRM_DEBUG_KMS("crtc%d\n", crtc->base.id); @@ -768,14 +752,12 @@ static void dpu_crtc_enable(struct drm_crtc *crtc, { struct dpu_crtc *dpu_crtc; struct drm_encoder *encoder; - struct msm_drm_private *priv; bool request_bandwidth; if (!crtc) { DPU_ERROR("invalid crtc\n"); return; } - priv = crtc->dev->dev_private; pm_runtime_get_sync(crtc->dev->dev); From ab723b7a992a19b843f798b183f53f7472f598c8 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Fri, 24 Jan 2020 00:57:10 +0100 Subject: [PATCH 164/574] drm/msm: Add syncobj support. This 1) Enables core DRM syncobj support. 2) Adds options to the submission ioctl to wait/signal syncobjs. Just like the wait fence fd, this does inline waits. Using the scheduler would be nice but I believe it is out of scope for this work. Support for timeline syncobjs is implemented and the interface is ready for it, but I'm not enabling it yet until there is some code for turnip to use it. The reset is mostly in there because in the presence of waiting and signalling the same semaphores, resetting them after signalling can become very annoying. v2: - Fixed style issues - Removed a cleanup issue in a failure case - Moved to a copy_from_user per syncobj v3: - Fixed a missing declaration introduced in v2 - Reworked to use ERR_PTR/PTR_ERR - Simplified failure gotos. Used by: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2769 Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 6 +- drivers/gpu/drm/msm/msm_gem_submit.c | 232 ++++++++++++++++++++++++++- include/uapi/drm/msm_drm.h | 24 ++- 3 files changed, 258 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 29295dee2a2e..f6ce40bf3699 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -37,9 +37,10 @@ * - 1.4.0 - softpin, MSM_RELOC_BO_DUMP, and GEM_INFO support to set/get * GEM object's debug name * - 1.5.0 - Add SUBMITQUERY_QUERY ioctl + * - 1.6.0 - Syncobj support */ #define MSM_VERSION_MAJOR 1 -#define MSM_VERSION_MINOR 5 +#define MSM_VERSION_MINOR 6 #define MSM_VERSION_PATCHLEVEL 0 static const struct drm_mode_config_funcs mode_config_funcs = { @@ -1002,7 +1003,8 @@ static struct drm_driver msm_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER | DRIVER_ATOMIC | - DRIVER_MODESET, + DRIVER_MODESET | + DRIVER_SYNCOBJ, .open = msm_open, .postclose = msm_postclose, .lastclose = drm_fb_helper_lastclose, diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 385d4965a8d0..6630aa817505 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -8,7 +8,9 @@ #include #include +#include #include +#include #include "msm_drv.h" #include "msm_gpu.h" @@ -391,6 +393,186 @@ static void submit_cleanup(struct msm_gem_submit *submit) } } + +struct msm_submit_post_dep { + struct drm_syncobj *syncobj; + uint64_t point; + struct dma_fence_chain *chain; +}; + +static struct drm_syncobj **msm_wait_deps(struct drm_device *dev, + struct drm_file *file, + uint64_t in_syncobjs_addr, + uint32_t nr_in_syncobjs, + size_t syncobj_stride, + struct msm_ringbuffer *ring) +{ + struct drm_syncobj **syncobjs = NULL; + struct drm_msm_gem_submit_syncobj syncobj_desc = {0}; + int ret = 0; + uint32_t i, j; + + syncobjs = kcalloc(nr_in_syncobjs, sizeof(*syncobjs), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!syncobjs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_in_syncobjs; ++i) { + uint64_t address = in_syncobjs_addr + i * syncobj_stride; + struct dma_fence *fence; + + if (copy_from_user(&syncobj_desc, + u64_to_user_ptr(address), + min(syncobj_stride, sizeof(syncobj_desc)))) { + ret = -EFAULT; + break; + } + + if (syncobj_desc.point && + !drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE)) { + ret = -EOPNOTSUPP; + break; + } + + if (syncobj_desc.flags & ~MSM_SUBMIT_SYNCOBJ_FLAGS) { + ret = -EINVAL; + break; + } + + ret = drm_syncobj_find_fence(file, syncobj_desc.handle, + syncobj_desc.point, 0, &fence); + if (ret) + break; + + if (!dma_fence_match_context(fence, ring->fctx->context)) + ret = dma_fence_wait(fence, true); + + dma_fence_put(fence); + if (ret) + break; + + if (syncobj_desc.flags & MSM_SUBMIT_SYNCOBJ_RESET) { + syncobjs[i] = + drm_syncobj_find(file, syncobj_desc.handle); + if (!syncobjs[i]) { + ret = -EINVAL; + break; + } + } + } + + if (ret) { + for (j = 0; j <= i; ++j) { + if (syncobjs[j]) + drm_syncobj_put(syncobjs[j]); + } + kfree(syncobjs); + return ERR_PTR(ret); + } + return syncobjs; +} + +static void msm_reset_syncobjs(struct drm_syncobj **syncobjs, + uint32_t nr_syncobjs) +{ + uint32_t i; + + for (i = 0; syncobjs && i < nr_syncobjs; ++i) { + if (syncobjs[i]) + drm_syncobj_replace_fence(syncobjs[i], NULL); + } +} + +static struct msm_submit_post_dep *msm_parse_post_deps(struct drm_device *dev, + struct drm_file *file, + uint64_t syncobjs_addr, + uint32_t nr_syncobjs, + size_t syncobj_stride) +{ + struct msm_submit_post_dep *post_deps; + struct drm_msm_gem_submit_syncobj syncobj_desc = {0}; + int ret = 0; + uint32_t i, j; + + post_deps = kmalloc_array(nr_syncobjs, sizeof(*post_deps), + GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (!post_deps) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_syncobjs; ++i) { + uint64_t address = syncobjs_addr + i * syncobj_stride; + + if (copy_from_user(&syncobj_desc, + u64_to_user_ptr(address), + min(syncobj_stride, sizeof(syncobj_desc)))) { + ret = -EFAULT; + break; + } + + post_deps[i].point = syncobj_desc.point; + post_deps[i].chain = NULL; + + if (syncobj_desc.flags) { + ret = -EINVAL; + break; + } + + if (syncobj_desc.point) { + if (!drm_core_check_feature(dev, + DRIVER_SYNCOBJ_TIMELINE)) { + ret = -EOPNOTSUPP; + break; + } + + post_deps[i].chain = + kmalloc(sizeof(*post_deps[i].chain), + GFP_KERNEL); + if (!post_deps[i].chain) { + ret = -ENOMEM; + break; + } + } + + post_deps[i].syncobj = + drm_syncobj_find(file, syncobj_desc.handle); + if (!post_deps[i].syncobj) { + ret = -EINVAL; + break; + } + } + + if (ret) { + for (j = 0; j <= i; ++j) { + kfree(post_deps[j].chain); + if (post_deps[j].syncobj) + drm_syncobj_put(post_deps[j].syncobj); + } + + kfree(post_deps); + return ERR_PTR(ret); + } + + return post_deps; +} + +static void msm_process_post_deps(struct msm_submit_post_dep *post_deps, + uint32_t count, struct dma_fence *fence) +{ + uint32_t i; + + for (i = 0; post_deps && i < count; ++i) { + if (post_deps[i].chain) { + drm_syncobj_add_point(post_deps[i].syncobj, + post_deps[i].chain, + fence, post_deps[i].point); + post_deps[i].chain = NULL; + } else { + drm_syncobj_replace_fence(post_deps[i].syncobj, + fence); + } + } +} + int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct drm_file *file) { @@ -403,6 +585,8 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct sync_file *sync_file = NULL; struct msm_gpu_submitqueue *queue; struct msm_ringbuffer *ring; + struct msm_submit_post_dep *post_deps = NULL; + struct drm_syncobj **syncobjs_to_reset = NULL; int out_fence_fd = -1; struct pid *pid = get_pid(task_pid(current)); bool has_ww_ticket = false; @@ -411,6 +595,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, if (!gpu) return -ENXIO; + if (args->pad) + return -EINVAL; + /* for now, we just have 3d pipe.. eventually this would need to * be more clever to dispatch to appropriate gpu module: */ @@ -458,9 +645,29 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, return ret; } + if (args->flags & MSM_SUBMIT_SYNCOBJ_IN) { + syncobjs_to_reset = msm_wait_deps(dev, file, + args->in_syncobjs, + args->nr_in_syncobjs, + args->syncobj_stride, ring); + if (IS_ERR(syncobjs_to_reset)) + return PTR_ERR(syncobjs_to_reset); + } + + if (args->flags & MSM_SUBMIT_SYNCOBJ_OUT) { + post_deps = msm_parse_post_deps(dev, file, + args->out_syncobjs, + args->nr_out_syncobjs, + args->syncobj_stride); + if (IS_ERR(post_deps)) { + ret = PTR_ERR(post_deps); + goto out_post_unlock; + } + } + ret = mutex_lock_interruptible(&dev->struct_mutex); if (ret) - return ret; + goto out_post_unlock; if (args->flags & MSM_SUBMIT_FENCE_FD_OUT) { out_fence_fd = get_unused_fd_flags(O_CLOEXEC); @@ -587,6 +794,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, args->fence_fd = out_fence_fd; } + msm_reset_syncobjs(syncobjs_to_reset, args->nr_in_syncobjs); + msm_process_post_deps(post_deps, args->nr_out_syncobjs, + submit->fence); + + out: submit_cleanup(submit); if (has_ww_ticket) @@ -597,5 +809,23 @@ out_unlock: if (ret && (out_fence_fd >= 0)) put_unused_fd(out_fence_fd); mutex_unlock(&dev->struct_mutex); + +out_post_unlock: + if (!IS_ERR_OR_NULL(post_deps)) { + for (i = 0; i < args->nr_out_syncobjs; ++i) { + kfree(post_deps[i].chain); + drm_syncobj_put(post_deps[i].syncobj); + } + kfree(post_deps); + } + + if (!IS_ERR_OR_NULL(syncobjs_to_reset)) { + for (i = 0; i < args->nr_in_syncobjs; ++i) { + if (syncobjs_to_reset[i]) + drm_syncobj_put(syncobjs_to_reset[i]); + } + kfree(syncobjs_to_reset); + } + return ret; } diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 0b85ed6a3710..19806eb3a8e8 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -217,13 +217,28 @@ struct drm_msm_gem_submit_bo { #define MSM_SUBMIT_FENCE_FD_IN 0x40000000 /* enable input fence_fd */ #define MSM_SUBMIT_FENCE_FD_OUT 0x20000000 /* enable output fence_fd */ #define MSM_SUBMIT_SUDO 0x10000000 /* run submitted cmds from RB */ +#define MSM_SUBMIT_SYNCOBJ_IN 0x08000000 /* enable input syncobj */ +#define MSM_SUBMIT_SYNCOBJ_OUT 0x04000000 /* enable output syncobj */ #define MSM_SUBMIT_FLAGS ( \ MSM_SUBMIT_NO_IMPLICIT | \ MSM_SUBMIT_FENCE_FD_IN | \ MSM_SUBMIT_FENCE_FD_OUT | \ MSM_SUBMIT_SUDO | \ + MSM_SUBMIT_SYNCOBJ_IN | \ + MSM_SUBMIT_SYNCOBJ_OUT | \ 0) +#define MSM_SUBMIT_SYNCOBJ_RESET 0x00000001 /* Reset syncobj after wait. */ +#define MSM_SUBMIT_SYNCOBJ_FLAGS ( \ + MSM_SUBMIT_SYNCOBJ_RESET | \ + 0) + +struct drm_msm_gem_submit_syncobj { + __u32 handle; /* in, syncobj handle. */ + __u32 flags; /* in, from MSM_SUBMIT_SYNCOBJ_FLAGS */ + __u64 point; /* in, timepoint for timeline syncobjs. */ +}; + /* Each cmdstream submit consists of a table of buffers involved, and * one or more cmdstream buffers. This allows for conditional execution * (context-restore), and IB buffers needed for per tile/bin draw cmds. @@ -236,7 +251,14 @@ struct drm_msm_gem_submit { __u64 bos; /* in, ptr to array of submit_bo's */ __u64 cmds; /* in, ptr to array of submit_cmd's */ __s32 fence_fd; /* in/out fence fd (see MSM_SUBMIT_FENCE_FD_IN/OUT) */ - __u32 queueid; /* in, submitqueue id */ + __u32 queueid; /* in, submitqueue id */ + __u64 in_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ + __u64 out_syncobjs; /* in, ptr to to array of drm_msm_gem_submit_syncobj */ + __u32 nr_in_syncobjs; /* in, number of entries in in_syncobj */ + __u32 nr_out_syncobjs; /* in, number of entries in out_syncobj. */ + __u32 syncobj_stride; /* in, stride of syncobj arrays. */ + __u32 pad; /*in, reserved for future use, always 0. */ + }; /* The normal way to synchronize with the GPU is just to CPU_PREP on From 20aebe83698feb107d5a66b6cfd1d54459ccdfcf Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 30 Apr 2020 12:24:27 -0700 Subject: [PATCH 165/574] drm/msm: Fix undefined "rd_full" link error rd_full should be defined outside the CONFIG_DEBUG_FS region, in order to be able to link the msm driver even when CONFIG_DEBUG_FS is disabled. Fixes: e515af8d4a6f ("drm/msm: devcoredump should dump MSM_SUBMIT_BO_DUMP buffers") Reported-by: Stephen Rothwell Signed-off-by: Bjorn Andersson Reviewed-by: Rob Clark Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_rd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_rd.c b/drivers/gpu/drm/msm/msm_rd.c index 732f65df5c4f..fea30e7aa9e8 100644 --- a/drivers/gpu/drm/msm/msm_rd.c +++ b/drivers/gpu/drm/msm/msm_rd.c @@ -29,8 +29,6 @@ * or shader programs (if not emitted inline in cmdstream). */ -#ifdef CONFIG_DEBUG_FS - #include #include #include @@ -47,6 +45,8 @@ bool rd_full = false; MODULE_PARM_DESC(rd_full, "If true, $debugfs/.../rd will snapshot all buffer contents"); module_param_named(rd_full, rd_full, bool, 0600); +#ifdef CONFIG_DEBUG_FS + enum rd_sect_type { RD_NONE, RD_TEST, /* ascii text */ From e4b397f6a54c1b653b65c3dbcf3d3a157f580355 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 12 Apr 2020 16:35:09 +0200 Subject: [PATCH 166/574] drm/msm: Fix typo Duplicated 'we' Signed-off-by: Christophe JAILLET Reviewed-by: Abhinav Kumar Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c index 998bef1190a3..b5fed67c4651 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c @@ -959,7 +959,7 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc, if (!ctl) return -EINVAL; - /* don't support LM cursors when we we have source split enabled */ + /* don't support LM cursors when we have source split enabled */ if (mdp5_cstate->pipeline.r_mixer) return -EINVAL; @@ -1030,7 +1030,7 @@ static int mdp5_crtc_cursor_move(struct drm_crtc *crtc, int x, int y) return -EINVAL; } - /* don't support LM cursors when we we have source split enabled */ + /* don't support LM cursors when we have source split enabled */ if (mdp5_cstate->pipeline.r_mixer) return -EINVAL; From e4337877c5d578722c0716f131fb774522013cf5 Mon Sep 17 00:00:00 2001 From: Roy Spliet Date: Tue, 7 Apr 2020 18:07:37 +0100 Subject: [PATCH 167/574] drm/msm/mdp5: Fix mdp5_init error path for failed mdp5_kms allocation When allocation for mdp5_kms fails, calling mdp5_destroy() leads to undefined behaviour, likely a nullptr exception or use-after-free troubles. Signed-off-by: Roy Spliet Reviewed-by: Abhinav Kumar Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c index 47b989834af1..c23a2fa13fb9 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c @@ -943,7 +943,8 @@ static int mdp5_init(struct platform_device *pdev, struct drm_device *dev) return 0; fail: - mdp5_destroy(pdev); + if (mdp5_kms) + mdp5_destroy(pdev); return ret; } From e47616df008b1059c57892fb34883403a6933231 Mon Sep 17 00:00:00 2001 From: Kalyan Thota Date: Tue, 24 Mar 2020 15:31:18 +0530 Subject: [PATCH 168/574] drm/msm/dpu: add support for color processing blocks in dpu driver This change adds support to configure dspp blocks in the dpu driver. Macro description of the changes coming in this patch. 1) Add dspp definitions in the hw catalog. 2) Add capability to reserve dspp blocks in the display data path. 3) Attach the reserved block to the encoder. Signed-off-by: Kalyan Thota Tested-by: Fritz Koenig Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/Makefile | 1 + drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.h | 2 + drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 12 ++- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 39 ++++++--- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h | 39 +++++++++ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c | 26 ++++++ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h | 3 + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c | 82 +++++++++++++++++++ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h | 69 ++++++++++++++++ drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h | 2 + drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 1 + drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c | 58 ++++++++++++- drivers/gpu/drm/msm/disp/dpu1/dpu_rm.h | 2 + drivers/gpu/drm/msm/msm_drv.h | 1 + 14 files changed, 322 insertions(+), 15 deletions(-) create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile index 1579cf0d828f..42f8aae28b31 100644 --- a/drivers/gpu/drm/msm/Makefile +++ b/drivers/gpu/drm/msm/Makefile @@ -65,6 +65,7 @@ msm-y := \ disp/dpu1/dpu_hw_lm.o \ disp/dpu1/dpu_hw_pingpong.o \ disp/dpu1/dpu_hw_sspp.o \ + disp/dpu1/dpu_hw_dspp.o \ disp/dpu1/dpu_hw_top.o \ disp/dpu1/dpu_hw_util.o \ disp/dpu1/dpu_hw_vbif.o \ diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.h index 5174e86124cc..cec3474340e8 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.h @@ -73,12 +73,14 @@ struct dpu_crtc_smmu_state_data { * struct dpu_crtc_mixer: stores the map for each virtual pipeline in the CRTC * @hw_lm: LM HW Driver context * @lm_ctl: CTL Path HW driver context + * @lm_dspp: DSPP HW driver context * @mixer_op_mode: mixer blending operation mode * @flush_mask: mixer flush mask for ctl, mixer and pipe */ struct dpu_crtc_mixer { struct dpu_hw_mixer *hw_lm; struct dpu_hw_ctl *lm_ctl; + struct dpu_hw_dspp *hw_dspp; u32 mixer_op_mode; u32 flush_mask; }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index a1b79ee2bd9d..63976dcd2ac8 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -20,6 +20,7 @@ #include "dpu_hw_catalog.h" #include "dpu_hw_intf.h" #include "dpu_hw_ctl.h" +#include "dpu_hw_dspp.h" #include "dpu_formats.h" #include "dpu_encoder_phys.h" #include "dpu_crtc.h" @@ -536,6 +537,7 @@ static struct msm_display_topology dpu_encoder_get_topology( * 1 LM, 1 INTF * 2 LM, 1 INTF (stream merge to support high resolution interfaces) * + * Adding color blocks only to primary interface */ if (intf_count == 2) topology.num_lm = 2; @@ -544,6 +546,9 @@ static struct msm_display_topology dpu_encoder_get_topology( else topology.num_lm = (mode->hdisplay > MAX_HDISPLAY_SPLIT) ? 2 : 1; + if (dpu_enc->disp_info.intf_type == DRM_MODE_ENCODER_DSI) + topology.num_dspp = topology.num_lm; + topology.num_enc = 0; topology.num_intf = intf_count; @@ -959,7 +964,8 @@ static void dpu_encoder_virt_mode_set(struct drm_encoder *drm_enc, struct dpu_hw_blk *hw_pp[MAX_CHANNELS_PER_ENC]; struct dpu_hw_blk *hw_ctl[MAX_CHANNELS_PER_ENC]; struct dpu_hw_blk *hw_lm[MAX_CHANNELS_PER_ENC]; - int num_lm, num_ctl, num_pp; + struct dpu_hw_blk *hw_dspp[MAX_CHANNELS_PER_ENC] = { NULL }; + int num_lm, num_ctl, num_pp, num_dspp; int i, j; if (!drm_enc) { @@ -1008,6 +1014,9 @@ static void dpu_encoder_virt_mode_set(struct drm_encoder *drm_enc, drm_enc->base.id, DPU_HW_BLK_CTL, hw_ctl, ARRAY_SIZE(hw_ctl)); num_lm = dpu_rm_get_assigned_resources(&dpu_kms->rm, global_state, drm_enc->base.id, DPU_HW_BLK_LM, hw_lm, ARRAY_SIZE(hw_lm)); + num_dspp = dpu_rm_get_assigned_resources(&dpu_kms->rm, global_state, + drm_enc->base.id, DPU_HW_BLK_DSPP, hw_dspp, + ARRAY_SIZE(hw_dspp)); for (i = 0; i < MAX_CHANNELS_PER_ENC; i++) dpu_enc->hw_pp[i] = i < num_pp ? to_dpu_hw_pingpong(hw_pp[i]) @@ -1020,6 +1029,7 @@ static void dpu_encoder_virt_mode_set(struct drm_encoder *drm_enc, cstate->mixers[i].hw_lm = to_dpu_hw_mixer(hw_lm[i]); cstate->mixers[i].lm_ctl = to_dpu_hw_ctl(hw_ctl[ctl_idx]); + cstate->mixers[i].hw_dspp = to_dpu_hw_dspp(hw_dspp[i]); } cstate->num_mixers = num_lm; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index c567917541e8..19d065ad0b25 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -41,6 +41,8 @@ #define PINGPONG_SDM845_SPLIT_MASK \ (PINGPONG_SDM845_MASK | BIT(DPU_PINGPONG_TE2)) +#define DSPP_SC7180_MASK 0 + #define DEFAULT_PIXEL_RAM_SIZE (50 * 1024) #define DEFAULT_DPU_LINE_WIDTH 2048 #define DEFAULT_DPU_OUTPUT_LINE_WIDTH 2560 @@ -291,29 +293,30 @@ static const struct dpu_lm_sub_blks sdm845_lm_sblk = { }, }; -#define LM_BLK(_name, _id, _base, _fmask, _sblk, _pp, _lmpair) \ +#define LM_BLK(_name, _id, _base, _fmask, _sblk, _pp, _lmpair, _dspp) \ { \ .name = _name, .id = _id, \ .base = _base, .len = 0x320, \ .features = _fmask, \ .sblk = _sblk, \ .pingpong = _pp, \ - .lm_pair_mask = (1 << _lmpair) \ + .lm_pair_mask = (1 << _lmpair), \ + .dspp = _dspp \ } static const struct dpu_lm_cfg sdm845_lm[] = { LM_BLK("lm_0", LM_0, 0x44000, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_0, LM_1), + &sdm845_lm_sblk, PINGPONG_0, LM_1, 0), LM_BLK("lm_1", LM_1, 0x45000, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_1, LM_0), + &sdm845_lm_sblk, PINGPONG_1, LM_0, 0), LM_BLK("lm_2", LM_2, 0x46000, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_2, LM_5), + &sdm845_lm_sblk, PINGPONG_2, LM_5, 0), LM_BLK("lm_3", LM_3, 0x0, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_MAX, 0), + &sdm845_lm_sblk, PINGPONG_MAX, 0, 0), LM_BLK("lm_4", LM_4, 0x0, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_MAX, 0), + &sdm845_lm_sblk, PINGPONG_MAX, 0, 0), LM_BLK("lm_5", LM_5, 0x49000, MIXER_SDM845_MASK, - &sdm845_lm_sblk, PINGPONG_3, LM_2), + &sdm845_lm_sblk, PINGPONG_3, LM_2, 0), }; /* SC7180 */ @@ -328,11 +331,25 @@ static const struct dpu_lm_sub_blks sc7180_lm_sblk = { static const struct dpu_lm_cfg sc7180_lm[] = { LM_BLK("lm_0", LM_0, 0x44000, MIXER_SC7180_MASK, - &sc7180_lm_sblk, PINGPONG_0, LM_1), + &sc7180_lm_sblk, PINGPONG_0, LM_1, DSPP_0), LM_BLK("lm_1", LM_1, 0x45000, MIXER_SC7180_MASK, - &sc7180_lm_sblk, PINGPONG_1, LM_0), + &sc7180_lm_sblk, PINGPONG_1, LM_0, 0), }; +/************************************************************* + * DSPP sub blocks config + *************************************************************/ +#define DSPP_BLK(_name, _id, _base) \ + {\ + .name = _name, .id = _id, \ + .base = _base, .len = 0x1800, \ + .features = DSPP_SC7180_MASK, \ + .sblk = NULL, \ + } + +static const struct dpu_dspp_cfg sc7180_dspp[] = { + DSPP_BLK("dspp_0", DSPP_0, 0x54000), +}; /************************************************************* * PINGPONG sub blocks config *************************************************************/ @@ -587,6 +604,8 @@ static void sc7180_cfg_init(struct dpu_mdss_cfg *dpu_cfg) .sspp = sc7180_sspp, .mixer_count = ARRAY_SIZE(sc7180_lm), .mixer = sc7180_lm, + .dspp_count = ARRAY_SIZE(sc7180_dspp), + .dspp = sc7180_dspp, .pingpong_count = ARRAY_SIZE(sc7180_pp), .pingpong = sc7180_pp, .intf_count = ARRAY_SIZE(sc7180_intf), diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h index 09df7d87dd43..f7de43838c69 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h @@ -145,6 +145,17 @@ enum { DPU_MIXER_MAX }; +/** + * DSPP sub-blocks + * @DPU_DSPP_PCC Panel color correction block + * @DPU_DSPP_GC Gamma correction block + */ +enum { + DPU_DSPP_PCC = 0x1, + DPU_DSPP_GC, + DPU_DSPP_MAX +}; + /** * PINGPONG sub-blocks * @DPU_PINGPONG_TE Tear check block @@ -377,6 +388,16 @@ struct dpu_lm_sub_blks { struct dpu_pp_blk gc; }; +/** + * struct dpu_dspp_sub_blks: Information of DSPP block + * @gc : gamma correction block + * @pcc: pixel color correction block + */ +struct dpu_dspp_sub_blks { + struct dpu_pp_blk gc; + struct dpu_pp_blk pcc; +}; + struct dpu_pingpong_sub_blks { struct dpu_pp_blk te; struct dpu_pp_blk te2; @@ -471,9 +492,23 @@ struct dpu_lm_cfg { DPU_HW_BLK_INFO; const struct dpu_lm_sub_blks *sblk; u32 pingpong; + u32 dspp; unsigned long lm_pair_mask; }; +/** + * struct dpu_dspp_cfg - information of DSPP blocks + * @id enum identifying this block + * @base register offset of this block + * @features bit mask identifying sub-blocks/features + * supported by this block + * @sblk sub-blocks information + */ +struct dpu_dspp_cfg { + DPU_HW_BLK_INFO; + const struct dpu_dspp_sub_blks *sblk; +}; + /** * struct dpu_pingpong_cfg - information of PING-PONG blocks * @id enum identifying this block @@ -688,6 +723,9 @@ struct dpu_mdss_cfg { u32 ad_count; + u32 dspp_count; + const struct dpu_dspp_cfg *dspp; + /* Add additional block data structures here */ struct dpu_perf_cfg perf; @@ -716,6 +754,7 @@ struct dpu_mdss_hw_cfg_handler { #define BLK_PINGPONG(s) ((s)->pingpong) #define BLK_INTF(s) ((s)->intf) #define BLK_AD(s) ((s)->ad) +#define BLK_DSPP(s) ((s)->dspp) /** * dpu_hw_catalog_init - dpu hardware catalog init API retrieves diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c index 831e5f7a9b7f..613ae8f0cfcd 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c @@ -272,6 +272,31 @@ static int dpu_hw_ctl_active_get_bitmask_intf(struct dpu_hw_ctl *ctx, return 0; } +static uint32_t dpu_hw_ctl_get_bitmask_dspp(struct dpu_hw_ctl *ctx, + enum dpu_dspp dspp) +{ + uint32_t flushbits = 0; + + switch (dspp) { + case DSPP_0: + flushbits = BIT(13); + break; + case DSPP_1: + flushbits = BIT(14); + break; + case DSPP_2: + flushbits = BIT(15); + break; + case DSPP_3: + flushbits = BIT(21); + break; + default: + return 0; + } + + return flushbits; +} + static u32 dpu_hw_ctl_poll_reset_status(struct dpu_hw_ctl *ctx, u32 timeout_us) { struct dpu_hw_blk_reg_map *c = &ctx->hw; @@ -548,6 +573,7 @@ static void _setup_ctl_ops(struct dpu_hw_ctl_ops *ops, ops->setup_blendstage = dpu_hw_ctl_setup_blendstage; ops->get_bitmask_sspp = dpu_hw_ctl_get_bitmask_sspp; ops->get_bitmask_mixer = dpu_hw_ctl_get_bitmask_mixer; + ops->get_bitmask_dspp = dpu_hw_ctl_get_bitmask_dspp; }; static struct dpu_hw_blk_ops dpu_hw_ops; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h index 09e1263c72e2..ec579b470a80 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h @@ -139,6 +139,9 @@ struct dpu_hw_ctl_ops { uint32_t (*get_bitmask_mixer)(struct dpu_hw_ctl *ctx, enum dpu_lm blk); + uint32_t (*get_bitmask_dspp)(struct dpu_hw_ctl *ctx, + enum dpu_dspp blk); + /** * Query the value of the intf flush mask * No effect on hardware diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c new file mode 100644 index 000000000000..75c82e92bd2a --- /dev/null +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved. + */ + +#include "dpu_hwio.h" +#include "dpu_hw_catalog.h" +#include "dpu_hw_lm.h" +#include "dpu_hw_dspp.h" +#include "dpu_kms.h" + + +static void _setup_dspp_ops(struct dpu_hw_dspp *c, + unsigned long features) +{ + return; +} + +static const struct dpu_dspp_cfg *_dspp_offset(enum dpu_dspp dspp, + const struct dpu_mdss_cfg *m, + void __iomem *addr, + struct dpu_hw_blk_reg_map *b) +{ + int i; + + if (!m || !addr || !b) + return ERR_PTR(-EINVAL); + + for (i = 0; i < m->dspp_count; i++) { + if (dspp == m->dspp[i].id) { + b->base_off = addr; + b->blk_off = m->dspp[i].base; + b->length = m->dspp[i].len; + b->hwversion = m->hwversion; + b->log_mask = DPU_DBG_MASK_DSPP; + return &m->dspp[i]; + } + } + + return ERR_PTR(-EINVAL); +} + +static struct dpu_hw_blk_ops dpu_hw_ops; + +struct dpu_hw_dspp *dpu_hw_dspp_init(enum dpu_dspp idx, + void __iomem *addr, + const struct dpu_mdss_cfg *m) +{ + struct dpu_hw_dspp *c; + const struct dpu_dspp_cfg *cfg; + + if (!addr || !m) + return ERR_PTR(-EINVAL); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return ERR_PTR(-ENOMEM); + + cfg = _dspp_offset(idx, m, addr, &c->hw); + if (IS_ERR_OR_NULL(cfg)) { + kfree(c); + return ERR_PTR(-EINVAL); + } + + /* Assign ops */ + c->idx = idx; + c->cap = cfg; + _setup_dspp_ops(c, c->cap->features); + + dpu_hw_blk_init(&c->base, DPU_HW_BLK_DSPP, idx, &dpu_hw_ops); + + return c; +} + +void dpu_hw_dspp_destroy(struct dpu_hw_dspp *dspp) +{ + if (dspp) + dpu_hw_blk_destroy(&dspp->base); + + kfree(dspp); +} + + diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h new file mode 100644 index 000000000000..09807ea3f0fd --- /dev/null +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved. + */ + +#ifndef _DPU_HW_DSPP_H +#define _DPU_HW_DSPP_H + +#include "dpu_hw_blk.h" + +struct dpu_hw_dspp; + +/** + * struct dpu_hw_dspp_ops - interface to the dspp hardware driver functions + * Caller must call the init function to get the dspp context for each dspp + * Assumption is these functions will be called after clocks are enabled + */ +struct dpu_hw_dspp_ops { + + void (*dummy)(struct dpu_hw_dspp *ctx); +}; + +/** + * struct dpu_hw_dspp - dspp description + * @base: Hardware block base structure + * @hw: Block hardware details + * @idx: DSPP index + * @cap: Pointer to layer_cfg + * @ops: Pointer to operations possible for this DSPP + */ +struct dpu_hw_dspp { + struct dpu_hw_blk base; + struct dpu_hw_blk_reg_map hw; + + /* dspp */ + int idx; + const struct dpu_dspp_cfg *cap; + + /* Ops */ + struct dpu_hw_dspp_ops ops; +}; + +/** + * dpu_hw_dspp - convert base object dpu_hw_base to container + * @hw: Pointer to base hardware block + * return: Pointer to hardware block container + */ +static inline struct dpu_hw_dspp *to_dpu_hw_dspp(struct dpu_hw_blk *hw) +{ + return container_of(hw, struct dpu_hw_dspp, base); +} + +/** + * dpu_hw_dspp_init - initializes the dspp hw driver object. + * should be called once before accessing every dspp. + * @idx: DSPP index for which driver object is required + * @addr: Mapped register io address of MDP + * @Return: pointer to structure or ERR_PTR + */ +struct dpu_hw_dspp *dpu_hw_dspp_init(enum dpu_dspp idx, + void __iomem *addr, const struct dpu_mdss_cfg *m); + +/** + * dpu_hw_dspp_destroy(): Destroys DSPP driver context + * @dspp: Pointer to DSPP driver context + */ +void dpu_hw_dspp_destroy(struct dpu_hw_dspp *dspp); + +#endif /*_DPU_HW_DSPP_H */ + diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h index 686882132bf6..402dc5832361 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h @@ -95,6 +95,7 @@ enum dpu_hw_blk_type { DPU_HW_BLK_PINGPONG, DPU_HW_BLK_INTF, DPU_HW_BLK_WB, + DPU_HW_BLK_DSPP, DPU_HW_BLK_MAX, }; @@ -425,5 +426,6 @@ struct dpu_mdss_color { #define DPU_DBG_MASK_TOP (1 << 7) #define DPU_DBG_MASK_VBIF (1 << 8) #define DPU_DBG_MASK_ROT (1 << 9) +#define DPU_DBG_MASK_DSPP (1 << 10) #endif /* _DPU_HW_MDSS_H */ diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 211f5de99a44..4e32d040f1e6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -158,6 +158,7 @@ struct dpu_global_state { uint32_t mixer_to_enc_id[LM_MAX - LM_0]; uint32_t ctl_to_enc_id[CTL_MAX - CTL_0]; uint32_t intf_to_enc_id[INTF_MAX - INTF_0]; + uint32_t dspp_to_enc_id[DSPP_MAX - DSPP_0]; }; struct dpu_global_state diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c index 9b62451b01ee..9b2b5044e8e0 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c @@ -9,6 +9,7 @@ #include "dpu_hw_ctl.h" #include "dpu_hw_pingpong.h" #include "dpu_hw_intf.h" +#include "dpu_hw_dspp.h" #include "dpu_encoder.h" #include "dpu_trace.h" @@ -174,6 +175,23 @@ int dpu_rm_init(struct dpu_rm *rm, rm->ctl_blks[ctl->id - CTL_0] = &hw->base; } + for (i = 0; i < cat->dspp_count; i++) { + struct dpu_hw_dspp *hw; + const struct dpu_dspp_cfg *dspp = &cat->dspp[i]; + + if (dspp->id < DSPP_0 || dspp->id >= DSPP_MAX) { + DPU_ERROR("skip dspp %d with invalid id\n", dspp->id); + continue; + } + hw = dpu_hw_dspp_init(dspp->id, mmio, cat); + if (IS_ERR_OR_NULL(hw)) { + rc = PTR_ERR(hw); + DPU_ERROR("failed dspp object creation: err %d\n", rc); + goto fail; + } + rm->dspp_blks[dspp->id - DSPP_0] = &hw->base; + } + return 0; fail: @@ -222,12 +240,17 @@ static bool _dpu_rm_check_lm_peer(struct dpu_rm *rm, int primary_idx, * if lm, and all other hardwired blocks connected to the lm (pp) is * available and appropriate * @pp_idx: output parameter, index of pingpong block attached to the layer - * mixer in rm->pongpong_blks[]. + * mixer in rm->pingpong_blks[]. + * @dspp_idx: output parameter, index of dspp block attached to the layer + * mixer in rm->dspp_blks[]. + * @reqs: input parameter, rm requirements for HW blocks needed in the + * datapath. * @Return: true if lm matches all requirements, false otherwise */ static bool _dpu_rm_check_lm_and_get_connected_blks(struct dpu_rm *rm, struct dpu_global_state *global_state, - uint32_t enc_id, int lm_idx, int *pp_idx) + uint32_t enc_id, int lm_idx, int *pp_idx, int *dspp_idx, + struct dpu_rm_requirements *reqs) { const struct dpu_lm_cfg *lm_cfg; int idx; @@ -251,6 +274,23 @@ static bool _dpu_rm_check_lm_and_get_connected_blks(struct dpu_rm *rm, return false; } *pp_idx = idx; + + if (!reqs->topology.num_dspp) + return true; + + idx = lm_cfg->dspp - DSPP_0; + if (idx < 0 || idx >= ARRAY_SIZE(rm->dspp_blks)) { + DPU_ERROR("failed to get dspp on lm %d\n", lm_cfg->dspp); + return false; + } + + if (reserved_by_other(global_state->dspp_to_enc_id, idx, enc_id)) { + DPU_DEBUG("lm %d dspp %d already reserved\n", lm_cfg->id, + lm_cfg->dspp); + return false; + } + *dspp_idx = idx; + return true; } @@ -262,6 +302,7 @@ static int _dpu_rm_reserve_lms(struct dpu_rm *rm, { int lm_idx[MAX_BLOCKS]; int pp_idx[MAX_BLOCKS]; + int dspp_idx[MAX_BLOCKS] = {0}; int i, j, lm_count = 0; if (!reqs->topology.num_lm) { @@ -279,7 +320,8 @@ static int _dpu_rm_reserve_lms(struct dpu_rm *rm, lm_idx[lm_count] = i; if (!_dpu_rm_check_lm_and_get_connected_blks(rm, global_state, - enc_id, i, &pp_idx[lm_count])) { + enc_id, i, &pp_idx[lm_count], + &dspp_idx[lm_count], reqs)) { continue; } @@ -299,7 +341,8 @@ static int _dpu_rm_reserve_lms(struct dpu_rm *rm, if (!_dpu_rm_check_lm_and_get_connected_blks(rm, global_state, enc_id, j, - &pp_idx[lm_count])) { + &pp_idx[lm_count], &dspp_idx[lm_count], + reqs)) { continue; } @@ -316,6 +359,8 @@ static int _dpu_rm_reserve_lms(struct dpu_rm *rm, for (i = 0; i < lm_count; i++) { global_state->mixer_to_enc_id[lm_idx[i]] = enc_id; global_state->pingpong_to_enc_id[pp_idx[i]] = enc_id; + global_state->dspp_to_enc_id[dspp_idx[i]] = + reqs->topology.num_dspp ? enc_id : 0; trace_dpu_rm_reserve_lms(lm_idx[i] + LM_0, enc_id, pp_idx[i] + PINGPONG_0); @@ -560,6 +605,11 @@ int dpu_rm_get_assigned_resources(struct dpu_rm *rm, hw_to_enc_id = global_state->intf_to_enc_id; max_blks = ARRAY_SIZE(rm->intf_blks); break; + case DPU_HW_BLK_DSPP: + hw_blks = rm->dspp_blks; + hw_to_enc_id = global_state->dspp_to_enc_id; + max_blks = ARRAY_SIZE(rm->dspp_blks); + break; default: DPU_ERROR("blk type %d not managed by rm\n", type); return 0; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.h index 6d2b04f306f0..08726bb1063a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_rm.h @@ -19,6 +19,7 @@ struct dpu_global_state; * @mixer_blks: array of layer mixer hardware resources * @ctl_blks: array of ctl hardware resources * @intf_blks: array of intf hardware resources + * @dspp_blks: array of dspp hardware resources * @lm_max_width: cached layer mixer maximum width * @rm_lock: resource manager mutex */ @@ -27,6 +28,7 @@ struct dpu_rm { struct dpu_hw_blk *mixer_blks[LM_MAX - LM_0]; struct dpu_hw_blk *ctl_blks[CTL_MAX - CTL_0]; struct dpu_hw_blk *intf_blks[INTF_MAX - INTF_0]; + struct dpu_hw_blk *dspp_blks[DSPP_MAX - DSPP_0]; uint32_t lm_max_width; }; diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 194d900a460e..0ab49aa781a3 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -105,6 +105,7 @@ struct msm_display_topology { u32 num_lm; u32 num_enc; u32 num_intf; + u32 num_dspp; }; /** From 4259ff7ae509ed880b3a7bb685972c3a3bf4b74b Mon Sep 17 00:00:00 2001 From: Kalyan Thota Date: Tue, 24 Mar 2020 15:31:19 +0530 Subject: [PATCH 169/574] drm/msm/dpu: add support for pcc color block in dpu driver This change adds support for color correction sub block for SC7180 device. Signed-off-by: Kalyan Thota Tested-by: Fritz Koenig Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 77 +++++++++++++++++++ .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 9 ++- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c | 49 +++++++++++- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h | 33 +++++++- 4 files changed, 164 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index cf3af731a324..e15b42a780e0 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,7 @@ #include "dpu_kms.h" #include "dpu_hw_lm.h" #include "dpu_hw_ctl.h" +#include "dpu_hw_dspp.h" #include "dpu_crtc.h" #include "dpu_plane.h" #include "dpu_encoder.h" @@ -40,6 +42,9 @@ /* timeout in ms waiting for frame done */ #define DPU_CRTC_FRAME_DONE_TIMEOUT_MS 60 +#define CONVERT_S3_15(val) \ + (((((u64)val) & ~BIT_ULL(63)) >> 17) & GENMASK_ULL(17, 0)) + static struct dpu_kms *_dpu_crtc_get_kms(struct drm_crtc *crtc) { struct msm_drm_private *priv = crtc->dev->dev_private; @@ -420,6 +425,74 @@ static void _dpu_crtc_setup_lm_bounds(struct drm_crtc *crtc, drm_mode_debug_printmodeline(adj_mode); } +static void _dpu_crtc_get_pcc_coeff(struct drm_crtc_state *state, + struct dpu_hw_pcc_cfg *cfg) +{ + struct drm_color_ctm *ctm; + + memset(cfg, 0, sizeof(struct dpu_hw_pcc_cfg)); + + ctm = (struct drm_color_ctm *)state->ctm->data; + + if (!ctm) + return; + + cfg->r.r = CONVERT_S3_15(ctm->matrix[0]); + cfg->g.r = CONVERT_S3_15(ctm->matrix[1]); + cfg->b.r = CONVERT_S3_15(ctm->matrix[2]); + + cfg->r.g = CONVERT_S3_15(ctm->matrix[3]); + cfg->g.g = CONVERT_S3_15(ctm->matrix[4]); + cfg->b.g = CONVERT_S3_15(ctm->matrix[5]); + + cfg->r.b = CONVERT_S3_15(ctm->matrix[6]); + cfg->g.b = CONVERT_S3_15(ctm->matrix[7]); + cfg->b.b = CONVERT_S3_15(ctm->matrix[8]); +} + +static void _dpu_crtc_setup_cp_blocks(struct drm_crtc *crtc) +{ + struct drm_crtc_state *state = crtc->state; + struct dpu_crtc_state *cstate = to_dpu_crtc_state(crtc->state); + struct dpu_crtc_mixer *mixer = cstate->mixers; + struct dpu_hw_pcc_cfg cfg; + struct dpu_hw_ctl *ctl; + struct dpu_hw_mixer *lm; + struct dpu_hw_dspp *dspp; + int i; + + + if (!state->color_mgmt_changed) + return; + + for (i = 0; i < cstate->num_mixers; i++) { + ctl = mixer[i].lm_ctl; + lm = mixer[i].hw_lm; + dspp = mixer[i].hw_dspp; + + if (!dspp || !dspp->ops.setup_pcc) + continue; + + if (!state->ctm) { + dspp->ops.setup_pcc(dspp, NULL); + } else { + _dpu_crtc_get_pcc_coeff(state, &cfg); + dspp->ops.setup_pcc(dspp, &cfg); + } + + mixer[i].flush_mask |= ctl->ops.get_bitmask_dspp(ctl, + mixer[i].hw_dspp->idx); + + /* stage config flush mask */ + ctl->ops.update_pending_flush(ctl, mixer[i].flush_mask); + + DPU_DEBUG("lm %d, ctl %d, flush mask 0x%x\n", + mixer[i].hw_lm->idx - DSPP_0, + ctl->idx - CTL_0, + mixer[i].flush_mask); + } +} + static void dpu_crtc_atomic_begin(struct drm_crtc *crtc, struct drm_crtc_state *old_state) { @@ -471,6 +544,8 @@ static void dpu_crtc_atomic_begin(struct drm_crtc *crtc, _dpu_crtc_blend_setup(crtc); + _dpu_crtc_setup_cp_blocks(crtc); + /* * PP_DONE irq is only used by command mode for now. * It is better to request pending before FLUSH and START trigger @@ -1301,6 +1376,8 @@ struct drm_crtc *dpu_crtc_init(struct drm_device *dev, struct drm_plane *plane, drm_crtc_helper_add(crtc, &dpu_crtc_helper_funcs); + drm_crtc_enable_color_mgmt(crtc, 0, true, 0); + /* save user friendly CRTC name for later */ snprintf(dpu_crtc->name, DPU_CRTC_NAME_SIZE, "crtc%u", crtc->base.id); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 19d065ad0b25..731b4fbbf647 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -41,7 +41,7 @@ #define PINGPONG_SDM845_SPLIT_MASK \ (PINGPONG_SDM845_MASK | BIT(DPU_PINGPONG_TE2)) -#define DSPP_SC7180_MASK 0 +#define DSPP_SC7180_MASK BIT(DPU_DSPP_PCC) #define DEFAULT_PIXEL_RAM_SIZE (50 * 1024) #define DEFAULT_DPU_LINE_WIDTH 2048 @@ -339,12 +339,17 @@ static const struct dpu_lm_cfg sc7180_lm[] = { /************************************************************* * DSPP sub blocks config *************************************************************/ +static const struct dpu_dspp_sub_blks sc7180_dspp_sblk = { + .pcc = {.id = DPU_DSPP_PCC, .base = 0x1700, + .len = 0x90, .version = 0x10000}, +}; + #define DSPP_BLK(_name, _id, _base) \ {\ .name = _name, .id = _id, \ .base = _base, .len = 0x1800, \ .features = DSPP_SC7180_MASK, \ - .sblk = NULL, \ + .sblk = &sc7180_dspp_sblk \ } static const struct dpu_dspp_cfg sc7180_dspp[] = { diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c index 75c82e92bd2a..b5189cece3c6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c @@ -9,10 +9,57 @@ #include "dpu_kms.h" +/* DSPP_PCC */ +#define PCC_EN BIT(0) +#define PCC_DIS 0 +#define PCC_RED_R_OFF 0x10 +#define PCC_RED_G_OFF 0x1C +#define PCC_RED_B_OFF 0x28 +#define PCC_GREEN_R_OFF 0x14 +#define PCC_GREEN_G_OFF 0x20 +#define PCC_GREEN_B_OFF 0x2C +#define PCC_BLUE_R_OFF 0x18 +#define PCC_BLUE_G_OFF 0x24 +#define PCC_BLUE_B_OFF 0x30 + +void dpu_setup_dspp_pcc(struct dpu_hw_dspp *ctx, + struct dpu_hw_pcc_cfg *cfg) +{ + + u32 base = ctx->cap->sblk->pcc.base; + + if (!ctx || !base) { + DRM_ERROR("invalid ctx %pK pcc base 0x%x\n", ctx, base); + return; + } + + if (!cfg) { + DRM_DEBUG_DRIVER("disable pcc feature\n"); + DPU_REG_WRITE(&ctx->hw, base, PCC_DIS); + return; + } + + DPU_REG_WRITE(&ctx->hw, base + PCC_RED_R_OFF, cfg->r.r); + DPU_REG_WRITE(&ctx->hw, base + PCC_RED_G_OFF, cfg->r.g); + DPU_REG_WRITE(&ctx->hw, base + PCC_RED_B_OFF, cfg->r.b); + + DPU_REG_WRITE(&ctx->hw, base + PCC_GREEN_R_OFF, cfg->g.r); + DPU_REG_WRITE(&ctx->hw, base + PCC_GREEN_G_OFF, cfg->g.g); + DPU_REG_WRITE(&ctx->hw, base + PCC_GREEN_B_OFF, cfg->g.b); + + DPU_REG_WRITE(&ctx->hw, base + PCC_BLUE_R_OFF, cfg->b.r); + DPU_REG_WRITE(&ctx->hw, base + PCC_BLUE_G_OFF, cfg->b.g); + DPU_REG_WRITE(&ctx->hw, base + PCC_BLUE_B_OFF, cfg->b.b); + + DPU_REG_WRITE(&ctx->hw, base, PCC_EN); +} + static void _setup_dspp_ops(struct dpu_hw_dspp *c, unsigned long features) { - return; + if (test_bit(DPU_DSPP_PCC, &features) && + IS_SC7180_TARGET(c->hw.hwversion)) + c->ops.setup_pcc = dpu_setup_dspp_pcc; } static const struct dpu_dspp_cfg *_dspp_offset(enum dpu_dspp dspp, diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h index 09807ea3f0fd..7fa189cfcb06 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.h @@ -9,14 +9,45 @@ struct dpu_hw_dspp; +/** + * struct dpu_hw_pcc_coeff - PCC coefficient structure for each color + * component. + * @r: red coefficient. + * @g: green coefficient. + * @b: blue coefficient. + */ + +struct dpu_hw_pcc_coeff { + __u32 r; + __u32 g; + __u32 b; +}; + +/** + * struct dpu_hw_pcc - pcc feature structure + * @r: red coefficients. + * @g: green coefficients. + * @b: blue coefficients. + */ +struct dpu_hw_pcc_cfg { + struct dpu_hw_pcc_coeff r; + struct dpu_hw_pcc_coeff g; + struct dpu_hw_pcc_coeff b; +}; + /** * struct dpu_hw_dspp_ops - interface to the dspp hardware driver functions * Caller must call the init function to get the dspp context for each dspp * Assumption is these functions will be called after clocks are enabled */ struct dpu_hw_dspp_ops { + /** + * setup_pcc - setup dspp pcc + * @ctx: Pointer to dspp context + * @cfg: Pointer to configuration + */ + void (*setup_pcc)(struct dpu_hw_dspp *ctx, struct dpu_hw_pcc_cfg *cfg); - void (*dummy)(struct dpu_hw_dspp *ctx); }; /** From 04d9044f6c577948609c03b4e33b8fbc8b87c4b1 Mon Sep 17 00:00:00 2001 From: Kalyan Thota Date: Wed, 1 Apr 2020 14:18:18 +0530 Subject: [PATCH 170/574] drm/msm/dpu: add support for clk and bw scaling for display This change adds support to scale src clk and bandwidth as per composition requirements. Interconnect registration for bw has been moved to mdp device node from mdss to facilitate the scaling. Signed-off-by: Kalyan Thota Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 106 +++++++++++++++--- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 5 +- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h | 4 + drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 37 +++++- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 4 + drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c | 9 +- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 82 ++++++++++++++ drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h | 4 + 8 files changed, 228 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index 11f2bebe3869..24874f62de13 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -29,6 +29,73 @@ enum dpu_perf_mode { DPU_PERF_MODE_MAX }; +/** + * @_dpu_core_perf_calc_bw() - to calculate BW per crtc + * @kms - pointer to the dpu_kms + * @crtc - pointer to a crtc + * Return: returns aggregated BW for all planes in crtc. + */ +static u64 _dpu_core_perf_calc_bw(struct dpu_kms *kms, + struct drm_crtc *crtc) +{ + struct drm_plane *plane; + struct dpu_plane_state *pstate; + u64 crtc_plane_bw = 0; + u32 bw_factor; + + drm_atomic_crtc_for_each_plane(plane, crtc) { + pstate = to_dpu_plane_state(plane->state); + + if (!pstate) + continue; + + crtc_plane_bw += pstate->plane_fetch_bw; + } + + bw_factor = kms->catalog->perf.bw_inefficiency_factor; + if (bw_factor) + crtc_plane_bw = mult_frac(crtc_plane_bw, bw_factor, 100); + + return crtc_plane_bw; +} + +/** + * _dpu_core_perf_calc_clk() - to calculate clock per crtc + * @kms - pointer to the dpu_kms + * @crtc - pointer to a crtc + * @state - pointer to a crtc state + * Return: returns max clk for all planes in crtc. + */ +static u64 _dpu_core_perf_calc_clk(struct dpu_kms *kms, + struct drm_crtc *crtc, struct drm_crtc_state *state) +{ + struct drm_plane *plane; + struct dpu_plane_state *pstate; + struct drm_display_mode *mode; + u64 crtc_clk; + u32 clk_factor; + + mode = &state->adjusted_mode; + + crtc_clk = mode->vtotal * mode->hdisplay * drm_mode_vrefresh(mode); + + drm_atomic_crtc_for_each_plane(plane, crtc) { + pstate = to_dpu_plane_state(plane->state); + + if (!pstate) + continue; + + crtc_clk = max(pstate->plane_clk, crtc_clk); + } + + clk_factor = kms->catalog->perf.clk_inefficiency_factor; + if (clk_factor) + crtc_clk = mult_frac(crtc_clk, clk_factor, 100); + + return crtc_clk; +} + + static struct dpu_kms *_dpu_crtc_get_kms(struct drm_crtc *crtc) { struct msm_drm_private *priv; @@ -67,12 +134,7 @@ static void _dpu_core_perf_calc_crtc(struct dpu_kms *kms, dpu_cstate = to_dpu_crtc_state(state); memset(perf, 0, sizeof(struct dpu_core_perf_params)); - if (!dpu_cstate->bw_control) { - perf->bw_ctl = kms->catalog->perf.max_bw_high * - 1000ULL; - perf->max_per_pipe_ib = perf->bw_ctl; - perf->core_clk_rate = kms->perf.max_core_clk_rate; - } else if (kms->perf.perf_tune.mode == DPU_PERF_MODE_MINIMUM) { + if (kms->perf.perf_tune.mode == DPU_PERF_MODE_MINIMUM) { perf->bw_ctl = 0; perf->max_per_pipe_ib = 0; perf->core_clk_rate = 0; @@ -80,6 +142,10 @@ static void _dpu_core_perf_calc_crtc(struct dpu_kms *kms, perf->bw_ctl = kms->perf.fix_core_ab_vote; perf->max_per_pipe_ib = kms->perf.fix_core_ib_vote; perf->core_clk_rate = kms->perf.fix_core_clk_rate; + } else { + perf->bw_ctl = _dpu_core_perf_calc_bw(kms, crtc); + perf->max_per_pipe_ib = kms->catalog->perf.min_dram_ib; + perf->core_clk_rate = _dpu_core_perf_calc_clk(kms, crtc, state); } DPU_DEBUG( @@ -132,11 +198,7 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, DPU_DEBUG("crtc:%d bw:%llu ctrl:%d\n", tmp_crtc->base.id, tmp_cstate->new_perf.bw_ctl, tmp_cstate->bw_control); - /* - * For bw check only use the bw if the - * atomic property has been already set - */ - if (tmp_cstate->bw_control) + bw_sum_of_intfs += tmp_cstate->new_perf.bw_ctl; } @@ -152,9 +214,7 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, DPU_DEBUG("final threshold bw limit = %d\n", threshold); - if (!dpu_cstate->bw_control) { - DPU_DEBUG("bypass bandwidth check\n"); - } else if (!threshold) { + if (!threshold) { DPU_ERROR("no bandwidth limits specified\n"); return -E2BIG; } else if (bw > threshold) { @@ -175,7 +235,8 @@ static int _dpu_core_perf_crtc_update_bus(struct dpu_kms *kms, = dpu_crtc_get_client_type(crtc); struct drm_crtc *tmp_crtc; struct dpu_crtc_state *dpu_cstate; - int ret = 0; + int i, ret = 0; + u64 avg_bw; drm_for_each_crtc(tmp_crtc, crtc->dev) { if (tmp_crtc->enabled && @@ -186,10 +247,21 @@ static int _dpu_core_perf_crtc_update_bus(struct dpu_kms *kms, perf.max_per_pipe_ib = max(perf.max_per_pipe_ib, dpu_cstate->new_perf.max_per_pipe_ib); - DPU_DEBUG("crtc=%d bw=%llu\n", tmp_crtc->base.id, - dpu_cstate->new_perf.bw_ctl); + perf.bw_ctl += dpu_cstate->new_perf.bw_ctl; + + DPU_DEBUG("crtc=%d bw=%llu paths:%d\n", + tmp_crtc->base.id, + dpu_cstate->new_perf.bw_ctl, kms->num_paths); } } + + avg_bw = kms->num_paths ? + perf.bw_ctl / kms->num_paths : 0; + + for (i = 0; i < kms->num_paths; i++) + icc_set_bw(kms->path[i], + Bps_to_icc(avg_bw), (perf.max_per_pipe_ib)); + return ret; } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 731b4fbbf647..cc8bf06781c0 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -541,7 +541,8 @@ static const struct dpu_perf_cfg sc7180_perf_data = { .max_bw_high = 5500000, .min_core_ib = 2400000, .min_llcc_ib = 800000, - .min_dram_ib = 800000, + .min_dram_ib = 1600000, + .min_prefill_lines = 24, .danger_lut_tbl = {0xff, 0xffff, 0x0}, .qos_lut_tbl = { {.nentry = ARRAY_SIZE(sc7180_qos_linear), @@ -558,6 +559,8 @@ static const struct dpu_perf_cfg sc7180_perf_data = { {.rd_enable = 1, .wr_enable = 1}, {.rd_enable = 1, .wr_enable = 0} }, + .clk_inefficiency_factor = 105, + .bw_inefficiency_factor = 120, }; /************************************************************* diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h index f7de43838c69..f2a5fe2d9d62 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h @@ -651,6 +651,8 @@ struct dpu_perf_cdp_cfg { * @downscaling_prefill_lines downscaling latency in lines * @amortizable_theshold minimum y position for traffic shaping prefill * @min_prefill_lines minimum pipeline latency in lines + * @clk_inefficiency_factor DPU src clock inefficiency factor + * @bw_inefficiency_factor DPU axi bus bw inefficiency factor * @safe_lut_tbl: LUT tables for safe signals * @danger_lut_tbl: LUT tables for danger signals * @qos_lut_tbl: LUT tables for QoS signals @@ -675,6 +677,8 @@ struct dpu_perf_cfg { u32 downscaling_prefill_lines; u32 amortizable_threshold; u32 min_prefill_lines; + u32 clk_inefficiency_factor; + u32 bw_inefficiency_factor; u32 safe_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; u32 danger_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; struct dpu_qos_lut_tbl qos_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c index ce19f1d39367..10c33cf7a5d7 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c @@ -303,6 +303,28 @@ static int dpu_kms_global_obj_init(struct dpu_kms *dpu_kms) return 0; } +static int dpu_kms_parse_data_bus_icc_path(struct dpu_kms *dpu_kms) +{ + struct icc_path *path0; + struct icc_path *path1; + struct drm_device *dev = dpu_kms->dev; + + path0 = of_icc_get(dev->dev, "mdp0-mem"); + path1 = of_icc_get(dev->dev, "mdp1-mem"); + + if (IS_ERR_OR_NULL(path0)) + return PTR_ERR_OR_ZERO(path0); + + dpu_kms->path[0] = path0; + dpu_kms->num_paths = 1; + + if (!IS_ERR_OR_NULL(path1)) { + dpu_kms->path[1] = path1; + dpu_kms->num_paths++; + } + return 0; +} + static int dpu_kms_enable_vblank(struct msm_kms *kms, struct drm_crtc *crtc) { return dpu_crtc_vblank(crtc, true); @@ -980,6 +1002,9 @@ static int dpu_kms_hw_init(struct msm_kms *kms) dpu_vbif_init_memtypes(dpu_kms); + if (of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss")) + dpu_kms_parse_data_bus_icc_path(dpu_kms); + pm_runtime_put_sync(&dpu_kms->pdev->dev); return 0; @@ -1085,7 +1110,7 @@ static int dpu_dev_remove(struct platform_device *pdev) static int __maybe_unused dpu_runtime_suspend(struct device *dev) { - int rc = -1; + int i, rc = -1; struct platform_device *pdev = to_platform_device(dev); struct dpu_kms *dpu_kms = platform_get_drvdata(pdev); struct dss_module_power *mp = &dpu_kms->mp; @@ -1094,6 +1119,9 @@ static int __maybe_unused dpu_runtime_suspend(struct device *dev) if (rc) DPU_ERROR("clock disable failed rc:%d\n", rc); + for (i = 0; i < dpu_kms->num_paths; i++) + icc_set_bw(dpu_kms->path[i], 0, 0); + return rc; } @@ -1105,8 +1133,15 @@ static int __maybe_unused dpu_runtime_resume(struct device *dev) struct drm_encoder *encoder; struct drm_device *ddev; struct dss_module_power *mp = &dpu_kms->mp; + int i; ddev = dpu_kms->dev; + + /* Min vote of BW is required before turning on AXI clk */ + for (i = 0; i < dpu_kms->num_paths; i++) + icc_set_bw(dpu_kms->path[i], 0, + dpu_kms->catalog->perf.min_dram_ib); + rc = msm_dss_enable_clk(mp->clk_config, mp->num_clk, true); if (rc) { DPU_ERROR("clock enable failed rc:%d\n", rc); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 4e32d040f1e6..94410ca9bd70 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -8,6 +8,8 @@ #ifndef __DPU_KMS_H__ #define __DPU_KMS_H__ +#include + #include #include "msm_drv.h" @@ -137,6 +139,8 @@ struct dpu_kms { * when disabled. */ atomic_t bandwidth_ref; + struct icc_path *path[2]; + u32 num_paths; }; struct vsync_info { diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c index 80d3cfc14007..df0a9835bfb1 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c @@ -8,7 +8,6 @@ #include #include #include "dpu_kms.h" -#include #define to_dpu_mdss(x) container_of(x, struct dpu_mdss, base) @@ -315,9 +314,11 @@ int dpu_mdss_init(struct drm_device *dev) } dpu_mdss->mmio_len = resource_size(res); - ret = dpu_mdss_parse_data_bus_icc_path(dev, dpu_mdss); - if (ret) - return ret; + if (!of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss")) { + ret = dpu_mdss_parse_data_bus_icc_path(dev, dpu_mdss); + if (ret) + return ret; + } mp = &dpu_mdss->mp; ret = msm_dss_parse_clock(pdev, mp); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index 3b9c33e694bf..c2a6e3dacd68 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -131,6 +131,84 @@ static struct dpu_kms *_dpu_plane_get_kms(struct drm_plane *plane) return to_dpu_kms(priv->kms); } +/** + * _dpu_plane_calc_bw - calculate bandwidth required for a plane + * @Plane: Pointer to drm plane. + * Result: Updates calculated bandwidth in the plane state. + * BW Equation: src_w * src_h * bpp * fps * (v_total / v_dest) + * Prefill BW Equation: line src bytes * line_time + */ +static void _dpu_plane_calc_bw(struct drm_plane *plane, + struct drm_framebuffer *fb) +{ + struct dpu_plane *pdpu = to_dpu_plane(plane); + struct dpu_plane_state *pstate; + struct drm_display_mode *mode; + const struct dpu_format *fmt = NULL; + struct dpu_kms *dpu_kms = _dpu_plane_get_kms(plane); + int src_width, src_height, dst_height, fps; + u64 plane_prefill_bw; + u64 plane_bw; + u32 hw_latency_lines; + u32 scale_factor; + int vbp, vpw; + + pstate = to_dpu_plane_state(plane->state); + mode = &plane->state->crtc->mode; + + fmt = dpu_get_dpu_format_ext(fb->format->format, fb->modifier); + + src_width = drm_rect_width(&pdpu->pipe_cfg.src_rect); + src_height = drm_rect_height(&pdpu->pipe_cfg.src_rect); + dst_height = drm_rect_height(&pdpu->pipe_cfg.dst_rect); + fps = drm_mode_vrefresh(mode); + vbp = mode->vtotal - mode->vsync_end; + vpw = mode->vsync_end - mode->vsync_start; + hw_latency_lines = dpu_kms->catalog->perf.min_prefill_lines; + scale_factor = src_height > dst_height ? + mult_frac(src_height, 1, dst_height) : 1; + + plane_bw = + src_width * mode->vtotal * fps * fmt->bpp * scale_factor; + + plane_prefill_bw = + src_width * hw_latency_lines * fps * fmt->bpp * scale_factor; + + plane_prefill_bw = mult_frac(plane_prefill_bw, mode->vtotal, (vbp+vpw)); + + pstate->plane_fetch_bw = max(plane_bw, plane_prefill_bw); +} + + +/** + * _dpu_plane_calc_clk - calculate clock required for a plane + * @Plane: Pointer to drm plane. + * Result: Updates calculated clock in the plane state. + * Clock equation: dst_w * v_total * fps * (src_h / dst_h) + */ +static void _dpu_plane_calc_clk(struct drm_plane *plane) +{ + struct dpu_plane *pdpu = to_dpu_plane(plane); + struct dpu_plane_state *pstate; + struct drm_display_mode *mode; + int dst_width, src_height, dst_height, fps; + + pstate = to_dpu_plane_state(plane->state); + mode = &plane->state->crtc->mode; + + src_height = drm_rect_height(&pdpu->pipe_cfg.src_rect); + dst_width = drm_rect_width(&pdpu->pipe_cfg.dst_rect); + dst_height = drm_rect_height(&pdpu->pipe_cfg.dst_rect); + fps = drm_mode_vrefresh(mode); + + pstate->plane_clk = + dst_width * mode->vtotal * fps; + + if (src_height > dst_height) + pstate->plane_clk = mult_frac(pstate->plane_clk, + src_height, dst_height); +} + /** * _dpu_plane_calc_fill_level - calculate fill level of the given source format * @plane: Pointer to drm plane @@ -1102,6 +1180,10 @@ static void dpu_plane_sspp_atomic_update(struct drm_plane *plane) } _dpu_plane_set_qos_remap(plane); + + _dpu_plane_calc_bw(plane, fb); + + _dpu_plane_calc_clk(plane); } static void _dpu_plane_atomic_disable(struct drm_plane *plane) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h index 456949713e90..ca83b8753d59 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h @@ -25,6 +25,8 @@ * @scaler3_cfg: configuration data for scaler3 * @pixel_ext: configuration data for pixel extensions * @cdp_cfg: CDP configuration + * @plane_fetch_bw: calculated BW per plane + * @plane_clk: calculated clk per plane */ struct dpu_plane_state { struct drm_plane_state base; @@ -39,6 +41,8 @@ struct dpu_plane_state { struct dpu_hw_pixel_ext pixel_ext; struct dpu_hw_pipe_cdp_cfg cdp_cfg; + u64 plane_fetch_bw; + u64 plane_clk; }; /** From 71dc6c08e4c53d37e3756502b1793f440f4d4030 Mon Sep 17 00:00:00 2001 From: Krishna Manikandan Date: Mon, 4 May 2020 19:01:03 +0530 Subject: [PATCH 171/574] drm/msm/dpu: update bandwidth threshold check Maximum allowed bandwidth has no dependency on the type of panel used. Hence, cleanup the code to use max_bw_high as the threshold value for bandwidth checks. Update the maximum allowed bandwidth as 6.8Gbps for SC7180 target. Signed-off-by: Krishna Manikandan Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 23 +------------------ .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 4 ++-- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index 24874f62de13..9697abcbec3f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -103,22 +103,6 @@ static struct dpu_kms *_dpu_crtc_get_kms(struct drm_crtc *crtc) return to_dpu_kms(priv->kms); } -static bool _dpu_core_video_mode_intf_connected(struct drm_crtc *crtc) -{ - struct drm_crtc *tmp_crtc; - - drm_for_each_crtc(tmp_crtc, crtc->dev) { - if ((dpu_crtc_get_intf_mode(tmp_crtc) == INTF_MODE_VIDEO) && - tmp_crtc->enabled) { - DPU_DEBUG("video interface connected crtc:%d\n", - tmp_crtc->base.id); - return true; - } - } - - return false; -} - static void _dpu_core_perf_calc_crtc(struct dpu_kms *kms, struct drm_crtc *crtc, struct drm_crtc_state *state, @@ -160,7 +144,6 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, u32 bw, threshold; u64 bw_sum_of_intfs = 0; enum dpu_crtc_client_type curr_client_type; - bool is_video_mode; struct dpu_crtc_state *dpu_cstate; struct drm_crtc *tmp_crtc; struct dpu_kms *kms; @@ -206,11 +189,7 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, bw = DIV_ROUND_UP_ULL(bw_sum_of_intfs, 1000); DPU_DEBUG("calculated bandwidth=%uk\n", bw); - is_video_mode = dpu_crtc_get_intf_mode(crtc) == INTF_MODE_VIDEO; - threshold = (is_video_mode || - _dpu_core_video_mode_intf_connected(crtc)) ? - kms->catalog->perf.max_bw_low : - kms->catalog->perf.max_bw_high; + threshold = kms->catalog->perf.max_bw_high; DPU_DEBUG("final threshold bw limit = %d\n", threshold); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index cc8bf06781c0..8f2357d9960a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -537,8 +537,8 @@ static const struct dpu_perf_cfg sdm845_perf_data = { }; static const struct dpu_perf_cfg sc7180_perf_data = { - .max_bw_low = 3900000, - .max_bw_high = 5500000, + .max_bw_low = 6800000, + .max_bw_high = 6800000, .min_core_ib = 2400000, .min_llcc_ib = 800000, .min_dram_ib = 1600000, From eadf79286a4badebc95af7061530bdb50a7e6f38 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 1 May 2020 13:43:26 -0600 Subject: [PATCH 172/574] drm/msm: Check for powered down HW in the devfreq callbacks Writing to the devfreq sysfs nodes while the GPU is powered down can result in a system crash (on a5xx) or a nasty GMU error (on a6xx): $ /sys/class/devfreq/5000000.gpu# echo 500000000 > min_freq [ 104.841625] platform 506a000.gmu: [drm:a6xx_gmu_set_oob] *ERROR* Timeout waiting for GMU OOB set GPU_DCVS: 0x0 Despite the fact that we carefully try to suspend the devfreq device when the hardware is powered down there are lots of holes in the governors that don't check for the suspend state and blindly call into the devfreq callbacks that end up triggering hardware reads in the GPU driver. Call pm_runtime_get_if_in_use() in the gpu_busy() and gpu_set_freq() callbacks to skip the hardware access if it isn't active. v3: Only check pm_runtime_get_if_in_use() for == 0 per Eric Anholt v2: Use pm_runtime_get_if_in_use() per Eric Anholt Cc: stable@vger.kernel.org Reviewed-by: Eric Anholt Signed-off-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 ++++++ drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 8 ++++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 724024a2243a..662d02289533 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1404,6 +1404,10 @@ static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu) { u64 busy_cycles, busy_time; + /* Only read the gpu busy if the hardware is already active */ + if (pm_runtime_get_if_in_use(&gpu->pdev->dev) == 0) + return 0; + busy_cycles = gpu_read64(gpu, REG_A5XX_RBBM_PERFCTR_RBBM_0_LO, REG_A5XX_RBBM_PERFCTR_RBBM_0_HI); @@ -1412,6 +1416,8 @@ static unsigned long a5xx_gpu_busy(struct msm_gpu *gpu) gpu->devfreq.busy_cycles = busy_cycles; + pm_runtime_put(&gpu->pdev->dev); + if (WARN_ON(busy_time > ~0LU)) return ~0LU; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 40cb4535d554..b4aa93710409 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -108,6 +108,13 @@ static void __a6xx_gmu_set_freq(struct a6xx_gmu *gmu, int index) struct msm_gpu *gpu = &adreno_gpu->base; int ret; + /* + * This can get called from devfreq while the hardware is idle. Don't + * bring up the power if it isn't already active + */ + if (pm_runtime_get_if_in_use(gmu->dev) == 0) + return; + gmu_write(gmu, REG_A6XX_GMU_DCVS_ACK_OPTION, 0); gmu_write(gmu, REG_A6XX_GMU_DCVS_PERF_SETTING, @@ -134,6 +141,7 @@ static void __a6xx_gmu_set_freq(struct a6xx_gmu *gmu, int index) * for now leave it at max so that the performance is nominal. */ icc_set_bw(gpu->icc_path, 0, MBps_to_icc(7216)); + pm_runtime_put(gmu->dev); } void a6xx_gmu_set_freq(struct msm_gpu *gpu, unsigned long freq) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 68af24150de5..2c09d2c21773 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -810,6 +810,11 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu) struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); u64 busy_cycles, busy_time; + + /* Only read the gpu busy if the hardware is already active */ + if (pm_runtime_get_if_in_use(a6xx_gpu->gmu.dev) == 0) + return 0; + busy_cycles = gmu_read64(&a6xx_gpu->gmu, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_L, REG_A6XX_GMU_CX_GMU_POWER_COUNTER_XOCLK_0_H); @@ -819,6 +824,8 @@ static unsigned long a6xx_gpu_busy(struct msm_gpu *gpu) gpu->devfreq.busy_cycles = busy_cycles; + pm_runtime_put(a6xx_gpu->gmu.dev); + if (WARN_ON(busy_time > ~0LU)) return ~0LU; From d3b8877e57247c628966b155c02b4ed2e61b88f2 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:13 -0400 Subject: [PATCH 173/574] drm/msm: add msm_gem_get_and_pin_iova_range This function allows pinning iova to a specific page range (for a6xx GMU). Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.h | 6 +++++- drivers/gpu/drm/msm/msm_gem.c | 28 +++++++++++++++++++++------- drivers/gpu/drm/msm/msm_gem_vma.c | 6 ++++-- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 0ab49aa781a3..108202c35637 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -237,7 +237,8 @@ int msm_crtc_enable_vblank(struct drm_crtc *crtc); void msm_crtc_disable_vblank(struct drm_crtc *crtc); int msm_gem_init_vma(struct msm_gem_address_space *aspace, - struct msm_gem_vma *vma, int npages); + struct msm_gem_vma *vma, int npages, + u64 range_start, u64 range_end); void msm_gem_purge_vma(struct msm_gem_address_space *aspace, struct msm_gem_vma *vma); void msm_gem_unmap_vma(struct msm_gem_address_space *aspace, @@ -277,6 +278,9 @@ vm_fault_t msm_gem_fault(struct vm_fault *vmf); uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj); int msm_gem_get_iova(struct drm_gem_object *obj, struct msm_gem_address_space *aspace, uint64_t *iova); +int msm_gem_get_and_pin_iova_range(struct drm_gem_object *obj, + struct msm_gem_address_space *aspace, uint64_t *iova, + u64 range_start, u64 range_end); int msm_gem_get_and_pin_iova(struct drm_gem_object *obj, struct msm_gem_address_space *aspace, uint64_t *iova); uint64_t msm_gem_iova(struct drm_gem_object *obj, diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 5a6a79fbc9d6..d8f56a34c117 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -389,7 +389,8 @@ put_iova(struct drm_gem_object *obj) } static int msm_gem_get_iova_locked(struct drm_gem_object *obj, - struct msm_gem_address_space *aspace, uint64_t *iova) + struct msm_gem_address_space *aspace, uint64_t *iova, + u64 range_start, u64 range_end) { struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma; @@ -404,7 +405,8 @@ static int msm_gem_get_iova_locked(struct drm_gem_object *obj, if (IS_ERR(vma)) return PTR_ERR(vma); - ret = msm_gem_init_vma(aspace, vma, obj->size >> PAGE_SHIFT); + ret = msm_gem_init_vma(aspace, vma, obj->size >> PAGE_SHIFT, + range_start, range_end); if (ret) { del_vma(vma); return ret; @@ -443,9 +445,13 @@ static int msm_gem_pin_iova(struct drm_gem_object *obj, msm_obj->sgt, obj->size >> PAGE_SHIFT); } -/* get iova and pin it. Should have a matching put */ -int msm_gem_get_and_pin_iova(struct drm_gem_object *obj, - struct msm_gem_address_space *aspace, uint64_t *iova) +/* + * get iova and pin it. Should have a matching put + * limits iova to specified range (in pages) + */ +int msm_gem_get_and_pin_iova_range(struct drm_gem_object *obj, + struct msm_gem_address_space *aspace, uint64_t *iova, + u64 range_start, u64 range_end) { struct msm_gem_object *msm_obj = to_msm_bo(obj); u64 local; @@ -453,7 +459,8 @@ int msm_gem_get_and_pin_iova(struct drm_gem_object *obj, mutex_lock(&msm_obj->lock); - ret = msm_gem_get_iova_locked(obj, aspace, &local); + ret = msm_gem_get_iova_locked(obj, aspace, &local, + range_start, range_end); if (!ret) ret = msm_gem_pin_iova(obj, aspace); @@ -465,6 +472,13 @@ int msm_gem_get_and_pin_iova(struct drm_gem_object *obj, return ret; } +/* get iova and pin it. Should have a matching put */ +int msm_gem_get_and_pin_iova(struct drm_gem_object *obj, + struct msm_gem_address_space *aspace, uint64_t *iova) +{ + return msm_gem_get_and_pin_iova_range(obj, aspace, iova, 0, U64_MAX); +} + /* * Get an iova but don't pin it. Doesn't need a put because iovas are currently * valid for the life of the object @@ -476,7 +490,7 @@ int msm_gem_get_iova(struct drm_gem_object *obj, int ret; mutex_lock(&msm_obj->lock); - ret = msm_gem_get_iova_locked(obj, aspace, iova); + ret = msm_gem_get_iova_locked(obj, aspace, iova, 0, U64_MAX); mutex_unlock(&msm_obj->lock); return ret; diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 1af5354bcd46..407b7ab82818 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -103,7 +103,8 @@ void msm_gem_close_vma(struct msm_gem_address_space *aspace, /* Initialize a new vma and allocate an iova for it */ int msm_gem_init_vma(struct msm_gem_address_space *aspace, - struct msm_gem_vma *vma, int npages) + struct msm_gem_vma *vma, int npages, + u64 range_start, u64 range_end) { int ret; @@ -111,7 +112,8 @@ int msm_gem_init_vma(struct msm_gem_address_space *aspace, return -EBUSY; spin_lock(&aspace->lock); - ret = drm_mm_insert_node(&aspace->mm, &vma->node, npages); + ret = drm_mm_insert_node_in_range(&aspace->mm, &vma->node, npages, 0, + 0, range_start, range_end, 0); spin_unlock(&aspace->lock); if (ret) From 0b462d7a71c07e96b8f02cbc2d134fdc6e80ef34 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:14 -0400 Subject: [PATCH 174/574] drm/msm: add internal MSM_BO_MAP_PRIV flag This flag sets IOMMU_PRIV, which is required for some a6xx GMU objects. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 3 +++ drivers/gpu/drm/msm/msm_gem.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index d8f56a34c117..6277fde13df9 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -428,6 +428,9 @@ static int msm_gem_pin_iova(struct drm_gem_object *obj, if (!(msm_obj->flags & MSM_BO_GPU_READONLY)) prot |= IOMMU_WRITE; + if (msm_obj->flags & MSM_BO_MAP_PRIV) + prot |= IOMMU_PRIV; + WARN_ON(!mutex_is_locked(&msm_obj->lock)); if (WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 30584eaf8cc8..972490b14ba5 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -13,6 +13,7 @@ /* Additional internal-use only BO flags: */ #define MSM_BO_STOLEN 0x10000000 /* try to use stolen/splash memory */ +#define MSM_BO_MAP_PRIV 0x20000000 /* use IOMMU_PRIV when mapping */ struct msm_gem_address_space { const char *name; From 29ac8979cdf7205bf70ec9be60bd2442acc0422a Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:15 -0400 Subject: [PATCH 175/574] drm/msm/a6xx: use msm_gem for GMU memory objects This gives more fine-grained control over how memory is allocated over the DMA api. In particular, it allows using an address range or pinning to a fixed address. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 121 +++++++++++++++++--------- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 9 +- drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 6 +- 3 files changed, 91 insertions(+), 45 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index b4aa93710409..28c825d44f40 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -2,14 +2,16 @@ /* Copyright (c) 2017-2019 The Linux Foundation. All rights reserved. */ #include -#include #include #include #include #include +#include #include "a6xx_gpu.h" #include "a6xx_gmu.xml.h" +#include "msm_gem.h" +#include "msm_mmu.h" static void a6xx_gmu_fault(struct a6xx_gmu *gmu) { @@ -628,7 +630,7 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) gmu_write(gmu, REG_A6XX_GMU_CM3_BOOT_CONFIG, 0x02); /* Write the iova of the HFI table */ - gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_ADDR, gmu->hfi->iova); + gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_ADDR, gmu->hfi.iova); gmu_write(gmu, REG_A6XX_GMU_HFI_QTBL_INFO, 1); gmu_write(gmu, REG_A6XX_GMU_AHB_FENCE_RANGE_0, @@ -927,34 +929,77 @@ int a6xx_gmu_stop(struct a6xx_gpu *a6xx_gpu) return 0; } -static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo) +static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu) { - if (IS_ERR_OR_NULL(bo)) - return; + msm_gem_kernel_put(gmu->hfi.obj, gmu->aspace, false); + msm_gem_kernel_put(gmu->debug.obj, gmu->aspace, false); - dma_free_wc(gmu->dev, bo->size, bo->virt, bo->iova); - kfree(bo); + gmu->aspace->mmu->funcs->detach(gmu->aspace->mmu); + msm_gem_address_space_put(gmu->aspace); } -static struct a6xx_gmu_bo *a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, - size_t size) +static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo, + size_t size, u64 iova) { - struct a6xx_gmu_bo *bo; + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); + struct drm_device *dev = a6xx_gpu->base.base.dev; + uint32_t flags = MSM_BO_WC; + u64 range_start, range_end; + int ret; - bo = kzalloc(sizeof(*bo), GFP_KERNEL); - if (!bo) - return ERR_PTR(-ENOMEM); - - bo->size = PAGE_ALIGN(size); - - bo->virt = dma_alloc_wc(gmu->dev, bo->size, &bo->iova, GFP_KERNEL); - - if (!bo->virt) { - kfree(bo); - return ERR_PTR(-ENOMEM); + size = PAGE_ALIGN(size); + if (!iova) { + /* no fixed address - use GMU's uncached range */ + range_start = 0x60000000; + range_end = 0x80000000; + } else { + /* range for fixed address */ + range_start = iova; + range_end = iova + size; } - return bo; + bo->obj = msm_gem_new(dev, size, flags); + if (IS_ERR(bo->obj)) + return PTR_ERR(bo->obj); + + ret = msm_gem_get_and_pin_iova_range(bo->obj, gmu->aspace, &bo->iova, + range_start >> PAGE_SHIFT, range_end >> PAGE_SHIFT); + if (ret) { + drm_gem_object_put(bo->obj); + return ret; + } + + bo->virt = msm_gem_get_vaddr(bo->obj); + bo->size = size; + + return 0; +} + +static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu) +{ + struct iommu_domain *domain; + int ret; + + domain = iommu_domain_alloc(&platform_bus_type); + if (!domain) + return -ENODEV; + + domain->geometry.aperture_start = 0x00000000; + domain->geometry.aperture_end = 0x7fffffff; + + gmu->aspace = msm_gem_address_space_create(gmu->dev, domain, "gmu"); + if (IS_ERR(gmu->aspace)) { + iommu_domain_free(domain); + return PTR_ERR(gmu->aspace); + } + + ret = gmu->aspace->mmu->funcs->attach(gmu->aspace->mmu); + if (ret) { + msm_gem_address_space_put(gmu->aspace); + return ret; + } + + return 0; } /* Return the 'arc-level' for the given frequency */ @@ -1212,7 +1257,7 @@ void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu) iounmap(gmu->mmio); gmu->mmio = NULL; - a6xx_gmu_memory_free(gmu, gmu->hfi); + a6xx_gmu_memory_free(gmu); free_irq(gmu->gmu_irq, gmu); free_irq(gmu->hfi_irq, gmu); @@ -1234,15 +1279,7 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) gmu->dev = &pdev->dev; - /* Pass force_dma false to require the DT to set the dma region */ - ret = of_dma_configure(gmu->dev, node, false); - if (ret) - return ret; - - /* Set the mask after the of_dma_configure() */ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(31)); - if (ret) - return ret; + of_dma_configure(gmu->dev, node, true); /* Fow now, don't do anything fancy until we get our feet under us */ gmu->idle_level = GMU_IDLE_STATE_ACTIVE; @@ -1254,20 +1291,26 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (ret) goto err_put_device; + ret = a6xx_gmu_memory_probe(gmu); + if (ret) + goto err_put_device; + /* Allocate memory for for the HFI queues */ - gmu->hfi = a6xx_gmu_memory_alloc(gmu, SZ_16K); - if (IS_ERR(gmu->hfi)) + ret = a6xx_gmu_memory_alloc(gmu, &gmu->hfi, SZ_16K, 0); + if (ret) goto err_memory; /* Allocate memory for the GMU debug region */ - gmu->debug = a6xx_gmu_memory_alloc(gmu, SZ_16K); - if (IS_ERR(gmu->debug)) + ret = a6xx_gmu_memory_alloc(gmu, &gmu->debug, SZ_16K, 0); + if (ret) goto err_memory; /* Map the GMU registers */ gmu->mmio = a6xx_gmu_get_mmio(pdev, "gmu"); - if (IS_ERR(gmu->mmio)) + if (IS_ERR(gmu->mmio)) { + ret = PTR_ERR(gmu->mmio); goto err_memory; + } /* Get the HFI and GMU interrupts */ gmu->hfi_irq = a6xx_gmu_get_irq(gmu, pdev, "hfi", a6xx_hfi_irq); @@ -1296,11 +1339,11 @@ err_mmio: iounmap(gmu->mmio); free_irq(gmu->gmu_irq, gmu); free_irq(gmu->hfi_irq, gmu); -err_memory: - a6xx_gmu_memory_free(gmu, gmu->hfi); ret = -ENODEV; +err_memory: + a6xx_gmu_memory_free(gmu); err_put_device: /* Drop reference taken in of_find_device_by_node */ put_device(gmu->dev); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 4af65a36d5ca..cd66a5e1d7e9 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -10,9 +10,10 @@ #include "a6xx_hfi.h" struct a6xx_gmu_bo { + struct drm_gem_object *obj; void *virt; size_t size; - dma_addr_t iova; + u64 iova; }; /* @@ -43,6 +44,8 @@ struct a6xx_gmu_bo { struct a6xx_gmu { struct device *dev; + struct msm_gem_address_space *aspace; + void * __iomem mmio; int hfi_irq; @@ -52,8 +55,8 @@ struct a6xx_gmu { int idle_level; - struct a6xx_gmu_bo *hfi; - struct a6xx_gmu_bo *debug; + struct a6xx_gmu_bo hfi; + struct a6xx_gmu_bo debug; int nr_clocks; struct clk_bulk_data *clocks; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c index e450e0b97211..b90343d4caf0 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c @@ -176,8 +176,8 @@ static int a6xx_hfi_send_gmu_init(struct a6xx_gmu *gmu, int boot_state) { struct a6xx_hfi_msg_gmu_init_cmd msg = { 0 }; - msg.dbg_buffer_addr = (u32) gmu->debug->iova; - msg.dbg_buffer_size = (u32) gmu->debug->size; + msg.dbg_buffer_addr = (u32) gmu->debug.iova; + msg.dbg_buffer_size = (u32) gmu->debug.size; msg.boot_state = boot_state; return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_INIT, &msg, sizeof(msg), @@ -385,7 +385,7 @@ static void a6xx_hfi_queue_init(struct a6xx_hfi_queue *queue, void a6xx_hfi_init(struct a6xx_gmu *gmu) { - struct a6xx_gmu_bo *hfi = gmu->hfi; + struct a6xx_gmu_bo *hfi = &gmu->hfi; struct a6xx_hfi_queue_table_header *table = hfi->virt; struct a6xx_hfi_queue_header *headers = hfi->virt + sizeof(*table); u64 offset; From a83366ef19eaac14913ec89789638e32ee656480 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:16 -0400 Subject: [PATCH 176/574] drm/msm/a6xx: add A640/A650 to gpulist Add Adreno 640 and 650 GPU info to the gpulist. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 10 +++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index cb3a6e597d76..1156f72532a4 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -189,6 +189,30 @@ static const struct adreno_info gpulist[] = { .inactive_period = DRM_MSM_INACTIVE_PERIOD, .init = a6xx_gpu_init, .zapfw = "a630_zap.mdt", + }, { + .rev = ADRENO_REV(6, 4, 0, ANY_ID), + .revn = 640, + .name = "A640", + .fw = { + [ADRENO_FW_SQE] = "a630_sqe.fw", + [ADRENO_FW_GMU] = "a640_gmu.bin", + }, + .gmem = SZ_1M, + .inactive_period = DRM_MSM_INACTIVE_PERIOD, + .init = a6xx_gpu_init, + .zapfw = "a640_zap.mdt", + }, { + .rev = ADRENO_REV(6, 5, 0, ANY_ID), + .revn = 650, + .name = "A650", + .fw = { + [ADRENO_FW_SQE] = "a650_sqe.fw", + [ADRENO_FW_GMU] = "a650_gmu.bin", + }, + .gmem = SZ_1M + SZ_128K, + .inactive_period = DRM_MSM_INACTIVE_PERIOD, + .init = a6xx_gpu_init, + .zapfw = "a650_zap.mdt", }, }; diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 1d5c43c22269..a7647eaacc7a 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -197,7 +197,7 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) *value = adreno_gpu->gmem; return 0; case MSM_PARAM_GMEM_BASE: - *value = 0x100000; + *value = !adreno_is_a650(adreno_gpu) ? 0x100000 : 0; return 0; case MSM_PARAM_CHIP_ID: *value = adreno_gpu->rev.patchid | diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index 9ff4e550e7bd..88ae1b2813ef 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -237,6 +237,16 @@ static inline int adreno_is_a630(struct adreno_gpu *gpu) return gpu->revn == 630; } +static inline int adreno_is_a640(struct adreno_gpu *gpu) +{ + return gpu->revn == 640; +} + +static inline int adreno_is_a650(struct adreno_gpu *gpu) +{ + return gpu->revn == 650; +} + int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value); const struct firmware *adreno_request_fw(struct adreno_gpu *adreno_gpu, const char *fwname); From 8167e6fa76c8f7174dc9643f60c63bc083b35787 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:17 -0400 Subject: [PATCH 177/574] drm/msm/a6xx: HFI v2 for A640 and A650 Add HFI v2 code paths required by Adreno 640 and 650 GPUs. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 66 ++++++++++++--- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 7 ++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 6 +- drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 117 ++++++++++++++++++++++++-- drivers/gpu/drm/msm/adreno/a6xx_hfi.h | 50 ++++++++++- 5 files changed, 222 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 28c825d44f40..acffd61c374e 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -136,8 +136,6 @@ static void __a6xx_gmu_set_freq(struct a6xx_gmu *gmu, int index) if (ret) dev_err(gmu->dev, "GMU set GPU frequency error: %d\n", ret); - gmu->freq = gmu->gpu_freqs[index]; - /* * Eventually we will want to scale the path vote with the frequency but * for now leave it at max so that the performance is nominal. @@ -162,7 +160,12 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, unsigned long freq) gmu->current_perf_index = perf_index; - __a6xx_gmu_set_freq(gmu, perf_index); + if (gmu->legacy) + __a6xx_gmu_set_freq(gmu, perf_index); + else + a6xx_hfi_set_freq(gmu, perf_index); + + gmu->freq = gmu->gpu_freqs[perf_index]; } unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu) @@ -242,8 +245,13 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) switch (state) { case GMU_OOB_GPU_SET: - request = GMU_OOB_GPU_SET_REQUEST; - ack = GMU_OOB_GPU_SET_ACK; + if (gmu->legacy) { + request = GMU_OOB_GPU_SET_REQUEST; + ack = GMU_OOB_GPU_SET_ACK; + } else { + request = GMU_OOB_GPU_SET_REQUEST_NEW; + ack = GMU_OOB_GPU_SET_ACK_NEW; + } name = "GPU_SET"; break; case GMU_OOB_BOOT_SLUMBER: @@ -282,6 +290,13 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) /* Clear a pending OOB state in the GMU */ void a6xx_gmu_clear_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state) { + if (!gmu->legacy) { + WARN_ON(state != GMU_OOB_GPU_SET); + gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, + 1 << GMU_OOB_GPU_SET_CLEAR_NEW); + return; + } + switch (state) { case GMU_OOB_GPU_SET: gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, @@ -304,6 +319,9 @@ static int a6xx_sptprac_enable(struct a6xx_gmu *gmu) int ret; u32 val; + if (!gmu->legacy) + return 0; + gmu_write(gmu, REG_A6XX_GMU_GX_SPTPRAC_POWER_CONTROL, 0x778000); ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_SPTPRAC_PWR_CLK_STATUS, val, @@ -323,6 +341,9 @@ static void a6xx_sptprac_disable(struct a6xx_gmu *gmu) u32 val; int ret; + if (!gmu->legacy) + return; + /* Make sure retention is on */ gmu_rmw(gmu, REG_A6XX_GPU_CC_GX_GDSCR, 0, (1 << 11)); @@ -366,6 +387,11 @@ static int a6xx_gmu_notify_slumber(struct a6xx_gmu *gmu) if (gmu->idle_level < GMU_IDLE_STATE_SPTP) a6xx_sptprac_disable(gmu); + if (!gmu->legacy) { + ret = a6xx_hfi_send_prep_slumber(gmu); + goto out; + } + /* Tell the GMU to get ready to slumber */ gmu_write(gmu, REG_A6XX_GMU_BOOT_SLUMBER_OPTION, 1); @@ -381,6 +407,7 @@ static int a6xx_gmu_notify_slumber(struct a6xx_gmu *gmu) } } +out: /* Put fence into allow mode */ gmu_write(gmu, REG_A6XX_GMU_AO_AHB_FENCE_CTRL, 0); return ret; @@ -650,9 +677,11 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) if (ret) return ret; - ret = a6xx_gmu_gfx_rail_on(gmu); - if (ret) - return ret; + if (gmu->legacy) { + ret = a6xx_gmu_gfx_rail_on(gmu); + if (ret) + return ret; + } /* Enable SPTP_PC if the CPU is responsible for it */ if (gmu->idle_level < GMU_IDLE_STATE_SPTP) { @@ -771,7 +800,10 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu) enable_irq(gmu->hfi_irq); /* Set the GPU to the current freq */ - __a6xx_gmu_set_freq(gmu, gmu->current_perf_index); + if (gmu->legacy) + __a6xx_gmu_set_freq(gmu, gmu->current_perf_index); + else + a6xx_hfi_set_freq(gmu, gmu->current_perf_index); /* * "enable" the GX power domain which won't actually do anything but it @@ -1270,6 +1302,7 @@ void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu) int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) { + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; struct a6xx_gmu *gmu = &a6xx_gpu->gmu; struct platform_device *pdev = of_find_device_by_node(node); int ret; @@ -1295,16 +1328,21 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (ret) goto err_put_device; + if (!adreno_is_a640(adreno_gpu) && !adreno_is_a650(adreno_gpu)) { + /* HFI v1, has sptprac */ + gmu->legacy = true; + + /* Allocate memory for the GMU debug region */ + ret = a6xx_gmu_memory_alloc(gmu, &gmu->debug, SZ_16K, 0); + if (ret) + goto err_memory; + } + /* Allocate memory for for the HFI queues */ ret = a6xx_gmu_memory_alloc(gmu, &gmu->hfi, SZ_16K, 0); if (ret) goto err_memory; - /* Allocate memory for the GMU debug region */ - ret = a6xx_gmu_memory_alloc(gmu, &gmu->debug, SZ_16K, 0); - if (ret) - goto err_memory; - /* Map the GMU registers */ gmu->mmio = a6xx_gmu_get_mmio(pdev, "gmu"); if (IS_ERR(gmu->mmio)) { diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index cd66a5e1d7e9..463e2d5f2bb9 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -79,6 +79,7 @@ struct a6xx_gmu { bool initialized; bool hung; + bool legacy; /* a618 or a630 */ }; static inline u32 gmu_read(struct a6xx_gmu *gmu, u32 offset) @@ -159,10 +160,16 @@ enum a6xx_gmu_oob_state { #define GMU_OOB_GPU_SET_ACK 24 #define GMU_OOB_GPU_SET_CLEAR 24 +#define GMU_OOB_GPU_SET_REQUEST_NEW 30 +#define GMU_OOB_GPU_SET_ACK_NEW 31 +#define GMU_OOB_GPU_SET_CLEAR_NEW 31 + void a6xx_hfi_init(struct a6xx_gmu *gmu); int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state); void a6xx_hfi_stop(struct a6xx_gmu *gmu); +int a6xx_hfi_send_prep_slumber(struct a6xx_gmu *gmu); +int a6xx_hfi_set_freq(struct a6xx_gmu *gmu, int index); bool a6xx_gmu_gx_is_on(struct a6xx_gmu *gmu); bool a6xx_gmu_sptprac_is_on(struct a6xx_gmu *gmu); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 2c09d2c21773..20528ede63b6 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -566,8 +566,10 @@ out: */ a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); - /* Take the GMU out of its special boot mode */ - a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_BOOT_SLUMBER); + if (a6xx_gpu->gmu.legacy) { + /* Take the GMU out of its special boot mode */ + a6xx_gmu_clear_oob(&a6xx_gpu->gmu, GMU_OOB_BOOT_SLUMBER); + } return ret; } diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c index b90343d4caf0..f9db69e77121 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c @@ -17,10 +17,14 @@ static const char * const a6xx_hfi_msg_id[] = { HFI_MSG_ID(HFI_H2F_MSG_BW_TABLE), HFI_MSG_ID(HFI_H2F_MSG_PERF_TABLE), HFI_MSG_ID(HFI_H2F_MSG_TEST), + HFI_MSG_ID(HFI_H2F_MSG_START), + HFI_MSG_ID(HFI_H2F_MSG_CORE_FW_START), + HFI_MSG_ID(HFI_H2F_MSG_GX_BW_PERF_VOTE), + HFI_MSG_ID(HFI_H2F_MSG_PREPARE_SLUMBER), }; -static int a6xx_hfi_queue_read(struct a6xx_hfi_queue *queue, u32 *data, - u32 dwords) +static int a6xx_hfi_queue_read(struct a6xx_gmu *gmu, + struct a6xx_hfi_queue *queue, u32 *data, u32 dwords) { struct a6xx_hfi_queue_header *header = queue->header; u32 i, hdr, index = header->read_index; @@ -48,6 +52,9 @@ static int a6xx_hfi_queue_read(struct a6xx_hfi_queue *queue, u32 *data, index = (index + 1) % header->size; } + if (!gmu->legacy) + index = ALIGN(index, 4) % header->size; + header->read_index = index; return HFI_HEADER_SIZE(hdr); } @@ -73,6 +80,12 @@ static int a6xx_hfi_queue_write(struct a6xx_gmu *gmu, index = (index + 1) % header->size; } + /* Cookify any non used data at the end of the write buffer */ + if (!gmu->legacy) { + for (; index % 4; index = (index + 1) % header->size) + queue->data[index] = 0xfafafafa; + } + header->write_index = index; spin_unlock(&queue->lock); @@ -106,7 +119,7 @@ static int a6xx_hfi_wait_for_ack(struct a6xx_gmu *gmu, u32 id, u32 seqnum, struct a6xx_hfi_msg_response resp; /* Get the next packet */ - ret = a6xx_hfi_queue_read(queue, (u32 *) &resp, + ret = a6xx_hfi_queue_read(gmu, queue, (u32 *) &resp, sizeof(resp) >> 2); /* If the queue is empty our response never made it */ @@ -195,6 +208,28 @@ static int a6xx_hfi_get_fw_version(struct a6xx_gmu *gmu, u32 *version) version, sizeof(*version)); } +static int a6xx_hfi_send_perf_table_v1(struct a6xx_gmu *gmu) +{ + struct a6xx_hfi_msg_perf_table_v1 msg = { 0 }; + int i; + + msg.num_gpu_levels = gmu->nr_gpu_freqs; + msg.num_gmu_levels = gmu->nr_gmu_freqs; + + for (i = 0; i < gmu->nr_gpu_freqs; i++) { + msg.gx_votes[i].vote = gmu->gx_arc_votes[i]; + msg.gx_votes[i].freq = gmu->gpu_freqs[i] / 1000; + } + + for (i = 0; i < gmu->nr_gmu_freqs; i++) { + msg.cx_votes[i].vote = gmu->cx_arc_votes[i]; + msg.cx_votes[i].freq = gmu->gmu_freqs[i] / 1000; + } + + return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_PERF_TABLE, &msg, sizeof(msg), + NULL, 0); +} + static int a6xx_hfi_send_perf_table(struct a6xx_gmu *gmu) { struct a6xx_hfi_msg_perf_table msg = { 0 }; @@ -205,6 +240,7 @@ static int a6xx_hfi_send_perf_table(struct a6xx_gmu *gmu) for (i = 0; i < gmu->nr_gpu_freqs; i++) { msg.gx_votes[i].vote = gmu->gx_arc_votes[i]; + msg.gx_votes[i].acd = 0xffffffff; msg.gx_votes[i].freq = gmu->gpu_freqs[i] / 1000; } @@ -306,7 +342,45 @@ static int a6xx_hfi_send_test(struct a6xx_gmu *gmu) NULL, 0); } -int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state) +int a6xx_hfi_send_start(struct a6xx_gmu *gmu) +{ + struct a6xx_hfi_msg_start msg = { 0 }; + + return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_START, &msg, sizeof(msg), + NULL, 0); +} + +int a6xx_hfi_send_core_fw_start(struct a6xx_gmu *gmu) +{ + struct a6xx_hfi_msg_core_fw_start msg = { 0 }; + + return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_CORE_FW_START, &msg, + sizeof(msg), NULL, 0); +} + +int a6xx_hfi_set_freq(struct a6xx_gmu *gmu, int index) +{ + struct a6xx_hfi_gx_bw_perf_vote_cmd msg = { 0 }; + + msg.ack_type = 1; /* blocking */ + msg.freq = index; + msg.bw = 0; /* TODO: bus scaling */ + + return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_GX_BW_PERF_VOTE, &msg, + sizeof(msg), NULL, 0); +} + +int a6xx_hfi_send_prep_slumber(struct a6xx_gmu *gmu) +{ + struct a6xx_hfi_prep_slumber_cmd msg = { 0 }; + + /* TODO: should freq and bw fields be non-zero ? */ + + return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_PREPARE_SLUMBER, &msg, + sizeof(msg), NULL, 0); +} + +static int a6xx_hfi_start_v1(struct a6xx_gmu *gmu, int boot_state) { int ret; @@ -324,7 +398,7 @@ int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state) * the GMU firmware */ - ret = a6xx_hfi_send_perf_table(gmu); + ret = a6xx_hfi_send_perf_table_v1(gmu); if (ret) return ret; @@ -341,6 +415,37 @@ int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state) return 0; } +int a6xx_hfi_start(struct a6xx_gmu *gmu, int boot_state) +{ + int ret; + + if (gmu->legacy) + return a6xx_hfi_start_v1(gmu, boot_state); + + + ret = a6xx_hfi_send_perf_table(gmu); + if (ret) + return ret; + + ret = a6xx_hfi_send_bw_table(gmu); + if (ret) + return ret; + + ret = a6xx_hfi_send_core_fw_start(gmu); + if (ret) + return ret; + + /* + * Downstream driver sends this in its "a6xx_hw_init" equivalent, + * but seems to be no harm in sending it here + */ + ret = a6xx_hfi_send_start(gmu); + if (ret) + return ret; + + return 0; +} + void a6xx_hfi_stop(struct a6xx_gmu *gmu) { int i; @@ -415,5 +520,5 @@ void a6xx_hfi_init(struct a6xx_gmu *gmu) /* GMU response queue */ offset += SZ_4K; a6xx_hfi_queue_init(&gmu->queues[1], &headers[1], hfi->virt + offset, - hfi->iova + offset, 4); + hfi->iova + offset, gmu->legacy ? 4 : 1); } diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h index 60d1319fa44f..2bd670ca42d6 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h @@ -51,7 +51,8 @@ struct a6xx_hfi_queue { /* HFI message types */ #define HFI_MSG_CMD 0 -#define HFI_MSG_ACK 2 +#define HFI_MSG_ACK 1 +#define HFI_MSG_ACK_V1 2 #define HFI_F2H_MSG_ACK 126 @@ -94,7 +95,13 @@ struct perf_level { u32 freq; }; -struct a6xx_hfi_msg_perf_table { +struct perf_gx_level { + u32 vote; + u32 acd; + u32 freq; +}; + +struct a6xx_hfi_msg_perf_table_v1 { u32 header; u32 num_gpu_levels; u32 num_gmu_levels; @@ -103,6 +110,15 @@ struct a6xx_hfi_msg_perf_table { struct perf_level cx_votes[4]; }; +struct a6xx_hfi_msg_perf_table { + u32 header; + u32 num_gpu_levels; + u32 num_gmu_levels; + + struct perf_gx_level gx_votes[16]; + struct perf_level cx_votes[4]; +}; + #define HFI_H2F_MSG_BW_TABLE 3 struct a6xx_hfi_msg_bw_table { @@ -124,4 +140,34 @@ struct a6xx_hfi_msg_test { u32 header; }; +#define HFI_H2F_MSG_START 10 + +struct a6xx_hfi_msg_start { + u32 header; +}; + +#define HFI_H2F_MSG_CORE_FW_START 14 + +struct a6xx_hfi_msg_core_fw_start { + u32 header; + u32 handle; +}; + +#define HFI_H2F_MSG_GX_BW_PERF_VOTE 30 + +struct a6xx_hfi_gx_bw_perf_vote_cmd { + u32 header; + u32 ack_type; + u32 freq; + u32 bw; +}; + +#define HFI_H2F_MSG_PREPARE_SLUMBER 33 + +struct a6xx_hfi_prep_slumber_cmd { + u32 header; + u32 bw; + u32 freq; +}; + #endif From c6ed04f856a4ebbbd8276ea871d8c98590abb0d0 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:18 -0400 Subject: [PATCH 178/574] drm/msm/a6xx: A640/A650 GMU firmware path Newer GPUs have different GMU firmware path. v3: updated a6xx_gmu_fw_load based on feedback, including gmu_write_bulk, and removed extra whitespace change Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 138 +++++++++++++++++++--- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 10 ++ drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h | 6 + 3 files changed, 138 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index acffd61c374e..749cec7c090e 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -579,6 +579,8 @@ static void a6xx_gmu_power_config(struct a6xx_gmu *gmu) { /* Disable GMU WB/RB buffer */ gmu_write(gmu, REG_A6XX_GMU_SYS_BUS_CONFIG, 0x1); + gmu_write(gmu, REG_A6XX_GMU_ICACHE_CONFIG, 0x1); + gmu_write(gmu, REG_A6XX_GMU_DCACHE_CONFIG, 0x1); gmu_write(gmu, REG_A6XX_GMU_PWR_COL_INTER_FRAME_CTRL, 0x9c40400); @@ -608,14 +610,95 @@ static void a6xx_gmu_power_config(struct a6xx_gmu *gmu) A6XX_GMU_RPMH_CTRL_GFX_VOTE_ENABLE); } +struct block_header { + u32 addr; + u32 size; + u32 type; + u32 value; + u32 data[]; +}; + +/* this should be a general kernel helper */ +static int in_range(u32 addr, u32 start, u32 size) +{ + return addr >= start && addr < start + size; +} + +static bool fw_block_mem(struct a6xx_gmu_bo *bo, const struct block_header *blk) +{ + if (!in_range(blk->addr, bo->iova, bo->size)) + return false; + + memcpy(bo->virt + blk->addr - bo->iova, blk->data, blk->size); + return true; +} + +static int a6xx_gmu_fw_load(struct a6xx_gmu *gmu) +{ + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; + const struct firmware *fw_image = adreno_gpu->fw[ADRENO_FW_GMU]; + const struct block_header *blk; + u32 reg_offset; + + u32 itcm_base = 0x00000000; + u32 dtcm_base = 0x00040000; + + if (adreno_is_a650(adreno_gpu)) + dtcm_base = 0x10004000; + + if (gmu->legacy) { + /* Sanity check the size of the firmware that was loaded */ + if (fw_image->size > 0x8000) { + DRM_DEV_ERROR(gmu->dev, + "GMU firmware is bigger than the available region\n"); + return -EINVAL; + } + + gmu_write_bulk(gmu, REG_A6XX_GMU_CM3_ITCM_START, + (u32*) fw_image->data, fw_image->size); + return 0; + } + + + for (blk = (const struct block_header *) fw_image->data; + (const u8*) blk < fw_image->data + fw_image->size; + blk = (const struct block_header *) &blk->data[blk->size >> 2]) { + if (blk->size == 0) + continue; + + if (in_range(blk->addr, itcm_base, SZ_16K)) { + reg_offset = (blk->addr - itcm_base) >> 2; + gmu_write_bulk(gmu, + REG_A6XX_GMU_CM3_ITCM_START + reg_offset, + blk->data, blk->size); + } else if (in_range(blk->addr, dtcm_base, SZ_16K)) { + reg_offset = (blk->addr - dtcm_base) >> 2; + gmu_write_bulk(gmu, + REG_A6XX_GMU_CM3_DTCM_START + reg_offset, + blk->data, blk->size); + } else if (!fw_block_mem(&gmu->icache, blk) && + !fw_block_mem(&gmu->dcache, blk) && + !fw_block_mem(&gmu->dummy, blk)) { + DRM_DEV_ERROR(gmu->dev, + "failed to match fw block (addr=%.8x size=%d data[0]=%.8x)\n", + blk->addr, blk->size, blk->data[0]); + } + } + + return 0; +} + static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) { static bool rpmh_init; struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; - int i, ret; + int ret; u32 chipid; - u32 *image; + + if (adreno_is_a650(adreno_gpu)) + gmu_write(gmu, REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF, 1); if (state == GMU_WARM_BOOT) { ret = a6xx_rpmh_start(gmu); @@ -626,13 +709,6 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) "GMU firmware is not loaded\n")) return -ENOENT; - /* Sanity check the size of the firmware that was loaded */ - if (adreno_gpu->fw[ADRENO_FW_GMU]->size > 0x8000) { - DRM_DEV_ERROR(gmu->dev, - "GMU firmware is bigger than the available region\n"); - return -EINVAL; - } - /* Turn on register retention */ gmu_write(gmu, REG_A6XX_GMU_GENERAL_7, 1); @@ -646,11 +722,9 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) return ret; } - image = (u32 *) adreno_gpu->fw[ADRENO_FW_GMU]->data; - - for (i = 0; i < adreno_gpu->fw[ADRENO_FW_GMU]->size >> 2; i++) - gmu_write(gmu, REG_A6XX_GMU_CM3_ITCM_START + i, - image[i]); + ret = a6xx_gmu_fw_load(gmu); + if (ret) + return ret; } gmu_write(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, 0); @@ -783,6 +857,13 @@ int a6xx_gmu_resume(struct a6xx_gpu *a6xx_gpu) status = gmu_read(gmu, REG_A6XX_GMU_GENERAL_7) == 1 ? GMU_WARM_BOOT : GMU_COLD_BOOT; + /* + * Warm boot path does not work on newer GPUs + * Presumably this is because icache/dcache regions must be restored + */ + if (!gmu->legacy) + status = GMU_COLD_BOOT; + ret = a6xx_gmu_fw_start(gmu, status); if (ret) goto out; @@ -965,6 +1046,9 @@ static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu) { msm_gem_kernel_put(gmu->hfi.obj, gmu->aspace, false); msm_gem_kernel_put(gmu->debug.obj, gmu->aspace, false); + msm_gem_kernel_put(gmu->icache.obj, gmu->aspace, false); + msm_gem_kernel_put(gmu->dcache.obj, gmu->aspace, false); + msm_gem_kernel_put(gmu->dummy.obj, gmu->aspace, false); gmu->aspace->mmu->funcs->detach(gmu->aspace->mmu); msm_gem_address_space_put(gmu->aspace); @@ -982,12 +1066,14 @@ static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo, size = PAGE_ALIGN(size); if (!iova) { /* no fixed address - use GMU's uncached range */ - range_start = 0x60000000; + range_start = 0x60000000 + PAGE_SIZE; /* skip dummy page */ range_end = 0x80000000; } else { /* range for fixed address */ range_start = iova; range_end = iova + size; + /* use IOMMU_PRIV for icache/dcache */ + flags |= MSM_BO_MAP_PRIV; } bo->obj = msm_gem_new(dev, size, flags); @@ -1328,7 +1414,27 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (ret) goto err_put_device; - if (!adreno_is_a640(adreno_gpu) && !adreno_is_a650(adreno_gpu)) { + /* Allocate memory for the GMU dummy page */ + ret = a6xx_gmu_memory_alloc(gmu, &gmu->dummy, SZ_4K, 0x60000000); + if (ret) + goto err_memory; + + if (adreno_is_a650(adreno_gpu)) { + ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache, + SZ_16M - SZ_16K, 0x04000); + if (ret) + goto err_memory; + } else if (adreno_is_a640(adreno_gpu)) { + ret = a6xx_gmu_memory_alloc(gmu, &gmu->icache, + SZ_256K - SZ_16K, 0x04000); + if (ret) + goto err_memory; + + ret = a6xx_gmu_memory_alloc(gmu, &gmu->dcache, + SZ_256K - SZ_16K, 0x44000); + if (ret) + goto err_memory; + } else { /* HFI v1, has sptprac */ gmu->legacy = true; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index 463e2d5f2bb9..c6d8c0d1f90b 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -57,6 +57,9 @@ struct a6xx_gmu { struct a6xx_gmu_bo hfi; struct a6xx_gmu_bo debug; + struct a6xx_gmu_bo icache; + struct a6xx_gmu_bo dcache; + struct a6xx_gmu_bo dummy; int nr_clocks; struct clk_bulk_data *clocks; @@ -92,6 +95,13 @@ static inline void gmu_write(struct a6xx_gmu *gmu, u32 offset, u32 value) return msm_writel(value, gmu->mmio + (offset << 2)); } +static inline void +gmu_write_bulk(struct a6xx_gmu *gmu, u32 offset, const u32 *data, u32 size) +{ + memcpy_toio(gmu->mmio + (offset << 2), data, size); + wmb(); +} + static inline void gmu_rmw(struct a6xx_gmu *gmu, u32 reg, u32 mask, u32 or) { u32 val = gmu_read(gmu, reg); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h index 1cc1c135236b..eb2cd41dae6e 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h @@ -101,6 +101,10 @@ static inline uint32_t A6XX_HFI_IRQ_OOB_MASK(uint32_t val) #define REG_A6XX_GMU_DCVS_RETURN 0x000023ff +#define REG_A6XX_GMU_ICACHE_CONFIG 0x00004c00 + +#define REG_A6XX_GMU_DCACHE_CONFIG 0x00004c01 + #define REG_A6XX_GMU_SYS_BUS_CONFIG 0x00004c0f #define REG_A6XX_GMU_CM3_SYSRESET 0x00005000 @@ -199,6 +203,8 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val) #define REG_A6XX_GPU_GMU_CX_GMU_RPMH_POWER_STATE 0x000050ec +#define REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF 0x000050f0 + #define REG_A6XX_GMU_BOOT_KMD_LM_HANDSHAKE 0x000051f0 #define REG_A6XX_GMU_LLM_GLM_SLEEP_CTRL 0x00005157 From 02ef80c54e7cd70fe1f422b0315fd1534033e382 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:19 -0400 Subject: [PATCH 179/574] drm/msm/a6xx: update pdc/rscc GMU registers for A640/A650 Update the gmu_pdc registers for A640 and A650. Some of the RSCC registers on A650 are in a separate region. Note this also changes the address of these registers: RSCC_TCS1_DRV0_STATUS RSCC_TCS2_DRV0_STATUS RSCC_TCS3_DRV0_STATUS Based on the values in msm-4.14 and msm-4.19 kernels. v3: replaced adreno_is_a650 around ->rscc with checks for "rscc" resource Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 90 ++++++++++++++--------- drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 10 +++ drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h | 38 +++++----- 3 files changed, 85 insertions(+), 53 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 749cec7c090e..48d2e8defe59 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -429,7 +429,7 @@ static int a6xx_rpmh_start(struct a6xx_gmu *gmu) return ret; } - ret = gmu_poll_timeout(gmu, REG_A6XX_RSCC_SEQ_BUSY_DRV0, val, + ret = gmu_poll_timeout_rscc(gmu, REG_A6XX_RSCC_SEQ_BUSY_DRV0, val, !val, 100, 10000); if (ret) { @@ -455,7 +455,7 @@ static void a6xx_rpmh_stop(struct a6xx_gmu *gmu) gmu_write(gmu, REG_A6XX_GMU_RSCC_CONTROL_REQ, 1); - ret = gmu_poll_timeout(gmu, REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0, + ret = gmu_poll_timeout_rscc(gmu, REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0, val, val & (1 << 16), 100, 10000); if (ret) DRM_DEV_ERROR(gmu->dev, "Unable to power off the GPU RSC\n"); @@ -478,32 +478,48 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu) struct platform_device *pdev = to_platform_device(gmu->dev); void __iomem *pdcptr = a6xx_gmu_get_mmio(pdev, "gmu_pdc"); void __iomem *seqptr = a6xx_gmu_get_mmio(pdev, "gmu_pdc_seq"); + uint32_t pdc_address_offset; if (!pdcptr || !seqptr) goto err; + if (adreno_is_a618(adreno_gpu) || adreno_is_a640(adreno_gpu)) + pdc_address_offset = 0x30090; + else if (adreno_is_a650(adreno_gpu)) + pdc_address_offset = 0x300a0; + else + pdc_address_offset = 0x30080; + /* Disable SDE clock gating */ - gmu_write(gmu, REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0, BIT(24)); + gmu_write_rscc(gmu, REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0, BIT(24)); /* Setup RSC PDC handshake for sleep and wakeup */ - gmu_write(gmu, REG_A6XX_RSCC_PDC_SLAVE_ID_DRV0, 1); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA, 0); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR, 0); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA + 2, 0); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR + 2, 0); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA + 4, 0x80000000); - gmu_write(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR + 4, 0); - gmu_write(gmu, REG_A6XX_RSCC_OVERRIDE_START_ADDR, 0); - gmu_write(gmu, REG_A6XX_RSCC_PDC_SEQ_START_ADDR, 0x4520); - gmu_write(gmu, REG_A6XX_RSCC_PDC_MATCH_VALUE_LO, 0x4510); - gmu_write(gmu, REG_A6XX_RSCC_PDC_MATCH_VALUE_HI, 0x4514); + gmu_write_rscc(gmu, REG_A6XX_RSCC_PDC_SLAVE_ID_DRV0, 1); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA + 2, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR + 2, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA + 4, 0x80000000); + gmu_write_rscc(gmu, REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR + 4, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_OVERRIDE_START_ADDR, 0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_PDC_SEQ_START_ADDR, 0x4520); + gmu_write_rscc(gmu, REG_A6XX_RSCC_PDC_MATCH_VALUE_LO, 0x4510); + gmu_write_rscc(gmu, REG_A6XX_RSCC_PDC_MATCH_VALUE_HI, 0x4514); /* Load RSC sequencer uCode for sleep and wakeup */ - gmu_write(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0, 0xa7a506a0); - gmu_write(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 1, 0xa1e6a6e7); - gmu_write(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 2, 0xa2e081e1); - gmu_write(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 3, 0xe9a982e2); - gmu_write(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 4, 0x0020e8a8); + if (adreno_is_a650(adreno_gpu)) { + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0, 0xeaaae5a0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 1, 0xe1a1ebab); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 2, 0xa2e0a581); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 3, 0xecac82e2); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 4, 0x0020edad); + } else { + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0, 0xa7a506a0); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 1, 0xa1e6a6e7); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 2, 0xa2e081e1); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 3, 0xe9a982e2); + gmu_write_rscc(gmu, REG_A6XX_RSCC_SEQ_MEM_0_DRV0 + 4, 0x0020e8a8); + } /* Load PDC sequencer uCode for power up and power down sequence */ pdc_write(seqptr, REG_A6XX_PDC_GPU_SEQ_MEM_0, 0xfebea1e1); @@ -524,10 +540,7 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu) pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_DATA + 4, 0x0); pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_MSGID + 8, 0x10108); - if (adreno_is_a618(adreno_gpu)) - pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_ADDR + 8, 0x30090); - else - pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_ADDR + 8, 0x30080); + pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_ADDR + 8, pdc_address_offset); pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS1_CMD0_DATA + 8, 0x0); pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD_ENABLE_BANK, 7); @@ -539,17 +552,12 @@ static void a6xx_gmu_rpmh_init(struct a6xx_gmu *gmu) pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_MSGID + 4, 0x10108); pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_ADDR + 4, 0x30000); - if (adreno_is_a618(adreno_gpu)) + if (adreno_is_a618(adreno_gpu) || adreno_is_a650(adreno_gpu)) pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_DATA + 4, 0x2); else pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_DATA + 4, 0x3); - - pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_MSGID + 8, 0x10108); - if (adreno_is_a618(adreno_gpu)) - pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_ADDR + 8, 0x30090); - else - pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_ADDR + 8, 0x30080); + pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_ADDR + 8, pdc_address_offset); pdc_write(pdcptr, REG_A6XX_PDC_GPU_TCS3_CMD0_DATA + 8, 0x3); /* Setup GPU PDC */ @@ -796,13 +804,13 @@ static void a6xx_gmu_rpmh_off(struct a6xx_gmu *gmu) u32 val; /* Make sure there are no outstanding RPMh votes */ - gmu_poll_timeout(gmu, REG_A6XX_RSCC_TCS0_DRV0_STATUS, val, + gmu_poll_timeout_rscc(gmu, REG_A6XX_RSCC_TCS0_DRV0_STATUS, val, (val & 1), 100, 10000); - gmu_poll_timeout(gmu, REG_A6XX_RSCC_TCS1_DRV0_STATUS, val, + gmu_poll_timeout_rscc(gmu, REG_A6XX_RSCC_TCS1_DRV0_STATUS, val, (val & 1), 100, 10000); - gmu_poll_timeout(gmu, REG_A6XX_RSCC_TCS2_DRV0_STATUS, val, + gmu_poll_timeout_rscc(gmu, REG_A6XX_RSCC_TCS2_DRV0_STATUS, val, (val & 1), 100, 10000); - gmu_poll_timeout(gmu, REG_A6XX_RSCC_TCS3_DRV0_STATUS, val, + gmu_poll_timeout_rscc(gmu, REG_A6XX_RSCC_TCS3_DRV0_STATUS, val, (val & 1), 100, 1000); } @@ -1361,6 +1369,7 @@ static int a6xx_gmu_get_irq(struct a6xx_gmu *gmu, struct platform_device *pdev, void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu) { struct a6xx_gmu *gmu = &a6xx_gpu->gmu; + struct platform_device *pdev = to_platform_device(gmu->dev); if (!gmu->initialized) return; @@ -1373,7 +1382,10 @@ void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu) } iounmap(gmu->mmio); + if (platform_get_resource_byname(pdev, IORESOURCE_MEM, "rscc")) + iounmap(gmu->rscc); gmu->mmio = NULL; + gmu->rscc = NULL; a6xx_gmu_memory_free(gmu); @@ -1456,6 +1468,14 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) goto err_memory; } + if (adreno_is_a650(adreno_gpu)) { + gmu->rscc = a6xx_gmu_get_mmio(pdev, "rscc"); + if (IS_ERR(gmu->rscc)) + goto err_mmio; + } else { + gmu->rscc = gmu->mmio + 0x23000; + } + /* Get the HFI and GMU interrupts */ gmu->hfi_irq = a6xx_gmu_get_irq(gmu, pdev, "hfi", a6xx_hfi_irq); gmu->gmu_irq = a6xx_gmu_get_irq(gmu, pdev, "gmu", a6xx_gmu_irq); @@ -1481,6 +1501,8 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) err_mmio: iounmap(gmu->mmio); + if (platform_get_resource_byname(pdev, IORESOURCE_MEM, "rscc")) + iounmap(gmu->rscc); free_irq(gmu->gmu_irq, gmu); free_irq(gmu->hfi_irq, gmu); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index c6d8c0d1f90b..e16c16bb65bf 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -47,6 +47,7 @@ struct a6xx_gmu { struct msm_gem_address_space *aspace; void * __iomem mmio; + void * __iomem rscc; int hfi_irq; int gmu_irq; @@ -125,6 +126,15 @@ static inline u64 gmu_read64(struct a6xx_gmu *gmu, u32 lo, u32 hi) readl_poll_timeout((gmu)->mmio + ((addr) << 2), val, cond, \ interval, timeout) +static inline void gmu_write_rscc(struct a6xx_gmu *gmu, u32 offset, u32 value) +{ + return msm_writel(value, gmu->rscc + (offset << 2)); +} + +#define gmu_poll_timeout_rscc(gmu, addr, val, cond, interval, timeout) \ + readl_poll_timeout((gmu)->rscc + ((addr) << 2), val, cond, \ + interval, timeout) + /* * These are the available OOB (out of band requests) to the GMU where "out of * band" means that the CPU talks to the GMU directly and not through HFI. diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h index eb2cd41dae6e..b4357ea550ec 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h @@ -336,8 +336,6 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val) #define REG_A6XX_GMU_AO_SPARE_CNTL 0x00009316 -#define REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0 0x00008c04 - #define REG_A6XX_GMU_RSCC_CONTROL_REQ 0x00009307 #define REG_A6XX_GMU_RSCC_CONTROL_ACK 0x00009308 @@ -350,39 +348,41 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val) #define REG_A6XX_GPU_CC_GX_DOMAIN_MISC 0x00009d42 -#define REG_A6XX_RSCC_PDC_SEQ_START_ADDR 0x00008c08 +#define REG_A6XX_GPU_RSCC_RSC_STATUS0_DRV0 0x00000004 -#define REG_A6XX_RSCC_PDC_MATCH_VALUE_LO 0x00008c09 +#define REG_A6XX_RSCC_PDC_SEQ_START_ADDR 0x00000008 -#define REG_A6XX_RSCC_PDC_MATCH_VALUE_HI 0x00008c0a +#define REG_A6XX_RSCC_PDC_MATCH_VALUE_LO 0x00000009 -#define REG_A6XX_RSCC_PDC_SLAVE_ID_DRV0 0x00008c0b +#define REG_A6XX_RSCC_PDC_MATCH_VALUE_HI 0x0000000a -#define REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR 0x00008c0d +#define REG_A6XX_RSCC_PDC_SLAVE_ID_DRV0 0x0000000b -#define REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA 0x00008c0e +#define REG_A6XX_RSCC_HIDDEN_TCS_CMD0_ADDR 0x0000000d -#define REG_A6XX_RSCC_TIMESTAMP_UNIT0_TIMESTAMP_L_DRV0 0x00008c82 +#define REG_A6XX_RSCC_HIDDEN_TCS_CMD0_DATA 0x0000000e -#define REG_A6XX_RSCC_TIMESTAMP_UNIT0_TIMESTAMP_H_DRV0 0x00008c83 +#define REG_A6XX_RSCC_TIMESTAMP_UNIT0_TIMESTAMP_L_DRV0 0x00000082 -#define REG_A6XX_RSCC_TIMESTAMP_UNIT1_EN_DRV0 0x00008c89 +#define REG_A6XX_RSCC_TIMESTAMP_UNIT0_TIMESTAMP_H_DRV0 0x00000083 -#define REG_A6XX_RSCC_TIMESTAMP_UNIT1_OUTPUT_DRV0 0x00008c8c +#define REG_A6XX_RSCC_TIMESTAMP_UNIT1_EN_DRV0 0x00000089 -#define REG_A6XX_RSCC_OVERRIDE_START_ADDR 0x00008d00 +#define REG_A6XX_RSCC_TIMESTAMP_UNIT1_OUTPUT_DRV0 0x0000008c -#define REG_A6XX_RSCC_SEQ_BUSY_DRV0 0x00008d01 +#define REG_A6XX_RSCC_OVERRIDE_START_ADDR 0x00000100 -#define REG_A6XX_RSCC_SEQ_MEM_0_DRV0 0x00008d80 +#define REG_A6XX_RSCC_SEQ_BUSY_DRV0 0x00000101 -#define REG_A6XX_RSCC_TCS0_DRV0_STATUS 0x00008f46 +#define REG_A6XX_RSCC_SEQ_MEM_0_DRV0 0x00000180 -#define REG_A6XX_RSCC_TCS1_DRV0_STATUS 0x000090ae +#define REG_A6XX_RSCC_TCS0_DRV0_STATUS 0x00000346 -#define REG_A6XX_RSCC_TCS2_DRV0_STATUS 0x00009216 +#define REG_A6XX_RSCC_TCS1_DRV0_STATUS 0x000003ee -#define REG_A6XX_RSCC_TCS3_DRV0_STATUS 0x0000937e +#define REG_A6XX_RSCC_TCS2_DRV0_STATUS 0x00000496 + +#define REG_A6XX_RSCC_TCS3_DRV0_STATUS 0x0000053e #endif /* A6XX_GMU_XML */ From ad4968d51dd3a22a5a85728fec76bb8711c9c995 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:20 -0400 Subject: [PATCH 180/574] drm/msm/a6xx: enable GMU log This is required for a650 to work. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 15 +++++++++++++++ drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h | 4 ++++ 3 files changed, 20 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 48d2e8defe59..8c07850728bc 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -209,6 +209,12 @@ static int a6xx_gmu_start(struct a6xx_gmu *gmu) u32 val; gmu_write(gmu, REG_A6XX_GMU_CM3_SYSRESET, 1); + + /* Set the log wptr index + * note: downstream saves the value in poweroff and restores it here + */ + gmu_write(gmu, REG_A6XX_GPU_GMU_CX_GMU_PWR_COL_CP_RESP, 0); + gmu_write(gmu, REG_A6XX_GMU_CM3_SYSRESET, 0); ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_CM3_FW_INIT_RESULT, val, @@ -752,6 +758,9 @@ static int a6xx_gmu_fw_start(struct a6xx_gmu *gmu, unsigned int state) gmu_write(gmu, REG_A6XX_GMU_HFI_SFR_ADDR, chipid); + gmu_write(gmu, REG_A6XX_GPU_GMU_CX_GMU_PWR_COL_CP_MSG, + gmu->log.iova | (gmu->log.size / SZ_4K - 1)); + /* Set up the lowest idle level on the GMU */ a6xx_gmu_power_config(gmu); @@ -1057,6 +1066,7 @@ static void a6xx_gmu_memory_free(struct a6xx_gmu *gmu) msm_gem_kernel_put(gmu->icache.obj, gmu->aspace, false); msm_gem_kernel_put(gmu->dcache.obj, gmu->aspace, false); msm_gem_kernel_put(gmu->dummy.obj, gmu->aspace, false); + msm_gem_kernel_put(gmu->log.obj, gmu->aspace, false); gmu->aspace->mmu->funcs->detach(gmu->aspace->mmu); msm_gem_address_space_put(gmu->aspace); @@ -1461,6 +1471,11 @@ int a6xx_gmu_init(struct a6xx_gpu *a6xx_gpu, struct device_node *node) if (ret) goto err_memory; + /* Allocate memory for the GMU log region */ + ret = a6xx_gmu_memory_alloc(gmu, &gmu->log, SZ_4K, 0); + if (ret) + goto err_memory; + /* Map the GMU registers */ gmu->mmio = a6xx_gmu_get_mmio(pdev, "gmu"); if (IS_ERR(gmu->mmio)) { diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h index e16c16bb65bf..47df4745db50 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h @@ -61,6 +61,7 @@ struct a6xx_gmu { struct a6xx_gmu_bo icache; struct a6xx_gmu_bo dcache; struct a6xx_gmu_bo dummy; + struct a6xx_gmu_bo log; int nr_clocks; struct clk_bulk_data *clocks; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h index b4357ea550ec..176ae94d9fe6 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.xml.h @@ -205,6 +205,10 @@ static inline uint32_t A6XX_GMU_GPU_NAP_CTRL_SID(uint32_t val) #define REG_A6XX_GPU_GMU_CX_GMU_CX_FAL_INTF 0x000050f0 +#define REG_A6XX_GPU_GMU_CX_GMU_PWR_COL_CP_MSG 0x00005100 + +#define REG_A6XX_GPU_GMU_CX_GMU_PWR_COL_CP_RESP 0x00005101 + #define REG_A6XX_GMU_BOOT_KMD_LM_HANDSHAKE 0x000051f0 #define REG_A6XX_GMU_LLM_GLM_SLEEP_CTRL 0x00005157 From 24e6938ec604b7dc0306c972c1aa029ff03bb36a Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 23 Apr 2020 17:09:21 -0400 Subject: [PATCH 181/574] drm/msm/a6xx: update a6xx_hw_init for A640 and A650 Adreno 640 and 650 GPUs need some registers set differently. Signed-off-by: Jonathan Marek Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx.xml.h | 14 +++++++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 56 ++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx.xml.h b/drivers/gpu/drm/msm/adreno/a6xx.xml.h index ed78fee2a262..47840b73cdda 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx.xml.h +++ b/drivers/gpu/drm/msm/adreno/a6xx.xml.h @@ -1047,6 +1047,8 @@ enum a6xx_tex_type { #define REG_A6XX_CP_MISC_CNTL 0x00000840 +#define REG_A6XX_CP_APRIV_CNTL 0x00000844 + #define REG_A6XX_CP_ROQ_THRESHOLDS_1 0x000008c1 #define REG_A6XX_CP_ROQ_THRESHOLDS_2 0x000008c2 @@ -1764,6 +1766,8 @@ static inline uint32_t A6XX_CP_PROTECT_REG_MASK_LEN(uint32_t val) #define REG_A6XX_RBBM_VBIF_CLIENT_QOS_CNTL 0x00000010 +#define REG_A6XX_RBBM_GBIF_CLIENT_QOS_CNTL 0x00000011 + #define REG_A6XX_RBBM_INTERFACE_HANG_INT_CNTL 0x0000001f #define REG_A6XX_RBBM_INT_CLEAR_CMD 0x00000037 @@ -2418,6 +2422,16 @@ static inline uint32_t A6XX_UCHE_CLIENT_PF_PERFSEL(uint32_t val) #define REG_A6XX_TPL1_NC_MODE_CNTL 0x0000b604 +#define REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_0 0x0000b608 + +#define REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_1 0x0000b609 + +#define REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_2 0x0000b60a + +#define REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_3 0x0000b60b + +#define REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4 0x0000b60c + #define REG_A6XX_TPL1_PERFCTR_TP_SEL_0 0x0000b610 #define REG_A6XX_TPL1_PERFCTR_TP_SEL_1 0x0000b611 diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 20528ede63b6..6f335ae179c8 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -414,7 +414,17 @@ static int a6xx_hw_init(struct msm_gpu *gpu) a6xx_set_hwcg(gpu, true); /* VBIF/GBIF start*/ - gpu_write(gpu, REG_A6XX_RBBM_VBIF_CLIENT_QOS_CNTL, 0x3); + if (adreno_is_a640(adreno_gpu) || adreno_is_a650(adreno_gpu)) { + gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE0, 0x00071620); + gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE1, 0x00071620); + gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE2, 0x00071620); + gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE3, 0x00071620); + gpu_write(gpu, REG_A6XX_GBIF_QSB_SIDE3, 0x00071620); + gpu_write(gpu, REG_A6XX_RBBM_GBIF_CLIENT_QOS_CNTL, 0x3); + } else { + gpu_write(gpu, REG_A6XX_RBBM_VBIF_CLIENT_QOS_CNTL, 0x3); + } + if (adreno_is_a630(adreno_gpu)) gpu_write(gpu, REG_A6XX_VBIF_GATE_OFF_WRREQ_EN, 0x00000009); @@ -429,25 +439,35 @@ static int a6xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A6XX_UCHE_WRITE_THRU_BASE_LO, 0xfffff000); gpu_write(gpu, REG_A6XX_UCHE_WRITE_THRU_BASE_HI, 0x0001ffff); - /* Set the GMEM VA range [0x100000:0x100000 + gpu->gmem - 1] */ - gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MIN_LO, - REG_A6XX_UCHE_GMEM_RANGE_MIN_HI, 0x00100000); + if (!adreno_is_a650(adreno_gpu)) { + /* Set the GMEM VA range [0x100000:0x100000 + gpu->gmem - 1] */ + gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MIN_LO, + REG_A6XX_UCHE_GMEM_RANGE_MIN_HI, 0x00100000); - gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MAX_LO, - REG_A6XX_UCHE_GMEM_RANGE_MAX_HI, - 0x00100000 + adreno_gpu->gmem - 1); + gpu_write64(gpu, REG_A6XX_UCHE_GMEM_RANGE_MAX_LO, + REG_A6XX_UCHE_GMEM_RANGE_MAX_HI, + 0x00100000 + adreno_gpu->gmem - 1); + } gpu_write(gpu, REG_A6XX_UCHE_FILTER_CNTL, 0x804); gpu_write(gpu, REG_A6XX_UCHE_CACHE_WAYS, 0x4); - gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x010000c0); + if (adreno_is_a640(adreno_gpu) || adreno_is_a650(adreno_gpu)) + gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x02000140); + else + gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_2, 0x010000c0); gpu_write(gpu, REG_A6XX_CP_ROQ_THRESHOLDS_1, 0x8040362c); /* Setting the mem pool size */ gpu_write(gpu, REG_A6XX_CP_MEM_POOL_SIZE, 128); /* Setting the primFifo thresholds default values */ - gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, (0x300 << 11)); + if (adreno_is_a650(adreno_gpu)) + gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00300000); + else if (adreno_is_a640(adreno_gpu)) + gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, 0x00200000); + else + gpu_write(gpu, REG_A6XX_PC_DBG_ECO_CNTL, (0x300 << 11)); /* Set the AHB default slave response to "ERROR" */ gpu_write(gpu, REG_A6XX_CP_AHB_CNTL, 0x1); @@ -471,6 +491,19 @@ static int a6xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A6XX_UCHE_CLIENT_PF, 1); + /* Set weights for bicubic filtering */ + if (adreno_is_a650(adreno_gpu)) { + gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_0, 0); + gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_1, + 0x3fe05ff4); + gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_2, + 0x3fa0ebee); + gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_3, + 0x3f5193ed); + gpu_write(gpu, REG_A6XX_TPL1_BICUBIC_WEIGHTS_TABLE_4, + 0x3f0243f0); + } + /* Protect registers from the CP */ gpu_write(gpu, REG_A6XX_CP_PROTECT_CNTL, 0x00000003); @@ -508,6 +541,11 @@ static int a6xx_hw_init(struct msm_gpu *gpu) A6XX_PROTECT_RDONLY(0x980, 0x4)); gpu_write(gpu, REG_A6XX_CP_PROTECT(25), A6XX_PROTECT_RW(0xa630, 0x0)); + if (adreno_is_a650(adreno_gpu)) { + gpu_write(gpu, REG_A6XX_CP_APRIV_CNTL, + (1 << 6) | (1 << 5) | (1 << 3) | (1 << 2) | (1 << 1)); + } + /* Enable interrupts */ gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, A6XX_INT_MASK); From dc0fa5eb765d881977bab7568245f653407481be Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 9 May 2020 20:38:45 +0800 Subject: [PATCH 182/574] drm/msm/a4xx: add adreno a405 support It adds support for adreno a405 found on MSM8939. The adreno_is_a430() check in adreno_submit() needs an extension to cover a405. The downstream driver suggests it should cover the whole a4xx generation. That's why it gets changed to adreno_is_a4xx(), while a420 is not tested though. Signed-off-by: Shawn Guo Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 29 +++++++++++++--------- drivers/gpu/drm/msm/adreno/adreno_device.c | 11 ++++++++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 5 ++++ 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 253d8d85daad..70de59751188 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -66,19 +66,22 @@ static void a4xx_enable_hwcg(struct msm_gpu *gpu) } } - for (i = 0; i < 4; i++) { - gpu_write(gpu, REG_A4XX_RBBM_CLOCK_CTL_MARB_CCU(i), - 0x00000922); - } + /* No CCU for A405 */ + if (!adreno_is_a405(adreno_gpu)) { + for (i = 0; i < 4; i++) { + gpu_write(gpu, REG_A4XX_RBBM_CLOCK_CTL_MARB_CCU(i), + 0x00000922); + } - for (i = 0; i < 4; i++) { - gpu_write(gpu, REG_A4XX_RBBM_CLOCK_HYST_RB_MARB_CCU(i), - 0x00000000); - } + for (i = 0; i < 4; i++) { + gpu_write(gpu, REG_A4XX_RBBM_CLOCK_HYST_RB_MARB_CCU(i), + 0x00000000); + } - for (i = 0; i < 4; i++) { - gpu_write(gpu, REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1(i), - 0x00000001); + for (i = 0; i < 4; i++) { + gpu_write(gpu, REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1(i), + 0x00000001); + } } gpu_write(gpu, REG_A4XX_RBBM_CLOCK_MODE_GPC, 0x02222222); @@ -137,7 +140,9 @@ static int a4xx_hw_init(struct msm_gpu *gpu) uint32_t *ptr, len; int i, ret; - if (adreno_is_a420(adreno_gpu)) { + if (adreno_is_a405(adreno_gpu)) { + gpu_write(gpu, REG_A4XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x00000003); + } else if (adreno_is_a420(adreno_gpu)) { gpu_write(gpu, REG_A4XX_VBIF_ABIT_SORT, 0x0001001F); gpu_write(gpu, REG_A4XX_VBIF_ABIT_SORT_CONF, 0x000000A4); gpu_write(gpu, REG_A4XX_VBIF_GATE_OFF_WRREQ_EN, 0x00000001); diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 1156f72532a4..7732f03d9e3a 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -92,6 +92,17 @@ static const struct adreno_info gpulist[] = { .gmem = SZ_1M, .inactive_period = DRM_MSM_INACTIVE_PERIOD, .init = a3xx_gpu_init, + }, { + .rev = ADRENO_REV(4, 0, 5, ANY_ID), + .revn = 405, + .name = "A405", + .fw = { + [ADRENO_FW_PM4] = "a420_pm4.fw", + [ADRENO_FW_PFP] = "a420_pfp.fw", + }, + .gmem = SZ_256K, + .inactive_period = DRM_MSM_INACTIVE_PERIOD, + .init = a4xx_gpu_init, }, { .rev = ADRENO_REV(4, 2, 0, ANY_ID), .revn = 420, diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index a7647eaacc7a..ff7f441932b8 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -459,7 +459,7 @@ void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, break; /* fall-thru */ case MSM_SUBMIT_CMD_BUF: - OUT_PKT3(ring, adreno_is_a430(adreno_gpu) ? + OUT_PKT3(ring, adreno_is_a4xx(adreno_gpu) ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2); OUT_RING(ring, lower_32_bits(submit->cmd[i].iova)); OUT_RING(ring, submit->cmd[i].size); diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index 88ae1b2813ef..dea97c8317e0 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -202,6 +202,11 @@ static inline bool adreno_is_a4xx(struct adreno_gpu *gpu) return (gpu->revn >= 400) && (gpu->revn < 500); } +static inline int adreno_is_a405(struct adreno_gpu *gpu) +{ + return gpu->revn == 405; +} + static inline int adreno_is_a420(struct adreno_gpu *gpu) { return gpu->revn == 420; From d3b68ddf1d38366d3dd5afae21f167aaf5161d32 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Sat, 9 May 2020 20:38:46 +0800 Subject: [PATCH 183/574] drm/msm/a4xx: add a405_registers for a405 device A405 device has a different set of registers than a4xx_registers. It has no VMIDMT or XPU registers, and VBIF registers are different. Let's add a405_registers for a405 device. As adreno_is_a405() works only after adreno_gpu_init() gets called, the assignments get moved down after adreno_gpu_init(). Signed-off-by: Shawn Guo Reviewed-by: Jordan Crouse Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 53 +++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 70de59751188..9e244982974e 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -445,6 +445,52 @@ static const unsigned int a4xx_registers[] = { ~0 /* sentinel */ }; +static const unsigned int a405_registers[] = { + /* RBBM */ + 0x0000, 0x0002, 0x0004, 0x0021, 0x0023, 0x0024, 0x0026, 0x0026, + 0x0028, 0x002B, 0x002E, 0x0034, 0x0037, 0x0044, 0x0047, 0x0066, + 0x0068, 0x0095, 0x009C, 0x0170, 0x0174, 0x01AF, + /* CP */ + 0x0200, 0x0233, 0x0240, 0x0250, 0x04C0, 0x04DD, 0x0500, 0x050B, + 0x0578, 0x058F, + /* VSC */ + 0x0C00, 0x0C03, 0x0C08, 0x0C41, 0x0C50, 0x0C51, + /* GRAS */ + 0x0C80, 0x0C81, 0x0C88, 0x0C8F, + /* RB */ + 0x0CC0, 0x0CC0, 0x0CC4, 0x0CD2, + /* PC */ + 0x0D00, 0x0D0C, 0x0D10, 0x0D17, 0x0D20, 0x0D23, + /* VFD */ + 0x0E40, 0x0E4A, + /* VPC */ + 0x0E60, 0x0E61, 0x0E63, 0x0E68, + /* UCHE */ + 0x0E80, 0x0E84, 0x0E88, 0x0E95, + /* GRAS CTX 0 */ + 0x2000, 0x2004, 0x2008, 0x2067, 0x2070, 0x2078, 0x207B, 0x216E, + /* PC CTX 0 */ + 0x21C0, 0x21C6, 0x21D0, 0x21D0, 0x21D9, 0x21D9, 0x21E5, 0x21E7, + /* VFD CTX 0 */ + 0x2200, 0x2204, 0x2208, 0x22A9, + /* GRAS CTX 1 */ + 0x2400, 0x2404, 0x2408, 0x2467, 0x2470, 0x2478, 0x247B, 0x256E, + /* PC CTX 1 */ + 0x25C0, 0x25C6, 0x25D0, 0x25D0, 0x25D9, 0x25D9, 0x25E5, 0x25E7, + /* VFD CTX 1 */ + 0x2600, 0x2604, 0x2608, 0x26A9, + /* VBIF version 0x20050000*/ + 0x3000, 0x3007, 0x302C, 0x302C, 0x3030, 0x3030, 0x3034, 0x3036, + 0x3038, 0x3038, 0x303C, 0x303D, 0x3040, 0x3040, 0x3049, 0x3049, + 0x3058, 0x3058, 0x305B, 0x3061, 0x3064, 0x3068, 0x306C, 0x306D, + 0x3080, 0x3088, 0x308B, 0x308C, 0x3090, 0x3094, 0x3098, 0x3098, + 0x309C, 0x309C, 0x30C0, 0x30C0, 0x30C8, 0x30C8, 0x30D0, 0x30D0, + 0x30D8, 0x30D8, 0x30E0, 0x30E0, 0x3100, 0x3100, 0x3108, 0x3108, + 0x3110, 0x3110, 0x3118, 0x3118, 0x3120, 0x3120, 0x3124, 0x3125, + 0x3129, 0x3129, 0x340C, 0x340C, 0x3410, 0x3410, + ~0 /* sentinel */ +}; + static struct msm_gpu_state *a4xx_gpu_state_get(struct msm_gpu *gpu) { struct msm_gpu_state *state = kzalloc(sizeof(*state), GFP_KERNEL); @@ -568,13 +614,14 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev) gpu->perfcntrs = NULL; gpu->num_perfcntrs = 0; - adreno_gpu->registers = a4xx_registers; - adreno_gpu->reg_offsets = a4xx_register_offsets; - ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) goto fail; + adreno_gpu->registers = adreno_is_a405(adreno_gpu) ? a405_registers : + a4xx_registers; + adreno_gpu->reg_offsets = a4xx_register_offsets; + /* if needed, allocate gmem: */ if (adreno_is_a4xx(adreno_gpu)) { ret = adreno_gpu_ocmem_init(dev->dev, adreno_gpu, From 4e399b3ba8b937aaa49df836117ce4dc39bf3fcd Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 May 2020 12:43:36 +0200 Subject: [PATCH 184/574] dt-bindings: Document JZ47xx VPU auxiliary processor Inside the Video Processing Unit (VPU) of the recent JZ47xx SoCs from Ingenic is a second Xburst MIPS CPU very similar to the main core. This document describes the devicetree bindings for this auxiliary processor. Signed-off-by: Paul Cercueil Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20200515104340.10473-1-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- .../bindings/remoteproc/ingenic,vpu.yaml | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml diff --git a/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml new file mode 100644 index 000000000000..c019f9fbe916 --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/remoteproc/ingenic,vpu.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Ingenic Video Processing Unit bindings + +description: + Inside the Video Processing Unit (VPU) of the recent JZ47xx SoCs from + Ingenic is a second Xburst MIPS CPU very similar to the main core. + This document describes the devicetree bindings for this auxiliary + processor. + +maintainers: + - Paul Cercueil + +properties: + compatible: + const: ingenic,jz4770-vpu-rproc + + reg: + items: + - description: aux registers + - description: tcsm0 registers + - description: tcsm1 registers + - description: sram registers + + reg-names: + items: + - const: aux + - const: tcsm0 + - const: tcsm1 + - const: sram + + clocks: + items: + - description: aux clock + - description: vpu clock + + clock-names: + items: + - const: aux + - const: vpu + + interrupts: + description: VPU hardware interrupt + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - interrupts + +additionalProperties: false + +examples: + - | + #include + + vpu: video-decoder@132a0000 { + compatible = "ingenic,jz4770-vpu-rproc"; + + reg = <0x132a0000 0x20>, /* AUX */ + <0x132b0000 0x4000>, /* TCSM0 */ + <0x132c0000 0xc000>, /* TCSM1 */ + <0x132f0000 0x7000>; /* SRAM */ + reg-names = "aux", "tcsm0", "tcsm1", "sram"; + + clocks = <&cgu JZ4770_CLK_AUX>, <&cgu JZ4770_CLK_VPU>; + clock-names = "aux", "vpu"; + + interrupt-parent = <&cpuintc>; + interrupts = <3>; + }; From a99a37f6cd5a74d5b22c08544aa6c5890813c8ba Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 May 2020 12:43:38 +0200 Subject: [PATCH 185/574] remoteproc: Add support for runtime PM Call pm_runtime_get_sync() before the firmware is loaded, and pm_runtime_put() after the remote processor has been stopped. Even though the remoteproc device has no PM callbacks, this allows the parent device's PM callbacks to be properly called. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20200515104340.10473-3-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9c6d9ff7a1fc..0cc015fabf78 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1382,6 +1383,12 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) if (ret) return ret; + ret = pm_runtime_get_sync(dev); + if (ret < 0) { + dev_err(dev, "pm_runtime_get_sync failed: %d\n", ret); + return ret; + } + dev_info(dev, "Booting fw image %s, size %zd\n", name, fw->size); /* @@ -1391,7 +1398,7 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw) ret = rproc_enable_iommu(rproc); if (ret) { dev_err(dev, "can't enable iommu: %d\n", ret); - return ret; + goto put_pm_runtime; } /* Prepare rproc for firmware loading if needed */ @@ -1445,6 +1452,8 @@ unprepare_rproc: rproc_unprepare_device(rproc); disable_iommu: rproc_disable_iommu(rproc); +put_pm_runtime: + pm_runtime_put(dev); return ret; } @@ -1882,6 +1891,8 @@ void rproc_shutdown(struct rproc *rproc) rproc_disable_iommu(rproc); + pm_runtime_put(dev); + /* Free the copy of the resource table */ kfree(rproc->cached_table); rproc->cached_table = NULL; @@ -2172,6 +2183,9 @@ struct rproc *rproc_alloc(struct device *dev, const char *name, rproc->state = RPROC_OFFLINE; + pm_runtime_no_callbacks(&rproc->dev); + pm_runtime_enable(&rproc->dev); + return rproc; put_device: @@ -2191,6 +2205,7 @@ EXPORT_SYMBOL(rproc_alloc); */ void rproc_free(struct rproc *rproc) { + pm_runtime_disable(&rproc->dev); put_device(&rproc->dev); } EXPORT_SYMBOL(rproc_free); From 48f0a1bbb7586c94e0f15116b06f8179df2fd60f Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 May 2020 12:43:39 +0200 Subject: [PATCH 186/574] remoteproc: ingenic: Added remoteproc driver This driver is used to boot, communicate with and load firmwares to the MIPS co-processor found in the VPU hardware of the JZ47xx SoCs from Ingenic. Signed-off-by: Paul Cercueil Acked-by: Mathieu Poirier Link: https://lore.kernel.org/r/20200515104340.10473-4-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- drivers/remoteproc/Kconfig | 9 + drivers/remoteproc/Makefile | 1 + drivers/remoteproc/ingenic_rproc.c | 280 +++++++++++++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 drivers/remoteproc/ingenic_rproc.c diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index fbaed079b299..c4d1731295eb 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -23,6 +23,15 @@ config IMX_REMOTEPROC It's safe to say N here. +config INGENIC_VPU_RPROC + tristate "Ingenic JZ47xx VPU remoteproc support" + depends on MIPS || COMPILE_TEST + help + Say y or m here to support the VPU in the JZ47xx SoCs from Ingenic. + + This can be either built-in or a loadable module. + If unsure say N. + config MTK_SCP tristate "Mediatek SCP support" depends on ARCH_MEDIATEK diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile index 0effd3825035..e8b886e511f0 100644 --- a/drivers/remoteproc/Makefile +++ b/drivers/remoteproc/Makefile @@ -10,6 +10,7 @@ remoteproc-y += remoteproc_sysfs.o remoteproc-y += remoteproc_virtio.o remoteproc-y += remoteproc_elf_loader.o obj-$(CONFIG_IMX_REMOTEPROC) += imx_rproc.o +obj-$(CONFIG_INGENIC_VPU_RPROC) += ingenic_rproc.o obj-$(CONFIG_MTK_SCP) += mtk_scp.o mtk_scp_ipi.o obj-$(CONFIG_OMAP_REMOTEPROC) += omap_remoteproc.o obj-$(CONFIG_WKUP_M3_RPROC) += wkup_m3_rproc.o diff --git a/drivers/remoteproc/ingenic_rproc.c b/drivers/remoteproc/ingenic_rproc.c new file mode 100644 index 000000000000..189020d77b25 --- /dev/null +++ b/drivers/remoteproc/ingenic_rproc.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Ingenic JZ47xx remoteproc driver + * Copyright 2019, Paul Cercueil + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "remoteproc_internal.h" + +#define REG_AUX_CTRL 0x0 +#define REG_AUX_MSG_ACK 0x10 +#define REG_AUX_MSG 0x14 +#define REG_CORE_MSG_ACK 0x18 +#define REG_CORE_MSG 0x1C + +#define AUX_CTRL_SLEEP BIT(31) +#define AUX_CTRL_MSG_IRQ_EN BIT(3) +#define AUX_CTRL_NMI_RESETS BIT(2) +#define AUX_CTRL_NMI BIT(1) +#define AUX_CTRL_SW_RESET BIT(0) + +struct vpu_mem_map { + const char *name; + unsigned int da; +}; + +struct vpu_mem_info { + const struct vpu_mem_map *map; + unsigned long len; + void __iomem *base; +}; + +static const struct vpu_mem_map vpu_mem_map[] = { + { "tcsm0", 0x132b0000 }, + { "tcsm1", 0xf4000000 }, + { "sram", 0x132f0000 }, +}; + +/** + * struct vpu - Ingenic VPU remoteproc private structure + * @irq: interrupt number + * @clks: pointers to the VPU and AUX clocks + * @aux_base: raw pointer to the AUX interface registers + * @mem_info: array of struct vpu_mem_info, which contain the mapping info of + * each of the external memories + * @dev: private pointer to the device + */ +struct vpu { + int irq; + struct clk_bulk_data clks[2]; + void __iomem *aux_base; + struct vpu_mem_info mem_info[ARRAY_SIZE(vpu_mem_map)]; + struct device *dev; +}; + +static int ingenic_rproc_start(struct rproc *rproc) +{ + struct vpu *vpu = rproc->priv; + u32 ctrl; + + enable_irq(vpu->irq); + + /* Reset the AUX and enable message IRQ */ + ctrl = AUX_CTRL_NMI_RESETS | AUX_CTRL_NMI | AUX_CTRL_MSG_IRQ_EN; + writel(ctrl, vpu->aux_base + REG_AUX_CTRL); + + return 0; +} + +static int ingenic_rproc_stop(struct rproc *rproc) +{ + struct vpu *vpu = rproc->priv; + + disable_irq(vpu->irq); + + /* Keep AUX in reset mode */ + writel(AUX_CTRL_SW_RESET, vpu->aux_base + REG_AUX_CTRL); + + return 0; +} + +static void ingenic_rproc_kick(struct rproc *rproc, int vqid) +{ + struct vpu *vpu = rproc->priv; + + writel(vqid, vpu->aux_base + REG_CORE_MSG); +} + +static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +{ + struct vpu *vpu = rproc->priv; + void __iomem *va = NULL; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(vpu_mem_map); i++) { + const struct vpu_mem_info *info = &vpu->mem_info[i]; + const struct vpu_mem_map *map = info->map; + + if (da >= map->da && (da + len) < (map->da + info->len)) { + va = info->base + (da - map->da); + break; + } + } + + return (__force void *)va; +} + +static struct rproc_ops ingenic_rproc_ops = { + .start = ingenic_rproc_start, + .stop = ingenic_rproc_stop, + .kick = ingenic_rproc_kick, + .da_to_va = ingenic_rproc_da_to_va, +}; + +static irqreturn_t vpu_interrupt(int irq, void *data) +{ + struct rproc *rproc = data; + struct vpu *vpu = rproc->priv; + u32 vring; + + vring = readl(vpu->aux_base + REG_AUX_MSG); + + /* Ack the interrupt */ + writel(0, vpu->aux_base + REG_AUX_MSG_ACK); + + return rproc_vq_interrupt(rproc, vring); +} + +static void ingenic_rproc_disable_clks(void *data) +{ + struct vpu *vpu = data; + + pm_runtime_resume(vpu->dev); + pm_runtime_disable(vpu->dev); + + clk_bulk_disable_unprepare(ARRAY_SIZE(vpu->clks), vpu->clks); +} + +static int ingenic_rproc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *mem; + struct rproc *rproc; + struct vpu *vpu; + unsigned int i; + int ret; + + rproc = devm_rproc_alloc(dev, "ingenic-vpu", + &ingenic_rproc_ops, NULL, sizeof(*vpu)); + if (!rproc) + return -ENOMEM; + + vpu = rproc->priv; + vpu->dev = &pdev->dev; + platform_set_drvdata(pdev, vpu); + + mem = platform_get_resource_byname(pdev, IORESOURCE_MEM, "aux"); + vpu->aux_base = devm_ioremap_resource(dev, mem); + if (IS_ERR(vpu->aux_base)) { + dev_err(dev, "Failed to ioremap\n"); + return PTR_ERR(vpu->aux_base); + } + + for (i = 0; i < ARRAY_SIZE(vpu_mem_map); i++) { + mem = platform_get_resource_byname(pdev, IORESOURCE_MEM, + vpu_mem_map[i].name); + + vpu->mem_info[i].base = devm_ioremap_resource(dev, mem); + if (IS_ERR(vpu->mem_info[i].base)) { + ret = PTR_ERR(vpu->mem_info[i].base); + dev_err(dev, "Failed to ioremap\n"); + return ret; + } + + vpu->mem_info[i].len = resource_size(mem); + vpu->mem_info[i].map = &vpu_mem_map[i]; + } + + vpu->clks[0].id = "vpu"; + vpu->clks[1].id = "aux"; + + ret = devm_clk_bulk_get(dev, ARRAY_SIZE(vpu->clks), vpu->clks); + if (ret) { + dev_err(dev, "Failed to get clocks\n"); + return ret; + } + + vpu->irq = platform_get_irq(pdev, 0); + if (vpu->irq < 0) + return vpu->irq; + + ret = devm_request_irq(dev, vpu->irq, vpu_interrupt, 0, "VPU", rproc); + if (ret < 0) { + dev_err(dev, "Failed to request IRQ\n"); + return ret; + } + + disable_irq(vpu->irq); + + /* The clocks must be enabled for the firmware to be loaded in TCSM */ + ret = clk_bulk_prepare_enable(ARRAY_SIZE(vpu->clks), vpu->clks); + if (ret) { + dev_err(dev, "Unable to start clocks\n"); + return ret; + } + + pm_runtime_irq_safe(dev); + pm_runtime_set_active(dev); + pm_runtime_enable(dev); + pm_runtime_get_sync(dev); + pm_runtime_use_autosuspend(dev); + + ret = devm_add_action_or_reset(dev, ingenic_rproc_disable_clks, vpu); + if (ret) { + dev_err(dev, "Unable to register action\n"); + goto out_pm_put; + } + + ret = devm_rproc_add(dev, rproc); + if (ret) { + dev_err(dev, "Failed to register remote processor\n"); + goto out_pm_put; + } + +out_pm_put: + pm_runtime_put_autosuspend(dev); + + return ret; +} + +static const struct of_device_id ingenic_rproc_of_matches[] = { + { .compatible = "ingenic,jz4770-vpu-rproc", }, + {} +}; +MODULE_DEVICE_TABLE(of, ingenic_rproc_of_matches); + +static int __maybe_unused ingenic_rproc_suspend(struct device *dev) +{ + struct vpu *vpu = dev_get_drvdata(dev); + + clk_bulk_disable(ARRAY_SIZE(vpu->clks), vpu->clks); + + return 0; +} + +static int __maybe_unused ingenic_rproc_resume(struct device *dev) +{ + struct vpu *vpu = dev_get_drvdata(dev); + + return clk_bulk_enable(ARRAY_SIZE(vpu->clks), vpu->clks); +} + +static const struct dev_pm_ops __maybe_unused ingenic_rproc_pm = { + SET_RUNTIME_PM_OPS(ingenic_rproc_suspend, ingenic_rproc_resume, NULL) +}; + +static struct platform_driver ingenic_rproc_driver = { + .probe = ingenic_rproc_probe, + .driver = { + .name = "ingenic-vpu", +#ifdef CONFIG_PM + .pm = &ingenic_rproc_pm, +#endif + .of_match_table = ingenic_rproc_of_matches, + }, +}; +module_platform_driver(ingenic_rproc_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul Cercueil "); +MODULE_DESCRIPTION("Ingenic JZ47xx Remote Processor control driver"); From 1ec5dbef6803293974f262f69b63f19d2ed4f6ba Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 15 May 2020 12:43:40 +0200 Subject: [PATCH 187/574] MAINTAINERS: Add myself as reviewer for Ingenic rproc driver Add myself as the reviewer for the Ingenic VPU remoteproc driver. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20200515104340.10473-5-paul@crapouillou.net Signed-off-by: Bjorn Andersson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..1677071197a0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8418,6 +8418,7 @@ F: drivers/mtd/nand/raw/ingenic/ F: drivers/pinctrl/pinctrl-ingenic.c F: drivers/power/supply/ingenic-battery.c F: drivers/pwm/pwm-jz4740.c +F: drivers/remoteproc/ingenic_rproc.c F: drivers/rtc/rtc-jz4740.c F: drivers/tty/serial/8250/8250_ingenic.c F: drivers/usb/musb/jz4740.c From 116788689bf8ca3c3337d8dd1a742c08342d2f62 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 15 May 2020 19:35:45 -0700 Subject: [PATCH 188/574] drm/i915: Mark check_shadow_context_ppgtt as maybe unused When CONFIG_DRM_I915_DEBUG_GEM is not set, clang warns: drivers/gpu/drm/i915/gvt/scheduler.c:884:1: warning: function 'check_shadow_context_ppgtt' is not needed and will not be emitted [-Wunneeded-internal-declaration] check_shadow_context_ppgtt(struct execlist_ring_context *c, struct intel_vgpu_mm *m) ^ 1 warning generated. This warning is similar to -Wunused-function but rather than warning that the function is completely unused, it warns that it is used in some expression within the file but that expression will be evaluated to a constant or be optimized away in the final assembly, essentially making it appeared used but really isn't. Usually, this happens when a function or variable is only used in sizeof, where it will appear to be used but will be evaluated at compile time and not be required to be emitted. In this case, the function is only used in GEM_BUG_ON, which is defined as BUILD_BUG_ON_INVALID, which intentionally follows this pattern. To fix this warning, add __maybe_unused to make it clear that this is intentional depending on the configuration. Fixes: bec3df930fbd ("drm/i915/gvt: Support PPGTT table load command") Link: https://github.com/ClangBuiltLinux/linux/issues/1027 Acked-by: Zhenyu Wang Signed-off-by: Nathan Chancellor Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20200516023545.3332334-1-natechancellor@gmail.com --- drivers/gpu/drm/i915/gvt/scheduler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index c00189432b58..3a9bd8e4d8db 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -876,7 +876,7 @@ static void update_guest_pdps(struct intel_vgpu *vgpu, gpa + i * 8, &pdp[7 - i], 4); } -static bool +static __maybe_unused bool check_shadow_context_ppgtt(struct execlist_ring_context *c, struct intel_vgpu_mm *m) { if (m->ppgtt_mm.root_entry_type == GTT_TYPE_PPGTT_ROOT_L4_ENTRY) { From cb7ee52284a244fd14caec73df0d49e02891aac4 Mon Sep 17 00:00:00 2001 From: Aishwarya Ramakrishnan Date: Mon, 18 May 2020 20:33:36 +0530 Subject: [PATCH 189/574] drm/i915/gvt: Use ARRAY_SIZE for vgpu_types Prefer ARRAY_SIZE instead of using sizeof Fixes coccicheck warning: Use ARRAY_SIZE Reviewed-by: Chris Wilson Signed-off-by: Aishwarya Ramakrishnan Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20200518150336.15265-1-aishwaryarj100@gmail.com --- drivers/gpu/drm/i915/gvt/vgpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 1d5ff88078bd..7d361623ff67 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -124,7 +124,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) */ low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; - num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]); + num_types = ARRAY_SIZE(vgpu_types); gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), GFP_KERNEL); From ae7d292324b4be60b6d39d2e06bf2a63752f3fcd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 19 May 2020 09:18:57 +0000 Subject: [PATCH 190/574] iommu/sun50i: Fix return value check in sun50i_iommu_probe() In case of error, the function devm_platform_ioremap_resource() returns ERR_PTR() not NULL. The NULL test in the return value check must be replaced with IS_ERR(). Fixes: 4100b8c229b3 ("iommu: Add Allwinner H6 IOMMU driver") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20200519091857.134170-1-weiyongjun1@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/sun50i-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 9c763d4a8e2a..1fa09ddcebd4 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -941,7 +941,7 @@ static int sun50i_iommu_probe(struct platform_device *pdev) } iommu->base = devm_platform_ioremap_resource(pdev, 0); - if (!iommu->base) { + if (IS_ERR(iommu->base)) { ret = PTR_ERR(iommu->base); goto err_free_group; } From 2bc61fbcc192f0b2e10e51e38f4f485ba5b293ca Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Tue, 19 May 2020 15:57:44 +0800 Subject: [PATCH 191/574] iommu/mediatek-v1: Fix a build warning for a unused variable 'data' This patch fixes a build warning: drivers/iommu/mtk_iommu_v1.c: In function 'mtk_iommu_release_device': >> drivers/iommu/mtk_iommu_v1.c:467:25: warning: variable 'data' set but >> not used [-Wunused-but-set-variable] 467 | struct mtk_iommu_data *data; | ^~~~ It's reported at: https://lore.kernel.org/linux-iommu/202005191458.gY38V8bU%25lkp@intel.com/T/#u Reported-by: kbuild test robot Signed-off-by: Yong Wu Link: https://lore.kernel.org/r/1589875064-662-1-git-send-email-yong.wu@mediatek.com Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu_v1.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index f353b072a5d0..c9d79cff4d17 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -469,12 +469,10 @@ static void mtk_iommu_probe_finalize(struct device *dev) static void mtk_iommu_release_device(struct device *dev) { struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - struct mtk_iommu_data *data; if (!fwspec || fwspec->ops != &mtk_iommu_ops) return; - data = dev_iommu_priv_get(dev); iommu_fwspec_free(dev); } From a5f304670b80973dfce5bc86cacff20244926cf6 Mon Sep 17 00:00:00 2001 From: Veronika Kabatova Date: Tue, 19 May 2020 22:00:45 +0200 Subject: [PATCH 192/574] selftests: introduce gen_tar Makefile target The gen_kselftest_tar.sh always packages *all* selftests and doesn't pass along any variables to `make install` to influence what should be built. This can result in an early error on the command line ("Unknown tarball format TARGETS=XXX"), or unexpected test failures as the tarball contains tests people wanted to skip on purpose. Since the makefile already contains all the logic, we can add a target for packaging. Keep the default .gz target the script uses, and actually extend the supported formats by using tar's autodetection. To not break current workflows, keep the gen_kselftest_tar.sh script as it is, with an added suggestion to use the makefile target instead. Signed-off-by: Veronika Kabatova Reviewed-by: Stefano Brivio Signed-off-by: Shuah Khan --- Documentation/dev-tools/kselftest.rst | 23 ++++++++++++++++++++ tools/testing/selftests/Makefile | 9 +++++++- tools/testing/selftests/gen_kselftest_tar.sh | 5 +++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 61ae13c44f91..d4f6341cf4d9 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -151,6 +151,29 @@ note some tests will require root privileges:: $ cd kselftest $ ./run_kselftest.sh +Packaging selftests +=================== + +In some cases packaging is desired, such as when tests need to run on a +different system. To package selftests, run:: + + $ make -C tools/testing/selftests gen_tar + +This generates a tarball in the `INSTALL_PATH/kselftest-packages` directory. By +default, `.gz` format is used. The tar format can be overridden by specifying +a `FORMAT` make variable. Any value recognized by `tar's auto-compress`_ option +is supported, such as:: + + $ make -C tools/testing/selftests gen_tar FORMAT=.xz + +`make gen_tar` invokes `make install` so you can use it to package a subset of +tests by using variables specified in `Running a subset of selftests`_ +section:: + + $ make -C tools/testing/selftests gen_tar TARGETS="bpf" FORMAT=.xz + +.. _tar's auto-compress: https://www.gnu.org/software/tar/manual/html_node/gzip.html#auto_002dcompress + Contributing new tests ====================== diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2ff68702fd41..1195bd85af38 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -249,10 +249,17 @@ else $(error Error: set INSTALL_PATH to use install) endif +FORMAT ?= .gz +TAR_PATH = $(abspath ${INSTALL_PATH}/kselftest-packages/kselftest.tar${FORMAT}) +gen_tar: install + @mkdir -p ${INSTALL_PATH}/kselftest-packages/ + @tar caf ${TAR_PATH} --exclude=kselftest-packages -C ${INSTALL_PATH} . + @echo "Created ${TAR_PATH}" + clean: @for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\ done; -.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean +.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar diff --git a/tools/testing/selftests/gen_kselftest_tar.sh b/tools/testing/selftests/gen_kselftest_tar.sh index 8b2b6088540d..4a974bc03385 100755 --- a/tools/testing/selftests/gen_kselftest_tar.sh +++ b/tools/testing/selftests/gen_kselftest_tar.sh @@ -49,6 +49,11 @@ main() # directory ./kselftest_install.sh "$install_dir" (cd "$install_work"; tar $copts "$dest"/kselftest${ext} $install_name) + + # Don't put the message at the actual end as people may be parsing the + # "archive created" line in their scripts. + echo -e "\nConsider using 'make gen_tar' instead of this script\n" + echo "Kselftest archive kselftest${ext} created!" # clean up top-level install work directory From c143b7753b308d5f7d03b165a7fdff1dda1f271d Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 15 May 2020 10:08:28 +0000 Subject: [PATCH 193/574] ftrace: show debugging information when panic_on_warn set When an anomaly is detected in the function call modification code, ftrace_bug() is called to disable function tracing as well as give some warn and information that may help debug the problem. But currently, we call FTRACE_WARN_ON_ONCE() first in ftrace_bug(), so when panic_on_warn is set, we can't see the debugging information here. Call FTRACE_WARN_ON_ONCE() at the end of ftrace_bug() to ensure that the debugging information is displayed first. after this patch, the dmesg looks like: ------------[ ftrace bug ]------------ ftrace failed to modify [] bcm2835_handle_irq+0x4/0x58 actual: 1f:20:03:d5 Setting ftrace call site to call ftrace function ftrace record flags: 80000001 (1) expected tramp: ffff80001009d6f0 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1635 at kernel/trace/ftrace.c:2078 ftrace_bug+0x204/0x238 Kernel panic - not syncing: panic_on_warn set ... CPU: 2 PID: 1635 Comm: sh Not tainted 5.7.0-rc5-00033-gb922183867f5 #14 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1b0 show_stack+0x20/0x30 dump_stack+0xc0/0x10c panic+0x16c/0x368 __warn+0x120/0x160 report_bug+0xc8/0x160 bug_handler+0x28/0x98 brk_handler+0x70/0xd0 do_debug_exception+0xcc/0x1ac el1_sync_handler+0xe4/0x120 el1_sync+0x7c/0x100 ftrace_bug+0x204/0x238 Link: https://lkml.kernel.org/r/20200515100828.7091-1-cj.chengjian@huawei.com Signed-off-by: Cheng Jian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bd030b1b9514..cd39cbf3631a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2027,14 +2027,14 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) { unsigned long ip = rec ? rec->ip : 0; + pr_info("------------[ ftrace bug ]------------\n"); + switch (failed) { case -EFAULT: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); print_ip_sym(ip); break; case -EINVAL: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); @@ -2045,12 +2045,10 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } break; case -EPERM: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); print_ip_sym(ip); break; default: - FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } @@ -2077,6 +2075,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) ip = ftrace_get_addr_curr(rec); pr_cont("\n expected tramp: %lx\n", ip); } + + FTRACE_WARN_ON_ONCE(1); } static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) From 98d0a685cf8bcde23f4b0d1aa0a348fcef84569b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 9 May 2020 09:58:25 +0900 Subject: [PATCH 194/574] tools/bootconfig: Add a summary of test cases and return error Add summary lines of test cases and return an error code if any test case fails so that tester don't have to monitor the output. Link: https://lkml.kernel.org/r/158898590533.22749.10269622752797822320.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/test-bootconfig.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index 81b350ffd03f..eff16b77d5eb 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -124,9 +124,16 @@ for i in samples/good-* ; do xpass $BOOTCONF -a $i $INITRD done + +echo +echo "=== Summary ===" +echo "# of Passed: $(expr $NO - $NG - 1)" +echo "# of Failed: $NG" + echo if [ $NG -eq 0 ]; then echo "All tests passed" else echo "$NG tests failed" + exit 1 fi From fc9d276f22330e9322a9c592c71e0571810205f7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 7 May 2020 21:30:08 +0200 Subject: [PATCH 195/574] tracing/probe: reverse arguments to list_add Elsewhere in the file, the function trace_kprobe_has_same_kprobe uses a trace_probe_event.probes object as the second argument of list_for_each_entry, ie as a list head, while the list_for_each_entry iterates over the list fields of the trace_probe structures, making them the list elements. So, exchange the arguments on the list_add call to put the list head in the second argument. Since both list_head structures were just initialized, this problem did not cause any loss of information. Link: https://lkml.kernel.org/r/1588879808-24488-1-git-send-email-Julia.Lawall@inria.fr Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Acked-by: Masami Hiramatsu Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ab8b6436d53f..b8a928e925c7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event, INIT_LIST_HEAD(&tp->event->class.fields); INIT_LIST_HEAD(&tp->event->probes); INIT_LIST_HEAD(&tp->list); - list_add(&tp->event->probes, &tp->list); + list_add(&tp->list, &tp->event->probes); call = trace_probe_event_call(tp); call->class = &tp->event->class; From 55d7b4b8292bf7b698e947b1cfe263e4acac6cec Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 May 2020 22:07:24 +0800 Subject: [PATCH 196/574] s390: Remove two unused inline functions commit 657480d9c015 ("s390: support KPROBES_ON_FTRACE") left behind this, remove it. Signed-off-by: YueHaibing Acked-by: Sven Schnelle Link: https://lore.kernel.org/r/20200508140724.11324-1-yuehaibing@huawei.com Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/kernel/ftrace.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 4cd9b1ada834..44e01dd1e624 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -72,22 +72,6 @@ static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn) #endif } -static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn) -{ -#ifdef CONFIG_KPROBES - insn->opc = BREAKPOINT_INSTRUCTION; - insn->disp = KPROBE_ON_FTRACE_NOP; -#endif -} - -static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn) -{ -#ifdef CONFIG_KPROBES - insn->opc = BREAKPOINT_INSTRUCTION; - insn->disp = KPROBE_ON_FTRACE_CALL; -#endif -} - int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { From fee36576656f662dcfa4788eaeb033a4c5dd870b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 12 May 2020 09:26:39 +0200 Subject: [PATCH 197/574] s390/cio: Remove unused inline function idset_sch_get_first commit 8ebd51a705c5 ("s390/cio: idset.c: remove some unused functions") left behind this, remove it Link: https://lkml.kernel.org/r/20200508140643.30540-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Vineeth Vijayan [vneethv@linux.ibm.com: Slight modification in the title] Signed-off-by: Vineeth Vijayan Signed-off-by: Vasily Gorbik --- drivers/s390/cio/idset.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/s390/cio/idset.c b/drivers/s390/cio/idset.c index 77d0ea7b381b..45f9c0736be4 100644 --- a/drivers/s390/cio/idset.c +++ b/drivers/s390/cio/idset.c @@ -59,18 +59,6 @@ static inline int idset_contains(struct idset *set, int ssid, int id) return test_bit(ssid * set->num_id + id, set->bitmap); } -static inline int idset_get_first(struct idset *set, int *ssid, int *id) -{ - int bitnum; - - bitnum = find_first_bit(set->bitmap, set->num_ssid * set->num_id); - if (bitnum >= set->num_ssid * set->num_id) - return 0; - *ssid = bitnum / set->num_id; - *id = bitnum % set->num_id; - return 1; -} - struct idset *idset_sch_new(void) { return idset_new(max_ssid + 1, __MAX_SUBCHANNEL + 1); From 9de6c08700ce9fab4589c1a44cec2a21812ab565 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 21 Apr 2020 10:22:01 +0200 Subject: [PATCH 198/574] s390/qdio: simplify overlap calculation on Input refill Refilling the Input Queue requires additional checks, as the refilled SBALs can overlap with the ACKs that qdio maintains on the queue. This code path is way too complex, and does a whole bunch of wrap-around checks that the modulo arithmetic in sub_buf() takes care of by itself. So shrink down all that code into a few lines of equivalent functionality. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 50 +++++------------------------------- 1 file changed, 7 insertions(+), 43 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 4bd6dbfe8387..c3d696bb106b 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1438,24 +1438,6 @@ out: } EXPORT_SYMBOL_GPL(qdio_activate); -static inline int buf_in_between(int bufnr, int start, int count) -{ - int end = add_buf(start, count); - - if (end > start) { - if (bufnr >= start && bufnr < end) - return 1; - else - return 0; - } - - /* wrap-around case */ - if (bufnr >= start || bufnr < end) - return 1; - else - return 0; -} - /** * handle_inbound - reset processed input buffers * @q: queue containing the buffers @@ -1466,36 +1448,18 @@ static inline int buf_in_between(int bufnr, int start, int count) static int handle_inbound(struct qdio_q *q, unsigned int callflags, int bufnr, int count) { - int diff; + int overlap; qperf_inc(q, inbound_call); - if (!q->u.in.ack_count) - goto set; - - /* protect against stop polling setting an ACK for an emptied slsb */ - if (count == QDIO_MAX_BUFFERS_PER_Q) { - /* overwriting everything, just delete polling status */ - q->u.in.ack_count = 0; - goto set; - } else if (buf_in_between(q->u.in.ack_start, bufnr, count)) { - if (is_qebsm(q)) { - /* partial overwrite, just update ack_start */ - diff = add_buf(bufnr, count); - diff = sub_buf(diff, q->u.in.ack_start); - q->u.in.ack_count -= diff; - if (q->u.in.ack_count <= 0) { - q->u.in.ack_count = 0; - goto set; - } - q->u.in.ack_start = add_buf(q->u.in.ack_start, diff); - } else { - /* the only ACK will be deleted */ - q->u.in.ack_count = 0; - } + /* If any ACKed SBALs are returned to HW, adjust ACK tracking: */ + overlap = min(count - sub_buf(q->u.in.ack_start, bufnr), + q->u.in.ack_count); + if (overlap > 0) { + q->u.in.ack_start = add_buf(q->u.in.ack_start, overlap); + q->u.in.ack_count -= overlap; } -set: count = set_buf_states(q, bufnr, SLSB_CU_INPUT_EMPTY, count); atomic_add(count, &q->nr_buf_used); From 1db85d0e73f3b017c63afcdec1197d4d0736c362 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 21 Apr 2020 10:26:22 +0200 Subject: [PATCH 199/574] s390/qdio: refactor ACK processing for primed SBALs inbound_primed() currently has two code paths - one for QEBSM that knows how to deal with multiple ACKs, and a non-QEBSM path that strictly assumes a single ACK on the queue. In preparation for a subsequent patch, slightly adjust the non-QEBSM path so that it can manage a queue with multiple ACKs. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index c3d696bb106b..eea3032e68c0 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -466,15 +466,14 @@ static inline void inbound_primed(struct qdio_q *q, unsigned int start, * or by the next inbound run. */ new = add_buf(start, count - 1); - if (q->u.in.ack_count) { - /* reset the previous ACK but first set the new one */ - set_buf_state(q, new, SLSB_P_INPUT_ACK); - set_buf_state(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT); - } else { - q->u.in.ack_count = 1; - set_buf_state(q, new, SLSB_P_INPUT_ACK); - } + set_buf_state(q, new, SLSB_P_INPUT_ACK); + /* delete the previous ACKs */ + if (q->u.in.ack_count) + set_buf_states(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT, + q->u.in.ack_count); + + q->u.in.ack_count = 1; q->u.in.ack_start = new; count--; if (!count) From c70d82e96644bd660ea53209c19f75cd86c560d6 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 21 Apr 2020 10:38:18 +0200 Subject: [PATCH 200/574] s390/qdio: add IRQ reduction for error SBALs SBALs in PRIMED or ERROR state represent new work on the Input Queue. But while inbound_primed() does all sorts of ACK management for new PRIMED work, the same handling is currently missing for ERROR work. In particular the path for ERROR work doesn't clear up _old_ ACKs. Treat ERROR work the same as PRIMED work, but consider that the QEBSM auto-ACK feature doesn't apply here. So we need to set the ACK manually, as if it was a non-QEBSM device. Note that this doesn't aspire to actually improve performance, the main goal is to just unify the code paths and have consistent behaviour. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index eea3032e68c0..f5596265b053 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -438,15 +438,12 @@ static void process_buffer_error(struct qdio_q *q, unsigned int start, q->sbal[start]->element[15].sflags); } -static inline void inbound_primed(struct qdio_q *q, unsigned int start, - int count) +static inline void inbound_handle_work(struct qdio_q *q, unsigned int start, + int count, bool auto_ack) { int new; - DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in prim:%1d %02x", q->nr, count); - - /* for QEBSM the ACK was already set by EQBS */ - if (is_qebsm(q)) { + if (auto_ack) { if (!q->u.in.ack_count) { q->u.in.ack_count = count; q->u.in.ack_start = start; @@ -507,19 +504,21 @@ static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start) switch (state) { case SLSB_P_INPUT_PRIMED: - inbound_primed(q, start, count); + DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in prim:%1d %02x", q->nr, + count); + + inbound_handle_work(q, start, count, is_qebsm(q)); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); return count; case SLSB_P_INPUT_ERROR: + DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in err:%1d %02x", q->nr, + count); + process_buffer_error(q, start, count); - /* - * Interrupts may be avoided as long as the error is present - * so change the buffer state immediately to avoid starvation. - */ - set_buf_states(q, start, SLSB_P_INPUT_NOT_INIT, count); + inbound_handle_work(q, start, count, false); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) From a1ceea67f2e5b73cebd456e7fb463b3052bc6344 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 27 Apr 2020 16:25:27 +0200 Subject: [PATCH 201/574] PCI/IOV: Introduce pci_iov_sysfs_link() function Currently pci_iov_add_virtfn() scans the SR-IOV BARs, adds the VF to the bus and also creates the sysfs links between the newly added VF and its parent PF. With pdev->no_vf_scan fencing off the entire pci_iov_add_virtfn() call s390 as the sole pdev->no_vf_scan user thus ends up missing these sysfs links which are required for example by QEMU/libvirt. Instead of duplicating the code refactor pci_iov_add_virtfn() to make sysfs link creation callable separately. Signed-off-by: Niklas Schnelle Acked-by: Bjorn Helgaas Reviewed-by: Pierre Morel Link: https://lore.kernel.org/r/20200506154139.90609-1-schnelle@linux.ibm.com Signed-off-by: Vasily Gorbik --- drivers/pci/iov.c | 36 +++++++++++++++++++++++++----------- include/linux/pci.h | 8 ++++++++ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 4d1f392b05f9..ee6fbe688498 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -133,12 +133,35 @@ static void pci_read_vf_config_common(struct pci_dev *virtfn) &physfn->sriov->subsystem_device); } +int pci_iov_sysfs_link(struct pci_dev *dev, + struct pci_dev *virtfn, int id) +{ + char buf[VIRTFN_ID_LEN]; + int rc; + + sprintf(buf, "virtfn%u", id); + rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + if (rc) + goto failed; + rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn"); + if (rc) + goto failed1; + + kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); + + return 0; + +failed1: + sysfs_remove_link(&dev->dev.kobj, buf); +failed: + return rc; +} + int pci_iov_add_virtfn(struct pci_dev *dev, int id) { int i; int rc = -ENOMEM; u64 size; - char buf[VIRTFN_ID_LEN]; struct pci_dev *virtfn; struct resource *res; struct pci_sriov *iov = dev->sriov; @@ -182,23 +205,14 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id) } pci_device_add(virtfn, virtfn->bus); - - sprintf(buf, "virtfn%u", id); - rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf); + rc = pci_iov_sysfs_link(dev, virtfn, id); if (rc) goto failed1; - rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn"); - if (rc) - goto failed2; - - kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE); pci_bus_add_device(virtfn); return 0; -failed2: - sysfs_remove_link(&dev->dev.kobj, buf); failed1: pci_stop_and_remove_bus_device(virtfn); pci_dev_put(dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 83ce1cdf5676..93a063a7d7f9 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2048,6 +2048,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id); int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); void pci_disable_sriov(struct pci_dev *dev); + +int pci_iov_sysfs_link(struct pci_dev *dev, struct pci_dev *virtfn, int id); int pci_iov_add_virtfn(struct pci_dev *dev, int id); void pci_iov_remove_virtfn(struct pci_dev *dev, int id); int pci_num_vf(struct pci_dev *dev); @@ -2073,6 +2075,12 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id) } static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) { return -ENODEV; } + +static inline int pci_iov_sysfs_link(struct pci_dev *dev, + struct pci_dev *virtfn, int id) +{ + return -ENODEV; +} static inline int pci_iov_add_virtfn(struct pci_dev *dev, int id) { return -ENOSYS; From e5794cf1a270d813a5b9373a6876487d4d154195 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 28 Apr 2020 11:54:46 +0200 Subject: [PATCH 202/574] s390/pci: create links between PFs and VFs On s390 PCI Virtual Functions (VFs) are scanned by firmware and are made available to Linux via the hot-plug interface. As such the common code path of doing the scan directly using the parent Physical Function (PF) is not used and fenced off with the no_vf_scan attribute. Even if the partition created the VFs itself e.g. using the sriov_numvfs attribute of a PF, the PF/VF links thus need to be established after the fact. To do this when a VF is plugged we scan through all functions on the same zbus and test whether they are the parent PF in which case we establish the necessary links. With these links established there is now no more need to fence off pci_iov_remove_virtfn() for pdev->no_vf_scan as the common code now works fine. Signed-off-by: Niklas Schnelle Acked-by: Bjorn Helgaas Reviewed-by: Pierre Morel Link: https://lore.kernel.org/r/20200506154139.90609-3-schnelle@linux.ibm.com Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pci.h | 3 +- arch/s390/include/asm/pci_clp.h | 3 +- arch/s390/pci/pci_bus.c | 72 ++++++++++++++++++++++++++++++++- arch/s390/pci/pci_clp.c | 1 + drivers/pci/iov.c | 3 -- 5 files changed, 75 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index c1558cf071b8..99b92c3e46b0 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -131,7 +131,8 @@ struct zpci_dev { u8 port; u8 rid_available : 1; u8 has_hp_slot : 1; - u8 reserved : 6; + u8 is_physfn : 1; + u8 reserved : 5; unsigned int devfn; /* DEVFN part of the RID*/ struct mutex lock; diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 896ee41e23e3..eb51272dd2cc 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -95,7 +95,8 @@ struct clp_rsp_query_pci { u16 vfn; /* virtual fn number */ u16 : 3; u16 rid_avail : 1; - u16 : 2; + u16 is_physfn : 1; + u16 reserved1 : 1; u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index ada571d1c630..642a99384688 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -126,6 +126,64 @@ static struct zpci_bus *zpci_bus_alloc(int pchid) return zbus; } +#ifdef CONFIG_PCI_IOV +static int zpci_bus_link_virtfn(struct pci_dev *pdev, + struct pci_dev *virtfn, int vfid) +{ + int rc; + + virtfn->physfn = pci_dev_get(pdev); + rc = pci_iov_sysfs_link(pdev, virtfn, vfid); + if (rc) { + pci_dev_put(pdev); + virtfn->physfn = NULL; + return rc; + } + return 0; +} + +static int zpci_bus_setup_virtfn(struct zpci_bus *zbus, + struct pci_dev *virtfn, int vfn) +{ + int i, cand_devfn; + struct zpci_dev *zdev; + struct pci_dev *pdev; + int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ + int rc = 0; + + virtfn->is_virtfn = 1; + virtfn->multifunction = 0; + WARN_ON(vfid < 0); + /* If the parent PF for the given VF is also configured in the + * instance, it must be on the same zbus. + * We can then identify the parent PF by checking what + * devfn the VF would have if it belonged to that PF using the PF's + * stride and offset. Only if this candidate devfn matches the + * actual devfn will we link both functions. + */ + for (i = 0; i < ZPCI_FUNCTIONS_PER_BUS; i++) { + zdev = zbus->function[i]; + if (zdev && zdev->is_physfn) { + pdev = pci_get_slot(zbus->bus, zdev->devfn); + cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); + if (cand_devfn == virtfn->devfn) { + rc = zpci_bus_link_virtfn(pdev, virtfn, vfid); + break; + } + } + } + return rc; +} +#else +static inline int zpci_bus_setup_virtfn(struct zpci_bus *zbus, + struct pci_dev *virtfn, int vfn) +{ + virtfn->is_virtfn = 1; + virtfn->multifunction = 0; + return 0; +} +#endif + static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) { struct pci_bus *bus; @@ -156,10 +214,20 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) } pdev = pci_scan_single_device(bus, zdev->devfn); - if (pdev) + if (pdev) { + if (!zdev->is_physfn) { + rc = zpci_bus_setup_virtfn(zbus, pdev, zdev->vfn); + if (rc) + goto failed_with_pdev; + } pci_bus_add_device(pdev); - + } return 0; + +failed_with_pdev: + pci_stop_and_remove_bus_device(pdev); + pci_dev_put(pdev); + return rc; } static void zpci_bus_add_devices(struct zpci_bus *zbus) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 9b318824a134..d7bd3c287cf7 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -159,6 +159,7 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, zdev->uid = response->uid; zdev->fmb_length = sizeof(u32) * response->fmb_len; zdev->rid_available = response->rid_avail; + zdev->is_physfn = response->is_physfn; if (!s390_pci_no_rid && zdev->rid_available) zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN; diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index ee6fbe688498..b37e08c4f9d1 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -571,9 +571,6 @@ static void sriov_del_vfs(struct pci_dev *dev) struct pci_sriov *iov = dev->sriov; int i; - if (dev->no_vf_scan) - return; - for (i = 0; i < iov->num_VFs; i++) pci_iov_remove_virtfn(dev, i); } From 4765600fc7aa2f650951c3d0ed19e04e4c9e4b06 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 24 Apr 2020 10:39:04 +0200 Subject: [PATCH 203/574] s390: simplify memory notifier for protecting kdump crash kernel area Assume we have a crashkernel area of 256MB reserved: root@vm0:~# cat /proc/iomem 00000000-6fffffff : System RAM 0f258000-0fcfffff : Kernel code 0fd00000-101d10e3 : Kernel data 105b3000-1068dfff : Kernel bss 70000000-7fffffff : Crash kernel This exactly corresponds to memory block 7 (memory block size is 256MB). Trying to offline that memory block results in: root@vm0:~# echo "offline" > /sys/devices/system/memory/memory7/state -bash: echo: write error: Device or resource busy [ 128.458762] page:000003d081c00000 refcount:1 mapcount:0 mapping:00000000d01cecd4 index:0x0 [ 128.458773] flags: 0x1ffff00000001000(reserved) [ 128.458781] raw: 1ffff00000001000 000003d081c00008 000003d081c00008 0000000000000000 [ 128.458781] raw: 0000000000000000 0000000000000000 ffffffff00000001 0000000000000000 [ 128.458783] page dumped because: unmovable page The craskernel area is marked reserved in the bootmem allocator. This results in the memmap getting initialized (refcount=1, PG_reserved), but the pages are never freed to the page allocator. So these pages look like allocated pages that are unmovable (esp. PG_reserved), and therefore, memory offlining fails early, when trying to isolate the page range. We only have to care about the exchange area, make that clear. Signed-off-by: David Hildenbrand Reviewed-by: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Philipp Rudo Cc: Gerald Schaefer Cc: Eric W. Biederman Cc: Michal Hocko Link: https://lore.kernel.org/r/20200424083904.8587-1-david@redhat.com Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 36445dd40fdb..12f07565ef64 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -597,9 +597,10 @@ static void __init setup_memory_end(void) #ifdef CONFIG_CRASH_DUMP /* - * When kdump is enabled, we have to ensure that no memory from - * the area [0 - crashkernel memory size] and - * [crashk_res.start - crashk_res.end] is set offline. + * When kdump is enabled, we have to ensure that no memory from the area + * [0 - crashkernel memory size] is set offline - it will be exchanged with + * the crashkernel memory region when kdump is triggered. The crashkernel + * memory region can never get offlined (pages are unmovable). */ static int kdump_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) @@ -610,11 +611,7 @@ static int kdump_mem_notifier(struct notifier_block *nb, return NOTIFY_OK; if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res))) return NOTIFY_BAD; - if (arg->start_pfn > PFN_DOWN(crashk_res.end)) - return NOTIFY_OK; - if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start)) - return NOTIFY_OK; - return NOTIFY_BAD; + return NOTIFY_OK; } static struct notifier_block kdump_mem_nb = { From d03756aa0535f347f321c681ab0ca0fc7ba335bc Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 7 May 2020 16:21:37 +0200 Subject: [PATCH 204/574] Documentation/s390: Update / remove developerWorks web links s390 documentation now lives in IBM Knowledge Center, so update the link in the zfcpdump documentation. Also, remove the old developerWorks links from the appldata source code. Those were not really documentation related, but rather a reminder to the developer that some documentation has to be adjusted when changing the record layout, which should still be pretty obvious from the remaining comment. Signed-off-by: Gerald Schaefer Signed-off-by: Vasily Gorbik --- Documentation/s390/zfcpdump.rst | 4 ++-- arch/s390/appldata/appldata_mem.c | 4 ---- arch/s390/appldata/appldata_net_sum.c | 4 ---- arch/s390/appldata/appldata_os.c | 4 ---- 4 files changed, 2 insertions(+), 14 deletions(-) diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst index 54e8e7caf7e7..a61de7aa8778 100644 --- a/Documentation/s390/zfcpdump.rst +++ b/Documentation/s390/zfcpdump.rst @@ -46,5 +46,5 @@ initramfs with a user space application that writes the dump to a SCSI partition. For more information on how to use zfcpdump refer to the s390 'Using the Dump -Tools book', which is available from -http://www.ibm.com/developerworks/linux/linux390. +Tools' book, which is available from IBM Knowledge Center: +https://www.ibm.com/support/knowledgecenter/linuxonibm/liaaf/lnz_r_dt.html diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c index e68136c3c23a..21c3147bd92a 100644 --- a/arch/s390/appldata/appldata_mem.c +++ b/arch/s390/appldata/appldata_mem.c @@ -29,10 +29,6 @@ * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be * updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_mem_data { u64 timestamp; diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c index 8bc14b0d1def..59c282ca002f 100644 --- a/arch/s390/appldata/appldata_net_sum.c +++ b/arch/s390/appldata/appldata_net_sum.c @@ -25,10 +25,6 @@ * This is accessed as binary data by z/VM. If changes to it can't be avoided, * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_net_sum_data { u64 timestamp; diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 8bf46d705957..5503217366ec 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -32,10 +32,6 @@ * the structure version (product ID, see appldata_base.c) needs to be changed * as well and all documentation and z/VM applications using it must be * updated. - * - * The record layout is documented in the Linux for zSeries Device Drivers - * book: - * http://oss.software.ibm.com/developerworks/opensource/linux390/index.shtml */ struct appldata_os_per_cpu { u32 per_cpu_user; /* timer ticks spent in user mode */ From bc4b295e87a86bf14333753daeb1c84909197c46 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 8 May 2020 15:51:19 +0200 Subject: [PATCH 205/574] s390/ap: introduce new ap function ap_get_qdev() Provide a new interface function to be used by the ap drivers: struct ap_queue *ap_get_qdev(ap_qid_t qid); Returns ptr to the struct ap_queue device or NULL if there was no ap_queue device with this qid found. When something is found, the reference count of the embedded device is increased. So the caller has to decrease the reference count after use with a call to put_device(&aq->ap_dev.device). With this patch also the ap_card_list is removed from the ap core code and a new hashtable is introduced which stores hnodes of all the ap queues known to the ap bus. The hashtable approach and a first implementation of this interface comes from a previous patch from Anthony Krowiak and an idea from Halil Pasic. Signed-off-by: Harald Freudenberger Suggested-by: Tony Krowiak Suggested-by: Halil Pasic Reviewed-by: Tony Krowiak Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 94 +++++++++++++++++++--------------- drivers/s390/crypto/ap_bus.h | 25 +++++---- drivers/s390/crypto/ap_card.c | 47 +++++++++-------- drivers/s390/crypto/ap_queue.c | 10 ++-- 4 files changed, 95 insertions(+), 81 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 35064443e748..e71ca4a719a5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -62,8 +62,10 @@ MODULE_PARM_DESC(aqmask, "AP bus domain mask."); static struct device *ap_root_device; -DEFINE_SPINLOCK(ap_list_lock); -LIST_HEAD(ap_card_list); +/* Hashtable of all queue devices on the AP bus */ +DEFINE_HASHTABLE(ap_queues, 8); +/* lock used for the ap_queues hashtable */ +DEFINE_SPINLOCK(ap_queues_lock); /* Default permissions (ioctl, card and domain masking) */ struct ap_perms ap_perms; @@ -414,7 +416,7 @@ static void ap_interrupt_handler(struct airq_struct *airq, bool floating) */ static void ap_tasklet_fn(unsigned long dummy) { - struct ap_card *ac; + int bkt; struct ap_queue *aq; enum ap_wait wait = AP_WAIT_NONE; @@ -425,34 +427,30 @@ static void ap_tasklet_fn(unsigned long dummy) if (ap_using_interrupts()) xchg(ap_airq.lsi_ptr, 0); - spin_lock_bh(&ap_list_lock); - for_each_ap_card(ac) { - for_each_ap_queue(aq, ac) { - spin_lock_bh(&aq->lock); - wait = min(wait, ap_sm_event_loop(aq, AP_EVENT_POLL)); - spin_unlock_bh(&aq->lock); - } + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) { + spin_lock_bh(&aq->lock); + wait = min(wait, ap_sm_event_loop(aq, AP_EVENT_POLL)); + spin_unlock_bh(&aq->lock); } - spin_unlock_bh(&ap_list_lock); + spin_unlock_bh(&ap_queues_lock); ap_wait(wait); } static int ap_pending_requests(void) { - struct ap_card *ac; + int bkt; struct ap_queue *aq; - spin_lock_bh(&ap_list_lock); - for_each_ap_card(ac) { - for_each_ap_queue(aq, ac) { - if (aq->queue_count == 0) - continue; - spin_unlock_bh(&ap_list_lock); - return 1; - } + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) { + if (aq->queue_count == 0) + continue; + spin_unlock_bh(&ap_queues_lock); + return 1; } - spin_unlock_bh(&ap_list_lock); + spin_unlock_bh(&ap_queues_lock); return 0; } @@ -683,24 +681,20 @@ static int ap_device_probe(struct device *dev) } /* Add queue/card to list of active queues/cards */ - spin_lock_bh(&ap_list_lock); - if (is_card_dev(dev)) - list_add(&to_ap_card(dev)->list, &ap_card_list); - else - list_add(&to_ap_queue(dev)->list, - &to_ap_queue(dev)->card->queues); - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + if (is_queue_dev(dev)) + hash_add(ap_queues, &to_ap_queue(dev)->hnode, + to_ap_queue(dev)->qid); + spin_unlock_bh(&ap_queues_lock); ap_dev->drv = ap_drv; rc = ap_drv->probe ? ap_drv->probe(ap_dev) : -ENODEV; if (rc) { - spin_lock_bh(&ap_list_lock); - if (is_card_dev(dev)) - list_del_init(&to_ap_card(dev)->list); - else - list_del_init(&to_ap_queue(dev)->list); - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + if (is_queue_dev(dev)) + hash_del(&to_ap_queue(dev)->hnode); + spin_unlock_bh(&ap_queues_lock); ap_dev->drv = NULL; } @@ -725,16 +719,33 @@ static int ap_device_remove(struct device *dev) ap_queue_remove(to_ap_queue(dev)); /* Remove queue/card from list of active queues/cards */ - spin_lock_bh(&ap_list_lock); - if (is_card_dev(dev)) - list_del_init(&to_ap_card(dev)->list); - else - list_del_init(&to_ap_queue(dev)->list); - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + if (is_queue_dev(dev)) + hash_del(&to_ap_queue(dev)->hnode); + spin_unlock_bh(&ap_queues_lock); return 0; } +struct ap_queue *ap_get_qdev(ap_qid_t qid) +{ + int bkt; + struct ap_queue *aq; + + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) { + if (aq->qid == qid) { + get_device(&aq->ap_dev.device); + spin_unlock_bh(&ap_queues_lock); + return aq; + } + } + spin_unlock_bh(&ap_queues_lock); + + return NULL; +} +EXPORT_SYMBOL(ap_get_qdev); + int ap_driver_register(struct ap_driver *ap_drv, struct module *owner, char *name) { @@ -1506,6 +1517,9 @@ static int __init ap_module_init(void) return -ENODEV; } + /* init ap_queue hashtable */ + hash_init(ap_queues); + /* set up the AP permissions (ioctls, ap and aq masks) */ ap_perms_init(); diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 8e8e37b6c0ee..053cc34d2ca2 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -27,8 +28,8 @@ extern int ap_domain_index; -extern spinlock_t ap_list_lock; -extern struct list_head ap_card_list; +extern DECLARE_HASHTABLE(ap_queues, 8); +extern spinlock_t ap_queues_lock; static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) { @@ -152,8 +153,6 @@ struct ap_device { struct ap_card { struct ap_device ap_dev; - struct list_head list; /* Private list of AP cards. */ - struct list_head queues; /* List of assoc. AP queues */ void *private; /* ap driver private pointer. */ int raw_hwtype; /* AP raw hardware type. */ unsigned int functions; /* AP device function bitfield. */ @@ -166,7 +165,7 @@ struct ap_card { struct ap_queue { struct ap_device ap_dev; - struct list_head list; /* Private list of AP queues. */ + struct hlist_node hnode; /* Node for the ap_queues hashtable */ struct ap_card *card; /* Ptr to assoc. AP card. */ spinlock_t lock; /* Per device lock. */ void *private; /* ap driver private pointer. */ @@ -223,12 +222,6 @@ static inline void ap_release_message(struct ap_message *ap_msg) kzfree(ap_msg->private); } -#define for_each_ap_card(_ac) \ - list_for_each_entry(_ac, &ap_card_list, list) - -#define for_each_ap_queue(_aq, _ac) \ - list_for_each_entry(_aq, &(_ac)->queues, list) - /* * Note: don't use ap_send/ap_recv after using ap_queue_message * for the first time. Otherwise the ap message queue will get @@ -269,6 +262,16 @@ struct ap_perms { extern struct ap_perms ap_perms; extern struct mutex ap_perms_mutex; +/* + * Get ap_queue device for this qid. + * Returns ptr to the struct ap_queue device or NULL if there + * was no ap_queue device with this qid found. When something is + * found, the reference count of the embedded device is increased. + * So the caller has to decrease the reference count after use + * with a call to put_device(&aq->ap_dev.device). + */ +struct ap_queue *ap_get_qdev(ap_qid_t qid); + /* * check APQN for owned/reserved by ap bus and default driver(s). * Checks if this APQN is or will be in use by the ap bus diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c index 0a39dfdb6a1d..6588713319ba 100644 --- a/drivers/s390/crypto/ap_card.c +++ b/drivers/s390/crypto/ap_card.c @@ -66,9 +66,9 @@ static ssize_t request_count_show(struct device *dev, u64 req_cnt; req_cnt = 0; - spin_lock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); req_cnt = atomic64_read(&ac->total_request_count); - spin_unlock_bh(&ap_list_lock); + spin_unlock_bh(&ap_queues_lock); return scnprintf(buf, PAGE_SIZE, "%llu\n", req_cnt); } @@ -76,13 +76,15 @@ static ssize_t request_count_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct ap_card *ac = to_ap_card(dev); + int bkt; struct ap_queue *aq; + struct ap_card *ac = to_ap_card(dev); - spin_lock_bh(&ap_list_lock); - for_each_ap_queue(aq, ac) - aq->total_request_count = 0; - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) + if (ac == aq->card) + aq->total_request_count = 0; + spin_unlock_bh(&ap_queues_lock); atomic64_set(&ac->total_request_count, 0); return count; @@ -93,15 +95,17 @@ static DEVICE_ATTR_RW(request_count); static ssize_t requestq_count_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct ap_card *ac = to_ap_card(dev); + int bkt; struct ap_queue *aq; unsigned int reqq_cnt; + struct ap_card *ac = to_ap_card(dev); reqq_cnt = 0; - spin_lock_bh(&ap_list_lock); - for_each_ap_queue(aq, ac) - reqq_cnt += aq->requestq_count; - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) + if (ac == aq->card) + reqq_cnt += aq->requestq_count; + spin_unlock_bh(&ap_queues_lock); return scnprintf(buf, PAGE_SIZE, "%d\n", reqq_cnt); } @@ -110,15 +114,17 @@ static DEVICE_ATTR_RO(requestq_count); static ssize_t pendingq_count_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct ap_card *ac = to_ap_card(dev); + int bkt; struct ap_queue *aq; unsigned int penq_cnt; + struct ap_card *ac = to_ap_card(dev); penq_cnt = 0; - spin_lock_bh(&ap_list_lock); - for_each_ap_queue(aq, ac) - penq_cnt += aq->pendingq_count; - spin_unlock_bh(&ap_list_lock); + spin_lock_bh(&ap_queues_lock); + hash_for_each(ap_queues, bkt, aq, hnode) + if (ac == aq->card) + penq_cnt += aq->pendingq_count; + spin_unlock_bh(&ap_queues_lock); return scnprintf(buf, PAGE_SIZE, "%d\n", penq_cnt); } @@ -163,11 +169,6 @@ static void ap_card_device_release(struct device *dev) { struct ap_card *ac = to_ap_card(dev); - if (!list_empty(&ac->list)) { - spin_lock_bh(&ap_list_lock); - list_del_init(&ac->list); - spin_unlock_bh(&ap_list_lock); - } kfree(ac); } @@ -179,8 +180,6 @@ struct ap_card *ap_card_create(int id, int queue_depth, int raw_type, ac = kzalloc(sizeof(*ac), GFP_KERNEL); if (!ac) return NULL; - INIT_LIST_HEAD(&ac->list); - INIT_LIST_HEAD(&ac->queues); ac->ap_dev.device.release = ap_card_device_release; ac->ap_dev.device.type = &ap_card_type; ac->ap_dev.device_type = comp_type; diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 0eaf1d04e8df..73b077dca3e6 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -568,11 +568,10 @@ static void ap_queue_device_release(struct device *dev) { struct ap_queue *aq = to_ap_queue(dev); - if (!list_empty(&aq->list)) { - spin_lock_bh(&ap_list_lock); - list_del_init(&aq->list); - spin_unlock_bh(&ap_list_lock); - } + spin_lock_bh(&ap_queues_lock); + hash_del(&aq->hnode); + spin_unlock_bh(&ap_queues_lock); + kfree(aq); } @@ -590,7 +589,6 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) aq->state = AP_STATE_UNBOUND; aq->interrupt = AP_INTR_DISABLED; spin_lock_init(&aq->lock); - INIT_LIST_HEAD(&aq->list); INIT_LIST_HEAD(&aq->pendingq); INIT_LIST_HEAD(&aq->requestq); timer_setup(&aq->timeout, ap_request_timeout, 0); From a999eb96fdd4da488ac3085c40e20d61de26f6af Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 28 Feb 2020 10:27:22 +0100 Subject: [PATCH 206/574] s390/pci: ioremap() align with generic code Let's use the same signature and parameter names as in the generic ioremap() definition making the physical address' type explicit. Add a check against address wrap around as in the generic lib/ioremap.c:ioremap_prot() code. Finally use free_vm_area() instead of vunmap() as in the generic code. Besides being clearer free_vm_area() can also skip a few additional checks compared with vunmap(). Signed-off-by: Niklas Schnelle Reviewed-by: Gerald Schaefer Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/io.h | 2 +- arch/s390/pci/pci.c | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 5a16f500515a..da014e4f8113 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -26,7 +26,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap(unsigned long offset, unsigned long size); +void __iomem *ioremap(phys_addr_t addr, size_t size); void iounmap(volatile void __iomem *addr); static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 3f6670613c57..3902c9f6f2d6 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -226,28 +226,29 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -void __iomem *ioremap(unsigned long ioaddr, unsigned long size) +void __iomem *ioremap(phys_addr_t addr, size_t size) { + unsigned long offset, vaddr; struct vm_struct *area; - unsigned long offset; + phys_addr_t last_addr; - if (!size) + last_addr = addr + size - 1; + if (!size || last_addr < addr) return NULL; if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) ioaddr; + return (void __iomem *) addr; - offset = ioaddr & ~PAGE_MASK; - ioaddr &= PAGE_MASK; + offset = addr & ~PAGE_MASK; + addr &= PAGE_MASK; size = PAGE_ALIGN(size + offset); area = get_vm_area(size, VM_IOREMAP); if (!area) return NULL; - if (ioremap_page_range((unsigned long) area->addr, - (unsigned long) area->addr + size, - ioaddr, PAGE_KERNEL)) { - vunmap(area->addr); + vaddr = (unsigned long) area->addr; + if (ioremap_page_range(vaddr, vaddr + size, addr, PAGE_KERNEL)) { + free_vm_area(area); return NULL; } return (void __iomem *) ((unsigned long) area->addr + offset); From cf51e129b96847f969bfb8af1ee1516a01a70b39 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 May 2020 12:20:40 -0400 Subject: [PATCH 207/574] sparc32: fix register window handling in genregs32_[gs]et() It needs access_process_vm() if the traced process does not share mm with the caller. Solution is similar to what sparc64 does. Note that genregs32_set() is only ever called with pos being 0 or 32 * sizeof(u32) (the latter - as part of PTRACE_SETREGS handling). Cc: stable@kernel.org Signed-off-by: Al Viro --- arch/sparc/kernel/ptrace_32.c | 228 +++++++++++++++------------------- 1 file changed, 98 insertions(+), 130 deletions(-) diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index 16b50afe7b52..60f7205ebe40 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -46,82 +46,79 @@ enum sparc_regset { REGSET_FP, }; +static int regwindow32_get(struct task_struct *target, + const struct pt_regs *regs, + u32 *uregs) +{ + unsigned long reg_window = regs->u_regs[UREG_I6]; + int size = 16 * sizeof(u32); + + if (target == current) { + if (copy_from_user(uregs, (void __user *)reg_window, size)) + return -EFAULT; + } else { + if (access_process_vm(target, reg_window, uregs, size, + FOLL_FORCE) != size) + return -EFAULT; + } + return 0; +} + +static int regwindow32_set(struct task_struct *target, + const struct pt_regs *regs, + u32 *uregs) +{ + unsigned long reg_window = regs->u_regs[UREG_I6]; + int size = 16 * sizeof(u32); + + if (target == current) { + if (copy_to_user((void __user *)reg_window, uregs, size)) + return -EFAULT; + } else { + if (access_process_vm(target, reg_window, uregs, size, + FOLL_FORCE | FOLL_WRITE) != size) + return -EFAULT; + } + return 0; +} + static int genregs32_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { const struct pt_regs *regs = target->thread.kregs; - unsigned long __user *reg_window; - unsigned long *k = kbuf; - unsigned long __user *u = ubuf; - unsigned long reg; + u32 uregs[16]; + int ret; if (target == current) flush_user_windows(); - pos /= sizeof(reg); - count /= sizeof(reg); + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + regs->u_regs, + 0, 16 * sizeof(u32)); + if (ret || !count) + return ret; - if (kbuf) { - for (; count > 0 && pos < 16; count--) - *k++ = regs->u_regs[pos++]; - - reg_window = (unsigned long __user *) regs->u_regs[UREG_I6]; - reg_window -= 16; - for (; count > 0 && pos < 32; count--) { - if (get_user(*k++, ®_window[pos++])) - return -EFAULT; - } - } else { - for (; count > 0 && pos < 16; count--) { - if (put_user(regs->u_regs[pos++], u++)) - return -EFAULT; - } - - reg_window = (unsigned long __user *) regs->u_regs[UREG_I6]; - reg_window -= 16; - for (; count > 0 && pos < 32; count--) { - if (get_user(reg, ®_window[pos++]) || - put_user(reg, u++)) - return -EFAULT; - } - } - while (count > 0) { - switch (pos) { - case 32: /* PSR */ - reg = regs->psr; - break; - case 33: /* PC */ - reg = regs->pc; - break; - case 34: /* NPC */ - reg = regs->npc; - break; - case 35: /* Y */ - reg = regs->y; - break; - case 36: /* WIM */ - case 37: /* TBR */ - reg = 0; - break; - default: - goto finish; - } - - if (kbuf) - *k++ = reg; - else if (put_user(reg, u++)) + if (pos < 32 * sizeof(u32)) { + if (regwindow32_get(target, regs, uregs)) return -EFAULT; - pos++; - count--; + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + uregs, + 16 * sizeof(u32), 32 * sizeof(u32)); + if (ret || !count) + return ret; } -finish: - pos *= sizeof(reg); - count *= sizeof(reg); - return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, - 38 * sizeof(reg), -1); + uregs[0] = regs->psr; + uregs[1] = regs->pc; + uregs[2] = regs->npc; + uregs[3] = regs->y; + uregs[4] = 0; /* WIM */ + uregs[5] = 0; /* TBR */ + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + uregs, + 32 * sizeof(u32), 38 * sizeof(u32)); } static int genregs32_set(struct task_struct *target, @@ -130,82 +127,53 @@ static int genregs32_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { struct pt_regs *regs = target->thread.kregs; - unsigned long __user *reg_window; - const unsigned long *k = kbuf; - const unsigned long __user *u = ubuf; - unsigned long reg; + u32 uregs[16]; + u32 psr; + int ret; if (target == current) flush_user_windows(); - pos /= sizeof(reg); - count /= sizeof(reg); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + regs->u_regs, + 0, 16 * sizeof(u32)); + if (ret || !count) + return ret; - if (kbuf) { - for (; count > 0 && pos < 16; count--) - regs->u_regs[pos++] = *k++; - - reg_window = (unsigned long __user *) regs->u_regs[UREG_I6]; - reg_window -= 16; - for (; count > 0 && pos < 32; count--) { - if (put_user(*k++, ®_window[pos++])) - return -EFAULT; - } - } else { - for (; count > 0 && pos < 16; count--) { - if (get_user(reg, u++)) - return -EFAULT; - regs->u_regs[pos++] = reg; - } - - reg_window = (unsigned long __user *) regs->u_regs[UREG_I6]; - reg_window -= 16; - for (; count > 0 && pos < 32; count--) { - if (get_user(reg, u++) || - put_user(reg, ®_window[pos++])) - return -EFAULT; - } - } - while (count > 0) { - unsigned long psr; - - if (kbuf) - reg = *k++; - else if (get_user(reg, u++)) + if (pos < 32 * sizeof(u32)) { + if (regwindow32_get(target, regs, uregs)) return -EFAULT; - - switch (pos) { - case 32: /* PSR */ - psr = regs->psr; - psr &= ~(PSR_ICC | PSR_SYSCALL); - psr |= (reg & (PSR_ICC | PSR_SYSCALL)); - regs->psr = psr; - break; - case 33: /* PC */ - regs->pc = reg; - break; - case 34: /* NPC */ - regs->npc = reg; - break; - case 35: /* Y */ - regs->y = reg; - break; - case 36: /* WIM */ - case 37: /* TBR */ - break; - default: - goto finish; - } - - pos++; - count--; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + uregs, + 16 * sizeof(u32), 32 * sizeof(u32)); + if (ret) + return ret; + if (regwindow32_set(target, regs, uregs)) + return -EFAULT; + if (!count) + return 0; } -finish: - pos *= sizeof(reg); - count *= sizeof(reg); - + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &psr, + 32 * sizeof(u32), 33 * sizeof(u32)); + if (ret) + return ret; + regs->psr = (regs->psr & ~(PSR_ICC | PSR_SYSCALL)) | + (psr & (PSR_ICC | PSR_SYSCALL)); + if (!count) + return 0; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + ®s->pc, + 33 * sizeof(u32), 34 * sizeof(u32)); + if (ret || !count) + return ret; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + ®s->y, + 34 * sizeof(u32), 35 * sizeof(u32)); + if (ret || !count) + return ret; return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - 38 * sizeof(reg), -1); + 35 * sizeof(u32), 38 * sizeof(u32)); } static int fpregs32_get(struct task_struct *target, From 7dcef3988eedbfb40e7e95a821966a029a5a465b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 May 2020 15:07:05 +0300 Subject: [PATCH 208/574] remoteproc: Fix an error code in devm_rproc_alloc() The comments say that this function should return NULL on error and the caller expects NULL returns as well so I have modified the code to match. Returning an ERR_PTR(-ENOMEM) would lead to an OOps. Reviewed-by: Paul Cercueil Reviewed-by: Bjorn Andersson Fixes: 305ac5a766b1 ("remoteproc: Add device-managed variants of rproc_alloc/rproc_add") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20200520120705.GH172354@mwanda Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 0cc015fabf78..9f04c30c4aaf 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2297,7 +2297,7 @@ struct rproc *devm_rproc_alloc(struct device *dev, const char *name, ptr = devres_alloc(devm_rproc_free, sizeof(*ptr), GFP_KERNEL); if (!ptr) - return ERR_PTR(-ENOMEM); + return NULL; rproc = rproc_alloc(dev, name, ops, firmware, len); if (rproc) { From 0299a1a81ca056e79c1a7fb751f936ec0d5c7afe Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 19 May 2020 19:54:46 +0200 Subject: [PATCH 209/574] iommu/arm-smmu-v3: Manage ASIDs with xarray In preparation for sharing some ASIDs with the CPU, use a global xarray to store ASIDs and their context. ASID#0 is now reserved, and the ASID space is global. Signed-off-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/20200519175502.2504091-9-jean-philippe@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm-smmu-v3.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 5eec8ebdd4b5..8a908c50c306 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -667,7 +667,6 @@ struct arm_smmu_device { #define ARM_SMMU_MAX_ASIDS (1 << 16) unsigned int asid_bits; - DECLARE_BITMAP(asid_map, ARM_SMMU_MAX_ASIDS); #define ARM_SMMU_MAX_VMIDS (1 << 16) unsigned int vmid_bits; @@ -727,6 +726,8 @@ struct arm_smmu_option_prop { const char *prop; }; +static DEFINE_XARRAY_ALLOC1(asid_xa); + static struct arm_smmu_option_prop arm_smmu_options[] = { { ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" }, { ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"}, @@ -1765,6 +1766,14 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_domain *smmu_domain) cdcfg->cdtab = NULL; } +static void arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd) +{ + if (!cd->asid) + return; + + xa_erase(&asid_xa, cd->asid); +} + /* Stream table manipulation functions */ static void arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc) @@ -2450,10 +2459,9 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; - if (cfg->cdcfg.cdtab) { + if (cfg->cdcfg.cdtab) arm_smmu_free_cd_tables(smmu_domain); - arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid); - } + arm_smmu_free_asid(&cfg->cd); } else { struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; if (cfg->vmid) @@ -2468,14 +2476,15 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, struct io_pgtable_cfg *pgtbl_cfg) { int ret; - int asid; + u32 asid; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr; - asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits); - if (asid < 0) - return asid; + ret = xa_alloc(&asid_xa, &asid, &cfg->cd, + XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL); + if (ret) + return ret; cfg->s1cdmax = master->ssid_bits; @@ -2508,7 +2517,7 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, out_free_cd_tables: arm_smmu_free_cd_tables(smmu_domain); out_free_asid: - arm_smmu_bitmap_free(smmu->asid_map, asid); + arm_smmu_free_asid(&cfg->cd); return ret; } From 61016db15b8e20f371352db6a75b044ec3183fe7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 17 May 2020 21:47:43 -0700 Subject: [PATCH 210/574] selftests/exec: Verify execve of non-regular files fail Add a named pipe as an exec target to make sure that non-regular files are rejected by execve() with EACCES. This can help verify commit 73601ea5b7b1 ("fs/open.c: allow opening only regular files during execve()"). Signed-off-by: Kees Cook Signed-off-by: Shuah Khan --- tools/testing/selftests/exec/.gitignore | 1 + tools/testing/selftests/exec/Makefile | 2 +- tools/testing/selftests/exec/execveat.c | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore index c078ece12ff0..94b02a18f230 100644 --- a/tools/testing/selftests/exec/.gitignore +++ b/tools/testing/selftests/exec/.gitignore @@ -9,3 +9,4 @@ execveat.ephemeral execveat.denatured /recursion-depth xxxxxxxx* +pipe diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 33339e31e365..cfafa1f8a2fa 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -4,7 +4,7 @@ CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS := execveat -TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir +TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index cbb6efbdb786..67bf7254a48f 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -5,7 +5,9 @@ * Selftests for execveat(2). */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */ +#endif #include #include #include @@ -311,6 +313,10 @@ static int run_tests(void) fail += check_execveat_fail(AT_FDCWD, fullname_symlink, AT_SYMLINK_NOFOLLOW, ELOOP); + /* Non-regular file failure */ + fail += check_execveat_fail(dot_dfd, "pipe", 0, EACCES); + unlink("pipe"); + /* Shell script wrapping executable file: */ /* dfd + path */ fail += check_execveat(subdir_dfd, "../script", 0); @@ -384,6 +390,8 @@ static void prerequisites(void) fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755); write(fd, script, strlen(script)); close(fd); + + mkfifo("pipe", 0755); } int main(int argc, char **argv) From babf8a978d497af2fe4111cc80866b9e436bf785 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 21 May 2020 21:37:05 +0100 Subject: [PATCH 211/574] selftests: vdso: Rename vdso_test to vdso_test_gettimeofday Currently the vDSO kselftests have a test called vdso_test which tests the vDSO implementation of gettimeofday(). In preparation for adding tests for other vDSO functionality rename this test to reflect what's going on. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/.gitignore | 1 + tools/testing/selftests/vDSO/Makefile | 4 ++-- .../selftests/vDSO/{vdso_test.c => vdso_test_gettimeofday.c} | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) rename tools/testing/selftests/vDSO/{vdso_test.c => vdso_test_gettimeofday.c} (89%) diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 382cfb39a1a3..74f49bd5f6dd 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test +vdso_test_gettimeofday vdso_standalone_test_x86 diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 9e03d61f52fd..ae15d700b62e 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,7 @@ include ../lib.mk uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test +TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday ifeq ($(ARCH),x86) TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif @@ -17,7 +17,7 @@ LDLIBS += -lgcc_s endif all: $(TEST_GEN_PROGS) -$(OUTPUT)/vdso_test: parse_vdso.c vdso_test.c +$(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ vdso_standalone_test_x86.c parse_vdso.c \ diff --git a/tools/testing/selftests/vDSO/vdso_test.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c similarity index 89% rename from tools/testing/selftests/vDSO/vdso_test.c rename to tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index 719d5a6bd664..511c0dc5e47e 100644 --- a/tools/testing/selftests/vDSO/vdso_test.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -1,10 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() * Copyright (c) 2014 Andy Lutomirski * * Compile with: - * gcc -std=gnu99 vdso_test.c parse_vdso.c + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c * * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ From d9aeccec85ec39c4da90cc4b899cbd1961416a0d Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 21 May 2020 16:30:19 +0800 Subject: [PATCH 212/574] drm/msm/a6xx: a6xx_hfi_send_start() can be static Fixes: 8167e6fa76c8 ("drm/msm/a6xx: HFI v2 for A640 and A650") Signed-off-by: kbuild test robot Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c index f9db69e77121..9921e632f1ca 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c @@ -342,7 +342,7 @@ static int a6xx_hfi_send_test(struct a6xx_gmu *gmu) NULL, 0); } -int a6xx_hfi_send_start(struct a6xx_gmu *gmu) +static int a6xx_hfi_send_start(struct a6xx_gmu *gmu) { struct a6xx_hfi_msg_start msg = { 0 }; @@ -350,7 +350,7 @@ int a6xx_hfi_send_start(struct a6xx_gmu *gmu) NULL, 0); } -int a6xx_hfi_send_core_fw_start(struct a6xx_gmu *gmu) +static int a6xx_hfi_send_core_fw_start(struct a6xx_gmu *gmu) { struct a6xx_hfi_msg_core_fw_start msg = { 0 }; From 7d4eedb03fc754625e0e0c482f42bfeabce685ce Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 21 May 2020 15:11:13 +0800 Subject: [PATCH 213/574] drm/msm/dpu: dpu_setup_dspp_pcc() can be static Fixes: 4259ff7ae509 ("drm/msm/dpu: add support for pcc color block in dpu driver") Signed-off-by: kbuild test robot Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c index b5189cece3c6..a7a24539921f 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dspp.c @@ -22,7 +22,7 @@ #define PCC_BLUE_G_OFF 0x24 #define PCC_BLUE_B_OFF 0x30 -void dpu_setup_dspp_pcc(struct dpu_hw_dspp *ctx, +static void dpu_setup_dspp_pcc(struct dpu_hw_dspp *ctx, struct dpu_hw_pcc_cfg *cfg) { From cd76ca4dd63768385fb81927fdc3087ad3bfeefb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 22 May 2020 17:21:38 +0100 Subject: [PATCH 214/574] selftests: vdso: Use a header file to prototype parse_vdso API Both vdso_test_gettimeofday and vdso_standalone_test_x86 use the library in parse_vdso.c but each separately declares the API it offers which is not ideal. Create a header file with prototypes of the functions and use it in both the library and the tests to ensure that the same prototypes are used throughout. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/parse_vdso.c | 24 +------------- tools/testing/selftests/vDSO/parse_vdso.h | 31 +++++++++++++++++++ .../selftests/vDSO/vdso_standalone_test_x86.c | 4 +-- .../selftests/vDSO/vdso_test_gettimeofday.c | 5 +-- 4 files changed, 34 insertions(+), 30 deletions(-) create mode 100644 tools/testing/selftests/vDSO/parse_vdso.h diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 1dbb4b87268f..413f75620a35 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -21,29 +21,7 @@ #include #include -/* - * To use this vDSO parser, first call one of the vdso_init_* functions. - * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR - * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. - * Then call vdso_sym for each symbol you want. For example, to look up - * gettimeofday on x86_64, use: - * - * = vdso_sym("LINUX_2.6", "gettimeofday"); - * or - * = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); - * - * vdso_sym will return 0 if the symbol doesn't exist or if the init function - * failed or was not called. vdso_sym is a little slow, so its return value - * should be cached. - * - * vdso_sym is threadsafe; the init functions are not. - * - * These are the prototypes: - */ -extern void vdso_init_from_auxv(void *auxv); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void *vdso_sym(const char *version, const char *name); - +#include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h new file mode 100644 index 000000000000..de0453067d7c --- /dev/null +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef PARSE_VDSO_H +#define PARSE_VDSO_H + +#include + +/* + * To use this vDSO parser, first call one of the vdso_init_* functions. + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. + * Then call vdso_sym for each symbol you want. For example, to look up + * gettimeofday on x86_64, use: + * + * = vdso_sym("LINUX_2.6", "gettimeofday"); + * or + * = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + * + * vdso_sym will return 0 if the symbol doesn't exist or if the init function + * failed or was not called. vdso_sym is a little slow, so its return value + * should be cached. + * + * vdso_sym is threadsafe; the init functions are not. + * + * These are the prototypes: + */ +void *vdso_sym(const char *version, const char *name); +void vdso_init_from_sysinfo_ehdr(uintptr_t base); +void vdso_init_from_auxv(void *auxv); + +#endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 5ac4b00acfbc..8a44ff973ee1 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -16,9 +16,7 @@ #include #include -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* We need a libc functions... */ int strcmp(const char *a, const char *b) diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index 511c0dc5e47e..8ccc73ed8240 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -17,10 +17,7 @@ #include #include "../kselftest.h" - -extern void *vdso_sym(const char *version, const char *name); -extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); -extern void vdso_init_from_auxv(void *auxv); +#include "parse_vdso.h" /* * ARM64's vDSO exports its gettimeofday() implementation with a different From 2e9a97256616f2f7280f8bed8c98e57bfd745a4d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 22 May 2020 17:21:39 +0100 Subject: [PATCH 215/574] selftests: vdso: Add a selftest for vDSO getcpu() Provide a very basic selftest for getcpu() which similarly to our existing test for gettimeofday() looks up the function in the vDSO and prints the results it gets if the function exists and succeeds. Signed-off-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/.gitignore | 1 + tools/testing/selftests/vDSO/Makefile | 3 +- .../testing/selftests/vDSO/vdso_test_getcpu.c | 54 +++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/vDSO/vdso_test_getcpu.c diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 74f49bd5f6dd..5eb64d41e541 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test vdso_test_gettimeofday +vdso_test_getcpu vdso_standalone_test_x86 diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index ae15d700b62e..0069f2f83f86 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -4,7 +4,7 @@ include ../lib.mk uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) -TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday +TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu ifeq ($(ARCH),x86) TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86 endif @@ -18,6 +18,7 @@ endif all: $(TEST_GEN_PROGS) $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c +$(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \ vdso_standalone_test_x86.c parse_vdso.c \ diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c new file mode 100644 index 000000000000..fc25ede131b8 --- /dev/null +++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vdso_test_getcpu.c: Sample code to test parse_vdso.c and vDSO getcpu() + * + * Copyright (c) 2020 Arm Ltd + */ + +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "parse_vdso.h" + +const char *version = "LINUX_2.6"; +const char *name = "__vdso_getcpu"; + +struct getcpu_cache; +typedef long (*getcpu_t)(unsigned int *, unsigned int *, + struct getcpu_cache *); + +int main(int argc, char **argv) +{ + unsigned long sysinfo_ehdr; + unsigned int cpu, node; + getcpu_t get_cpu; + long ret; + + sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; + } + + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); + + get_cpu = (getcpu_t)vdso_sym(version, name); + if (!get_cpu) { + printf("Could not find %s\n", name); + return KSFT_SKIP; + } + + ret = get_cpu(&cpu, &node, 0); + if (ret == 0) { + printf("Running on CPU %u node %u\n", cpu, node); + } else { + printf("%s failed\n", name); + return KSFT_FAIL; + } + + return 0; +} From 5627f9cffee73dca9762135b6a8c7ab7213f31e2 Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Thu, 21 May 2020 17:43:44 +0300 Subject: [PATCH 216/574] Kernel selftests: Add check if TPM devices are supported TPM2 tests set uses /dev/tpm0 and /dev/tpmrm0 without check if they are available. In case, when these devices are not available test fails, but expected behaviour is skipped test. Signed-off-by: Nikita Sobolev Reviewed-by: Jarkko Sakkinen Reviewed-by: Petr Vorel Signed-off-by: Shuah Khan --- tools/testing/selftests/tpm2/test_smoke.sh | 5 +++++ tools/testing/selftests/tpm2/test_space.sh | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index 8155c2ea7ccb..663062701d5a 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +[ -f /dev/tpm0 ] || exit $ksft_skip + python -m unittest -v tpm2_tests.SmokeTest python -m unittest -v tpm2_tests.AsyncTest diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index a6f5e346635e..36c9d030a1c6 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -1,4 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +[ -f /dev/tpmrm0 ] || exit $ksft_skip + python -m unittest -v tpm2_tests.SpaceTest From 558ae0355a91c7d28fdf4c0011bee6ebb5118632 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 21 May 2020 00:52:52 -0700 Subject: [PATCH 217/574] selftests/timens: handle a case when alarm clocks are not supported This can happen if a testing node doesn't have RTC (real time clock) hardware or it doesn't support alarms. Fixes: 61c57676035d ("selftests/timens: Add Time Namespace test for supported clocks") Acked-by: Vincenzo Frascino Reported-by: Vincenzo Frascino Signed-off-by: Andrei Vagin Signed-off-by: Shuah Khan --- tools/testing/selftests/timens/clock_nanosleep.c | 2 +- tools/testing/selftests/timens/timens.c | 2 +- tools/testing/selftests/timens/timens.h | 13 ++++++++++++- tools/testing/selftests/timens/timer.c | 5 +++++ tools/testing/selftests/timens/timerfd.c | 5 +++++ 5 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c index 8e7b7c72ef65..72d41b955fb2 100644 --- a/tools/testing/selftests/timens/clock_nanosleep.c +++ b/tools/testing/selftests/timens/clock_nanosleep.c @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) ksft_set_plan(4); - check_config_posix_timers(); + check_supported_timers(); if (unshare_timens()) return 1; diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c index 098be7c83be3..52b6a1185f52 100644 --- a/tools/testing/selftests/timens/timens.c +++ b/tools/testing/selftests/timens/timens.c @@ -155,7 +155,7 @@ int main(int argc, char *argv[]) nscheck(); - check_config_posix_timers(); + check_supported_timers(); ksft_set_plan(ARRAY_SIZE(clocks) * 2); diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h index e09e7e39bc52..d4fc52d47146 100644 --- a/tools/testing/selftests/timens/timens.h +++ b/tools/testing/selftests/timens/timens.h @@ -14,15 +14,26 @@ #endif static int config_posix_timers = true; +static int config_alarm_timers = true; -static inline void check_config_posix_timers(void) +static inline void check_supported_timers(void) { + struct timespec ts; + if (timer_create(-1, 0, 0) == -1 && errno == ENOSYS) config_posix_timers = false; + + if (clock_gettime(CLOCK_BOOTTIME_ALARM, &ts) == -1 && errno == EINVAL) + config_alarm_timers = false; } static inline bool check_skip(int clockid) { + if (!config_alarm_timers && clockid == CLOCK_BOOTTIME_ALARM) { + ksft_test_result_skip("CLOCK_BOOTTIME_ALARM isn't supported\n"); + return true; + } + if (config_posix_timers) return false; diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c index 96dba11ebe44..5e7f0051bd7b 100644 --- a/tools/testing/selftests/timens/timer.c +++ b/tools/testing/selftests/timens/timer.c @@ -22,6 +22,9 @@ int run_test(int clockid, struct timespec now) timer_t fd; int i; + if (check_skip(clockid)) + return 0; + for (i = 0; i < 2; i++) { struct sigevent sevp = {.sigev_notify = SIGEV_NONE}; int flags = 0; @@ -74,6 +77,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c index eff1ec5ff215..9edd43d6b2c1 100644 --- a/tools/testing/selftests/timens/timerfd.c +++ b/tools/testing/selftests/timens/timerfd.c @@ -28,6 +28,9 @@ int run_test(int clockid, struct timespec now) long long elapsed; int fd, i; + if (check_skip(clockid)) + return 0; + if (tclock_gettime(clockid, &now)) return pr_perror("clock_gettime(%d)", clockid); @@ -81,6 +84,8 @@ int main(int argc, char *argv[]) nscheck(); + check_supported_timers(); + ksft_set_plan(3); clock_gettime(CLOCK_MONOTONIC, &mtime_now); From c4714b0045ac74f3b578851e312f9fbccfb382db Mon Sep 17 00:00:00 2001 From: Lothar Rubusch Date: Wed, 15 Apr 2020 20:16:53 +0000 Subject: [PATCH 218/574] Documentation: test.h - fix warnings Fix warnings at 'make htmldocs', and formatting issues in the resulting documentation. - test.h: Fix annotation in kernel-doc parameter description. - Documentation/*.rst: Fixing formatting issues, and a duplicate label issue due to usage of sphinx.ext.autosectionlabel and identical labels within one document (sphinx warning) Signed-off-by: Lothar Rubusch Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- Documentation/dev-tools/kunit/start.rst | 13 ++++++++----- Documentation/dev-tools/kunit/usage.rst | 4 ++-- include/kunit/test.h | 12 ++++++------ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst index e1c5ce80ce12..bb112cf70624 100644 --- a/Documentation/dev-tools/kunit/start.rst +++ b/Documentation/dev-tools/kunit/start.rst @@ -32,15 +32,17 @@ test targets as well. The ``.kunitconfig`` should also contain any other config options required by the tests. A good starting point for a ``.kunitconfig`` is the KUnit defconfig: + .. code-block:: bash cd $PATH_TO_LINUX_REPO cp arch/um/configs/kunit_defconfig .kunitconfig You can then add any other Kconfig options you wish, e.g.: + .. code-block:: none - CONFIG_LIST_KUNIT_TEST=y + CONFIG_LIST_KUNIT_TEST=y :doc:`kunit_tool ` will ensure that all config options set in ``.kunitconfig`` are set in the kernel ``.config`` before running the tests. @@ -54,8 +56,8 @@ using. other tools (such as make menuconfig) to adjust other config options. -Running the tests ------------------ +Running the tests (KUnit Wrapper) +--------------------------------- To make sure that everything is set up correctly, simply invoke the Python wrapper from your kernel repo: @@ -105,8 +107,9 @@ have config options ending in ``_KUNIT_TEST``. KUnit and KUnit tests can be compiled as modules: in this case the tests in a module will be run when the module is loaded. -Running the tests ------------------ + +Running the tests (w/o KUnit Wrapper) +------------------------------------- Build and run your kernel as usual. Test output will be written to the kernel log in `TAP `_ format. diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 473a2361ec37..3c3fe8b5fecc 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -595,7 +595,7 @@ able to run one test case per invocation. KUnit debugfs representation ============================ When kunit test suites are initialized, they create an associated directory -in /sys/kernel/debug/kunit/. The directory contains one file +in ``/sys/kernel/debug/kunit/``. The directory contains one file - results: "cat results" displays results of each test case and the results of the entire suite for the last test run. @@ -604,4 +604,4 @@ The debugfs representation is primarily of use when kunit test suites are run in a native environment, either as modules or builtin. Having a way to display results like this is valuable as otherwise results can be intermixed with other events in dmesg output. The maximum size of each -results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h). +results file is KUNIT_LOG_SIZE bytes (defined in ``include/kunit/test.h``). diff --git a/include/kunit/test.h b/include/kunit/test.h index 9b0c46a6ca1f..47e61e1d5337 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -175,7 +175,7 @@ struct kunit_suite { void (*exit)(struct kunit *test); struct kunit_case *test_cases; - /* private - internal use only */ + /* private: internal use only */ struct dentry *debugfs; char *log; }; @@ -232,12 +232,12 @@ void __kunit_test_suites_exit(struct kunit_suite **suites); * kunit_test_suites() - used to register one or more &struct kunit_suite * with KUnit. * - * @suites: a statically allocated list of &struct kunit_suite. + * @suites_list...: a statically allocated list of &struct kunit_suite. * - * Registers @suites with the test framework. See &struct kunit_suite for + * Registers @suites_list with the test framework. See &struct kunit_suite for * more information. * - * When builtin, KUnit tests are all run as late_initcalls; this means + * When builtin, KUnit tests are all run as late_initcalls; this means * that they cannot test anything where tests must run at a different init * phase. One significant restriction resulting from this is that KUnit * cannot reliably test anything that is initialize in the late_init phase; @@ -253,8 +253,8 @@ void __kunit_test_suites_exit(struct kunit_suite **suites); * tests from the same place, and at the very least to do so after * everything else is definitely initialized. */ -#define kunit_test_suites(...) \ - static struct kunit_suite *suites[] = { __VA_ARGS__, NULL}; \ +#define kunit_test_suites(suites_list...) \ + static struct kunit_suite *suites[] = {suites_list, NULL}; \ static int kunit_test_suites_init(void) \ { \ return __kunit_test_suites_init(suites); \ From ddbd60c779b4ddaa87173a160ce90146933fb8f9 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Tue, 14 Apr 2020 20:09:50 -0300 Subject: [PATCH 219/574] kunit: use --build_dir=.kunit as default To make KUnit easier to use, and to avoid overwriting object and .config files, the default KUnit build directory is set to .kunit * Related bug: https://bugzilla.kernel.org/show_bug.cgi?id=205221 Fixed up minor merge conflicts - Shuah Khan Signed-off-by: Vitor Massaru Iha Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index b01838b6f5f9..f3a1803bd71f 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -169,7 +169,7 @@ def add_common_opts(parser): parser.add_argument('--build_dir', help='As in the make command, it specifies the build ' 'directory.', - type=str, default='', metavar='build_dir') + type=str, default='.kunit', metavar='build_dir') parser.add_argument('--make_options', help='X=Y make option, can be repeated.', action='append') @@ -245,12 +245,11 @@ def main(argv, linux=None): cli_args = parser.parse_args(argv) if cli_args.subcommand == 'run': - if cli_args.build_dir: - if not os.path.exists(cli_args.build_dir): - os.mkdir(cli_args.build_dir) - kunit_kernel.kunitconfig_path = os.path.join( - cli_args.build_dir, - kunit_kernel.kunitconfig_path) + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) if not linux: linux = kunit_kernel.LinuxSourceTree() From 9bdf64b35117cc10813d24e1842cd8ee40ecbf19 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Tue, 14 Apr 2020 20:37:53 -0300 Subject: [PATCH 220/574] kunit: use KUnit defconfig by default To improve the usability of KUnit, defconfig is used by default if no kunitconfig is present. * https://bugzilla.kernel.org/show_bug.cgi?id=205259 Fixed up minor merge conflicts - Shuah Khan Signed-off-by: Vitor Massaru Iha Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index f3a1803bd71f..ec73b07d1265 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -32,8 +32,8 @@ KunitExecRequest = namedtuple('KunitExecRequest', KunitParseRequest = namedtuple('KunitParseRequest', ['raw_output', 'input_data']) KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs', - 'build_dir', 'defconfig', - 'alltests', 'make_options']) + 'build_dir', 'alltests', + 'make_options']) KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0] @@ -60,8 +60,7 @@ def config_tests(linux: kunit_kernel.LinuxSourceTree, kunit_parser.print_with_timestamp('Configuring KUnit Kernel ...') config_start = time.time() - if request.defconfig: - create_default_kunitconfig() + create_default_kunitconfig() success = linux.build_reconfig(request.build_dir, request.make_options) config_end = time.time() if not success: @@ -177,11 +176,6 @@ def add_common_opts(parser): help='Run all KUnit tests through allyesconfig', action='store_true') -def add_config_opts(parser): - parser.add_argument('--defconfig', - help='Uses a default .kunitconfig.', - action='store_true') - def add_build_opts(parser): parser.add_argument('--jobs', help='As in the make command, "Specifies the number of ' @@ -210,7 +204,6 @@ def main(argv, linux=None): # The 'run' command will config, build, exec, and parse in one go. run_parser = subparser.add_parser('run', help='Runs KUnit tests.') add_common_opts(run_parser) - add_config_opts(run_parser) add_build_opts(run_parser) add_exec_opts(run_parser) add_parse_opts(run_parser) @@ -258,7 +251,6 @@ def main(argv, linux=None): cli_args.timeout, cli_args.jobs, cli_args.build_dir, - cli_args.defconfig, cli_args.alltests, cli_args.make_options) result = run_tests(linux, request) From 52da6d513183cf543df6efc95bf504aee0da70d6 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 22 May 2020 16:03:14 -0600 Subject: [PATCH 221/574] drm/msm: Attach the IOMMU device during initialization Everywhere an IOMMU object is created by msm_gpu_create_address_space the IOMMU device is attached immediately after. Instead of carrying around the infrastructure to do the attach from the device specific code do it directly in the msm_iommu_init() function. This gets it out of the way for more aggressive cleanups that follow. Reviewed-by: Rob Clark Signed-off-by: Jordan Crouse Tested-by: Shawn Guo [squash in rebase fixups and fix for unused fxn] Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 7 ------- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 8 -------- drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c | 4 ---- drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c | 7 ------- drivers/gpu/drm/msm/msm_gem_vma.c | 23 +++++++++++++++++++---- drivers/gpu/drm/msm/msm_gpu.c | 11 +---------- drivers/gpu/drm/msm/msm_gpummu.c | 6 ------ drivers/gpu/drm/msm/msm_iommu.c | 15 +++++++-------- drivers/gpu/drm/msm/msm_mmu.h | 1 - 9 files changed, 27 insertions(+), 55 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 8c07850728bc..0c72ae84e279 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -1114,7 +1114,6 @@ static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo, static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu) { struct iommu_domain *domain; - int ret; domain = iommu_domain_alloc(&platform_bus_type); if (!domain) @@ -1129,12 +1128,6 @@ static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu) return PTR_ERR(gmu->aspace); } - ret = gmu->aspace->mmu->funcs->attach(gmu->aspace->mmu); - if (ret) { - msm_gem_address_space_put(gmu->aspace); - return ret; - } - return 0; } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c index 10c33cf7a5d7..c5150c67a3b8 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c @@ -794,7 +794,6 @@ static int _dpu_kms_mmu_init(struct dpu_kms *dpu_kms) { struct iommu_domain *domain; struct msm_gem_address_space *aspace; - int ret; domain = iommu_domain_alloc(&platform_bus_type); if (!domain) @@ -810,13 +809,6 @@ static int _dpu_kms_mmu_init(struct dpu_kms *dpu_kms) return PTR_ERR(aspace); } - ret = aspace->mmu->funcs->attach(aspace->mmu); - if (ret) { - DPU_ERROR("failed to attach iommu %d\n", ret); - msm_gem_address_space_put(aspace); - return ret; - } - dpu_kms->base.aspace = aspace; return 0; } diff --git a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c index dda05436f716..9dba37c6344f 100644 --- a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c @@ -518,10 +518,6 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev) } kms->aspace = aspace; - - ret = aspace->mmu->funcs->attach(aspace->mmu); - if (ret) - goto fail; } else { DRM_DEV_INFO(dev->dev, "no iommu, fallback to phys " "contig buffers for scanout\n"); diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c index c23a2fa13fb9..fe18656848ce 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c @@ -644,13 +644,6 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev) } kms->aspace = aspace; - - ret = aspace->mmu->funcs->attach(aspace->mmu); - if (ret) { - DRM_DEV_ERROR(&pdev->dev, "failed to attach iommu: %d\n", - ret); - goto fail; - } } else { DRM_DEV_INFO(&pdev->dev, "no iommu, fallback to phys contig buffers for scanout\n"); diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 407b7ab82818..72fc1980fff6 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -133,8 +133,8 @@ msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, const char *name) { struct msm_gem_address_space *aspace; - u64 size = domain->geometry.aperture_end - - domain->geometry.aperture_start; + u64 start = domain->geometry.aperture_start; + u64 size = domain->geometry.aperture_end - start; aspace = kzalloc(sizeof(*aspace), GFP_KERNEL); if (!aspace) @@ -143,9 +143,18 @@ msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, spin_lock_init(&aspace->lock); aspace->name = name; aspace->mmu = msm_iommu_new(dev, domain); + if (IS_ERR(aspace->mmu)) { + int ret = PTR_ERR(aspace->mmu); - drm_mm_init(&aspace->mm, (domain->geometry.aperture_start >> PAGE_SHIFT), - size >> PAGE_SHIFT); + kfree(aspace); + return ERR_PTR(ret); + } + + /* + * Attaching the IOMMU device changes the aperture values so use the + * cached values instead + */ + drm_mm_init(&aspace->mm, start >> PAGE_SHIFT, size >> PAGE_SHIFT); kref_init(&aspace->kref); @@ -166,6 +175,12 @@ msm_gem_address_space_create_a2xx(struct device *dev, struct msm_gpu *gpu, spin_lock_init(&aspace->lock); aspace->name = name; aspace->mmu = msm_gpummu_new(dev, gpu); + if (IS_ERR(aspace->mmu)) { + int ret = PTR_ERR(aspace->mmu); + + kfree(aspace); + return ERR_PTR(ret); + } drm_mm_init(&aspace->mm, (va_start >> PAGE_SHIFT), size >> PAGE_SHIFT); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 615c5cda5389..b6f0d7204da9 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -826,7 +826,6 @@ msm_gpu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev, uint64_t va_start, uint64_t va_end) { struct msm_gem_address_space *aspace; - int ret; /* * Setup IOMMU.. eventually we will (I think) do this once per context @@ -851,17 +850,9 @@ msm_gpu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev, va_start, va_end); } - if (IS_ERR(aspace)) { + if (IS_ERR(aspace)) DRM_DEV_ERROR(gpu->dev->dev, "failed to init mmu: %ld\n", PTR_ERR(aspace)); - return ERR_CAST(aspace); - } - - ret = aspace->mmu->funcs->attach(aspace->mmu); - if (ret) { - msm_gem_address_space_put(aspace); - return ERR_PTR(ret); - } return aspace; } diff --git a/drivers/gpu/drm/msm/msm_gpummu.c b/drivers/gpu/drm/msm/msm_gpummu.c index 34980d8eb7ad..0ad0f848560a 100644 --- a/drivers/gpu/drm/msm/msm_gpummu.c +++ b/drivers/gpu/drm/msm/msm_gpummu.c @@ -21,11 +21,6 @@ struct msm_gpummu { #define GPUMMU_PAGE_SIZE SZ_4K #define TABLE_SIZE (sizeof(uint32_t) * GPUMMU_VA_RANGE / GPUMMU_PAGE_SIZE) -static int msm_gpummu_attach(struct msm_mmu *mmu) -{ - return 0; -} - static void msm_gpummu_detach(struct msm_mmu *mmu) { } @@ -85,7 +80,6 @@ static void msm_gpummu_destroy(struct msm_mmu *mmu) } static const struct msm_mmu_funcs funcs = { - .attach = msm_gpummu_attach, .detach = msm_gpummu_detach, .map = msm_gpummu_map, .unmap = msm_gpummu_unmap, diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index ad58cfe5998e..544c51955717 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -23,13 +23,6 @@ static int msm_fault_handler(struct iommu_domain *domain, struct device *dev, return 0; } -static int msm_iommu_attach(struct msm_mmu *mmu) -{ - struct msm_iommu *iommu = to_msm_iommu(mmu); - - return iommu_attach_device(iommu->domain, mmu->dev); -} - static void msm_iommu_detach(struct msm_mmu *mmu) { struct msm_iommu *iommu = to_msm_iommu(mmu); @@ -66,7 +59,6 @@ static void msm_iommu_destroy(struct msm_mmu *mmu) } static const struct msm_mmu_funcs funcs = { - .attach = msm_iommu_attach, .detach = msm_iommu_detach, .map = msm_iommu_map, .unmap = msm_iommu_unmap, @@ -76,6 +68,7 @@ static const struct msm_mmu_funcs funcs = { struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) { struct msm_iommu *iommu; + int ret; iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) @@ -85,5 +78,11 @@ struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) msm_mmu_init(&iommu->base, dev, &funcs); iommu_set_fault_handler(domain, msm_fault_handler, iommu); + ret = iommu_attach_device(iommu->domain, dev); + if (ret) { + kfree(iommu); + return ERR_PTR(ret); + } + return &iommu->base; } diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index 67a623f14319..bae9e8e67ec1 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -10,7 +10,6 @@ #include struct msm_mmu_funcs { - int (*attach)(struct msm_mmu *mmu); void (*detach)(struct msm_mmu *mmu); int (*map)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt, unsigned len, int prot); From ccac7ce373c1b5175bcf733fe6223129b8975788 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 22 May 2020 16:03:15 -0600 Subject: [PATCH 222/574] drm/msm: Refactor address space initialization Refactor how address space initialization works. Instead of having the address space function create the MMU object (and thus require separate but equal functions for gpummu and iommu) use a single function and pass the MMU struct in. Make the generic code cleaner by using target specific functions to create the address space so a2xx can do its own thing in its own space. For all the other targets use a generic helper to initialize IOMMU but leave the door open for newer targets to use customization if they need it. Reviewed-by: Rob Clark Signed-off-by: Jordan Crouse Tested-by: Shawn Guo [squash in rebase fixups] Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a2xx_gpu.c | 16 +++++++ drivers/gpu/drm/msm/adreno/a3xx_gpu.c | 1 + drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 1 + drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 1 + drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 7 ++-- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 1 + drivers/gpu/drm/msm/adreno/adreno_gpu.c | 23 +++++++--- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 8 ++++ drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 10 ++--- drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c | 14 ++++--- drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c | 4 -- drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c | 11 ++++- drivers/gpu/drm/msm/msm_drv.h | 8 +--- drivers/gpu/drm/msm/msm_gem_vma.c | 53 ++++-------------------- drivers/gpu/drm/msm/msm_gpu.c | 40 +----------------- drivers/gpu/drm/msm/msm_gpu.h | 4 +- drivers/gpu/drm/msm/msm_iommu.c | 3 ++ 17 files changed, 86 insertions(+), 119 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c index 1f83bc18d500..60f6472a3e58 100644 --- a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c @@ -401,6 +401,21 @@ static struct msm_gpu_state *a2xx_gpu_state_get(struct msm_gpu *gpu) return state; } +static struct msm_gem_address_space * +a2xx_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) +{ + struct msm_mmu *mmu = msm_gpummu_new(&pdev->dev, gpu); + struct msm_gem_address_space *aspace; + + aspace = msm_gem_address_space_create(mmu, "gpu", SZ_16M, + SZ_16M + 0xfff * SZ_64K); + + if (IS_ERR(aspace) && !IS_ERR(mmu)) + mmu->funcs->destroy(mmu); + + return aspace; +} + /* Register offset defines for A2XX - copy of A3XX */ static const unsigned int a2xx_register_offsets[REG_ADRENO_REGISTER_MAX] = { REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_BASE, REG_AXXX_CP_RB_BASE), @@ -429,6 +444,7 @@ static const struct adreno_gpu_funcs funcs = { #endif .gpu_state_get = a2xx_gpu_state_get, .gpu_state_put = adreno_gpu_state_put, + .create_address_space = a2xx_create_address_space, }, }; diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c index b67f88872726..0a5ea9f56cb8 100644 --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c @@ -441,6 +441,7 @@ static const struct adreno_gpu_funcs funcs = { #endif .gpu_state_get = a3xx_gpu_state_get, .gpu_state_put = adreno_gpu_state_put, + .create_address_space = adreno_iommu_create_address_space, }, }; diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 9e244982974e..b9b26b2bf9c5 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -583,6 +583,7 @@ static const struct adreno_gpu_funcs funcs = { #endif .gpu_state_get = a4xx_gpu_state_get, .gpu_state_put = adreno_gpu_state_put, + .create_address_space = adreno_iommu_create_address_space, }, .get_timestamp = a4xx_get_timestamp, }; diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 662d02289533..d95970a73fb4 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1445,6 +1445,7 @@ static const struct adreno_gpu_funcs funcs = { .gpu_busy = a5xx_gpu_busy, .gpu_state_get = a5xx_gpu_state_get, .gpu_state_put = a5xx_gpu_state_put, + .create_address_space = adreno_iommu_create_address_space, }, .get_timestamp = a5xx_get_timestamp, }; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 0c72ae84e279..c5e21b53c4d0 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -1114,15 +1114,14 @@ static int a6xx_gmu_memory_alloc(struct a6xx_gmu *gmu, struct a6xx_gmu_bo *bo, static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu) { struct iommu_domain *domain; + struct msm_mmu *mmu; domain = iommu_domain_alloc(&platform_bus_type); if (!domain) return -ENODEV; - domain->geometry.aperture_start = 0x00000000; - domain->geometry.aperture_end = 0x7fffffff; - - gmu->aspace = msm_gem_address_space_create(gmu->dev, domain, "gmu"); + mmu = msm_iommu_new(gmu->dev, domain); + gmu->aspace = msm_gem_address_space_create(mmu, "gmu", 0x0, 0x7fffffff); if (IS_ERR(gmu->aspace)) { iommu_domain_free(domain); return PTR_ERR(gmu->aspace); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 6f335ae179c8..a1589e040c57 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -893,6 +893,7 @@ static const struct adreno_gpu_funcs funcs = { #if defined(CONFIG_DRM_MSM_GPU_STATE) .gpu_state_get = a6xx_gpu_state_get, .gpu_state_put = a6xx_gpu_state_put, + .create_address_space = adreno_iommu_create_address_space, #endif }, .get_timestamp = a6xx_get_timestamp, diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index ff7f441932b8..89673c7ed473 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -185,6 +185,23 @@ int adreno_zap_shader_load(struct msm_gpu *gpu, u32 pasid) return zap_shader_load_mdt(gpu, adreno_gpu->info->zapfw, pasid); } +struct msm_gem_address_space * +adreno_iommu_create_address_space(struct msm_gpu *gpu, + struct platform_device *pdev) +{ + struct iommu_domain *iommu = iommu_domain_alloc(&platform_bus_type); + struct msm_mmu *mmu = msm_iommu_new(&pdev->dev, iommu); + struct msm_gem_address_space *aspace; + + aspace = msm_gem_address_space_create(mmu, "gpu", SZ_16M, + 0xfffffff); + + if (IS_ERR(aspace) && !IS_ERR(mmu)) + mmu->funcs->destroy(mmu); + + return aspace; +} + int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -988,12 +1005,6 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev, adreno_gpu_config.ioname = "kgsl_3d0_reg_memory"; - adreno_gpu_config.va_start = SZ_16M; - adreno_gpu_config.va_end = 0xffffffff; - /* maximum range of a2xx mmu */ - if (adreno_is_a2xx(adreno_gpu)) - adreno_gpu_config.va_end = SZ_16M + 0xfff * SZ_64K; - adreno_gpu_config.nr_rings = nr_rings; adreno_get_pwrlevels(&pdev->dev, gpu); diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index dea97c8317e0..2f5d2c3acc3a 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -287,6 +287,14 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state); int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state); int adreno_gpu_state_put(struct msm_gpu_state *state); +/* + * Common helper function to initialize the default address space for arm-smmu + * attached targets + */ +struct msm_gem_address_space * +adreno_iommu_create_address_space(struct msm_gpu *gpu, + struct platform_device *pdev); + /* * For a5xx and a6xx targets load the zap shader that is used to pull the GPU * out of secure mode diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c index c5150c67a3b8..a5da7aacddba 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c @@ -794,18 +794,18 @@ static int _dpu_kms_mmu_init(struct dpu_kms *dpu_kms) { struct iommu_domain *domain; struct msm_gem_address_space *aspace; + struct msm_mmu *mmu; domain = iommu_domain_alloc(&platform_bus_type); if (!domain) return 0; - domain->geometry.aperture_start = 0x1000; - domain->geometry.aperture_end = 0xffffffff; + mmu = msm_iommu_new(dpu_kms->dev->dev, domain); + aspace = msm_gem_address_space_create(mmu, "dpu1", + 0x1000, 0xfffffff); - aspace = msm_gem_address_space_create(dpu_kms->dev->dev, - domain, "dpu1"); if (IS_ERR(aspace)) { - iommu_domain_free(domain); + mmu->funcs->destroy(mmu); return PTR_ERR(aspace); } diff --git a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c index 9dba37c6344f..08897184b1d9 100644 --- a/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp4/mdp4_kms.c @@ -510,9 +510,15 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev) mdelay(16); if (config->iommu) { - aspace = msm_gem_address_space_create(&pdev->dev, - config->iommu, "mdp4"); + struct msm_mmu *mmu = msm_iommu_new(&pdev->dev, + config->iommu); + + aspace = msm_gem_address_space_create(mmu, + "mdp4", 0x1000, 0xffffffff); + if (IS_ERR(aspace)) { + if (!IS_ERR(mmu)) + mmu->funcs->destroy(mmu); ret = PTR_ERR(aspace); goto fail; } @@ -565,10 +571,6 @@ static struct mdp4_platform_config *mdp4_get_config(struct platform_device *dev) /* TODO: Chips that aren't apq8064 have a 200 Mhz max_clk */ config.max_clk = 266667000; config.iommu = iommu_domain_alloc(&platform_bus_type); - if (config.iommu) { - config.iommu->geometry.aperture_start = 0x1000; - config.iommu->geometry.aperture_end = 0xffffffff; - } return &config; } diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c index a7df8dbffdc2..25a13a2a57a9 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_cfg.c @@ -1017,10 +1017,6 @@ static struct mdp5_cfg_platform *mdp5_get_config(struct platform_device *dev) static struct mdp5_cfg_platform config = {}; config.iommu = iommu_domain_alloc(&platform_bus_type); - if (config.iommu) { - config.iommu->geometry.aperture_start = 0x1000; - config.iommu->geometry.aperture_end = 0xffffffff; - } return &config; } diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c index fe18656848ce..54631fbd9389 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c @@ -632,13 +632,20 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev) mdelay(16); if (config->platform.iommu) { + struct msm_mmu *mmu; + iommu_dev = &pdev->dev; if (!dev_iommu_fwspec_get(iommu_dev)) iommu_dev = iommu_dev->parent; - aspace = msm_gem_address_space_create(iommu_dev, - config->platform.iommu, "mdp5"); + mmu = msm_iommu_new(iommu_dev, config->platform.iommu); + + aspace = msm_gem_address_space_create(mmu, "mdp5", + 0x1000, 0xffffffff); + if (IS_ERR(aspace)) { + if (!IS_ERR(mmu)) + mmu->funcs->destroy(mmu); ret = PTR_ERR(aspace); goto fail; } diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 108202c35637..e2d6a6056418 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -252,12 +252,8 @@ void msm_gem_close_vma(struct msm_gem_address_space *aspace, void msm_gem_address_space_put(struct msm_gem_address_space *aspace); struct msm_gem_address_space * -msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, - const char *name); - -struct msm_gem_address_space * -msm_gem_address_space_create_a2xx(struct device *dev, struct msm_gpu *gpu, - const char *name, uint64_t va_start, uint64_t va_end); +msm_gem_address_space_create(struct msm_mmu *mmu, const char *name, + u64 va_start, u64 size); int msm_register_mmu(struct drm_device *dev, struct msm_mmu *mmu); void msm_unregister_mmu(struct drm_device *dev, struct msm_mmu *mmu); diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 72fc1980fff6..5f6a11211b64 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -127,14 +127,14 @@ int msm_gem_init_vma(struct msm_gem_address_space *aspace, return 0; } - struct msm_gem_address_space * -msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, - const char *name) +msm_gem_address_space_create(struct msm_mmu *mmu, const char *name, + u64 va_start, u64 size) { struct msm_gem_address_space *aspace; - u64 start = domain->geometry.aperture_start; - u64 size = domain->geometry.aperture_end - start; + + if (IS_ERR(mmu)) + return ERR_CAST(mmu); aspace = kzalloc(sizeof(*aspace), GFP_KERNEL); if (!aspace) @@ -142,48 +142,9 @@ msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, spin_lock_init(&aspace->lock); aspace->name = name; - aspace->mmu = msm_iommu_new(dev, domain); - if (IS_ERR(aspace->mmu)) { - int ret = PTR_ERR(aspace->mmu); + aspace->mmu = mmu; - kfree(aspace); - return ERR_PTR(ret); - } - - /* - * Attaching the IOMMU device changes the aperture values so use the - * cached values instead - */ - drm_mm_init(&aspace->mm, start >> PAGE_SHIFT, size >> PAGE_SHIFT); - - kref_init(&aspace->kref); - - return aspace; -} - -struct msm_gem_address_space * -msm_gem_address_space_create_a2xx(struct device *dev, struct msm_gpu *gpu, - const char *name, uint64_t va_start, uint64_t va_end) -{ - struct msm_gem_address_space *aspace; - u64 size = va_end - va_start; - - aspace = kzalloc(sizeof(*aspace), GFP_KERNEL); - if (!aspace) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&aspace->lock); - aspace->name = name; - aspace->mmu = msm_gpummu_new(dev, gpu); - if (IS_ERR(aspace->mmu)) { - int ret = PTR_ERR(aspace->mmu); - - kfree(aspace); - return ERR_PTR(ret); - } - - drm_mm_init(&aspace->mm, (va_start >> PAGE_SHIFT), - size >> PAGE_SHIFT); + drm_mm_init(&aspace->mm, va_start >> PAGE_SHIFT, size >> PAGE_SHIFT); kref_init(&aspace->kref); diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index b6f0d7204da9..a22d30622306 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -821,42 +821,6 @@ static int get_clocks(struct platform_device *pdev, struct msm_gpu *gpu) return 0; } -static struct msm_gem_address_space * -msm_gpu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev, - uint64_t va_start, uint64_t va_end) -{ - struct msm_gem_address_space *aspace; - - /* - * Setup IOMMU.. eventually we will (I think) do this once per context - * and have separate page tables per context. For now, to keep things - * simple and to get something working, just use a single address space: - */ - if (!adreno_is_a2xx(to_adreno_gpu(gpu))) { - struct iommu_domain *iommu = iommu_domain_alloc(&platform_bus_type); - if (!iommu) - return NULL; - - iommu->geometry.aperture_start = va_start; - iommu->geometry.aperture_end = va_end; - - DRM_DEV_INFO(gpu->dev->dev, "%s: using IOMMU\n", gpu->name); - - aspace = msm_gem_address_space_create(&pdev->dev, iommu, "gpu"); - if (IS_ERR(aspace)) - iommu_domain_free(iommu); - } else { - aspace = msm_gem_address_space_create_a2xx(&pdev->dev, gpu, "gpu", - va_start, va_end); - } - - if (IS_ERR(aspace)) - DRM_DEV_ERROR(gpu->dev->dev, "failed to init mmu: %ld\n", - PTR_ERR(aspace)); - - return aspace; -} - int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, const char *name, struct msm_gpu_config *config) @@ -929,8 +893,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, msm_devfreq_init(gpu); - gpu->aspace = msm_gpu_create_address_space(gpu, pdev, - config->va_start, config->va_end); + + gpu->aspace = gpu->funcs->create_address_space(gpu, pdev); if (gpu->aspace == NULL) DRM_DEV_INFO(drm->dev, "%s: no IOMMU, fallback to VRAM carveout!\n", name); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index be5bc2e8425c..d496b68e58a9 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -21,8 +21,6 @@ struct msm_gpu_state; struct msm_gpu_config { const char *ioname; - uint64_t va_start; - uint64_t va_end; unsigned int nr_rings; }; @@ -64,6 +62,8 @@ struct msm_gpu_funcs { int (*gpu_state_put)(struct msm_gpu_state *state); unsigned long (*gpu_get_freq)(struct msm_gpu *gpu); void (*gpu_set_freq)(struct msm_gpu *gpu, unsigned long freq); + struct msm_gem_address_space *(*create_address_space) + (struct msm_gpu *gpu, struct platform_device *pdev); }; struct msm_gpu { diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index 544c51955717..e773ef8c7203 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -70,6 +70,9 @@ struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) struct msm_iommu *iommu; int ret; + if (!domain) + return ERR_PTR(-ENODEV); + iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) return ERR_PTR(-ENOMEM); From fb212ad6cc58c0fd8c0f76578d4392f71e595bd5 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 22 May 2020 16:03:16 -0600 Subject: [PATCH 223/574] drm/msm: Update the MMU helper function APIs Instead of using a bare unsigned type for the length value for map/unmap functions pass in a size_t to more correctly match up with the underlying APIs. Signed-off-by: Jordan Crouse Tested-by: Shawn Guo Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gpummu.c | 4 ++-- drivers/gpu/drm/msm/msm_iommu.c | 4 ++-- drivers/gpu/drm/msm/msm_mmu.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gpummu.c b/drivers/gpu/drm/msm/msm_gpummu.c index 0ad0f848560a..310a31b05faa 100644 --- a/drivers/gpu/drm/msm/msm_gpummu.c +++ b/drivers/gpu/drm/msm/msm_gpummu.c @@ -26,7 +26,7 @@ static void msm_gpummu_detach(struct msm_mmu *mmu) } static int msm_gpummu_map(struct msm_mmu *mmu, uint64_t iova, - struct sg_table *sgt, unsigned len, int prot) + struct sg_table *sgt, size_t len, int prot) { struct msm_gpummu *gpummu = to_msm_gpummu(mmu); unsigned idx = (iova - GPUMMU_VA_START) / GPUMMU_PAGE_SIZE; @@ -54,7 +54,7 @@ static int msm_gpummu_map(struct msm_mmu *mmu, uint64_t iova, return 0; } -static int msm_gpummu_unmap(struct msm_mmu *mmu, uint64_t iova, unsigned len) +static int msm_gpummu_unmap(struct msm_mmu *mmu, uint64_t iova, size_t len) { struct msm_gpummu *gpummu = to_msm_gpummu(mmu); unsigned idx = (iova - GPUMMU_VA_START) / GPUMMU_PAGE_SIZE; diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index e773ef8c7203..3a381a9674c9 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -31,7 +31,7 @@ static void msm_iommu_detach(struct msm_mmu *mmu) } static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova, - struct sg_table *sgt, unsigned len, int prot) + struct sg_table *sgt, size_t len, int prot) { struct msm_iommu *iommu = to_msm_iommu(mmu); size_t ret; @@ -42,7 +42,7 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova, return (ret == len) ? 0 : -EINVAL; } -static int msm_iommu_unmap(struct msm_mmu *mmu, uint64_t iova, unsigned len) +static int msm_iommu_unmap(struct msm_mmu *mmu, uint64_t iova, size_t len) { struct msm_iommu *iommu = to_msm_iommu(mmu); diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index bae9e8e67ec1..3a534ee59bf6 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -12,8 +12,8 @@ struct msm_mmu_funcs { void (*detach)(struct msm_mmu *mmu); int (*map)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt, - unsigned len, int prot); - int (*unmap)(struct msm_mmu *mmu, uint64_t iova, unsigned len); + size_t len, int prot); + int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len); void (*destroy)(struct msm_mmu *mmu); }; From d9e19d7966a31ae70edfe0cb7cb044e20343a0c9 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Fri, 22 May 2020 18:29:08 -0400 Subject: [PATCH 224/574] drm/msm/a6xx: skip HFI set freq if GMU is powered down Also skip the newly added HFI set freq path if the GMU is powered down, which was missing because of patches crossing paths. Signed-off-by: Jonathan Marek Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index c5e21b53c4d0..096be97ce9f9 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -110,13 +110,6 @@ static void __a6xx_gmu_set_freq(struct a6xx_gmu *gmu, int index) struct msm_gpu *gpu = &adreno_gpu->base; int ret; - /* - * This can get called from devfreq while the hardware is idle. Don't - * bring up the power if it isn't already active - */ - if (pm_runtime_get_if_in_use(gmu->dev) == 0) - return; - gmu_write(gmu, REG_A6XX_GMU_DCVS_ACK_OPTION, 0); gmu_write(gmu, REG_A6XX_GMU_DCVS_PERF_SETTING, @@ -141,7 +134,6 @@ static void __a6xx_gmu_set_freq(struct a6xx_gmu *gmu, int index) * for now leave it at max so that the performance is nominal. */ icc_set_bw(gpu->icc_path, 0, MBps_to_icc(7216)); - pm_runtime_put(gmu->dev); } void a6xx_gmu_set_freq(struct msm_gpu *gpu, unsigned long freq) @@ -159,13 +151,21 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, unsigned long freq) break; gmu->current_perf_index = perf_index; + gmu->freq = gmu->gpu_freqs[perf_index]; + + /* + * This can get called from devfreq while the hardware is idle. Don't + * bring up the power if it isn't already active + */ + if (pm_runtime_get_if_in_use(gmu->dev) == 0) + return; if (gmu->legacy) __a6xx_gmu_set_freq(gmu, perf_index); else a6xx_hfi_set_freq(gmu, perf_index); - gmu->freq = gmu->gpu_freqs[perf_index]; + pm_runtime_put(gmu->dev); } unsigned long a6xx_gmu_get_freq(struct msm_gpu *gpu) From 70b8170e55d3ca9503a53211967faee6b5f18b19 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 May 2020 15:28:24 +0200 Subject: [PATCH 225/574] iommu: Don't call .probe_finalize() under group->mutex The .probe_finalize() call-back of some IOMMU drivers calls into arm_iommu_attach_device(). This function will call back into the IOMMU core code, where it tries to take group->mutex again, resulting in a deadlock. As there is no reason why .probe_finalize() needs to be called under that mutex, move it after the lock has been released to fix the deadlock. Fixes: deac0b3bed26 ("iommu: Split off default domain allocation from group assignment") Reported-by: Yong Wu Tested-by: Yong Wu Signed-off-by: Joerg Roedel Cc: Yong Wu Link: https://lore.kernel.org/r/20200519132824.15163-1-joro@8bytes.org --- drivers/iommu/iommu.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4050569188be..b5ae598af2f4 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1674,17 +1674,8 @@ static void probe_alloc_default_domain(struct bus_type *bus, static int iommu_group_do_dma_attach(struct device *dev, void *data) { struct iommu_domain *domain = data; - const struct iommu_ops *ops; - int ret; - ret = __iommu_attach_device(domain, dev); - - ops = domain->ops; - - if (ret == 0 && ops->probe_finalize) - ops->probe_finalize(dev); - - return ret; + return __iommu_attach_device(domain, dev); } static int __iommu_group_dma_attach(struct iommu_group *group) @@ -1693,6 +1684,21 @@ static int __iommu_group_dma_attach(struct iommu_group *group) iommu_group_do_dma_attach); } +static int iommu_group_do_probe_finalize(struct device *dev, void *data) +{ + struct iommu_domain *domain = data; + + if (domain->ops->probe_finalize) + domain->ops->probe_finalize(dev); + + return 0; +} + +static void __iommu_group_dma_finalize(struct iommu_group *group) +{ + __iommu_group_for_each_dev(group, group->default_domain, + iommu_group_do_probe_finalize); +} static int iommu_do_create_direct_mappings(struct device *dev, void *data) { struct iommu_group *group = data; @@ -1745,6 +1751,8 @@ int bus_iommu_probe(struct bus_type *bus) if (ret) break; + + __iommu_group_dma_finalize(group); } return ret; From bfe6240dfe4f16c20db94bc7c0ab9ffa316fb926 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 19 May 2020 09:34:23 +0800 Subject: [PATCH 226/574] iommu/vt-d: Fix pointer cast warnings on 32 bit Pointers should be casted to unsigned long to avoid "cast from pointer to integer of different size" warnings. drivers/iommu/intel-pasid.c:818:6: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] drivers/iommu/intel-pasid.c:821:9: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] drivers/iommu/intel-pasid.c:824:23: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] drivers/iommu/intel-svm.c:343:45: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] Fixes: b0d1f8741b81 ("iommu/vt-d: Add nested translation helper function") Fixes: 56722a4398a3 ("iommu/vt-d: Add bind guest PASID support") Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200519013423.11971-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-pasid.c | 8 ++++---- drivers/iommu/intel-svm.c | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c index 25d749830500..c81f0f17c6ba 100644 --- a/drivers/iommu/intel-pasid.c +++ b/drivers/iommu/intel-pasid.c @@ -815,13 +815,13 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, } /* First level PGD is in GPA, must be supported by the second level */ - if ((unsigned long long)gpgd > domain->max_addr) { + if ((uintptr_t)gpgd > domain->max_addr) { dev_err_ratelimited(dev, - "Guest PGD %llx not supported, max %llx\n", - (unsigned long long)gpgd, domain->max_addr); + "Guest PGD %lx not supported, max %llx\n", + (uintptr_t)gpgd, domain->max_addr); return -EINVAL; } - pasid_set_flptr(pte, (u64)gpgd); + pasid_set_flptr(pte, (uintptr_t)gpgd); ret = intel_pasid_setup_bind_data(iommu, pte, pasid_data); if (ret) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 11366dc91971..acc7555b002d 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -340,7 +340,8 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, * call the nested mode setup function here. */ spin_lock(&iommu->lock); - ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)data->gpgd, + ret = intel_pasid_setup_nested(iommu, dev, + (pgd_t *)(uintptr_t)data->gpgd, data->hpasid, &data->vtd, dmar_domain, data->addr_width); spin_unlock(&iommu->lock); From 7809c4d5805b1d331acfe0047dd9695691d428d4 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 21 May 2020 17:50:30 -0400 Subject: [PATCH 227/574] iommu/vt-d: fix a GCC warning The commit 6ee1b77ba3ac ("iommu/vt-d: Add svm/sva invalidate function") introduced a GCC warning, drivers/iommu/intel-iommu.c:5330:1: warning: 'static' is not at beginning of declaration [-Wold-style-declaration] const static int ^~~~~ Fixes: 6ee1b77ba3ac0 ("iommu/vt-d: Add svm/sva invalidate function") Signed-off-by: Qian Cai Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200521215030.16938-1-cai@lca.pw Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f75d7d9c231f..ff5a30a94679 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5327,7 +5327,7 @@ static void intel_iommu_aux_detach_device(struct iommu_domain *domain, * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR] */ -const static int +static const int inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = { /* * PASID based IOTLB invalidation: PASID selective (per PASID), From 0fad590fd98f308c903fe8205c0f07a649a0b72e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 19 May 2020 07:31:12 +0100 Subject: [PATCH 228/574] drm/i915: Don't set queue-priority hint when supressing the reschedule We recorded the execlists->queue_priority_hint update for the inflight request without kicking the tasklet. The next submitted request then failed to be scheduled as it had a lower priority than the hint, leaving the HW running with only the inflight request. Fixes: 6cebcf746f3f ("drm/i915: Tweak scheduler's kick_submission()") Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200519063123.20673-1-chris@chris-wilson.co.uk (cherry picked from commit b86fc6e5e89e5645b43f57171c26740ef38f9f4a) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_scheduler.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index f4ea318781f0..cbb880b10c65 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -209,14 +209,6 @@ static void kick_submission(struct intel_engine_cs *engine, if (!inflight) goto unlock; - ENGINE_TRACE(engine, - "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n", - prio, - rq->fence.context, rq->fence.seqno, - inflight->fence.context, inflight->fence.seqno, - inflight->sched.attr.priority); - engine->execlists.queue_priority_hint = prio; - /* * If we are already the currently executing context, don't * bother evaluating if we should preempt ourselves. @@ -224,6 +216,14 @@ static void kick_submission(struct intel_engine_cs *engine, if (inflight->context == rq->context) goto unlock; + ENGINE_TRACE(engine, + "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n", + prio, + rq->fence.context, rq->fence.seqno, + inflight->fence.context, inflight->fence.seqno, + inflight->sched.attr.priority); + + engine->execlists.queue_priority_hint = prio; if (need_preempt(prio, rq_prio(inflight))) tasklet_hi_schedule(&engine->execlists.tasklet); From b7ccc7858a33ddb3c5516f39b104a9156957c8bb Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 20 May 2020 08:30:48 +0100 Subject: [PATCH 229/574] drm/i915/gt: Remove errant assertion in __intel_context_do_pin This assertion was removed in commit b412c63f1cba ("drm/i915/gt: Report context-is-closed prior to pinning"), but accidentally restored by a cherry-pick into drm-next and now has percolated back to drm-intel-next-queued. Fixes: 2e46a2a0b014 ("drm/i915: Use explicit flag to mark unreachable intel_context") Fixes: 2b703bbda271 ("Merge drm/drm-next into drm-intel-next-queued") References: b412c63f1cba ("drm/i915/gt: Report context-is-closed prior to pinning") Signed-off-by: Chris Wilson Cc: Rodrigo Vivi Cc: Joonas Lahtinen Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20200520073048.2394034-1-chris@chris-wilson.co.uk (cherry picked from commit f2c1061a3677b400a945d9238f17bf33d669acff) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gt/intel_context.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 74ddb49b2941..e4aece20bc80 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -97,8 +97,6 @@ int __intel_context_do_pin(struct intel_context *ce) { int err; - GEM_BUG_ON(intel_context_is_closed(ce)); - if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { err = intel_context_alloc_state(ce); if (err) From 9ef36fc2d0347f04f75b9d70c0ecc2e3b403bb7f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 21 May 2020 15:06:16 +0100 Subject: [PATCH 230/574] drm/i915: Disable semaphore inter-engine sync without timeslicing Since the removal of the no-semaphore boosting, we rely on timeslicing to reorder passed inter-dependency hogs across the engines. However, we require preemption to support timeslicing into user payloads, and not all machine support preemption so we do not universally enable timeslicing, even when it would correctly preempt our own inter-engine semaphores. Since timeslicing and semaphore priority deboosting is now disabled on Broadwell/Braswell, we have to follow suite and not use semaphores. Testcase: igt/gem_exec_schedule/semaphore-codependency # bdw/bsw Fixes: 18e4af04d218 ("drm/i915: Drop no-semaphore boosting") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Mika Kuoppala Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200521140617.30015-1-chris@chris-wilson.co.uk (cherry picked from commit 0eb670aac27b1d615004c29efec595616e3e091a) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 900ea8b7fc8f..f5d59d18cd5b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -230,7 +230,7 @@ static void intel_context_set_gem(struct intel_context *ce, ce->timeline = intel_timeline_get(ctx->timeline); if (ctx->sched.priority >= I915_PRIORITY_NORMAL && - intel_engine_has_semaphores(ce->engine)) + intel_engine_has_timeslices(ce->engine)) __set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); } @@ -1969,7 +1969,7 @@ static int __apply_priority(struct intel_context *ce, void *arg) { struct i915_gem_context *ctx = arg; - if (!intel_engine_has_semaphores(ce->engine)) + if (!intel_engine_has_timeslices(ce->engine)) return 0; if (ctx->sched.priority >= I915_PRIORITY_NORMAL) From ef29440b3ccb93f44cf664311b30fbd5f84d9683 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 21 May 2020 15:06:17 +0100 Subject: [PATCH 231/574] drm/i915: Avoid using rq->engine after free during i915_fence_release In order to be valid to dereference during the i915_fence_release, after retiring the fence and releasing its refererences, we assume that rq->engine can only be a real engine (that stay intact until the device is shutdown after all fences have been flushed). However, due to a quirk of preempt-to-busy, we may retire a request that still belongs to a virtual engine and so eventually free it with rq->engine being invalid. To avoid dereferencing that invalid engine, we look at the execution_mask which if it indicates it may be executed on more than one engine, we know it originated on a virtual engine and may still be on one. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1906 Fixes: 43acd6516ca9 ("drm/i915: Keep a per-engine request pool") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200521140617.30015-2-chris@chris-wilson.co.uk (cherry picked from commit 32a4605b38c30689a6a18f3f4c7d3133ac9d3277) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 35 +++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 526c1e9acbd5..c282719ad3ac 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -121,8 +121,39 @@ static void i915_fence_release(struct dma_fence *fence) i915_sw_fence_fini(&rq->submit); i915_sw_fence_fini(&rq->semaphore); - /* Keep one request on each engine for reserved use under mempressure */ - if (!cmpxchg(&rq->engine->request_pool, NULL, rq)) + /* + * Keep one request on each engine for reserved use under mempressure + * + * We do not hold a reference to the engine here and so have to be + * very careful in what rq->engine we poke. The virtual engine is + * referenced via the rq->context and we released that ref during + * i915_request_retire(), ergo we must not dereference a virtual + * engine here. Not that we would want to, as the only consumer of + * the reserved engine->request_pool is the power management parking, + * which must-not-fail, and that is only run on the physical engines. + * + * Since the request must have been executed to be have completed, + * we know that it will have been processed by the HW and will + * not be unsubmitted again, so rq->engine and rq->execution_mask + * at this point is stable. rq->execution_mask will be a single + * bit if the last and _only_ engine it could execution on was a + * physical engine, if it's multiple bits then it started on and + * could still be on a virtual engine. Thus if the mask is not a + * power-of-two we assume that rq->engine may still be a virtual + * engine and so a dangling invalid pointer that we cannot dereference + * + * For example, consider the flow of a bonded request through a virtual + * engine. The request is created with a wide engine mask (all engines + * that we might execute on). On processing the bond, the request mask + * is reduced to one or more engines. If the request is subsequently + * bound to a single engine, it will then be constrained to only + * execute on that engine and never returned to the virtual engine + * after timeslicing away, see __unwind_incomplete_requests(). Thus we + * know that if the rq->execution_mask is a single bit, rq->engine + * can be a physical engine with the exact corresponding mask. + */ + if (is_power_of_2(rq->execution_mask) && + !cmpxchg(&rq->engine->request_pool, NULL, rq)) return; kmem_cache_free(global.slab_requests, rq); From 757a9395f33c51c4e6eff2c7c0fbd50226a58224 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 22 May 2020 14:27:06 +0100 Subject: [PATCH 232/574] drm/i915/gem: Avoid iterating an empty list Our __sgt_iter assumes that the scattergather list has at least one element. But during construction we may fail in allocating the first page, and so mark the first element as the terminator. This is unexpected! [22555.524752] RIP: 0010:shmem_get_pages+0x506/0x710 [i915] [22555.524759] Code: 49 8b 2c 24 31 c0 66 89 44 24 40 48 85 ed 0f 84 62 01 00 00 4c 8b 75 00 8b 5d 08 44 8b 7d 0c 48 8b 0d 7e 34 07 e2 49 83 e6 fc <49> 8b 16 41 01 df 48 89 cf 48 89 d0 48 c1 e8 2d 48 85 c9 0f 84 c8 [22555.524765] RSP: 0018:ffffc9000053f9d0 EFLAGS: 00010246 [22555.524770] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8881ffffa000 [22555.524774] RDX: fffffffffffffff4 RSI: ffffffffffffffff RDI: ffffffff821efe00 [22555.524778] RBP: ffff8881b099ab00 R08: 0000000000000000 R09: 00000000fffffff4 [22555.524782] R10: 0000000000000002 R11: 00000000ffec0a02 R12: ffff8881cd3c8d60 [22555.524786] R13: 00000000fffffff4 R14: 0000000000000000 R15: 0000000000000000 [22555.524790] FS: 00007f4fbeb9b9c0(0000) GS:ffff8881f8580000(0000) knlGS:0000000000000000 [22555.524795] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [22555.524799] CR2: 0000000000000000 CR3: 00000001ec7f0004 CR4: 00000000001606e0 [22555.524803] Call Trace: [22555.524919] __i915_gem_object_get_pages+0x4f/0x60 [i915] Fixes: 85d1225ec066 ("drm/i915: Introduce & use new lightweight SGL iterators") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Tvrtko Ursulin Cc: # v4.8+ Reviewed-by: Matthew Auld Reviewed-by: Maciej Patelczyk Link: https://patchwork.freedesktop.org/patch/msgid/20200522132706.5133-1-chris@chris-wilson.co.uk (cherry picked from commit 957ad9a02be6faa87594c58ac09460cd3d190d0e) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 5d5d7eef3f43..7aff3514d97a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -39,7 +39,6 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj) unsigned long last_pfn = 0; /* suppress gcc warning */ unsigned int max_segment = i915_sg_segment_size(); unsigned int sg_page_sizes; - struct pagevec pvec; gfp_t noreclaim; int ret; @@ -192,13 +191,17 @@ err_sg: sg_mark_end(sg); err_pages: mapping_clear_unevictable(mapping); - pagevec_init(&pvec); - for_each_sgt_page(page, sgt_iter, st) { - if (!pagevec_add(&pvec, page)) + if (sg != st->sgl) { + struct pagevec pvec; + + pagevec_init(&pvec); + for_each_sgt_page(page, sgt_iter, st) { + if (!pagevec_add(&pvec, page)) + check_release_pagevec(&pvec); + } + if (pagevec_count(&pvec)) check_release_pagevec(&pvec); } - if (pagevec_count(&pvec)) - check_release_pagevec(&pvec); sg_free_table(st); kfree(st); From 79659190ee972c05498c338e48d80cb45490c533 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 25 May 2020 15:01:22 +0200 Subject: [PATCH 233/574] iommu: Don't take group reference in iommu_alloc_default_domain() The iommu_alloc_default_domain() function takes a reference to an IOMMU group without releasing it. This causes the group to never be released, with undefined side effects. The function has only one call-site, which takes a group reference on its own, so to fix this leak, do not take another reference in iommu_alloc_default_domain() and pass the group as a function parameter instead. Fixes: 6e1aa2049154 ("iommu: Move default domain allocation to iommu_probe_device()") Reported-by: Sai Prakash Ranjan Signed-off-by: Joerg Roedel Tested-by: Sai Prakash Ranjan Cc: Sai Prakash Ranjan Link: https://lore.kernel.org/r/20200525130122.380-1-joro@8bytes.org Reference: https://lore.kernel.org/lkml/20200522130145.30067-1-saiprakash.ranjan@codeaurora.org/ --- drivers/iommu/iommu.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b5ae598af2f4..298397721144 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -80,7 +80,8 @@ static bool iommu_cmd_line_dma_api(void) return !!(iommu_cmd_line & IOMMU_CMD_LINE_DMA_API); } -static int iommu_alloc_default_domain(struct device *dev); +static int iommu_alloc_default_domain(struct iommu_group *group, + struct device *dev); static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, unsigned type); static int __iommu_attach_device(struct iommu_domain *domain, @@ -251,17 +252,17 @@ int iommu_probe_device(struct device *dev) if (ret) goto err_out; + group = iommu_group_get(dev); + if (!group) + goto err_release; + /* * Try to allocate a default domain - needs support from the * IOMMU driver. There are still some drivers which don't * support default domains, so the return value is not yet * checked. */ - iommu_alloc_default_domain(dev); - - group = iommu_group_get(dev); - if (!group) - goto err_release; + iommu_alloc_default_domain(group, dev); if (group->default_domain) ret = __iommu_attach_device(group->default_domain, dev); @@ -1478,15 +1479,11 @@ static int iommu_group_alloc_default_domain(struct bus_type *bus, return 0; } -static int iommu_alloc_default_domain(struct device *dev) +static int iommu_alloc_default_domain(struct iommu_group *group, + struct device *dev) { - struct iommu_group *group; unsigned int type; - group = iommu_group_get(dev); - if (!group) - return -ENODEV; - if (group->default_domain) return 0; From 521376741b2c26fe53a1ec24d02da24d477eb739 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:00 +0200 Subject: [PATCH 234/574] PCI/ATS: Only enable ATS for trusted devices Add pci_ats_supported(), which checks whether a device has an ATS capability, and whether it is trusted. A device is untrusted if it is plugged into an external-facing port such as Thunderbolt and could be spoofing an existing device to exploit weaknesses in the IOMMU configuration. PCIe ATS is one such weaknesses since it allows endpoints to cache IOMMU translations and emit transactions with 'Translated' Address Type (10b) that partially bypass the IOMMU translation. The SMMUv3 and VT-d IOMMU drivers already disallow ATS and transactions with 'Translated' Address Type for untrusted devices. Add the check to pci_enable_ats() to let other drivers (AMD IOMMU for now) benefit from it. By checking ats_cap, the pci_ats_supported() helper also returns whether ATS was globally disabled with pci=noats, and could later include more things, for example whether the whole PCIe hierarchy down to the endpoint supports ATS. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Joerg Roedel Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20200520152201.3309416-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 18 +++++++++++++++++- include/linux/pci-ats.h | 3 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 390e92f2d8d1..b761c1f72f67 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -30,6 +30,22 @@ void pci_ats_init(struct pci_dev *dev) dev->ats_cap = pos; } +/** + * pci_ats_supported - check if the device can use ATS + * @dev: the PCI device + * + * Returns true if the device supports ATS and is allowed to use it, false + * otherwise. + */ +bool pci_ats_supported(struct pci_dev *dev) +{ + if (!dev->ats_cap) + return false; + + return (dev->untrusted == 0); +} +EXPORT_SYMBOL_GPL(pci_ats_supported); + /** * pci_enable_ats - enable the ATS capability * @dev: the PCI device @@ -42,7 +58,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps) u16 ctrl; struct pci_dev *pdev; - if (!dev->ats_cap) + if (!pci_ats_supported(dev)) return -EINVAL; if (WARN_ON(dev->ats_enabled)) diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index d08f0869f121..f75c307f346d 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -6,11 +6,14 @@ #ifdef CONFIG_PCI_ATS /* Address Translation Service */ +bool pci_ats_supported(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else /* CONFIG_PCI_ATS */ +static inline bool pci_ats_supported(struct pci_dev *d) +{ return false; } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } From 7a441b2110527851f630144ec76ab8409e1d6c61 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:01 +0200 Subject: [PATCH 235/574] iommu/amd: Use pci_ats_supported() The pci_ats_supported() function checks if a device supports ATS and is allowed to use it. In addition to checking that the device has an ATS capability and that the global pci=noats is not set (pci_ats_disabled()), it also checks if a device is untrusted. A device is untrusted if it is plugged into an external-facing port such as Thunderbolt and could be spoofing an existing device to exploit weaknesses in the IOMMU configuration. By calling pci_ats_supported() we keep DTE[I]=0 for untrusted devices and abort transactions with Pretranslated Addresses. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Joerg Roedel Link: https://lore.kernel.org/r/20200520152201.3309416-3-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1880811cec33..254f028cd18e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -291,16 +291,15 @@ static struct iommu_group *acpihid_device_group(struct device *dev) static bool pci_iommuv2_capable(struct pci_dev *pdev) { static const int caps[] = { - PCI_EXT_CAP_ID_ATS, PCI_EXT_CAP_ID_PRI, PCI_EXT_CAP_ID_PASID, }; int i, pos; - if (pci_ats_disabled()) + if (!pci_ats_supported(pdev)) return false; - for (i = 0; i < 3; ++i) { + for (i = 0; i < 2; ++i) { pos = pci_find_ext_capability(pdev, caps[i]); if (pos == 0) return false; @@ -3028,11 +3027,8 @@ int amd_iommu_device_info(struct pci_dev *pdev, memset(info, 0, sizeof(*info)); - if (!pci_ats_disabled()) { - pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); - if (pos) - info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; - } + if (pci_ats_supported(pdev)) + info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (pos) From 0b2527a654190a987d45e2cc9e5c6946eea11fc5 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:02 +0200 Subject: [PATCH 236/574] iommu/arm-smmu-v3: Use pci_ats_supported() The new pci_ats_supported() function checks if a device supports ATS and is allowed to use it. Signed-off-by: Jean-Philippe Brucker Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200520152201.3309416-4-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/arm-smmu-v3.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index 42e1ee7e5197..cb086924419f 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -2652,26 +2652,20 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master) } } -#ifdef CONFIG_PCI_ATS static bool arm_smmu_ats_supported(struct arm_smmu_master *master) { - struct pci_dev *pdev; + struct device *dev = master->dev; struct arm_smmu_device *smmu = master->smmu; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); - if (!(smmu->features & ARM_SMMU_FEAT_ATS) || !dev_is_pci(master->dev) || - !(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS) || pci_ats_disabled()) + if (!(smmu->features & ARM_SMMU_FEAT_ATS)) return false; - pdev = to_pci_dev(master->dev); - return !pdev->untrusted && pdev->ats_cap; + if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_ATS)) + return false; + + return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)); } -#else -static bool arm_smmu_ats_supported(struct arm_smmu_master *master) -{ - return false; -} -#endif static void arm_smmu_enable_ats(struct arm_smmu_master *master) { From da656a042568ffbc30881c43a832277f275eea4a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 20 May 2020 17:22:03 +0200 Subject: [PATCH 237/574] iommu/vt-d: Use pci_ats_supported() The pci_ats_supported() helper checks if a device supports ATS and is allowed to use it. By checking the ATS capability it also integrates the pci_ats_disabled() check from pci_ats_init(). Simplify the vt-d checks. Signed-off-by: Jean-Philippe Brucker Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200520152201.3309416-5-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 34e08fa2ce3a..5dea3042820b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1454,8 +1454,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) info->pri_enabled = 1; #endif - if (!pdev->untrusted && info->ats_supported && - pci_ats_page_aligned(pdev) && + if (info->ats_supported && pci_ats_page_aligned(pdev) && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { info->ats_enabled = 1; domain_update_iotlb(info->domain); @@ -2611,10 +2610,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (dev && dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(info->dev); - if (!pdev->untrusted && - !pci_ats_disabled() && - ecap_dev_iotlb_support(iommu->ecap) && - pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) && + if (ecap_dev_iotlb_support(iommu->ecap) && + pci_ats_supported(pdev) && dmar_find_matched_atsr_unit(pdev)) info->ats_supported = 1; From 9f510d1e4299169e01efeac2275d0792850db956 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 25 May 2020 23:49:57 +0200 Subject: [PATCH 238/574] iommu/hyper-v: Constify hyperv_ir_domain_ops The struct hyperv_ir_domain_ops is not modified and can be made const to allow the compiler to put it in read-only memory. Before: text data bss dec hex filename 2916 1180 1120 5216 1460 drivers/iommu/hyperv-iommu.o After: text data bss dec hex filename 3044 1052 1120 5216 1460 drivers/iommu/hyperv-iommu.o Signed-off-by: Rikard Falkeborn Acked-by: Wei Liu Link: https://lore.kernel.org/r/20200525214958.30015-2-rikard.falkeborn@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/hyperv-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index a386b83e0e34..3c0c67a99c7b 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -131,7 +131,7 @@ static int hyperv_irq_remapping_activate(struct irq_domain *domain, return 0; } -static struct irq_domain_ops hyperv_ir_domain_ops = { +static const struct irq_domain_ops hyperv_ir_domain_ops = { .alloc = hyperv_irq_remapping_alloc, .free = hyperv_irq_remapping_free, .activate = hyperv_irq_remapping_activate, From 79074f61c022e822451f0e7ac0c8c6e213f75f18 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 25 May 2020 23:49:58 +0200 Subject: [PATCH 239/574] iommu/sun50i: Constify sun50i_iommu_ops The struct sun50i_iommu_ops is not modified and can be made const to allow the compiler to put it in read-only memory. Before: text data bss dec hex filename 14358 2501 64 16923 421b drivers/iommu/sun50i-iommu.o After: text data bss dec hex filename 14726 2117 64 16907 420b drivers/iommu/sun50i-iommu.o Signed-off-by: Rikard Falkeborn Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20200525214958.30015-3-rikard.falkeborn@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/sun50i-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 1fa09ddcebd4..fce605e96aa2 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -771,7 +771,7 @@ static int sun50i_iommu_of_xlate(struct device *dev, return iommu_fwspec_add_ids(dev, &id, 1); } -static struct iommu_ops sun50i_iommu_ops = { +static const struct iommu_ops sun50i_iommu_ops = { .pgsize_bitmap = SZ_4K, .attach_dev = sun50i_iommu_attach_device, .detach_dev = sun50i_iommu_detach_device, From 11886c199d8d62e2469cb404235a4ee7266f01c7 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 17 Mar 2020 13:37:49 +0100 Subject: [PATCH 240/574] s390: add machine check SIGP This will be used with the upcoming entry.S changes to signal that there's a machine check pending that cannot be handled in the Machine check handler itself. Reviewed-by: Christian Borntraeger Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/smp.h | 1 + arch/s390/kernel/smp.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 231a51e870fe..7326f110d48c 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -58,5 +58,6 @@ extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); extern void __cpu_die(unsigned int cpu); extern int __cpu_disable(void); +extern void schedule_mcck_handler(void); #endif /* __ASM_SMP_H */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 7eaabbab2213..fc1041257c60 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -61,6 +61,7 @@ enum { ec_schedule = 0, ec_call_function_single, ec_stop_cpu, + ec_mcck_pending, }; enum { @@ -403,6 +404,11 @@ int smp_find_processor_id(u16 address) return -1; } +void schedule_mcck_handler(void) +{ + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); +} + bool arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) @@ -497,6 +503,8 @@ static void smp_handle_ext_call(void) scheduler_ipi(); if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); + if (test_bit(ec_mcck_pending, &bits)) + s390_handle_mcck(); } static void do_ext_call_interrupt(struct ext_code ext_code, From 0b0ed657fe003fdf4df3766b898e8869950aa1ce Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 20 Feb 2020 12:09:36 +0100 Subject: [PATCH 241/574] s390: remove critical section cleanup from entry.S The current code is rather complex and caused a lot of subtle and hard to debug bugs in the past. Simplify the code by calling the system_call handler with interrupts disabled, save machine state, and re-enable them later. This requires significant changes to the machine check handling code as well. When the machine check interrupt arrived while being in kernel mode the new code will signal pending machine checks with a SIGP external call. When userspace was interrupted, the handler will switch to the kernel stack and directly execute s390_handle_mcck(). Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/nmi.h | 2 +- arch/s390/include/asm/processor.h | 18 +- arch/s390/kernel/entry.S | 464 +++++++----------------------- arch/s390/kernel/idle.c | 14 +- arch/s390/kernel/nmi.c | 23 +- arch/s390/kernel/setup.c | 3 +- arch/s390/kvm/kvm-s390.c | 3 - arch/s390/kvm/vsie.c | 2 - arch/s390/lib/delay.c | 4 +- 9 files changed, 143 insertions(+), 390 deletions(-) diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index b160da8fa14b..5afee80cff58 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -99,7 +99,7 @@ int nmi_alloc_per_cpu(struct lowcore *lc); void nmi_free_per_cpu(struct lowcore *lc); void s390_handle_mcck(void); -void s390_do_machine_check(struct pt_regs *regs); +int s390_do_machine_check(struct pt_regs *regs); #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_NMI_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 555d148ccf32..962da04234af 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -14,17 +14,15 @@ #include -#define CIF_MCCK_PENDING 0 /* machine check handling is pending */ -#define CIF_ASCE_PRIMARY 1 /* primary asce needs fixup / uaccess */ -#define CIF_ASCE_SECONDARY 2 /* secondary asce needs fixup / uaccess */ -#define CIF_NOHZ_DELAY 3 /* delay HZ disable for a tick */ -#define CIF_FPU 4 /* restore FPU registers */ -#define CIF_IGNORE_IRQ 5 /* ignore interrupt (for udelay) */ -#define CIF_ENABLED_WAIT 6 /* in enabled wait state */ -#define CIF_MCCK_GUEST 7 /* machine check happening in guest */ -#define CIF_DEDICATED_CPU 8 /* this CPU is dedicated */ +#define CIF_ASCE_PRIMARY 0 /* primary asce needs fixup / uaccess */ +#define CIF_ASCE_SECONDARY 1 /* secondary asce needs fixup / uaccess */ +#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ +#define CIF_FPU 3 /* restore FPU registers */ +#define CIF_IGNORE_IRQ 4 /* ignore interrupt (for udelay) */ +#define CIF_ENABLED_WAIT 5 /* in enabled wait state */ +#define CIF_MCCK_GUEST 6 /* machine check happening in guest */ +#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */ -#define _CIF_MCCK_PENDING BIT(CIF_MCCK_PENDING) #define _CIF_ASCE_PRIMARY BIT(CIF_ASCE_PRIMARY) #define _CIF_ASCE_SECONDARY BIT(CIF_ASCE_SECONDARY) #define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3ae64914bd14..50ff6dd0f995 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -55,14 +55,11 @@ _TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING) _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) -_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ - _CIF_ASCE_SECONDARY | _CIF_FPU) +_CIF_WORK = (_CIF_ASCE_PRIMARY | _CIF_ASCE_SECONDARY | _CIF_FPU) _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) _LPP_OFFSET = __LC_LPP -#define BASED(name) name-cleanup_critical(%r13) - .macro TRACE_IRQS_ON #ifdef CONFIG_TRACE_IRQFLAGS basr %r2,%r0 @@ -116,17 +113,39 @@ _LPP_OFFSET = __LC_LPP .macro SWITCH_ASYNC savearea,timer tmhh %r8,0x0001 # interrupting from user ? jnz 2f +#if IS_ENABLED(CONFIG_KVM) lgr %r14,%r9 - cghi %r14,__LC_RETURN_LPSWE - je 0f - slg %r14,BASED(.Lcritical_start) - clg %r14,BASED(.Lcritical_length) - jhe 1f -0: + larl %r13,.Lsie_gmap + slgr %r14,%r13 + lghi %r13,.Lsie_done - .Lsie_gmap + clgr %r14,%r13 + jhe 0f lghi %r11,\savearea # inside critical section, do cleanup - brasl %r14,cleanup_critical - tmhh %r8,0x0001 # retest problem state after cleanup - jnz 2f + brasl %r14,.Lcleanup_sie +#endif +0: larl %r13,.Lpsw_idle_exit + cgr %r13,%r9 + jne 1f + + mvc __CLOCK_IDLE_EXIT(8,%r2), __LC_INT_CLOCK + mvc __TIMER_IDLE_EXIT(8,%r2), __LC_ASYNC_ENTER_TIMER + # account system time going idle + ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT + + lg %r13,__LC_STEAL_TIMER + alg %r13,__CLOCK_IDLE_ENTER(%r2) + slg %r13,__LC_LAST_UPDATE_CLOCK + stg %r13,__LC_STEAL_TIMER + + mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2) + + lg %r13,__LC_SYSTEM_TIMER + alg %r13,__LC_LAST_UPDATE_TIMER + slg %r13,__TIMER_IDLE_ENTER(%r2) + stg %r13,__LC_SYSTEM_TIMER + mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2) + + nihh %r8,0xfcfd # clear wait state and irq bits 1: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? slgr %r14,%r15 srag %r14,%r14,STACK_SHIFT @@ -152,12 +171,30 @@ _LPP_OFFSET = __LC_LPP mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer .endm - .macro REENABLE_IRQS + .macro RESTORE_SM_CLEAR_PER stg %r8,__LC_RETURN_PSW ni __LC_RETURN_PSW,0xbf ssm __LC_RETURN_PSW .endm + .macro ENABLE_INTS + stosm __SF_EMPTY(%r15),3 + .endm + + .macro ENABLE_INTS_TRACE + TRACE_IRQS_ON + ENABLE_INTS + .endm + + .macro DISABLE_INTS + stnsm __SF_EMPTY(%r15),0xfc + .endm + + .macro DISABLE_INTS_TRACE + DISABLE_INTS + TRACE_IRQS_OFF + .endm + .macro STCK savearea #ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES .insn s,0xb27c0000,\savearea # store clock fast @@ -254,8 +291,6 @@ ENTRY(__switch_to) BR_EX %r14 ENDPROC(__switch_to) -.L__critical_start: - #if IS_ENABLED(CONFIG_KVM) /* * sie64a calling convention: @@ -288,7 +323,6 @@ ENTRY(sie64a) BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_entry: sie 0(%r14) -.Lsie_exit: BPOFF BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) .Lsie_skip: @@ -341,7 +375,6 @@ EXPORT_SYMBOL(sie_exit) ENTRY(system_call) stpt __LC_SYNC_ENTER_TIMER -.Lsysc_stmg: stmg %r8,%r15,__LC_SAVE_AREA_SYNC BPOFF lg %r12,__LC_CURRENT @@ -350,7 +383,6 @@ ENTRY(system_call) .Lsysc_per: lg %r15,__LC_KERNEL_STACK la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs -.Lsysc_vtime: UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP stmg %r0,%r7,__PT_R0(%r11) @@ -358,6 +390,7 @@ ENTRY(system_call) mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC stg %r14,__PT_FLAGS(%r11) + ENABLE_INTS .Lsysc_do_svc: # clear user controlled register to prevent speculative use xgr %r0,%r0 @@ -393,26 +426,26 @@ ENTRY(system_call) jnz .Lsysc_work TSTMSK __TI_flags(%r12),_TIF_WORK jnz .Lsysc_work # check for work - TSTMSK __LC_CPU_FLAGS,_CIF_WORK + TSTMSK __LC_CPU_FLAGS,(_CIF_WORK-_CIF_FPU) jnz .Lsysc_work BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP .Lsysc_restore: + DISABLE_INTS + TSTMSK __LC_CPU_FLAGS, _CIF_FPU + jz .Lsysc_skip_fpu + brasl %r14,load_fpu_regs +.Lsysc_skip_fpu: lg %r14,__LC_VDSO_PER_CPU - lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) -.Lsysc_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER - lmg %r11,%r15,__PT_R11(%r11) - b __LC_RETURN_LPSWE(%r0) -.Lsysc_done: + lmg %r0,%r15,__PT_R0(%r11) + b __LC_RETURN_LPSWE # # One of the work bits is on. Find out which one. # .Lsysc_work: - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jo .Lsysc_mcck_pending TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED jo .Lsysc_reschedule TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART @@ -436,11 +469,9 @@ ENTRY(system_call) jo .Lsysc_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME jo .Lsysc_notify_resume - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsysc_vxrs TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) jnz .Lsysc_asce - j .Lsysc_return # beware of critical section cleanup + j .Lsysc_return # # _TIF_NEED_RESCHED is set, call schedule @@ -449,13 +480,6 @@ ENTRY(system_call) larl %r14,.Lsysc_return jg schedule -# -# _CIF_MCCK_PENDING is set, call handler -# -.Lsysc_mcck_pending: - larl %r14,.Lsysc_return - jg s390_handle_mcck # TIF bit will be cleared by handler - # # _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce # @@ -475,12 +499,6 @@ ENTRY(system_call) larl %r14,.Lsysc_return jg set_fs_fixup -# -# CIF_FPU is set, restore floating-point controls and floating-point registers. -# -.Lsysc_vxrs: - larl %r14,.Lsysc_return - jg load_fpu_regs # # _TIF_SIGPENDING is set, call do_signal @@ -564,7 +582,6 @@ ENTRY(system_call) jnh .Lsysc_tracenogo sllg %r8,%r2,3 lg %r9,0(%r8,%r10) -.Lsysc_tracego: lmg %r3,%r7,__PT_R3(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) lg %r2,__PT_ORIG_GPR2(%r11) @@ -585,8 +602,6 @@ ENTRY(ret_from_fork) la %r11,STACK_FRAME_OVERHEAD(%r15) lg %r12,__LC_CURRENT brasl %r14,schedule_tail - TRACE_IRQS_ON - ssm __LC_SVC_NEW_PSW # reenable interrupts tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ? jne .Lsysc_tracenogo # it's a kernel thread @@ -620,15 +635,16 @@ ENTRY(pgm_check_handler) lghi %r10,1 0: lg %r12,__LC_CURRENT lghi %r11,0 - larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit jnz 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a lgr %r14,%r9 - slg %r14,BASED(.Lsie_critical_start) - clg %r14,BASED(.Lsie_critical_length) + larl %r13,.Lsie_gmap + slgr %r14,%r13 + lghi %r13,.Lsie_done - .Lsie_gmap + clgr %r14,%r13 jhe 1f lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE @@ -680,7 +696,7 @@ ENTRY(pgm_check_handler) mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -6: REENABLE_IRQS +6: RESTORE_SM_CLEAR_PER xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) @@ -702,7 +718,7 @@ ENTRY(pgm_check_handler) # PER event in supervisor state, must be kprobes # .Lpgm_kprobe: - REENABLE_IRQS + RESTORE_SM_CLEAR_PER xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,do_per_trap @@ -713,11 +729,10 @@ ENTRY(pgm_check_handler) # .Lpgm_svcper: mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW - lghi %r13,__TASK_thread larl %r14,.Lsysc_per stg %r14,__LC_RETURN_PSW+8 lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP - lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs + lpswe __LC_RETURN_PSW # branch to .Lsysc_per ENDPROC(pgm_check_handler) /* @@ -729,7 +744,6 @@ ENTRY(io_int_handler) BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT - larl %r13,cleanup_critical lmg %r8,%r9,__LC_IO_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER stmg %r0,%r7,__PT_R0(%r11) @@ -749,7 +763,12 @@ ENTRY(io_int_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore +#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) + tmhh %r8,0x300 + jz 1f TRACE_IRQS_OFF +1: +#endif xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) .Lio_loop: lgr %r2,%r11 # pass pointer to pt_regs @@ -767,25 +786,27 @@ ENTRY(io_int_handler) j .Lio_loop .Lio_return: LOCKDEP_SYS_EXIT - TRACE_IRQS_ON -.Lio_tif: TSTMSK __TI_flags(%r12),_TIF_WORK jnz .Lio_work # there is work to do (signals etc.) TSTMSK __LC_CPU_FLAGS,_CIF_WORK jnz .Lio_work .Lio_restore: +#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) + tm __PT_PSW(%r11),3 + jno 0f + TRACE_IRQS_ON +0: +#endif lg %r14,__LC_VDSO_PER_CPU - lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) tm __PT_PSW+1(%r11),0x01 # returning to user ? jno .Lio_exit_kernel BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP -.Lio_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER .Lio_exit_kernel: - lmg %r11,%r15,__PT_R11(%r11) - b __LC_RETURN_LPSWE(%r0) + lmg %r0,%r15,__PT_R0(%r11) + b __LC_RETURN_LPSWE .Lio_done: # @@ -813,9 +834,6 @@ ENTRY(io_int_handler) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) la %r11,STACK_FRAME_OVERHEAD(%r1) lgr %r15,%r1 - # TRACE_IRQS_ON already done at .Lio_return, call - # TRACE_IRQS_OFF to keep things symmetrical - TRACE_IRQS_OFF brasl %r14,preempt_schedule_irq j .Lio_return #else @@ -835,9 +853,6 @@ ENTRY(io_int_handler) # # One of the work bits is on. Find out which one. # -.Lio_work_tif: - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jo .Lio_mcck_pending TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED jo .Lio_reschedule #ifdef CONFIG_LIVEPATCH @@ -854,15 +869,6 @@ ENTRY(io_int_handler) jo .Lio_vxrs TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) jnz .Lio_asce - j .Lio_return # beware of critical section cleanup - -# -# _CIF_MCCK_PENDING is set, call handler -# -.Lio_mcck_pending: - # TRACE_IRQS_ON already done at .Lio_return - brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler - TRACE_IRQS_OFF j .Lio_return # @@ -895,23 +901,19 @@ ENTRY(io_int_handler) # _TIF_GUARDED_STORAGE is set, call guarded_storage_load # .Lio_guarded_storage: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts + ENABLE_INTS_TRACE lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,gs_load_bc_cb - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF + DISABLE_INTS_TRACE j .Lio_return # # _TIF_NEED_RESCHED is set, call schedule # .Lio_reschedule: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts + ENABLE_INTS_TRACE brasl %r14,schedule # call scheduler - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF + DISABLE_INTS_TRACE j .Lio_return # @@ -928,24 +930,20 @@ ENTRY(io_int_handler) # _TIF_SIGPENDING or is set, call do_signal # .Lio_sigpending: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts + ENABLE_INTS_TRACE lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,do_signal - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF + DISABLE_INTS_TRACE j .Lio_return # # _TIF_NOTIFY_RESUME or is set, call do_notify_resume # .Lio_notify_resume: - # TRACE_IRQS_ON already done at .Lio_return - ssm __LC_SVC_NEW_PSW # reenable interrupts + ENABLE_INTS_TRACE lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,do_notify_resume - ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts - TRACE_IRQS_OFF + DISABLE_INTS_TRACE j .Lio_return ENDPROC(io_int_handler) @@ -958,7 +956,6 @@ ENTRY(ext_int_handler) BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT - larl %r13,cleanup_critical lmg %r8,%r9,__LC_EXT_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER stmg %r0,%r7,__PT_R0(%r11) @@ -981,7 +978,12 @@ ENTRY(ext_int_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore +#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) + tmhh %r8,0x300 + jz 1f TRACE_IRQS_OFF +1: +#endif xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs lghi %r3,EXT_INTERRUPT @@ -990,11 +992,11 @@ ENTRY(ext_int_handler) ENDPROC(ext_int_handler) /* - * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. + * Load idle PSW. */ ENTRY(psw_idle) stg %r3,__SF_EMPTY(%r15) - larl %r1,.Lpsw_idle_lpsw+4 + larl %r1,.Lpsw_idle_exit stg %r1,__SF_EMPTY+8(%r15) larl %r1,smp_cpu_mtid llgf %r1,0(%r1) @@ -1006,10 +1008,9 @@ ENTRY(psw_idle) BPON STCK __CLOCK_IDLE_ENTER(%r2) stpt __TIMER_IDLE_ENTER(%r2) -.Lpsw_idle_lpsw: lpswe __SF_EMPTY(%r15) +.Lpsw_idle_exit: BR_EX %r14 -.Lpsw_idle_end: ENDPROC(psw_idle) /* @@ -1020,6 +1021,7 @@ ENDPROC(psw_idle) * of the register contents at return from io or a system call. */ ENTRY(save_fpu_regs) + stnsm __SF_EMPTY(%r15),0xfc lg %r2,__LC_CURRENT aghi %r2,__TASK_thread TSTMSK __LC_CPU_FLAGS,_CIF_FPU @@ -1051,6 +1053,7 @@ ENTRY(save_fpu_regs) .Lsave_fpu_regs_done: oi __LC_CPU_FLAGS+7,_CIF_FPU .Lsave_fpu_regs_exit: + ssm __SF_EMPTY(%r15) BR_EX %r14 .Lsave_fpu_regs_end: ENDPROC(save_fpu_regs) @@ -1102,8 +1105,6 @@ load_fpu_regs: .Lload_fpu_regs_end: ENDPROC(load_fpu_regs) -.L__critical_end: - /* * Machine check handler routines */ @@ -1116,7 +1117,6 @@ ENTRY(mcck_int_handler) lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT - larl %r13,cleanup_critical lmg %r8,%r9,__LC_MCK_OLD_PSW TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid @@ -1202,15 +1202,13 @@ ENTRY(mcck_int_handler) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,s390_do_machine_check - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lmcck_return + cghi %r2,0 + je .Lmcck_return lg %r1,__LC_KERNEL_STACK # switch to kernel stack mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) la %r11,STACK_FRAME_OVERHEAD(%r1) lgr %r15,%r1 - TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING - jno .Lmcck_return TRACE_IRQS_OFF brasl %r14,s390_handle_mcck TRACE_IRQS_ON @@ -1280,265 +1278,23 @@ ENTRY(stack_overflow) ENDPROC(stack_overflow) #endif -ENTRY(cleanup_critical) - cghi %r9,__LC_RETURN_LPSWE - je .Lcleanup_lpswe #if IS_ENABLED(CONFIG_KVM) - clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap - jl 0f - clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done - jl .Lcleanup_sie -#endif - clg %r9,BASED(.Lcleanup_table) # system_call - jl 0f - clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc - jl .Lcleanup_system_call - clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif - jl 0f - clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore - jl .Lcleanup_sysc_tif - clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done - jl .Lcleanup_sysc_restore - clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif - jl 0f - clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore - jl .Lcleanup_io_tif - clg %r9,BASED(.Lcleanup_table+56) # .Lio_done - jl .Lcleanup_io_restore - clg %r9,BASED(.Lcleanup_table+64) # psw_idle - jl 0f - clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end - jl .Lcleanup_idle - clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs - jl 0f - clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end - jl .Lcleanup_save_fpu_regs - clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs - jl 0f - clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end - jl .Lcleanup_load_fpu_regs -0: BR_EX %r14,%r11 -ENDPROC(cleanup_critical) - - .align 8 -.Lcleanup_table: - .quad system_call - .quad .Lsysc_do_svc - .quad .Lsysc_tif - .quad .Lsysc_restore - .quad .Lsysc_done - .quad .Lio_tif - .quad .Lio_restore - .quad .Lio_done - .quad psw_idle - .quad .Lpsw_idle_end - .quad save_fpu_regs - .quad .Lsave_fpu_regs_end - .quad load_fpu_regs - .quad .Lload_fpu_regs_end - -#if IS_ENABLED(CONFIG_KVM) -.Lcleanup_table_sie: - .quad .Lsie_gmap - .quad .Lsie_done - .Lcleanup_sie: - cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? - je 1f - slg %r9,BASED(.Lsie_crit_mcck_start) - clg %r9,BASED(.Lsie_crit_mcck_length) - jh 1f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? + je 1f + larl %r13,.Lsie_entry + slgr %r9,%r13 + larl %r13,.Lsie_skip + clgr %r9,%r13 + jh 1f + oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST +1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit BR_EX %r14,%r11 -#endif -.Lcleanup_system_call: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_system_call_insn) - jh 0f - mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER -0: # check if stmg has been executed - clg %r9,BASED(.Lcleanup_system_call_insn+8) - jh 0f - mvc __LC_SAVE_AREA_SYNC(64),0(%r11) -0: # check if base register setup + TIF bit load has been done - clg %r9,BASED(.Lcleanup_system_call_insn+16) - jhe 0f - # set up saved register r12 task struct pointer - stg %r12,32(%r11) - # set up saved register r13 __TASK_thread offset - mvc 40(8,%r11),BASED(.Lcleanup_system_call_const) -0: # check if the user time update has been done - clg %r9,BASED(.Lcleanup_system_call_insn+24) - jh 0f - lg %r15,__LC_EXIT_TIMER - slg %r15,__LC_SYNC_ENTER_TIMER - alg %r15,__LC_USER_TIMER - stg %r15,__LC_USER_TIMER -0: # check if the system time update has been done - clg %r9,BASED(.Lcleanup_system_call_insn+32) - jh 0f - lg %r15,__LC_LAST_UPDATE_TIMER - slg %r15,__LC_EXIT_TIMER - alg %r15,__LC_SYSTEM_TIMER - stg %r15,__LC_SYSTEM_TIMER -0: # update accounting time stamp - mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - # set up saved register r11 - lg %r15,__LC_KERNEL_STACK - la %r9,STACK_FRAME_OVERHEAD(%r15) - stg %r9,24(%r11) # r11 pt_regs pointer - # fill pt_regs - mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC - stmg %r0,%r7,__PT_R0(%r9) - mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW - mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC - xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9) - mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL - # setup saved register r15 - stg %r15,56(%r11) # r15 stack pointer - # set new psw address and exit - larl %r9,.Lsysc_do_svc - BR_EX %r14,%r11 -.Lcleanup_system_call_insn: - .quad system_call - .quad .Lsysc_stmg - .quad .Lsysc_per - .quad .Lsysc_vtime+36 - .quad .Lsysc_vtime+42 -.Lcleanup_system_call_const: - .quad __TASK_thread - -.Lcleanup_sysc_tif: - larl %r9,.Lsysc_tif - BR_EX %r14,%r11 - -.Lcleanup_sysc_restore: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_sysc_restore_insn) - jh 0f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER -0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) - je 1f - lg %r9,24(%r11) # get saved pointer to pt_regs - mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) - mvc 0(64,%r11),__PT_R8(%r9) - lmg %r0,%r7,__PT_R0(%r9) -.Lcleanup_lpswe: -1: lmg %r8,%r9,__LC_RETURN_PSW - BR_EX %r14,%r11 -.Lcleanup_sysc_restore_insn: - .quad .Lsysc_exit_timer - .quad .Lsysc_done - 4 - -.Lcleanup_io_tif: - larl %r9,.Lio_tif - BR_EX %r14,%r11 - -.Lcleanup_io_restore: - # check if stpt has been executed - clg %r9,BASED(.Lcleanup_io_restore_insn) - jh 0f - mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER -0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) - je 1f - lg %r9,24(%r11) # get saved r11 pointer to pt_regs - mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) - mvc 0(64,%r11),__PT_R8(%r9) - lmg %r0,%r7,__PT_R0(%r9) -1: lmg %r8,%r9,__LC_RETURN_PSW - BR_EX %r14,%r11 -.Lcleanup_io_restore_insn: - .quad .Lio_exit_timer - .quad .Lio_done - 4 - -.Lcleanup_idle: - ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT - # copy interrupt clock & cpu timer - mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK - mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER - cghi %r11,__LC_SAVE_AREA_ASYNC - je 0f - mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK - mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER -0: # check if stck & stpt have been executed - clg %r9,BASED(.Lcleanup_idle_insn) - jhe 1f - mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2) - mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2) -1: # calculate idle cycles - clg %r9,BASED(.Lcleanup_idle_insn) - jl 3f - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz 3f - .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15) - larl %r3,mt_cycles - ag %r3,__LC_PERCPU_OFFSET - la %r4,__SF_EMPTY+16(%r15) -2: lg %r0,0(%r3) - slg %r0,0(%r4) - alg %r0,64(%r4) - stg %r0,0(%r3) - la %r3,8(%r3) - la %r4,8(%r4) - brct %r1,2b -3: # account system time going idle - lg %r9,__LC_STEAL_TIMER - alg %r9,__CLOCK_IDLE_ENTER(%r2) - slg %r9,__LC_LAST_UPDATE_CLOCK - stg %r9,__LC_STEAL_TIMER - mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2) - lg %r9,__LC_SYSTEM_TIMER - alg %r9,__LC_LAST_UPDATE_TIMER - slg %r9,__TIMER_IDLE_ENTER(%r2) - stg %r9,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2) - # prepare return psw - nihh %r8,0xfcfd # clear irq & wait state bits - lg %r9,48(%r11) # return from psw_idle - BR_EX %r14,%r11 -.Lcleanup_idle_insn: - .quad .Lpsw_idle_lpsw - -.Lcleanup_save_fpu_regs: - larl %r9,save_fpu_regs - BR_EX %r14,%r11 - -.Lcleanup_load_fpu_regs: - larl %r9,load_fpu_regs - BR_EX %r14,%r11 - -/* - * Integer constants - */ - .align 8 -.Lcritical_start: - .quad .L__critical_start -.Lcritical_length: - .quad .L__critical_end - .L__critical_start -#if IS_ENABLED(CONFIG_KVM) -.Lsie_critical_start: - .quad .Lsie_gmap -.Lsie_critical_length: - .quad .Lsie_done - .Lsie_gmap -.Lsie_crit_mcck_start: - .quad .Lsie_entry -.Lsie_crit_mcck_length: - .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 8f8456816d83..0d7fbdfe995a 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -24,19 +24,19 @@ void enabled_wait(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); unsigned long long idle_time; - unsigned long psw_mask; + unsigned long psw_mask, flags; - trace_hardirqs_on(); /* Wait for external, I/O or machine check interrupt. */ psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); + local_irq_save(flags); /* Call the assembler magic in entry.S */ psw_idle(idle, psw_mask); + local_irq_restore(flags); - trace_hardirqs_off(); /* Account time spent with enabled wait psw loaded as idle time. */ write_seqcount_begin(&idle->seqcount); @@ -118,22 +118,16 @@ u64 arch_cpu_idle_time(int cpu) void arch_cpu_idle_enter(void) { - local_mcck_disable(); } void arch_cpu_idle(void) { - if (!test_cpu_flag(CIF_MCCK_PENDING)) - /* Halt the cpu and keep track of cpu time accounting. */ - enabled_wait(); + enabled_wait(); local_irq_enable(); } void arch_cpu_idle_exit(void) { - local_mcck_enable(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); } void arch_cpu_idle_dead(void) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 0a487fae763e..86c8d5370e7f 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -148,7 +148,6 @@ void s390_handle_mcck(void) local_mcck_disable(); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); - clear_cpu_flag(CIF_MCCK_PENDING); local_mcck_enable(); local_irq_restore(flags); @@ -333,7 +332,7 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info); /* * machine check handler. */ -void notrace s390_do_machine_check(struct pt_regs *regs) +int notrace s390_do_machine_check(struct pt_regs *regs) { static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); @@ -342,6 +341,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) unsigned long long tmp; union mci mci; unsigned long mcck_dam_code; + int mcck_pending = 0; nmi_enter(); inc_irq_stat(NMI_NMI); @@ -400,7 +400,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) */ mcck->kill_task = 1; mcck->mcck_code = mci.val; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } /* @@ -420,8 +420,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) mcck->stp_queue |= stp_sync_check(); if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); - if (mcck->stp_queue) - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } /* @@ -442,12 +441,12 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } if (mci.w) { /* Warning pending */ mcck->warning = 1; - set_cpu_flag(CIF_MCCK_PENDING); + mcck_pending = 1; } /* @@ -462,7 +461,17 @@ void notrace s390_do_machine_check(struct pt_regs *regs) *((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR; } clear_cpu_flag(CIF_MCCK_GUEST); + + if (user_mode(regs) && mcck_pending) { + nmi_exit(); + return 1; + } + + if (mcck_pending) + schedule_mcck_handler(); + nmi_exit(); + return 0; } NOKPROBE_SYMBOL(s390_do_machine_check); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 12f07565ef64..f432b57438f9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -384,8 +384,7 @@ static void __init setup_lowcore_dat_off(void) lc->restart_psw.addr = (unsigned long) restart_int_handler; lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = PSW_KERNEL_BITS | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->svc_new_psw.addr = (unsigned long) system_call; lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 19a81024fe16..5722daf0b603 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3995,9 +3995,6 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) if (need_resched()) schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); if (rc) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4f6c22d72072..4fde24a1856e 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1002,8 +1002,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (need_resched()) schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index d4aa10795605..daca7bad66de 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(__delay); static void __udelay_disabled(unsigned long long usecs) { - unsigned long cr0, cr0_new, psw_mask; + unsigned long cr0, cr0_new, psw_mask, flags; struct s390_idle_data idle; u64 end; @@ -45,7 +45,9 @@ static void __udelay_disabled(unsigned long long usecs) psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT; set_clock_comparator(end); set_cpu_flag(CIF_IGNORE_IRQ); + local_irq_save(flags); psw_idle(&idle, psw_mask); + local_irq_restore(flags); clear_cpu_flag(CIF_IGNORE_IRQ); set_clock_comparator(S390_lowcore.clock_comparator); __ctl_load(cr0, 0, 0); From 0623b7dda24f313bf210119ee6c5a4bdec6ce52d Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Sat, 9 May 2020 15:12:05 +0200 Subject: [PATCH 242/574] s390/qdio: fix up qdio_start_irq() kerneldoc Document the actual semantics, correcting an old copy & paste mistake. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index f5596265b053..d9acf492f73c 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1566,7 +1566,7 @@ int do_QDIO(struct ccw_device *cdev, unsigned int callflags, EXPORT_SYMBOL_GPL(do_QDIO); /** - * qdio_start_irq - process input buffers + * qdio_start_irq - enable interrupt processing for the device * @cdev: associated ccw_device for the qdio subchannel * * Return codes From cafebf8653b3d689b3411daa0d3d7b67fc4f2edb Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 7 May 2020 10:21:53 +0200 Subject: [PATCH 243/574] s390/qdio: remove q->first_to_kick q->first_to_kick is obsolete, and can be replaced by q->first_to_check. Both cursors start off at 0. Out of the three code paths that update first_to_check, the qdio_inspect_queue() path is irrelevant as it doesn't even touch first_to_kick anymore. This leaves us with the two tasklet-driven code paths. Here any update to first_to_check is followed by a call to qdio_kick_handler(), which advances first_to_kick by the same amount. So the two cursors will differ only for a tiny moment. Drivers have no way of deterministically observing this difference, and thus it doesn't matter which of the cursors we use for reporting an error to q->handler. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 3 --- drivers/s390/cio/qdio_main.c | 18 +++++++----------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 66191e864b0b..eb13c479e11d 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -221,9 +221,6 @@ struct qdio_q { */ int first_to_check; - /* beginning position for calling the program */ - int first_to_kick; - /* number of buffers in use by the adapter */ atomic_t nr_buf_used; diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index d9acf492f73c..b0e84fba54dd 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -143,7 +143,7 @@ again: DBF_ERROR("%4x EQBS ERROR", SCH_NO(q)); DBF_ERROR("%3d%3d%2d", count, tmp_count, nr); q->handler(q->irq_ptr->cdev, QDIO_ERROR_GET_BUF_STATE, q->nr, - q->first_to_kick, count, q->irq_ptr->int_parm); + q->first_to_check, count, q->irq_ptr->int_parm); return 0; } } @@ -191,7 +191,7 @@ again: DBF_ERROR("%4x SQBS ERROR", SCH_NO(q)); DBF_ERROR("%3d%3d%2d", count, tmp_count, nr); q->handler(q->irq_ptr->cdev, QDIO_ERROR_SET_BUF_STATE, q->nr, - q->first_to_kick, count, q->irq_ptr->int_parm); + q->first_to_check, count, q->irq_ptr->int_parm); return 0; } } @@ -622,10 +622,9 @@ static inline unsigned long qdio_aob_for_buffer(struct qdio_output_q *q, return phys_aob; } -static void qdio_kick_handler(struct qdio_q *q, unsigned int count) +static void qdio_kick_handler(struct qdio_q *q, unsigned int start, + unsigned int count) { - int start = q->first_to_kick; - if (unlikely(q->irq_ptr->state != QDIO_IRQ_STATE_ACTIVE)) return; @@ -642,7 +641,6 @@ static void qdio_kick_handler(struct qdio_q *q, unsigned int count) q->irq_ptr->int_parm); /* for the next time */ - q->first_to_kick = add_buf(start, count); q->qdio_error = 0; } @@ -666,9 +664,9 @@ static void __qdio_inbound_processing(struct qdio_q *q) if (count == 0) return; + qdio_kick_handler(q, start, count); start = add_buf(start, count); q->first_to_check = start; - qdio_kick_handler(q, count); if (!qdio_inbound_q_done(q, start)) { /* means poll time is not yet over */ @@ -824,7 +822,7 @@ static void __qdio_outbound_processing(struct qdio_q *q) count = qdio_outbound_q_moved(q, start); if (count) { q->first_to_check = add_buf(start, count); - qdio_kick_handler(q, count); + qdio_kick_handler(q, start, count); } if (queue_type(q) == QDIO_ZFCP_QFMT && !pci_out_supported(q->irq_ptr) && @@ -945,7 +943,6 @@ static void qdio_handle_activate_check(struct ccw_device *cdev, { struct qdio_irq *irq_ptr = cdev->private->qdio_data; struct qdio_q *q; - int count; DBF_ERROR("%4x ACT CHECK", irq_ptr->schid.sch_no); DBF_ERROR("intp :%lx", intparm); @@ -960,9 +957,8 @@ static void qdio_handle_activate_check(struct ccw_device *cdev, goto no_handler; } - count = sub_buf(q->first_to_check, q->first_to_kick); q->handler(q->irq_ptr->cdev, QDIO_ERROR_ACTIVATE, - q->nr, q->first_to_kick, count, irq_ptr->int_parm); + q->nr, q->first_to_check, 0, irq_ptr->int_parm); no_handler: qdio_set_state(irq_ptr, QDIO_IRQ_STATE_STOPPED); /* From a0138f59265aff4a21356ba9319171f421575b52 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 16 Apr 2020 15:08:41 +0200 Subject: [PATCH 244/574] s390/cio, s390/qeth: cleanup PNSO CHSC CHSC3D (PNSO - perform network subchannel operation) is used for OC0 (Store-network-bridging-information) as well as for OC3 (Store-network-address-information). So common fields are renamed from *brinfo* to *pnso*. Also *_bridge_host_* is changed into *_addr_change_*, e.g. qeth_bridge_host_event to qeth_addr_change_event, for the same reasons. The keywords in the card traces are changed accordingly. Remove unused L3 types, as PNSO will only return Layer2 entries. Make PNSO CHSC implementation more consistent with existing API usage: Add new function ccw_device_pnso() to drivers/s390/cio/device_ops.c and the function declaration to arch/s390/include/asm/ccwdev.h, which takes a struct ccw_device * as parameter instead of schid and calls chsc_pnso(). PNSO CHSC has no strict relationship to qdio. So move the calling function from qdio to qeth_l2 and move the necessary structures to a new file arch/s390/include/asm/chsc.h. Do response code evaluation only in chsc_error_from_response() and use return code in all other places. qeth_anset_makerc() was meant to evaluate the PNSO response code, but never did, because pnso_rc was already non-zero. Indentation was corrected in some places. Signed-off-by: Alexandra Winter Reviewed-by: Peter Oberparleiter Reviewed-by: Vineeth Vijayan Reviewed-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/ccwdev.h | 5 + arch/s390/include/asm/chsc.h | 62 ++++++++++ arch/s390/include/asm/qdio.h | 33 ------ drivers/s390/cio/chsc.c | 40 +++---- drivers/s390/cio/chsc.h | 50 +------- drivers/s390/cio/device_ops.c | 23 ++++ drivers/s390/cio/qdio_main.c | 88 -------------- drivers/s390/net/qeth_l2_main.c | 198 ++++++++++++++++++-------------- 8 files changed, 223 insertions(+), 276 deletions(-) create mode 100644 arch/s390/include/asm/chsc.h diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index 865ce1cb86d5..3cfe1eb89838 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -236,4 +237,8 @@ extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); struct channel_path_desc_fmt0 *ccw_device_get_chp_desc(struct ccw_device *, int); u8 *ccw_device_get_util_str(struct ccw_device *cdev, int chp_idx); +int ccw_device_pnso(struct ccw_device *cdev, + struct chsc_pnso_area *pnso_area, + struct chsc_pnso_resume_token resume_token, + int cnc); #endif /* _S390_CCWDEV_H_ */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h new file mode 100644 index 000000000000..36ce2d25a5fc --- /dev/null +++ b/arch/s390/include/asm/chsc.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 2020 + * + * Author(s): Alexandra Winter + * + * Interface for Channel Subsystem Call + */ +#ifndef _ASM_S390_CHSC_H +#define _ASM_S390_CHSC_H + +#include + +/** + * struct chsc_pnso_naid_l2 - network address information descriptor + * @nit: Network interface token + * @addr_lnid: network address and logical network id (VLAN ID) + */ +struct chsc_pnso_naid_l2 { + u64 nit; + struct { u8 mac[6]; u16 lnid; } addr_lnid; +} __packed; + +struct chsc_pnso_resume_token { + u64 t1; + u64 t2; +} __packed; + +struct chsc_pnso_naihdr { + struct chsc_pnso_resume_token resume_token; + u32:32; + u32 instance; + u32:24; + u8 naids; + u32 reserved[3]; +} __packed; + +struct chsc_pnso_area { + struct chsc_header request; + u8:2; + u8 m:1; + u8:5; + u8:2; + u8 ssid:2; + u8 fmt:4; + u16 sch; + u8:8; + u8 cssid; + u16:16; + u8 oc; + u32:24; + struct chsc_pnso_resume_token resume_token; + u32 n:1; + u32:31; + u32 reserved[3]; + struct chsc_header response; + u32:32; + struct chsc_pnso_naihdr naihdr; + struct chsc_pnso_naid_l2 entries[0]; +} __packed __aligned(PAGE_SIZE); + +#endif /* _ASM_S390_CHSC_H */ diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 86a3796e9be8..e69dbf438f99 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -365,34 +365,6 @@ struct qdio_initialize { struct qdio_outbuf_state *output_sbal_state_array; }; -/** - * enum qdio_brinfo_entry_type - type of address entry for qdio_brinfo_desc() - * @l3_ipv6_addr: entry contains IPv6 address - * @l3_ipv4_addr: entry contains IPv4 address - * @l2_addr_lnid: entry contains MAC address and VLAN ID - */ -enum qdio_brinfo_entry_type {l3_ipv6_addr, l3_ipv4_addr, l2_addr_lnid}; - -/** - * struct qdio_brinfo_entry_XXX - Address entry for qdio_brinfo_desc() - * @nit: Network interface token - * @addr: Address of one of the three types - * - * The struct is passed to the callback function by qdio_brinfo_desc() - */ -struct qdio_brinfo_entry_l3_ipv6 { - u64 nit; - struct { unsigned char _s6_addr[16]; } addr; -} __packed; -struct qdio_brinfo_entry_l3_ipv4 { - u64 nit; - struct { uint32_t _s_addr; } addr; -} __packed; -struct qdio_brinfo_entry_l2 { - u64 nit; - struct { u8 mac[6]; u16 lnid; } addr_lnid; -} __packed; - #define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */ #define QDIO_STATE_ESTABLISHED 0x00000004 /* after qdio_establish */ #define QDIO_STATE_ACTIVE 0x00000008 /* after qdio_activate */ @@ -423,10 +395,5 @@ extern int qdio_inspect_queue(struct ccw_device *cdev, unsigned int nr, extern int qdio_shutdown(struct ccw_device *, int); extern int qdio_free(struct ccw_device *); extern int qdio_get_ssqd_desc(struct ccw_device *, struct qdio_ssqd_desc *); -extern int qdio_pnso_brinfo(struct subchannel_id schid, - int cnc, u16 *response, - void (*cb)(void *priv, enum qdio_brinfo_entry_type type, - void *entry), - void *priv); #endif /* __QDIO_H__ */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 1ca73c2e5a8f..c314e9495c1b 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -57,6 +57,7 @@ int chsc_error_from_response(int response) case 0x0104: return -EINVAL; case 0x0004: + case 0x0106: /* "Wrong Channel Parm" for the op 0x003d */ return -EOPNOTSUPP; case 0x000b: case 0x0107: /* "Channel busy" for the op 0x003d */ @@ -1336,36 +1337,35 @@ out: EXPORT_SYMBOL_GPL(chsc_scm_info); /** - * chsc_pnso_brinfo() - Perform Network-Subchannel Operation, Bridge Info. + * chsc_pnso() - Perform Network-Subchannel Operation * @schid: id of the subchannel on which PNSO is performed - * @brinfo_area: request and response block for the operation + * @pnso_area: request and response block for the operation * @resume_token: resume token for multiblock response * @cnc: Boolean change-notification control * - * brinfo_area must be allocated by the caller with get_zeroed_page(GFP_KERNEL) + * pnso_area must be allocated by the caller with get_zeroed_page(GFP_KERNEL) * * Returns 0 on success. */ -int chsc_pnso_brinfo(struct subchannel_id schid, - struct chsc_pnso_area *brinfo_area, - struct chsc_brinfo_resume_token resume_token, - int cnc) +int chsc_pnso(struct subchannel_id schid, + struct chsc_pnso_area *pnso_area, + struct chsc_pnso_resume_token resume_token, + int cnc) { - memset(brinfo_area, 0, sizeof(*brinfo_area)); - brinfo_area->request.length = 0x0030; - brinfo_area->request.code = 0x003d; /* network-subchannel operation */ - brinfo_area->m = schid.m; - brinfo_area->ssid = schid.ssid; - brinfo_area->sch = schid.sch_no; - brinfo_area->cssid = schid.cssid; - brinfo_area->oc = 0; /* Store-network-bridging-information list */ - brinfo_area->resume_token = resume_token; - brinfo_area->n = (cnc != 0); - if (chsc(brinfo_area)) + memset(pnso_area, 0, sizeof(*pnso_area)); + pnso_area->request.length = 0x0030; + pnso_area->request.code = 0x003d; /* network-subchannel operation */ + pnso_area->m = schid.m; + pnso_area->ssid = schid.ssid; + pnso_area->sch = schid.sch_no; + pnso_area->cssid = schid.cssid; + pnso_area->oc = 0; /* Store-network-bridging-information list */ + pnso_area->resume_token = resume_token; + pnso_area->n = (cnc != 0); + if (chsc(pnso_area)) return -EIO; - return chsc_error_from_response(brinfo_area->response.code); + return chsc_error_from_response(pnso_area->response.code); } -EXPORT_SYMBOL_GPL(chsc_pnso_brinfo); int chsc_sgib(u32 origin) { diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 34de6d77442c..7ecf7e4c402e 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -205,52 +205,10 @@ struct chsc_scm_info { int chsc_scm_info(struct chsc_scm_info *scm_area, u64 token); -struct chsc_brinfo_resume_token { - u64 t1; - u64 t2; -} __packed; - -struct chsc_brinfo_naihdr { - struct chsc_brinfo_resume_token resume_token; - u32:32; - u32 instance; - u32:24; - u8 naids; - u32 reserved[3]; -} __packed; - -struct chsc_pnso_area { - struct chsc_header request; - u8:2; - u8 m:1; - u8:5; - u8:2; - u8 ssid:2; - u8 fmt:4; - u16 sch; - u8:8; - u8 cssid; - u16:16; - u8 oc; - u32:24; - struct chsc_brinfo_resume_token resume_token; - u32 n:1; - u32:31; - u32 reserved[3]; - struct chsc_header response; - u32:32; - struct chsc_brinfo_naihdr naihdr; - union { - struct qdio_brinfo_entry_l3_ipv6 l3_ipv6[0]; - struct qdio_brinfo_entry_l3_ipv4 l3_ipv4[0]; - struct qdio_brinfo_entry_l2 l2[0]; - } entries; -} __packed __aligned(PAGE_SIZE); - -int chsc_pnso_brinfo(struct subchannel_id schid, - struct chsc_pnso_area *brinfo_area, - struct chsc_brinfo_resume_token resume_token, - int cnc); +int chsc_pnso(struct subchannel_id schid, + struct chsc_pnso_area *pnso_area, + struct chsc_pnso_resume_token resume_token, + int cnc); int __init chsc_get_cssid(int idx); diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index ccecf6b9504e..963fcc9054c6 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -710,6 +710,29 @@ void ccw_device_get_schid(struct ccw_device *cdev, struct subchannel_id *schid) } EXPORT_SYMBOL_GPL(ccw_device_get_schid); +/** + * ccw_device_pnso() - Perform Network-Subchannel Operation + * @cdev: device on which PNSO is performed + * @pnso_area: request and response block for the operation + * @resume_token: resume token for multiblock response + * @cnc: Boolean change-notification control + * + * pnso_area must be allocated by the caller with get_zeroed_page(GFP_KERNEL) + * + * Returns 0 on success. + */ +int ccw_device_pnso(struct ccw_device *cdev, + struct chsc_pnso_area *pnso_area, + struct chsc_pnso_resume_token resume_token, + int cnc) +{ + struct subchannel_id schid; + + ccw_device_get_schid(cdev, &schid); + return chsc_pnso(schid, pnso_area, resume_token, cnc); +} +EXPORT_SYMBOL_GPL(ccw_device_pnso); + /* * Allocate zeroed dma coherent 31 bit addressable memory using * the subchannels dma pool. Maximal size of allocation supported diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index b0e84fba54dd..610c05f59589 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -1705,94 +1705,6 @@ int qdio_stop_irq(struct ccw_device *cdev) } EXPORT_SYMBOL(qdio_stop_irq); -/** - * qdio_pnso_brinfo() - perform network subchannel op #0 - bridge info. - * @schid: Subchannel ID. - * @cnc: Boolean Change-Notification Control - * @response: Response code will be stored at this address - * @cb: Callback function will be executed for each element - * of the address list - * @priv: Pointer to pass to the callback function. - * - * Performs "Store-network-bridging-information list" operation and calls - * the callback function for every entry in the list. If "change- - * notification-control" is set, further changes in the address list - * will be reported via the IPA command. - */ -int qdio_pnso_brinfo(struct subchannel_id schid, - int cnc, u16 *response, - void (*cb)(void *priv, enum qdio_brinfo_entry_type type, - void *entry), - void *priv) -{ - struct chsc_pnso_area *rr; - int rc; - u32 prev_instance = 0; - int isfirstblock = 1; - int i, size, elems; - - rr = (struct chsc_pnso_area *)get_zeroed_page(GFP_KERNEL); - if (rr == NULL) - return -ENOMEM; - do { - /* on the first iteration, naihdr.resume_token will be zero */ - rc = chsc_pnso_brinfo(schid, rr, rr->naihdr.resume_token, cnc); - if (rc != 0 && rc != -EBUSY) - goto out; - if (rr->response.code != 1) { - rc = -EIO; - continue; - } else - rc = 0; - - if (cb == NULL) - continue; - - size = rr->naihdr.naids; - elems = (rr->response.length - - sizeof(struct chsc_header) - - sizeof(struct chsc_brinfo_naihdr)) / - size; - - if (!isfirstblock && (rr->naihdr.instance != prev_instance)) { - /* Inform the caller that they need to scrap */ - /* the data that was already reported via cb */ - rc = -EAGAIN; - break; - } - isfirstblock = 0; - prev_instance = rr->naihdr.instance; - for (i = 0; i < elems; i++) - switch (size) { - case sizeof(struct qdio_brinfo_entry_l3_ipv6): - (*cb)(priv, l3_ipv6_addr, - &rr->entries.l3_ipv6[i]); - break; - case sizeof(struct qdio_brinfo_entry_l3_ipv4): - (*cb)(priv, l3_ipv4_addr, - &rr->entries.l3_ipv4[i]); - break; - case sizeof(struct qdio_brinfo_entry_l2): - (*cb)(priv, l2_addr_lnid, - &rr->entries.l2[i]); - break; - default: - WARN_ON_ONCE(1); - rc = -EIO; - goto out; - } - } while (rr->response.code == 0x0107 || /* channel busy */ - (rr->response.code == 1 && /* list stored */ - /* resume token is non-zero => list incomplete */ - (rr->naihdr.resume_token.t1 || rr->naihdr.resume_token.t2))); - (*response) = rr->response.code; - -out: - free_page((unsigned long)rr); - return rc; -} -EXPORT_SYMBOL_GPL(qdio_pnso_brinfo); - static int __init init_QDIO(void) { int rc; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 0bd5b09e7a22..f60b865c73ad 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "qeth_core.h" #include "qeth_l2.h" @@ -27,8 +28,8 @@ static void qeth_bridgeport_query_support(struct qeth_card *card); static void qeth_bridge_state_change(struct qeth_card *card, struct qeth_ipa_cmd *cmd); -static void qeth_bridge_host_event(struct qeth_card *card, - struct qeth_ipa_cmd *cmd); +static void qeth_addr_change_event(struct qeth_card *card, + struct qeth_ipa_cmd *cmd); static void qeth_l2_vnicc_set_defaults(struct qeth_card *card); static void qeth_l2_vnicc_init(struct qeth_card *card); static bool qeth_l2_vnicc_recover_timeout(struct qeth_card *card, u32 vnicc, @@ -628,6 +629,72 @@ static void qeth_l2_set_rx_mode(struct net_device *dev) schedule_work(&card->rx_mode_work); } +/** + * qeth_l2_pnso() - perform network subchannel operation + * @card: qeth_card structure pointer + * @cnc: Boolean Change-Notification Control + * @cb: Callback function will be executed for each element + * of the address list + * @priv: Pointer to pass to the callback function. + * + * Collects network information in a network address list and calls the + * callback function for every entry in the list. If "change-notification- + * control" is set, further changes in the address list will be reported + * via the IPA command. + */ +static int qeth_l2_pnso(struct qeth_card *card, int cnc, + void (*cb)(void *priv, struct chsc_pnso_naid_l2 *entry), + void *priv) +{ + struct ccw_device *ddev = CARD_DDEV(card); + struct chsc_pnso_area *rr; + u32 prev_instance = 0; + int isfirstblock = 1; + int i, size, elems; + int rc; + + QETH_CARD_TEXT(card, 2, "PNSO"); + rr = (struct chsc_pnso_area *)get_zeroed_page(GFP_KERNEL); + if (rr == NULL) + return -ENOMEM; + do { + /* on the first iteration, naihdr.resume_token will be zero */ + rc = ccw_device_pnso(ddev, rr, rr->naihdr.resume_token, cnc); + if (rc) + continue; + if (cb == NULL) + continue; + + size = rr->naihdr.naids; + if (size != sizeof(struct chsc_pnso_naid_l2)) { + WARN_ON_ONCE(1); + continue; + } + + elems = (rr->response.length - sizeof(struct chsc_header) - + sizeof(struct chsc_pnso_naihdr)) / size; + + if (!isfirstblock && (rr->naihdr.instance != prev_instance)) { + /* Inform the caller that they need to scrap */ + /* the data that was already reported via cb */ + rc = -EAGAIN; + break; + } + isfirstblock = 0; + prev_instance = rr->naihdr.instance; + for (i = 0; i < elems; i++) + (*cb)(priv, &rr->entries[i]); + } while ((rc == -EBUSY) || (!rc && /* list stored */ + /* resume token is non-zero => list incomplete */ + (rr->naihdr.resume_token.t1 || rr->naihdr.resume_token.t2))); + + if (rc) + QETH_CARD_TEXT_(card, 2, "PNrp%04x", rr->response.code); + + free_page((unsigned long)rr); + return rc; +} + static const struct net_device_ops qeth_l2_netdev_ops = { .ndo_open = qeth_open, .ndo_stop = qeth_stop, @@ -854,7 +921,7 @@ static int qeth_l2_control_event(struct qeth_card *card, } else return 1; case IPA_CMD_ADDRESS_CHANGE_NOTIF: - qeth_bridge_host_event(card, cmd); + qeth_addr_change_event(card, cmd); return 0; default: return 1; @@ -971,8 +1038,10 @@ enum qeth_an_event_type {anev_reg_unreg, anev_abort, anev_reset}; * for all currently registered addresses. */ static void qeth_bridge_emit_host_event(struct qeth_card *card, - enum qeth_an_event_type evtype, - u8 code, struct net_if_token *token, struct mac_addr_lnid *addr_lnid) + enum qeth_an_event_type evtype, + u8 code, + struct net_if_token *token, + struct mac_addr_lnid *addr_lnid) { char str[7][32]; char *env[8]; @@ -1089,74 +1158,76 @@ static void qeth_bridge_state_change(struct qeth_card *card, queue_work(card->event_wq, &data->worker); } -struct qeth_bridge_host_data { +struct qeth_addr_change_data { struct work_struct worker; struct qeth_card *card; - struct qeth_ipacmd_addr_change hostevs; + struct qeth_ipacmd_addr_change ac_event; }; -static void qeth_bridge_host_event_worker(struct work_struct *work) +static void qeth_addr_change_event_worker(struct work_struct *work) { - struct qeth_bridge_host_data *data = - container_of(work, struct qeth_bridge_host_data, worker); + struct qeth_addr_change_data *data = + container_of(work, struct qeth_addr_change_data, worker); int i; - if (data->hostevs.lost_event_mask) { + QETH_CARD_TEXT(data->card, 4, "adrchgew"); + if (data->ac_event.lost_event_mask) { dev_info(&data->card->gdev->dev, -"Address notification from the Bridge Port stopped %s (%s)\n", - data->card->dev->name, - (data->hostevs.lost_event_mask == 0x01) + "Address change notification stopped on %s (%s)\n", + data->card->dev->name, + (data->ac_event.lost_event_mask == 0x01) ? "Overflow" - : (data->hostevs.lost_event_mask == 0x02) + : (data->ac_event.lost_event_mask == 0x02) ? "Bridge port state change" : "Unknown reason"); mutex_lock(&data->card->sbp_lock); data->card->options.sbp.hostnotification = 0; mutex_unlock(&data->card->sbp_lock); qeth_bridge_emit_host_event(data->card, anev_abort, - 0, NULL, NULL); + 0, NULL, NULL); } else - for (i = 0; i < data->hostevs.num_entries; i++) { + for (i = 0; i < data->ac_event.num_entries; i++) { struct qeth_ipacmd_addr_change_entry *entry = - &data->hostevs.entry[i]; + &data->ac_event.entry[i]; qeth_bridge_emit_host_event(data->card, - anev_reg_unreg, - entry->change_code, - &entry->token, &entry->addr_lnid); + anev_reg_unreg, + entry->change_code, + &entry->token, + &entry->addr_lnid); } kfree(data); } -static void qeth_bridge_host_event(struct qeth_card *card, - struct qeth_ipa_cmd *cmd) +static void qeth_addr_change_event(struct qeth_card *card, + struct qeth_ipa_cmd *cmd) { struct qeth_ipacmd_addr_change *hostevs = &cmd->data.addrchange; - struct qeth_bridge_host_data *data; + struct qeth_addr_change_data *data; int extrasize; - QETH_CARD_TEXT(card, 2, "brhostev"); + QETH_CARD_TEXT(card, 4, "adrchgev"); if (cmd->hdr.return_code != 0x0000) { if (cmd->hdr.return_code == 0x0010) { if (hostevs->lost_event_mask == 0x00) hostevs->lost_event_mask = 0xff; } else { - QETH_CARD_TEXT_(card, 2, "BPHe%04x", + QETH_CARD_TEXT_(card, 2, "ACHN%04x", cmd->hdr.return_code); return; } } extrasize = sizeof(struct qeth_ipacmd_addr_change_entry) * hostevs->num_entries; - data = kzalloc(sizeof(struct qeth_bridge_host_data) + extrasize, - GFP_ATOMIC); + data = kzalloc(sizeof(struct qeth_addr_change_data) + extrasize, + GFP_ATOMIC); if (!data) { - QETH_CARD_TEXT(card, 2, "BPHalloc"); + QETH_CARD_TEXT(card, 2, "ACNalloc"); return; } - INIT_WORK(&data->worker, qeth_bridge_host_event_worker); + INIT_WORK(&data->worker, qeth_addr_change_event_worker); data->card = card; - memcpy(&data->hostevs, hostevs, + memcpy(&data->ac_event, hostevs, sizeof(struct qeth_ipacmd_addr_change) + extrasize); queue_work(card->event_wq, &data->worker); } @@ -1446,63 +1517,18 @@ int qeth_bridgeport_setrole(struct qeth_card *card, enum qeth_sbp_roles role) return qeth_send_ipa_cmd(card, iob, qeth_bridgeport_set_cb, NULL); } -/** - * qeth_anset_makerc() - derive "traditional" error from hardware codes. - * @card: qeth_card structure pointer, for debug messages. - * - * Returns negative errno-compatible error indication or 0 on success. - */ -static int qeth_anset_makerc(struct qeth_card *card, int pnso_rc, u16 response) -{ - int rc; - - if (pnso_rc == 0) - switch (response) { - case 0x0001: - rc = 0; - break; - case 0x0004: - case 0x0100: - case 0x0106: - rc = -EOPNOTSUPP; - dev_err(&card->gdev->dev, - "Setting address notification failed\n"); - break; - case 0x0107: - rc = -EAGAIN; - break; - default: - rc = -EIO; - } - else - rc = -EIO; - - if (rc) { - QETH_CARD_TEXT_(card, 2, "SBPp%04x", pnso_rc); - QETH_CARD_TEXT_(card, 2, "SBPr%04x", response); - } - return rc; -} - static void qeth_bridgeport_an_set_cb(void *priv, - enum qdio_brinfo_entry_type type, void *entry) + struct chsc_pnso_naid_l2 *entry) { struct qeth_card *card = (struct qeth_card *)priv; - struct qdio_brinfo_entry_l2 *l2entry; u8 code; - if (type != l2_addr_lnid) { - WARN_ON_ONCE(1); - return; - } - - l2entry = (struct qdio_brinfo_entry_l2 *)entry; code = IPA_ADDR_CHANGE_CODE_MACADDR; - if (l2entry->addr_lnid.lnid < VLAN_N_VID) + if (entry->addr_lnid.lnid < VLAN_N_VID) code |= IPA_ADDR_CHANGE_CODE_VLANID; qeth_bridge_emit_host_event(card, anev_reg_unreg, code, - (struct net_if_token *)&l2entry->nit, - (struct mac_addr_lnid *)&l2entry->addr_lnid); + (struct net_if_token *)&entry->nit, + (struct mac_addr_lnid *)&entry->addr_lnid); } /** @@ -1518,22 +1544,16 @@ static void qeth_bridgeport_an_set_cb(void *priv, int qeth_bridgeport_an_set(struct qeth_card *card, int enable) { int rc; - u16 response; - struct ccw_device *ddev; - struct subchannel_id schid; if (!card->options.sbp.supported_funcs) return -EOPNOTSUPP; - ddev = CARD_DDEV(card); - ccw_device_get_schid(ddev, &schid); if (enable) { qeth_bridge_emit_host_event(card, anev_reset, 0, NULL, NULL); - rc = qdio_pnso_brinfo(schid, 1, &response, - qeth_bridgeport_an_set_cb, card); + rc = qeth_l2_pnso(card, 1, qeth_bridgeport_an_set_cb, card); } else - rc = qdio_pnso_brinfo(schid, 0, &response, NULL, NULL); - return qeth_anset_makerc(card, rc, response); + rc = qeth_l2_pnso(card, 0, NULL, NULL); + return rc; } static bool qeth_bridgeport_is_in_use(struct qeth_card *card) From e1750a3d9abbea2ece29cac8dc5a6f5bc19c1492 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 22 May 2020 20:39:22 +0200 Subject: [PATCH 245/574] s390/pci: Log new handle in clp_disable_fh() After disabling a function, the original handle is logged instead of the disabled handle. Link: https://lkml.kernel.org/r/20200522183922.5253-1-ptesarik@suse.com Fixes: 17cdec960cf7 ("s390/pci: Recover handle in clp_set_pci_fn()") Reviewed-by: Pierre Morel Signed-off-by: Petr Tesarik Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_clp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index d7bd3c287cf7..7e735f41a0a6 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -314,14 +314,13 @@ out: int clp_disable_fh(struct zpci_dev *zdev) { - u32 fh = zdev->fh; int rc; if (!zdev_enabled(zdev)) return 0; rc = clp_set_pci_fn(zdev, 0, CLP_SET_DISABLE_PCI_FN); - zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); return rc; } From 8e923a2168afd221ea26e3d9716f21e9578b5c4d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 11 May 2020 22:36:27 +0900 Subject: [PATCH 246/574] selftests/ftrace: Use printf for backslash included command Since the built-in echo has different behavior in POSIX shell (dash) and bash, kprobe_syntax_errors.tc can fail on dash which interpret backslash escape automatically. To fix this issue, we explicitly use printf "%s" (not interpret backslash escapes) if the command string can include backslash. Reported-by: Liu Yiding Suggested-by: Xiao Yang Signed-off-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/functions | 8 +++++--- .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 61a3c7e2634d..697c77ef2e2b 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -119,12 +119,14 @@ yield() { ping $LOCALHOST -c 1 || sleep .001 || usleep 1 || sleep 1 } +# Since probe event command may include backslash, explicitly use printf "%s" +# to NOT interpret it. ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file - pos=$(echo -n "${2%^*}" | wc -c) # error position - command=$(echo "$2" | tr -d ^) + pos=$(printf "%s" "${2%^*}" | wc -c) # error position + command=$(printf "%s" "$2" | tr -d ^) echo "Test command: $command" echo > error_log - (! echo "$command" >> "$3" ) 2> /dev/null + (! printf "%s" "$command" >> "$3" ) 2> /dev/null grep "$1: error:" -A 3 error_log N=$(tail -n 1 error_log | wc -c) # " Command: " and "^\n" => 13 diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index ef1e9bafb098..eb0f4ab4e070 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -91,7 +91,9 @@ esac if grep -q "Create/append/" README && grep -q "imm-value" README; then echo 'p:kprobes/testevent _do_fork' > kprobe_events check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE -echo 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events + +# Explicitly use printf "%s" to not interpret \1 +printf "%s" 'p:kprobes/testevent _do_fork abcd=\1' > kprobe_events check_error 'p:kprobes/testevent _do_fork ^bcd=\1' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\1:u8' # DIFF_ARG_TYPE check_error 'p:kprobes/testevent _do_fork ^abcd=\"foo"' # DIFF_ARG_TYPE From 619ee76f5c9f6a1d601d1a056a454d62bf676ae4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 May 2020 19:20:57 +0900 Subject: [PATCH 247/574] selftests/ftrace: Return unsupported if no error_log file Check whether error_log file exists in tracing/error_log testcase and return UNSUPPORTED if no error_log file. This can happen if we run the ftracetest on the older stable kernel. Fixes: 4eab1cc461a6 ("selftests/ftrace: Add tracing/error_log testcase") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- .../testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc index 021c03fd885d..23465823532b 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc @@ -14,6 +14,8 @@ if [ ! -f set_event ]; then exit_unsupported fi +[ -f error_log ] || exit_unsupported + ftrace_errlog_check 'event filter parse error' '((sig >= 10 && sig < 15) || dsig ^== 17) && comm != bash' 'events/signal/signal_generate/filter' exit 0 From fb01562e5a8a731bb1807eba0a9fadb355ca2277 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:28 +0200 Subject: [PATCH 248/574] uacce: Remove mm_exit() op The mm_exit() op will be removed from the SVA API. When a process dies and its mm goes away, the IOMMU driver won't notify device drivers anymore. Drivers should expect to handle a lot more aborted DMA. On the upside, it does greatly simplify the queue management. The uacce_mm struct, that tracks all queues bound to an mm, was only used by the mm_exit() callback. Remove it. Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Acked-by: Zhangfei Gao Link: https://lore.kernel.org/r/20200423125329.782066-2-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/misc/uacce/uacce.c | 176 ++++++++++--------------------------- include/linux/uacce.h | 34 ++----- 2 files changed, 53 insertions(+), 157 deletions(-) diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index d39307f060bd..107028e77ca3 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -90,109 +90,39 @@ static long uacce_fops_compat_ioctl(struct file *filep, } #endif -static int uacce_sva_exit(struct device *dev, struct iommu_sva *handle, - void *data) +static int uacce_bind_queue(struct uacce_device *uacce, struct uacce_queue *q) { - struct uacce_mm *uacce_mm = data; - struct uacce_queue *q; + int pasid; + struct iommu_sva *handle; - /* - * No new queue can be added concurrently because no caller can have a - * reference to this mm. But there may be concurrent calls to - * uacce_mm_put(), so we need the lock. - */ - mutex_lock(&uacce_mm->lock); - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - uacce_mm->mm = NULL; - mutex_unlock(&uacce_mm->lock); + if (!(uacce->flags & UACCE_DEV_SVA)) + return 0; + handle = iommu_sva_bind_device(uacce->parent, current->mm, NULL); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + pasid = iommu_sva_get_pasid(handle); + if (pasid == IOMMU_PASID_INVALID) { + iommu_sva_unbind_device(handle); + return -ENODEV; + } + + q->handle = handle; + q->pasid = pasid; return 0; } -static struct iommu_sva_ops uacce_sva_ops = { - .mm_exit = uacce_sva_exit, -}; - -static struct uacce_mm *uacce_mm_get(struct uacce_device *uacce, - struct uacce_queue *q, - struct mm_struct *mm) +static void uacce_unbind_queue(struct uacce_queue *q) { - struct uacce_mm *uacce_mm = NULL; - struct iommu_sva *handle = NULL; - int ret; - - lockdep_assert_held(&uacce->mm_lock); - - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - if (uacce_mm->mm == mm) { - mutex_lock(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - mutex_unlock(&uacce_mm->lock); - return uacce_mm; - } - } - - uacce_mm = kzalloc(sizeof(*uacce_mm), GFP_KERNEL); - if (!uacce_mm) - return NULL; - - if (uacce->flags & UACCE_DEV_SVA) { - /* - * Safe to pass an incomplete uacce_mm, since mm_exit cannot - * fire while we hold a reference to the mm. - */ - handle = iommu_sva_bind_device(uacce->parent, mm, uacce_mm); - if (IS_ERR(handle)) - goto err_free; - - ret = iommu_sva_set_ops(handle, &uacce_sva_ops); - if (ret) - goto err_unbind; - - uacce_mm->pasid = iommu_sva_get_pasid(handle); - if (uacce_mm->pasid == IOMMU_PASID_INVALID) - goto err_unbind; - } - - uacce_mm->mm = mm; - uacce_mm->handle = handle; - INIT_LIST_HEAD(&uacce_mm->queues); - mutex_init(&uacce_mm->lock); - list_add(&q->list, &uacce_mm->queues); - list_add(&uacce_mm->list, &uacce->mm_list); - - return uacce_mm; - -err_unbind: - if (handle) - iommu_sva_unbind_device(handle); -err_free: - kfree(uacce_mm); - return NULL; -} - -static void uacce_mm_put(struct uacce_queue *q) -{ - struct uacce_mm *uacce_mm = q->uacce_mm; - - lockdep_assert_held(&q->uacce->mm_lock); - - mutex_lock(&uacce_mm->lock); - list_del(&q->list); - mutex_unlock(&uacce_mm->lock); - - if (list_empty(&uacce_mm->queues)) { - if (uacce_mm->handle) - iommu_sva_unbind_device(uacce_mm->handle); - list_del(&uacce_mm->list); - kfree(uacce_mm); - } + if (!q->handle) + return; + iommu_sva_unbind_device(q->handle); + q->handle = NULL; } static int uacce_fops_open(struct inode *inode, struct file *filep) { - struct uacce_mm *uacce_mm = NULL; struct uacce_device *uacce; struct uacce_queue *q; int ret = 0; @@ -205,21 +135,16 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) if (!q) return -ENOMEM; - mutex_lock(&uacce->mm_lock); - uacce_mm = uacce_mm_get(uacce, q, current->mm); - mutex_unlock(&uacce->mm_lock); - if (!uacce_mm) { - ret = -ENOMEM; + ret = uacce_bind_queue(uacce, q); + if (ret) goto out_with_mem; - } q->uacce = uacce; - q->uacce_mm = uacce_mm; if (uacce->ops->get_queue) { - ret = uacce->ops->get_queue(uacce, uacce_mm->pasid, q); + ret = uacce->ops->get_queue(uacce, q->pasid, q); if (ret < 0) - goto out_with_mm; + goto out_with_bond; } init_waitqueue_head(&q->wait); @@ -227,12 +152,14 @@ static int uacce_fops_open(struct inode *inode, struct file *filep) uacce->inode = inode; q->state = UACCE_Q_INIT; + mutex_lock(&uacce->queues_lock); + list_add(&q->list, &uacce->queues); + mutex_unlock(&uacce->queues_lock); + return 0; -out_with_mm: - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); +out_with_bond: + uacce_unbind_queue(q); out_with_mem: kfree(q); return ret; @@ -241,14 +168,12 @@ out_with_mem: static int uacce_fops_release(struct inode *inode, struct file *filep) { struct uacce_queue *q = filep->private_data; - struct uacce_device *uacce = q->uacce; + mutex_lock(&q->uacce->queues_lock); + list_del(&q->list); + mutex_unlock(&q->uacce->queues_lock); uacce_put_queue(q); - - mutex_lock(&uacce->mm_lock); - uacce_mm_put(q); - mutex_unlock(&uacce->mm_lock); - + uacce_unbind_queue(q); kfree(q); return 0; @@ -513,8 +438,8 @@ struct uacce_device *uacce_alloc(struct device *parent, if (ret < 0) goto err_with_uacce; - INIT_LIST_HEAD(&uacce->mm_list); - mutex_init(&uacce->mm_lock); + INIT_LIST_HEAD(&uacce->queues); + mutex_init(&uacce->queues_lock); device_initialize(&uacce->dev); uacce->dev.devt = MKDEV(MAJOR(uacce_devt), uacce->dev_id); uacce->dev.class = uacce_class; @@ -561,8 +486,7 @@ EXPORT_SYMBOL_GPL(uacce_register); */ void uacce_remove(struct uacce_device *uacce) { - struct uacce_mm *uacce_mm; - struct uacce_queue *q; + struct uacce_queue *q, *next_q; if (!uacce) return; @@ -574,24 +498,12 @@ void uacce_remove(struct uacce_device *uacce) unmap_mapping_range(uacce->inode->i_mapping, 0, 0, 1); /* ensure no open queue remains */ - mutex_lock(&uacce->mm_lock); - list_for_each_entry(uacce_mm, &uacce->mm_list, list) { - /* - * We don't take the uacce_mm->lock here. Since we hold the - * device's mm_lock, no queue can be added to or removed from - * this uacce_mm. We may run concurrently with mm_exit, but - * uacce_put_queue() is serialized and iommu_sva_unbind_device() - * waits for the lock that mm_exit is holding. - */ - list_for_each_entry(q, &uacce_mm->queues, list) - uacce_put_queue(q); - - if (uacce->flags & UACCE_DEV_SVA) { - iommu_sva_unbind_device(uacce_mm->handle); - uacce_mm->handle = NULL; - } + mutex_lock(&uacce->queues_lock); + list_for_each_entry_safe(q, next_q, &uacce->queues, list) { + uacce_put_queue(q); + uacce_unbind_queue(q); } - mutex_unlock(&uacce->mm_lock); + mutex_unlock(&uacce->queues_lock); /* disable sva now since no opened queues */ if (uacce->flags & UACCE_DEV_SVA) diff --git a/include/linux/uacce.h b/include/linux/uacce.h index 0e215e6d0534..454c2f6672d7 100644 --- a/include/linux/uacce.h +++ b/include/linux/uacce.h @@ -68,19 +68,21 @@ enum uacce_q_state { * @uacce: pointer to uacce * @priv: private pointer * @wait: wait queue head - * @list: index into uacce_mm - * @uacce_mm: the corresponding mm + * @list: index into uacce queues list * @qfrs: pointer of qfr regions * @state: queue state machine + * @pasid: pasid associated to the mm + * @handle: iommu_sva handle returned by iommu_sva_bind_device() */ struct uacce_queue { struct uacce_device *uacce; void *priv; wait_queue_head_t wait; struct list_head list; - struct uacce_mm *uacce_mm; struct uacce_qfile_region *qfrs[UACCE_MAX_REGION]; enum uacce_q_state state; + int pasid; + struct iommu_sva *handle; }; /** @@ -96,8 +98,8 @@ struct uacce_queue { * @cdev: cdev of the uacce * @dev: dev of the uacce * @priv: private pointer of the uacce - * @mm_list: list head of uacce_mm->list - * @mm_lock: lock for mm_list + * @queues: list of queues + * @queues_lock: lock for queues list * @inode: core vfs */ struct uacce_device { @@ -112,27 +114,9 @@ struct uacce_device { struct cdev *cdev; struct device dev; void *priv; - struct list_head mm_list; - struct mutex mm_lock; - struct inode *inode; -}; - -/** - * struct uacce_mm - keep track of queues bound to a process - * @list: index into uacce_device - * @queues: list of queues - * @mm: the mm struct - * @lock: protects the list of queues - * @pasid: pasid of the uacce_mm - * @handle: iommu_sva handle return from iommu_sva_bind_device - */ -struct uacce_mm { - struct list_head list; struct list_head queues; - struct mm_struct *mm; - struct mutex lock; - int pasid; - struct iommu_sva *handle; + struct mutex queues_lock; + struct inode *inode; }; #if IS_ENABLED(CONFIG_UACCE) From edcc40d2ab5f47f205c2dd2a9aeedd8c77de050a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 23 Apr 2020 14:53:30 +0200 Subject: [PATCH 249/574] iommu: Remove iommu_sva_ops::mm_exit() After binding a device to an mm, device drivers currently need to register a mm_exit handler. This function is called when the mm exits, to gracefully stop DMA targeting the address space and flush page faults to the IOMMU. This is deemed too complex for the MMU release() notifier, which may be triggered by any mmput() invocation, from about 120 callsites [1]. The upcoming SVA module has an example of such complexity: the I/O Page Fault handler would need to call mmput_async() instead of mmput() after handling an IOPF, to avoid triggering the release() notifier which would in turn drain the IOPF queue and lock up. Another concern is the DMA stop function taking too long, up to several minutes [2]. For some mmput() callers this may disturb other users. For example, if the OOM killer picks the mm bound to a device as the victim and that mm's memory is locked, if the release() takes too long, it might choose additional innocent victims to kill. To simplify the MMU release notifier, don't forward the notification to device drivers. Since they don't stop DMA on mm exit anymore, the PASID lifetime is extended: (1) The device driver calls bind(). A PASID is allocated. Here any DMA fault is handled by mm, and on error we don't print anything to dmesg. Userspace can easily trigger errors by issuing DMA on unmapped buffers. (2) exit_mmap(), for example the process took a SIGKILL. This step doesn't happen during normal operations. Remove the pgd from the PASID table, since the page tables are about to be freed. Invalidate the IOTLBs. Here the device may still perform DMA on the address space. Incoming transactions are aborted but faults aren't printed out. ATS Translation Requests return Successful Translation Completions with R=W=0. PRI Page Requests return with Invalid Request. (3) The device driver stops DMA, possibly following release of a fd, and calls unbind(). PASID table is cleared, IOTLB invalidated if necessary. The page fault queues are drained, and the PASID is freed. If DMA for that PASID is still running here, something went seriously wrong and errors should be reported. For now remove iommu_sva_ops entirely. We might need to re-introduce them at some point, for example to notify device drivers of unhandled IOPF. [1] https://lore.kernel.org/linux-iommu/20200306174239.GM31668@ziepe.ca/ [2] https://lore.kernel.org/linux-iommu/4d68da96-0ad5-b412-5987-2f7a6aa796c3@amd.com/ Signed-off-by: Jean-Philippe Brucker Acked-by: Jacob Pan Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200423125329.782066-3-jean-philippe@linaro.org Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 11 ----------- include/linux/iommu.h | 30 ------------------------------ 2 files changed, 41 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 298397721144..abcd19118169 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2883,17 +2883,6 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *sva_ops) -{ - if (handle->ops && handle->ops != sva_ops) - return -EEXIST; - - handle->ops = sva_ops; - return 0; -} -EXPORT_SYMBOL_GPL(iommu_sva_set_ops); - int iommu_sva_get_pasid(struct iommu_sva *handle) { const struct iommu_ops *ops = handle->dev->bus->iommu_ops; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7cfd2dddb49d..08d3506172a1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -53,8 +53,6 @@ struct iommu_fault_event; typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); -typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *, - void *); typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *); struct iommu_domain_geometry { @@ -171,25 +169,6 @@ enum iommu_dev_features { #define IOMMU_PASID_INVALID (-1U) -/** - * struct iommu_sva_ops - device driver callbacks for an SVA context - * - * @mm_exit: called when the mm is about to be torn down by exit_mmap. After - * @mm_exit returns, the device must not issue any more transaction - * with the PASID given as argument. - * - * The @mm_exit handler is allowed to sleep. Be careful about the - * locks taken in @mm_exit, because they might lead to deadlocks if - * they are also held when dropping references to the mm. Consider the - * following call chain: - * mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A) - * Using mmput_async() prevents this scenario. - * - */ -struct iommu_sva_ops { - iommu_mm_exit_handler_t mm_exit; -}; - #ifdef CONFIG_IOMMU_API /** @@ -616,7 +595,6 @@ struct iommu_fwspec { */ struct iommu_sva { struct device *dev; - const struct iommu_sva_ops *ops; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, @@ -664,8 +642,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata); void iommu_sva_unbind_device(struct iommu_sva *handle); -int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops); int iommu_sva_get_pasid(struct iommu_sva *handle); #else /* CONFIG_IOMMU_API */ @@ -1069,12 +1045,6 @@ static inline void iommu_sva_unbind_device(struct iommu_sva *handle) { } -static inline int iommu_sva_set_ops(struct iommu_sva *handle, - const struct iommu_sva_ops *ops) -{ - return -EINVAL; -} - static inline int iommu_sva_get_pasid(struct iommu_sva *handle) { return IOMMU_PASID_INVALID; From 8038bdb8553313ad53bfcffcf8294dd0ab44618f Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 10:56:15 -0600 Subject: [PATCH 250/574] iommu/vt-d: Only clear real DMA device's context entries Domain context mapping can encounter issues with sub-devices of a real DMA device. A sub-device cannot have a valid context entry due to it potentially aliasing another device's 16-bit ID. It's expected that sub-devices of the real DMA device uses the real DMA device's requester when context mapping. This is an issue when a sub-device is removed where the context entry is cleared for all aliases. Other sub-devices are still valid, resulting in those sub-devices being stranded without valid context entries. The correct approach is to use the real DMA device when programming the context entries. The insertion path is correct because device_to_iommu() will return the bus and devfn of the real DMA device. The removal path needs to only operate on the real DMA device, otherwise the entire context entry would be cleared for all sub-devices of the real DMA device. This patch also adds a helper to determine if a struct device is a sub-device of a real DMA device. Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") Cc: stable@vger.kernel.org # v5.6+ Signed-off-by: Jon Derrick Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200527165617.297470-2-jonathan.derrick@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index ff5a30a94679..1ff45b2d03ab 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2500,6 +2500,12 @@ static int domain_setup_first_level(struct intel_iommu *iommu, flags); } +static bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, int bus, int devfn, struct device *dev, @@ -4975,7 +4981,8 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) PASID_RID2PASID, false); iommu_disable_dev_iotlb(info); - domain_context_clear(iommu, info->dev); + if (!dev_is_real_dma_subdevice(info->dev)) + domain_context_clear(iommu, info->dev); intel_pasid_free_table(info->dev); } From 4fda230ecddc2573ed88632e98b69b0b9b68c0ad Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 10:56:16 -0600 Subject: [PATCH 251/574] iommu/vt-d: Allocate domain info for real DMA sub-devices Sub-devices of a real DMA device might exist on a separate segment than the real DMA device and its IOMMU. These devices should still have a valid device_domain_info, but the current dma alias model won't allocate info for the subdevice. This patch adds a segment member to struct device_domain_info and uses the sub-device's BDF so that these sub-devices won't alias to other devices. Fixes: 2b0140c69637e ("iommu/vt-d: Use pci_real_dma_dev() for mapping") Cc: stable@vger.kernel.org # v5.6+ Signed-off-by: Jon Derrick Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200527165617.297470-3-jonathan.derrick@intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 19 +++++++++++++++---- include/linux/intel-iommu.h | 1 + 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 1ff45b2d03ab..6d39b9bd89a6 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2463,7 +2463,7 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn) struct device_domain_info *info; list_for_each_entry(info, &device_domain_list, global) - if (info->iommu->segment == segment && info->bus == bus && + if (info->segment == segment && info->bus == bus && info->devfn == devfn) return info; @@ -2520,8 +2520,18 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!info) return NULL; - info->bus = bus; - info->devfn = devfn; + if (!dev_is_real_dma_subdevice(dev)) { + info->bus = bus; + info->devfn = devfn; + info->segment = iommu->segment; + } else { + struct pci_dev *pdev = to_pci_dev(dev); + + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->segment = pci_domain_nr(pdev->bus); + } + info->ats_supported = info->pasid_supported = info->pri_supported = 0; info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; info->ats_qdep = 0; @@ -2561,7 +2571,8 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (!found) { struct device_domain_info *info2; - info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn); + info2 = dmar_search_domain_by_dev_info(info->segment, info->bus, + info->devfn); if (info2) { found = info2->domain; info2->dev = dev; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 21633cee6331..4100bd224f5c 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -609,6 +609,7 @@ struct device_domain_info { struct list_head auxiliary_domains; /* auxiliary domains * attached to this device */ + u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ u16 pfsid; /* SRIOV physical function source ID */ From bba9cc2cf82840bd3c9b3f4f7edac2dc8329c241 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 27 May 2020 10:56:17 -0600 Subject: [PATCH 252/574] iommu/vt-d: Remove real DMA lookup in find_domain By removing the real DMA indirection in find_domain(), we can allow sub-devices of a real DMA device to have their own valid device_domain_info. The dmar lookup and context entry removal paths have been fixed to account for sub-devices. Fixes: 2b0140c69637 ("iommu/vt-d: Use pci_real_dma_dev() for mapping") Signed-off-by: Jon Derrick Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20200527165617.297470-4-jonathan.derrick@intel.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207575 Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 6d39b9bd89a6..5767882aa80f 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2436,9 +2436,6 @@ struct dmar_domain *find_domain(struct device *dev) if (unlikely(attach_deferred(dev) || iommu_dummy(dev))) return NULL; - if (dev_is_pci(dev)) - dev = &pci_real_dma_dev(to_pci_dev(dev))->dev; - /* No lock here, assumes no domain exit in normal case */ info = get_domain_info(dev); if (likely(info)) From 71974cfb6737d45cba66189cdc0f8ba3397e8fe0 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 28 May 2020 11:03:51 -0700 Subject: [PATCH 253/574] iommu/vt-d: Fix compile warning Make intel_svm_unbind_mm() a static function. Fixes: 064a57d7ddfc ("iommu/vt-d: Replace intel SVM APIs with generic SVA APIs") Reported-by: kbuild test robot Signed-off-by: Jacob Pan Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1590689031-79318-1-git-send-email-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel-svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index acc7555b002d..a035ef911fba 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -601,7 +601,7 @@ success: } /* Caller must hold pasid_mutex */ -int intel_svm_unbind_mm(struct device *dev, int pasid) +static int intel_svm_unbind_mm(struct device *dev, int pasid) { struct intel_svm_dev *sdev; struct intel_iommu *iommu; From fb1b6955bbf3470d1a3dfb22efc729fa84c73a89 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:05 +0200 Subject: [PATCH 254/574] iommu/amd: Unexport get_dev_data() This function is internal to the AMD IOMMU driver and only exported because the amd_iommu_v2 modules calls it. But the reason it is called from there could better be handled by amd_iommu_is_attach_deferred(). So unexport get_dev_data() and use amd_iommu_is_attach_deferred() instead. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-3-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 9 +++++---- drivers/iommu/amd_iommu_proto.h | 3 ++- drivers/iommu/amd_iommu_v2.c | 10 ++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1b36c40d0712..73f3ac3ba276 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -280,11 +280,10 @@ static struct iommu_dev_data *find_dev_data(u16 devid) return dev_data; } -struct iommu_dev_data *get_dev_data(struct device *dev) +static struct iommu_dev_data *get_dev_data(struct device *dev) { return dev->archdata.iommu; } -EXPORT_SYMBOL(get_dev_data); /* * Find or create an IOMMU group for a acpihid device. @@ -2706,12 +2705,14 @@ static void amd_iommu_get_resv_regions(struct device *dev, list_add_tail(®ion->list, head); } -static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, - struct device *dev) +bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, + struct device *dev) { struct iommu_dev_data *dev_data = dev->archdata.iommu; + return dev_data->defer_attach; } +EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred); static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) { diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 92c2ba6468a0..1c6c12c11368 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -92,5 +92,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) } extern bool translation_pre_enabled(struct amd_iommu *iommu); -extern struct iommu_dev_data *get_dev_data(struct device *dev); +extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, + struct device *dev); #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index d6d85debd01b..9b6e038150c1 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -517,13 +517,12 @@ static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) struct amd_iommu_fault *iommu_fault; struct pasid_state *pasid_state; struct device_state *dev_state; + struct pci_dev *pdev = NULL; unsigned long flags; struct fault *fault; bool finish; u16 tag, devid; int ret; - struct iommu_dev_data *dev_data; - struct pci_dev *pdev = NULL; iommu_fault = data; tag = iommu_fault->tag & 0x1ff; @@ -534,12 +533,11 @@ static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) devid & 0xff); if (!pdev) return -ENODEV; - dev_data = get_dev_data(&pdev->dev); + + ret = NOTIFY_DONE; /* In kdump kernel pci dev is not initialized yet -> send INVALID */ - ret = NOTIFY_DONE; - if (translation_pre_enabled(amd_iommu_rlookup_table[devid]) - && dev_data->defer_attach) { + if (amd_iommu_is_attach_deferred(NULL, &pdev->dev)) { amd_iommu_complete_ppr(pdev, iommu_fault->pasid, PPR_INVALID, tag); goto out; From 1226c370744613b17f7cd564087c4dcd56207a07 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:06 +0200 Subject: [PATCH 255/574] iommu/amd: Let free_pagetable() not rely on domain->pt_root Use 'struct domain_pgtable' instead to free_pagetable(). This solves the problem that amd_iommu_domain_direct_map() needs to restore domain->pt_root after the device table has been updated just to make free_pagetable release the domain page-table. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-4-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 73f3ac3ba276..9b0884ea5f4c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1391,20 +1391,19 @@ static struct page *free_sub_pt(unsigned long root, int mode, return freelist; } -static void free_pagetable(struct protection_domain *domain) +static void free_pagetable(struct domain_pgtable *pgtable) { - struct domain_pgtable pgtable; struct page *freelist = NULL; unsigned long root; - amd_iommu_domain_get_pgtable(domain, &pgtable); - atomic64_set(&domain->pt_root, 0); + if (pgtable->mode == PAGE_MODE_NONE) + return; - BUG_ON(pgtable.mode < PAGE_MODE_NONE || - pgtable.mode > PAGE_MODE_6_LEVEL); + BUG_ON(pgtable->mode < PAGE_MODE_NONE || + pgtable->mode > PAGE_MODE_6_LEVEL); - root = (unsigned long)pgtable.root; - freelist = free_sub_pt(root, pgtable.mode, freelist); + root = (unsigned long)pgtable->root; + freelist = free_sub_pt(root, pgtable->mode, freelist); free_page_list(freelist); } @@ -1823,12 +1822,16 @@ static void free_gcr3_table(struct protection_domain *domain) */ static void dma_ops_domain_free(struct protection_domain *domain) { + struct domain_pgtable pgtable; + if (!domain) return; iommu_put_dma_cookie(&domain->domain); - free_pagetable(domain); + amd_iommu_domain_get_pgtable(domain, &pgtable); + atomic64_set(&domain->pt_root, 0); + free_pagetable(&pgtable); if (domain->id) domain_id_free(domain->id); @@ -2496,9 +2499,8 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) break; default: amd_iommu_domain_get_pgtable(domain, &pgtable); - - if (pgtable.mode != PAGE_MODE_NONE) - free_pagetable(domain); + atomic64_set(&domain->pt_root, 0); + free_pagetable(&pgtable); if (domain->flags & PD_IOMMUV2_MASK) free_gcr3_table(domain); @@ -2796,7 +2798,6 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) struct protection_domain *domain = to_pdomain(dom); struct domain_pgtable pgtable; unsigned long flags; - u64 pt_root; spin_lock_irqsave(&domain->lock, flags); @@ -2804,18 +2805,13 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom) amd_iommu_domain_get_pgtable(domain, &pgtable); /* Update data structure */ - pt_root = amd_iommu_domain_encode_pgtable(NULL, PAGE_MODE_NONE); - atomic64_set(&domain->pt_root, pt_root); + atomic64_set(&domain->pt_root, 0); /* Make changes visible to IOMMUs */ update_domain(domain); - /* Restore old pgtable in domain->ptroot to free page-table */ - pt_root = amd_iommu_domain_encode_pgtable(pgtable.root, pgtable.mode); - atomic64_set(&domain->pt_root, pt_root); - /* Page-table is not visible to IOMMU anymore, so free it */ - free_pagetable(domain); + free_pagetable(&pgtable); spin_unlock_irqrestore(&domain->lock, flags); } From a71730e225be457dc51fc0b1fe155db18706d0fe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:07 +0200 Subject: [PATCH 256/574] iommu/amd: Allocate page-table in protection_domain_init() Consolidate the allocation of the domain page-table in one place. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-5-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 48 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 9b0884ea5f4c..434435b99a8a 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -71,6 +71,8 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) +#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL + static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ @@ -99,7 +101,7 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; static void update_domain(struct protection_domain *domain); -static int protection_domain_init(struct protection_domain *domain); +static int protection_domain_init(struct protection_domain *domain, int mode); static void detach_device(struct device *dev); static void update_and_flush_device_table(struct protection_domain *domain, struct domain_pgtable *pgtable); @@ -1847,21 +1849,14 @@ static void dma_ops_domain_free(struct protection_domain *domain) static struct protection_domain *dma_ops_domain_alloc(void) { struct protection_domain *domain; - u64 *pt_root, root; domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL); if (!domain) return NULL; - if (protection_domain_init(domain)) + if (protection_domain_init(domain, DEFAULT_PGTABLE_LEVEL)) goto free_domain; - pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pt_root) - goto free_domain; - - root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); - atomic64_set(&domain->pt_root, root); domain->flags = PD_DMA_OPS_MASK; if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) @@ -2401,18 +2396,31 @@ static void protection_domain_free(struct protection_domain *domain) kfree(domain); } -static int protection_domain_init(struct protection_domain *domain) +static int protection_domain_init(struct protection_domain *domain, int mode) { + u64 *pt_root = NULL, root; + + BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); + spin_lock_init(&domain->lock); domain->id = domain_id_alloc(); if (!domain->id) return -ENOMEM; INIT_LIST_HEAD(&domain->dev_list); + if (mode != PAGE_MODE_NONE) { + pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!pt_root) + return -ENOMEM; + } + + root = amd_iommu_domain_encode_pgtable(pt_root, mode); + atomic64_set(&domain->pt_root, root); + return 0; } -static struct protection_domain *protection_domain_alloc(void) +static struct protection_domain *protection_domain_alloc(int mode) { struct protection_domain *domain; @@ -2420,7 +2428,7 @@ static struct protection_domain *protection_domain_alloc(void) if (!domain) return NULL; - if (protection_domain_init(domain)) + if (protection_domain_init(domain, mode)) goto out_err; return domain; @@ -2434,23 +2442,13 @@ out_err: static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) { struct protection_domain *pdomain; - u64 *pt_root, root; switch (type) { case IOMMU_DOMAIN_UNMANAGED: - pdomain = protection_domain_alloc(); + pdomain = protection_domain_alloc(DEFAULT_PGTABLE_LEVEL); if (!pdomain) return NULL; - pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pt_root) { - protection_domain_free(pdomain); - return NULL; - } - - root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); - atomic64_set(&pdomain->pt_root, root); - pdomain->domain.geometry.aperture_start = 0; pdomain->domain.geometry.aperture_end = ~0ULL; pdomain->domain.geometry.force_aperture = true; @@ -2464,11 +2462,9 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) } break; case IOMMU_DOMAIN_IDENTITY: - pdomain = protection_domain_alloc(); + pdomain = protection_domain_alloc(PAGE_MODE_NONE); if (!pdomain) return NULL; - - atomic64_set(&pdomain->pt_root, PAGE_MODE_NONE); break; default: return NULL; From 75b27745097dde18a0b8936d73cf72df61b2253b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:08 +0200 Subject: [PATCH 257/574] iommu/amd: Free page-table in protection_domain_free() Align release of the page-table with the place where it is allocated. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-6-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 434435b99a8a..24e5678b5708 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2387,12 +2387,18 @@ static void cleanup_domain(struct protection_domain *domain) static void protection_domain_free(struct protection_domain *domain) { + struct domain_pgtable pgtable; + if (!domain) return; if (domain->id) domain_id_free(domain->id); + amd_iommu_domain_get_pgtable(domain, &pgtable); + atomic64_set(&domain->pt_root, 0); + free_pagetable(&pgtable); + kfree(domain); } @@ -2476,7 +2482,6 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; - struct domain_pgtable pgtable; domain = to_pdomain(dom); @@ -2494,10 +2499,6 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) dma_ops_domain_free(domain); break; default: - amd_iommu_domain_get_pgtable(domain, &pgtable); - atomic64_set(&domain->pt_root, 0); - free_pagetable(&pgtable); - if (domain->flags & PD_IOMMUV2_MASK) free_gcr3_table(domain); From 301441a0994689b56e98a0e8ecdc383b144ed258 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:09 +0200 Subject: [PATCH 258/574] iommu/amd: Consolidate domain allocation/freeing Merge the allocation code paths of DMA and UNMANAGED domains and remove code duplication. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-7-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 116 +++++++++----------------------------- 1 file changed, 27 insertions(+), 89 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 24e5678b5708..517a5eb30360 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -101,7 +101,6 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; static void update_domain(struct protection_domain *domain); -static int protection_domain_init(struct protection_domain *domain, int mode); static void detach_device(struct device *dev); static void update_and_flush_device_table(struct protection_domain *domain, struct domain_pgtable *pgtable); @@ -1818,58 +1817,6 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } -/* - * Free a domain, only used if something went wrong in the - * allocation path and we need to free an already allocated page table - */ -static void dma_ops_domain_free(struct protection_domain *domain) -{ - struct domain_pgtable pgtable; - - if (!domain) - return; - - iommu_put_dma_cookie(&domain->domain); - - amd_iommu_domain_get_pgtable(domain, &pgtable); - atomic64_set(&domain->pt_root, 0); - free_pagetable(&pgtable); - - if (domain->id) - domain_id_free(domain->id); - - kfree(domain); -} - -/* - * Allocates a new protection domain usable for the dma_ops functions. - * It also initializes the page table and the address allocator data - * structures required for the dma_ops interface - */ -static struct protection_domain *dma_ops_domain_alloc(void) -{ - struct protection_domain *domain; - - domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL); - if (!domain) - return NULL; - - if (protection_domain_init(domain, DEFAULT_PGTABLE_LEVEL)) - goto free_domain; - - domain->flags = PD_DMA_OPS_MASK; - - if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) - goto free_domain; - - return domain; - -free_domain: - dma_ops_domain_free(domain); - - return NULL; -} - /* * little helper function to check whether a given protection domain is a * dma_ops domain @@ -2447,36 +2394,32 @@ out_err: static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) { - struct protection_domain *pdomain; + struct protection_domain *domain; + int mode = DEFAULT_PGTABLE_LEVEL; - switch (type) { - case IOMMU_DOMAIN_UNMANAGED: - pdomain = protection_domain_alloc(DEFAULT_PGTABLE_LEVEL); - if (!pdomain) - return NULL; + if (type == IOMMU_DOMAIN_IDENTITY) + mode = PAGE_MODE_NONE; - pdomain->domain.geometry.aperture_start = 0; - pdomain->domain.geometry.aperture_end = ~0ULL; - pdomain->domain.geometry.force_aperture = true; - - break; - case IOMMU_DOMAIN_DMA: - pdomain = dma_ops_domain_alloc(); - if (!pdomain) { - pr_err("Failed to allocate\n"); - return NULL; - } - break; - case IOMMU_DOMAIN_IDENTITY: - pdomain = protection_domain_alloc(PAGE_MODE_NONE); - if (!pdomain) - return NULL; - break; - default: + domain = protection_domain_alloc(mode); + if (!domain) return NULL; + + domain->domain.geometry.aperture_start = 0; + domain->domain.geometry.aperture_end = ~0ULL; + domain->domain.geometry.force_aperture = true; + + if (type == IOMMU_DOMAIN_DMA) { + if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) + goto free_domain; + domain->flags = PD_DMA_OPS_MASK; } - return &pdomain->domain; + return &domain->domain; + +free_domain: + protection_domain_free(domain); + + return NULL; } static void amd_iommu_domain_free(struct iommu_domain *dom) @@ -2493,18 +2436,13 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) if (!dom) return; - switch (dom->type) { - case IOMMU_DOMAIN_DMA: - /* Now release the domain */ - dma_ops_domain_free(domain); - break; - default: - if (domain->flags & PD_IOMMUV2_MASK) - free_gcr3_table(domain); + if (dom->type == IOMMU_DOMAIN_DMA) + iommu_put_dma_cookie(&domain->domain); - protection_domain_free(domain); - break; - } + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); + + protection_domain_free(domain); } static void amd_iommu_detach_device(struct iommu_domain *dom, From e1980df36c5c94899df1fd71f0b0a6dba457051c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:10 +0200 Subject: [PATCH 259/574] iommu/amd: Remove PD_DMA_OPS_MASK This is covered by IOMMU_DOMAIN_DMA from the IOMMU core code already, so remove it. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-8-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 517a5eb30360..1faf2e0dad32 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1817,15 +1817,6 @@ static void free_gcr3_table(struct protection_domain *domain) free_page((unsigned long)domain->gcr3_tbl); } -/* - * little helper function to check whether a given protection domain is a - * dma_ops domain - */ -static bool dma_ops_domain(struct protection_domain *domain) -{ - return domain->flags & PD_DMA_OPS_MASK; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain, struct domain_pgtable *pgtable, bool ats, bool ppr) @@ -2408,11 +2399,9 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) domain->domain.geometry.aperture_end = ~0ULL; domain->domain.geometry.force_aperture = true; - if (type == IOMMU_DOMAIN_DMA) { - if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) - goto free_domain; - domain->flags = PD_DMA_OPS_MASK; - } + if (type == IOMMU_DOMAIN_DMA && + iommu_get_dma_cookie(&domain->domain) == -ENOMEM) + goto free_domain; return &domain->domain; @@ -3024,17 +3013,18 @@ struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) if (!check_device(dev)) return NULL; - pdomain = get_dev_data(dev)->domain; + pdomain = get_dev_data(dev)->domain; + io_domain = iommu_get_domain_for_dev(dev); if (pdomain == NULL && get_dev_data(dev)->defer_attach) { get_dev_data(dev)->defer_attach = false; - io_domain = iommu_get_domain_for_dev(dev); pdomain = to_pdomain(io_domain); attach_device(dev, pdomain); } + if (pdomain == NULL) return NULL; - if (!dma_ops_domain(pdomain)) + if (io_domain->type != IOMMU_DOMAIN_DMA) return NULL; /* Only return IOMMUv2 domains */ From 786dfe496e3b30b92d11cf7ae1d36d1a458702f6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:11 +0200 Subject: [PATCH 260/574] iommu/amd: Merge private header files Merge amd_iommu_proto.h into amd_iommu.h. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-9-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 4 +- drivers/iommu/amd_iommu.h | 96 +++++++++++++++++++++++++++++- drivers/iommu/amd_iommu_debugfs.c | 5 +- drivers/iommu/amd_iommu_init.c | 4 +- drivers/iommu/amd_iommu_proto.h | 97 ------------------------------- drivers/iommu/amd_iommu_v2.c | 4 +- 6 files changed, 100 insertions(+), 110 deletions(-) delete mode 100644 drivers/iommu/amd_iommu_proto.h diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1faf2e0dad32..862c8d5e63d6 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -43,8 +42,7 @@ #include #include -#include "amd_iommu_proto.h" -#include "amd_iommu_types.h" +#include "amd_iommu.h" #include "irq_remapping.h" #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) diff --git a/drivers/iommu/amd_iommu.h b/drivers/iommu/amd_iommu.h index 12d540d9b59b..f892992c8744 100644 --- a/drivers/iommu/amd_iommu.h +++ b/drivers/iommu/amd_iommu.h @@ -1,9 +1,103 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + */ #ifndef AMD_IOMMU_H #define AMD_IOMMU_H -int __init add_special_device(u8 type, u8 id, u16 *devid, bool cmd_line); +#include + +#include "amd_iommu_types.h" + +extern int amd_iommu_get_num_iommus(void); +extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); +extern irqreturn_t amd_iommu_int_thread(int irq, void *data); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); +extern int amd_iommu_init_devices(void); +extern void amd_iommu_uninit_devices(void); +extern void amd_iommu_init_notifier(void); +extern int amd_iommu_init_api(void); + +#ifdef CONFIG_AMD_IOMMU_DEBUGFS +void amd_iommu_debugfs_setup(struct amd_iommu *iommu); +#else +static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {} +#endif + +/* Needed for interrupt remapping */ +extern int amd_iommu_prepare(void); +extern int amd_iommu_enable(void); +extern void amd_iommu_disable(void); +extern int amd_iommu_reenable(int); +extern int amd_iommu_enable_faulting(void); +extern int amd_iommu_guest_ir; + +/* IOMMUv2 specific functions */ +struct iommu_domain; + +extern bool amd_iommu_v2_supported(void); +extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); +extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); +extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); +extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address); +extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); +extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3); +extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); +extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev); + +#ifdef CONFIG_IRQ_REMAP +extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu); +#else +static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu) +{ + return 0; +} +#endif + +#define PPR_SUCCESS 0x0 +#define PPR_INVALID 0x1 +#define PPR_FAILURE 0xf + +extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag); + +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_ATI) && + (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + +static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) +{ + if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) + return false; + + return !!(iommu->features & f); +} + +static inline u64 iommu_virt_to_phys(void *vaddr) +{ + return (u64)__sme_set(virt_to_phys(vaddr)); +} + +static inline void *iommu_phys_to_virt(unsigned long paddr) +{ + return phys_to_virt(__sme_clr(paddr)); +} + +extern bool translation_pre_enabled(struct amd_iommu *iommu); +extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, + struct device *dev); +extern int __init add_special_device(u8 type, u8 id, u16 *devid, + bool cmd_line); #ifdef CONFIG_DMI void amd_iommu_apply_ivrs_quirks(void); diff --git a/drivers/iommu/amd_iommu_debugfs.c b/drivers/iommu/amd_iommu_debugfs.c index c6a5c737ef09..545372fcc72f 100644 --- a/drivers/iommu/amd_iommu_debugfs.c +++ b/drivers/iommu/amd_iommu_debugfs.c @@ -8,10 +8,9 @@ */ #include -#include #include -#include "amd_iommu_proto.h" -#include "amd_iommu_types.h" + +#include "amd_iommu.h" static struct dentry *amd_iommu_debugfs; static DEFINE_MUTEX(amd_iommu_debugfs_lock); diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 5b81fd16f5fa..3faff7f80fd2 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -32,9 +31,8 @@ #include #include + #include "amd_iommu.h" -#include "amd_iommu_proto.h" -#include "amd_iommu_types.h" #include "irq_remapping.h" /* diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h deleted file mode 100644 index 1c6c12c11368..000000000000 --- a/drivers/iommu/amd_iommu_proto.h +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - */ - -#ifndef _ASM_X86_AMD_IOMMU_PROTO_H -#define _ASM_X86_AMD_IOMMU_PROTO_H - -#include "amd_iommu_types.h" - -extern int amd_iommu_get_num_iommus(void); -extern int amd_iommu_init_dma_ops(void); -extern int amd_iommu_init_passthrough(void); -extern irqreturn_t amd_iommu_int_thread(int irq, void *data); -extern irqreturn_t amd_iommu_int_handler(int irq, void *data); -extern void amd_iommu_apply_erratum_63(u16 devid); -extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); -extern int amd_iommu_init_devices(void); -extern void amd_iommu_uninit_devices(void); -extern void amd_iommu_init_notifier(void); -extern int amd_iommu_init_api(void); - -#ifdef CONFIG_AMD_IOMMU_DEBUGFS -void amd_iommu_debugfs_setup(struct amd_iommu *iommu); -#else -static inline void amd_iommu_debugfs_setup(struct amd_iommu *iommu) {} -#endif - -/* Needed for interrupt remapping */ -extern int amd_iommu_prepare(void); -extern int amd_iommu_enable(void); -extern void amd_iommu_disable(void); -extern int amd_iommu_reenable(int); -extern int amd_iommu_enable_faulting(void); -extern int amd_iommu_guest_ir; - -/* IOMMUv2 specific functions */ -struct iommu_domain; - -extern bool amd_iommu_v2_supported(void); -extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); -extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); -extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); -extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); -extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, - u64 address); -extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); -extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, - unsigned long cr3); -extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); -extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev); - -#ifdef CONFIG_IRQ_REMAP -extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu); -#else -static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu) -{ - return 0; -} -#endif - -#define PPR_SUCCESS 0x0 -#define PPR_INVALID 0x1 -#define PPR_FAILURE 0xf - -extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, - int status, int tag); - -static inline bool is_rd890_iommu(struct pci_dev *pdev) -{ - return (pdev->vendor == PCI_VENDOR_ID_ATI) && - (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); -} - -static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) -{ - if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) - return false; - - return !!(iommu->features & f); -} - -static inline u64 iommu_virt_to_phys(void *vaddr) -{ - return (u64)__sme_set(virt_to_phys(vaddr)); -} - -static inline void *iommu_phys_to_virt(unsigned long paddr) -{ - return phys_to_virt(__sme_clr(paddr)); -} - -extern bool translation_pre_enabled(struct amd_iommu *iommu); -extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, - struct device *dev); -#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 9b6e038150c1..c8a7b6b39222 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -13,13 +13,11 @@ #include #include #include -#include #include #include #include -#include "amd_iommu_types.h" -#include "amd_iommu_proto.h" +#include "amd_iommu.h" MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Joerg Roedel "); From 05a0542b456e135f362ba83a17ccff73bac0b92f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:12 +0200 Subject: [PATCH 261/574] iommu/amd: Store dev_data as device iommu private data Do not use dev->archdata.iommu anymore and switch to using the private per-device pointer provided by the IOMMU core code. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-10-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 862c8d5e63d6..7461b49580a8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -279,11 +279,6 @@ static struct iommu_dev_data *find_dev_data(u16 devid) return dev_data; } -static struct iommu_dev_data *get_dev_data(struct device *dev) -{ - return dev->archdata.iommu; -} - /* * Find or create an IOMMU group for a acpihid device. */ @@ -334,7 +329,7 @@ static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) { struct iommu_dev_data *dev_data; - dev_data = get_dev_data(&pdev->dev); + dev_data = dev_iommu_priv_get(&pdev->dev); return dev_data->errata & (1 << erratum) ? true : false; } @@ -369,7 +364,7 @@ static int iommu_init_device(struct device *dev) struct iommu_dev_data *dev_data; int devid; - if (dev->archdata.iommu) + if (dev_iommu_priv_get(dev)) return 0; devid = get_device_id(dev); @@ -396,7 +391,7 @@ static int iommu_init_device(struct device *dev) dev_data->iommu_v2 = iommu->is_iommu_v2; } - dev->archdata.iommu = dev_data; + dev_iommu_priv_set(dev, dev_data); return 0; } @@ -431,6 +426,8 @@ static void amd_iommu_uninit_device(struct device *dev) if (dev_data->domain) detach_device(dev); + dev_iommu_priv_set(dev, NULL); + /* * We keep dev_data around for unplugged devices and reuse it when the * device is re-plugged - not doing so would introduce a ton of races. @@ -493,7 +490,7 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id, pdev = pci_get_domain_bus_and_slot(0, PCI_BUS_NUM(devid), devid & 0xff); if (pdev) - dev_data = get_dev_data(&pdev->dev); + dev_data = dev_iommu_priv_get(&pdev->dev); if (dev_data && __ratelimit(&dev_data->rs)) { pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", @@ -2033,7 +2030,7 @@ static int attach_device(struct device *dev, spin_lock_irqsave(&domain->lock, flags); - dev_data = get_dev_data(dev); + dev_data = dev_iommu_priv_get(dev); spin_lock(&dev_data->lock); @@ -2097,7 +2094,7 @@ static void detach_device(struct device *dev) struct iommu_dev_data *dev_data; unsigned long flags; - dev_data = get_dev_data(dev); + dev_data = dev_iommu_priv_get(dev); domain = dev_data->domain; spin_lock_irqsave(&domain->lock, flags); @@ -2146,7 +2143,7 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) iommu = amd_iommu_rlookup_table[devid]; - if (get_dev_data(dev)) + if (dev_iommu_priv_get(dev)) return &iommu->iommu; ret = iommu_init_device(dev); @@ -2435,7 +2432,7 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct iommu_dev_data *dev_data = dev->archdata.iommu; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct amd_iommu *iommu; int devid; @@ -2473,7 +2470,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, if (!check_device(dev)) return -EINVAL; - dev_data = dev->archdata.iommu; + dev_data = dev_iommu_priv_get(dev); dev_data->defer_attach = false; iommu = amd_iommu_rlookup_table[dev_data->devid]; @@ -2632,7 +2629,7 @@ static void amd_iommu_get_resv_regions(struct device *dev, bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, struct device *dev) { - struct iommu_dev_data *dev_data = dev->archdata.iommu; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); return dev_data->defer_attach; } @@ -2659,7 +2656,7 @@ static int amd_iommu_def_domain_type(struct device *dev) { struct iommu_dev_data *dev_data; - dev_data = get_dev_data(dev); + dev_data = dev_iommu_priv_get(dev); if (!dev_data) return 0; @@ -2992,7 +2989,7 @@ int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, struct amd_iommu *iommu; struct iommu_cmd cmd; - dev_data = get_dev_data(&pdev->dev); + dev_data = dev_iommu_priv_get(&pdev->dev); iommu = amd_iommu_rlookup_table[dev_data->devid]; build_complete_ppr(&cmd, dev_data->devid, pasid, status, @@ -3005,16 +3002,19 @@ EXPORT_SYMBOL(amd_iommu_complete_ppr); struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) { struct protection_domain *pdomain; - struct iommu_domain *io_domain; + struct iommu_dev_data *dev_data; struct device *dev = &pdev->dev; + struct iommu_domain *io_domain; if (!check_device(dev)) return NULL; - pdomain = get_dev_data(dev)->domain; + dev_data = dev_iommu_priv_get(&pdev->dev); + pdomain = dev_data->domain; io_domain = iommu_get_domain_for_dev(dev); - if (pdomain == NULL && get_dev_data(dev)->defer_attach) { - get_dev_data(dev)->defer_attach = false; + + if (pdomain == NULL && dev_data->defer_attach) { + dev_data->defer_attach = false; pdomain = to_pdomain(io_domain); attach_device(dev, pdomain); } @@ -3040,7 +3040,7 @@ void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) if (!amd_iommu_v2_supported()) return; - dev_data = get_dev_data(&pdev->dev); + dev_data = dev_iommu_priv_get(&pdev->dev); dev_data->errata |= (1 << erratum); } EXPORT_SYMBOL(amd_iommu_enable_device_erratum); From 736c3333e3970e7eb0c47a3811588ff0e760f19b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 May 2020 13:53:13 +0200 Subject: [PATCH 262/574] iommu/amd: Remove redundant devid checks Checking the return value of get_device_id() in a code-path which has already done check_device() is not needed, as check_device() does the same check and bails out if it fails. Signed-off-by: Joerg Roedel Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20200527115313.7426-11-joro@8bytes.org --- drivers/iommu/amd_iommu.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 7461b49580a8..7c1884cab91f 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -413,13 +413,8 @@ static void iommu_ignore_device(struct device *dev) static void amd_iommu_uninit_device(struct device *dev) { struct iommu_dev_data *dev_data; - int devid; - devid = get_device_id(dev); - if (devid < 0) - return; - - dev_data = search_dev_data(devid); + dev_data = dev_iommu_priv_get(dev); if (!dev_data) return; @@ -2173,16 +2168,12 @@ static void amd_iommu_probe_finalize(struct device *dev) static void amd_iommu_release_device(struct device *dev) { + int devid = get_device_id(dev); struct amd_iommu *iommu; - int devid; if (!check_device(dev)) return; - devid = get_device_id(dev); - if (devid < 0) - return; - iommu = amd_iommu_rlookup_table[devid]; amd_iommu_uninit_device(dev); From 9271dfd9e0f79e2969dcbe28568bce0fdc4f8f73 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 24 May 2020 02:46:53 -0400 Subject: [PATCH 263/574] drm/amdgpu/pm: return an error during GPU reset or suspend (v2) Return an error for sysfs and debugfs power interfaces during gpu reset and suspend. Prevents access to the hw while it may be in an unusable state. v2: squash in fix to drop suspend check Acked-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 171 +++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index d7646cbce346..775e389c9a13 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -163,6 +163,9 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev, enum amd_pm_state_type pm; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -196,6 +199,9 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev, enum amd_pm_state_type state; int ret; + if (adev->in_gpu_reset) + return -EPERM; + if (strncmp("battery", buf, strlen("battery")) == 0) state = POWER_STATE_TYPE_BATTERY; else if (strncmp("balanced", buf, strlen("balanced")) == 0) @@ -297,6 +303,9 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev, enum amd_dpm_forced_level level = 0xff; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -334,6 +343,9 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, enum amd_dpm_forced_level current_level = 0xff; int ret = 0; + if (adev->in_gpu_reset) + return -EPERM; + if (strncmp("low", buf, strlen("low")) == 0) { level = AMD_DPM_FORCED_LEVEL_LOW; } else if (strncmp("high", buf, strlen("high")) == 0) { @@ -433,6 +445,9 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev, struct pp_states_info data; int i, buf_len, ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -472,6 +487,9 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev, enum amd_pm_state_type pm = 0; int i = 0, ret = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -508,6 +526,9 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = ddev->dev_private; + if (adev->in_gpu_reset) + return -EPERM; + if (adev->pp_force_state_enabled) return amdgpu_get_pp_cur_state(dev, attr, buf); else @@ -525,6 +546,9 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev, unsigned long idx; int ret; + if (adev->in_gpu_reset) + return -EPERM; + if (strlen(buf) == 1) adev->pp_force_state_enabled = false; else if (is_support_sw_smu(adev)) @@ -580,6 +604,9 @@ static ssize_t amdgpu_get_pp_table(struct device *dev, char *table = NULL; int size, ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -619,6 +646,9 @@ static ssize_t amdgpu_set_pp_table(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int ret = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -721,6 +751,9 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, const char delimiter[3] = {' ', '\n', '\0'}; uint32_t type; + if (adev->in_gpu_reset) + return -EPERM; + if (count > 127) return -EINVAL; @@ -810,6 +843,9 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -859,6 +895,9 @@ static ssize_t amdgpu_set_pp_features(struct device *dev, uint64_t featuremask; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = kstrtou64(buf, 0, &featuremask); if (ret) return -EINVAL; @@ -899,6 +938,9 @@ static ssize_t amdgpu_get_pp_features(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -955,6 +997,9 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1018,6 +1063,9 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev, int ret; uint32_t mask = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1049,6 +1097,9 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1076,6 +1127,9 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev, uint32_t mask = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1107,6 +1161,9 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1134,6 +1191,9 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev, int ret; uint32_t mask = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1167,6 +1227,9 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1194,6 +1257,9 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev, int ret; uint32_t mask = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1227,6 +1293,9 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1254,6 +1323,9 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev, int ret; uint32_t mask = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1287,6 +1359,9 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1314,6 +1389,9 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev, int ret; uint32_t mask = 0; + if (adev->in_gpu_reset) + return -EPERM; + ret = amdgpu_read_mask(buf, count, &mask); if (ret) return ret; @@ -1347,6 +1425,9 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev, uint32_t value = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1372,6 +1453,9 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev, int ret; long int value; + if (adev->in_gpu_reset) + return -EPERM; + ret = kstrtol(buf, 0, &value); if (ret) @@ -1410,6 +1494,9 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev, uint32_t value = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1435,6 +1522,9 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev, int ret; long int value; + if (adev->in_gpu_reset) + return -EPERM; + ret = kstrtol(buf, 0, &value); if (ret) @@ -1493,6 +1583,9 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev, ssize_t size; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(ddev->dev); if (ret < 0) return ret; @@ -1528,6 +1621,9 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev, long int profile_mode = 0; const char delimiter[3] = {' ', '\n', '\0'}; + if (adev->in_gpu_reset) + return -EPERM; + tmp[0] = *(buf); tmp[1] = '\0'; ret = kstrtol(tmp, 0, &profile_mode); @@ -1587,6 +1683,9 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int r, value, size = sizeof(value); + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(ddev->dev); if (r < 0) return r; @@ -1620,6 +1719,9 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int r, value, size = sizeof(value); + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(ddev->dev); if (r < 0) return r; @@ -1658,6 +1760,9 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev, uint64_t count0 = 0, count1 = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + if (adev->flags & AMD_IS_APU) return -ENODATA; @@ -1694,6 +1799,9 @@ static ssize_t amdgpu_get_unique_id(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = ddev->dev_private; + if (adev->in_gpu_reset) + return -EPERM; + if (adev->unique_id) return snprintf(buf, PAGE_SIZE, "%016llx\n", adev->unique_id); @@ -1888,6 +1996,9 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev, int channel = to_sensor_dev_attr(attr)->index; int r, temp = 0, size = sizeof(temp); + if (adev->in_gpu_reset) + return -EPERM; + if (channel >= PP_TEMP_MAX) return -EINVAL; @@ -2019,6 +2130,9 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev, u32 pwm_mode = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(adev->ddev->dev); if (ret < 0) return ret; @@ -2050,6 +2164,9 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev, int err, ret; int value; + if (adev->in_gpu_reset) + return -EPERM; + err = kstrtoint(buf, 10, &value); if (err) return err; @@ -2099,6 +2216,9 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev, u32 value; u32 pwm_mode; + if (adev->in_gpu_reset) + return -EPERM; + err = pm_runtime_get_sync(adev->ddev->dev); if (err < 0) return err; @@ -2148,6 +2268,9 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev, int err; u32 speed = 0; + if (adev->in_gpu_reset) + return -EPERM; + err = pm_runtime_get_sync(adev->ddev->dev); if (err < 0) return err; @@ -2178,6 +2301,9 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev, int err; u32 speed = 0; + if (adev->in_gpu_reset) + return -EPERM; + err = pm_runtime_get_sync(adev->ddev->dev); if (err < 0) return err; @@ -2207,6 +2333,9 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev, u32 size = sizeof(min_rpm); int r; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2232,6 +2361,9 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev, u32 size = sizeof(max_rpm); int r; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2256,6 +2388,9 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev, int err; u32 rpm = 0; + if (adev->in_gpu_reset) + return -EPERM; + err = pm_runtime_get_sync(adev->ddev->dev); if (err < 0) return err; @@ -2285,6 +2420,9 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev, u32 value; u32 pwm_mode; + if (adev->in_gpu_reset) + return -EPERM; + err = pm_runtime_get_sync(adev->ddev->dev); if (err < 0) return err; @@ -2331,6 +2469,9 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev, u32 pwm_mode = 0; int ret; + if (adev->in_gpu_reset) + return -EPERM; + ret = pm_runtime_get_sync(adev->ddev->dev); if (ret < 0) return ret; @@ -2363,6 +2504,9 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev, int value; u32 pwm_mode; + if (adev->in_gpu_reset) + return -EPERM; + err = kstrtoint(buf, 10, &value); if (err) return err; @@ -2403,6 +2547,9 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev, u32 vddgfx; int r, size = sizeof(vddgfx); + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2435,6 +2582,9 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev, u32 vddnb; int r, size = sizeof(vddnb); + if (adev->in_gpu_reset) + return -EPERM; + /* only APUs have vddnb */ if (!(adev->flags & AMD_IS_APU)) return -EINVAL; @@ -2472,6 +2622,9 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev, int r, size = sizeof(u32); unsigned uw; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2508,6 +2661,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev, ssize_t size; int r; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2537,6 +2693,9 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev, ssize_t size; int r; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2567,6 +2726,9 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev, int err; u32 value; + if (adev->in_gpu_reset) + return -EPERM; + if (amdgpu_sriov_vf(adev)) return -EINVAL; @@ -2605,6 +2767,9 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev, uint32_t sclk; int r, size = sizeof(sclk); + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -2637,6 +2802,9 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev, uint32_t mclk; int r, size = sizeof(mclk); + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(adev->ddev->dev); if (r < 0) return r; @@ -3497,6 +3665,9 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data) u32 flags = 0; int r; + if (adev->in_gpu_reset) + return -EPERM; + r = pm_runtime_get_sync(dev->dev); if (r < 0) return r; From cc831b9781e871e2bde00e9ae0041152139ee304 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 12 May 2020 19:06:37 +0800 Subject: [PATCH 264/574] drm/amd/powerplay: ack the SMUToHost interrupt on receive V2 There will be no further interrupt without proper ack for current one. V2: fix typo to really set ACK bit only Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index ae0361e225bb..aa76c2cea747 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -1561,6 +1561,7 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, * events for SMCToHost interrupt. */ uint32_t ctxid = entry->src_data[0]; + uint32_t data; if (client_id == SOC15_IH_CLIENTID_THM) { switch (src_id) { @@ -1590,6 +1591,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, orderly_poweroff(true); } else if (client_id == SOC15_IH_CLIENTID_MP1) { if (src_id == 0xfe) { + /* ACK SMUToHost interrupt */ + data = RREG32_SOC15(MP1, 0, mmMP1_SMN_IH_SW_INT_CTRL); + data = REG_SET_FIELD(data, MP1_SMN_IH_SW_INT_CTRL, INT_ACK, 1); + WREG32_SOC15(MP1, 0, mmMP1_SMN_IH_SW_INT_CTRL, data); + switch (ctxid) { case 0x3: dev_dbg(adev->dev, "Switched to AC mode!\n"); From 14ed1c908a7a623cc0cbf0203f8201d1b7d31d16 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 28 May 2020 09:44:44 -0400 Subject: [PATCH 265/574] Revert "drm/amd/display: disable dcn20 abm feature for bring up" This reverts commit 96cb7cf13d8530099c256c053648ad576588c387. This change was used for DCN2 bringup and is no longer desired. In fact it breaks backlight on DCN2 systems. Cc: Alexander Monakov Cc: Hersen Wu Cc: Anthony Koo Cc: Michael Chiu Signed-off-by: Harry Wentland Acked-by: Alex Deucher Reviewed-by: Nicholas Kazlauskas Reported-and-tested-by: Alexander Monakov Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d53c60b37cc6..f42e7e67ddba 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1356,7 +1356,7 @@ static int dm_late_init(void *handle) unsigned int linear_lut[16]; int i; struct dmcu *dmcu = NULL; - bool ret = false; + bool ret; if (!adev->dm.fw_dmcu) return detect_mst_link_for_all_connectors(adev->ddev); @@ -1377,13 +1377,10 @@ static int dm_late_init(void *handle) */ params.min_abm_backlight = 0x28F; - /* todo will enable for navi10 */ - if (adev->asic_type <= CHIP_RAVEN) { - ret = dmcu_load_iram(dmcu, params); + ret = dmcu_load_iram(dmcu, params); - if (!ret) - return -EINVAL; - } + if (!ret) + return -EINVAL; return detect_mst_link_for_all_connectors(adev->ddev); } From a1ef8bad506e4ffa0c57ac5f8cb99ab5cbc3b1fc Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Fri, 29 May 2020 15:18:47 +1000 Subject: [PATCH 266/574] drm/nouveau/disp/gm200-: fix NV_PDISP_SOR_HDMI2_CTRL(n) selection This is a SOR register, and not indexed by the bound head. Fixes display not coming up on high-bandwidth HDMI displays under a number of configurations. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigm200.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigm200.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigm200.c index 9b16a08eb4d9..bf6d41fb0c9f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigm200.c @@ -27,10 +27,10 @@ void gm200_hdmi_scdc(struct nvkm_ior *ior, int head, u8 scdc) { struct nvkm_device *device = ior->disp->engine.subdev.device; - const u32 hoff = head * 0x800; + const u32 soff = nv50_ior_base(ior); const u32 ctrl = scdc & 0x3; - nvkm_mask(device, 0x61c5bc + hoff, 0x00000003, ctrl); + nvkm_mask(device, 0x61c5bc + soff, 0x00000003, ctrl); ior->tmds.high_speed = !!(scdc & 0x2); } From 0ad679d157aa69ddf0ee46b564c9fbb646cf6d4e Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Fri, 29 May 2020 17:57:29 +1000 Subject: [PATCH 267/574] drm/nouveau/kms/gt215-: fix race with audio driver runpm The audio driver can call into nouveau right while we're in the middle of re-fetching the EDID, and decide it no longer needs to be awake. Stop depending on EDID in the audio component get_eld() callback, and instead cache whether audio support is present from the prior modeset. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/disp.c | 4 +++- drivers/gpu/drm/nouveau/nouveau_encoder.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index 7622490d8602..e8ac510f8298 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -510,7 +510,7 @@ nv50_audio_component_get_eld(struct device *kdev, int port, int dev_id, if (!nv_connector || !nv_crtc || nv_encoder->or != port || nv_crtc->index != dev_id) continue; - *enabled = drm_detect_monitor_audio(nv_connector->edid); + *enabled = nv_encoder->audio; if (*enabled) { ret = drm_eld_size(nv_connector->base.eld); memcpy(buf, nv_connector->base.eld, @@ -600,6 +600,7 @@ nv50_audio_disable(struct drm_encoder *encoder, struct nouveau_crtc *nv_crtc) (0x0100 << nv_crtc->index), }; + nv_encoder->audio = false; nvif_mthd(&disp->disp->object, 0, &args, sizeof(args)); nv50_audio_component_eld_notify(drm->audio.component, nv_encoder->or, @@ -636,6 +637,7 @@ nv50_audio_enable(struct drm_encoder *encoder, struct drm_display_mode *mode) nvif_mthd(&disp->disp->object, 0, &args, sizeof(args.base) + drm_eld_size(args.data)); + nv_encoder->audio = true; nv50_audio_component_eld_notify(drm->audio.component, nv_encoder->or, nv_crtc->index); diff --git a/drivers/gpu/drm/nouveau/nouveau_encoder.h b/drivers/gpu/drm/nouveau/nouveau_encoder.h index de51733b0476..a72c412ac8b1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_encoder.h +++ b/drivers/gpu/drm/nouveau/nouveau_encoder.h @@ -52,6 +52,7 @@ struct nouveau_encoder { * actually programmed on the hw, not the proposed crtc */ struct drm_crtc *crtc; u32 ctrl; + bool audio; struct drm_display_mode mode; int last_dpms; From f9009efac49c830460f55b9f6c08ee0d76f31b0d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:44:59 -0400 Subject: [PATCH 268/574] ceph: add dentry lease metric support For dentry leases, only count the hit/miss info triggered from the vfs calls. For the cases like request reply handling and ceph_trim_dentries, ignore them. For now, these are only viewable using debugfs. Future patches will allow the client to send the stats to the MDS. The output looks like: item total miss hit ------------------------------------------------- d_lease 11 7 141 URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 2 +- fs/ceph/debugfs.c | 33 +++++++++++++++++++++++++++++---- fs/ceph/dir.c | 12 ++++++++++++ fs/ceph/mds_client.c | 16 ++++++++++++++-- fs/ceph/mds_client.h | 4 ++++ fs/ceph/metric.c | 35 +++++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 17 +++++++++++++++++ fs/ceph/super.h | 1 + 8 files changed, 113 insertions(+), 7 deletions(-) create mode 100644 fs/ceph/metric.c create mode 100644 fs/ceph/metric.h diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 0a0823d378db..50c635dc7f71 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ - debugfs.o util.o + debugfs.o util.o metric.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index dcaed75de9e6..1e7ea58f546a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -124,6 +124,23 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +static int metric_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client_metric *m = &mdsc->metric; + + seq_printf(s, "item total miss hit\n"); + seq_printf(s, "-------------------------------------------------\n"); + + seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease", + atomic64_read(&m->total_dentries), + percpu_counter_sum(&m->d_lease_mis), + percpu_counter_sum(&m->d_lease_hit)); + + return 0; +} + static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; @@ -222,6 +239,7 @@ DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); +DEFINE_SHOW_ATTRIBUTE(metric); /* @@ -255,6 +273,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mdsmap); debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_metric); debugfs_remove(fsc->debugfs_mdsc); } @@ -295,11 +314,17 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc, &mdsc_fops); + fsc->debugfs_metric = debugfs_create_file("metrics", + 0400, + fsc->client->debugfs_dir, + fsc, + &metric_fops); + fsc->debugfs_caps = debugfs_create_file("caps", - 0400, - fsc->client->debugfs_dir, - fsc, - &caps_fops); + 0400, + fsc->client->debugfs_dir, + fsc, + &caps_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4c4202c93b71..44cf85136ad5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,6 +38,8 @@ static int __dir_lease_try_check(const struct dentry *dentry); static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) @@ -48,6 +50,9 @@ static int ceph_d_init(struct dentry *dentry) di->time = jiffies; dentry->d_fsdata = di; INIT_LIST_HEAD(&di->lease_list); + + atomic64_inc(&mdsc->metric.total_dentries); + return 0; } @@ -1709,6 +1714,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; + percpu_counter_inc(&mdsc->metric.d_lease_mis); + op = ceph_snap(dir) == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); @@ -1740,6 +1747,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) dout("d_revalidate %p lookup result=%d\n", dentry, err); } + } else { + percpu_counter_inc(&mdsc->metric.d_lease_hit); } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); @@ -1782,9 +1791,12 @@ static int ceph_d_delete(const struct dentry *dentry) static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); dout("d_release %p\n", dentry); + atomic64_dec(&fsc->mdsc->metric.total_dentries); + spin_lock(&dentry->d_lock); __dentry_lease_unlist(di); dentry->d_fsdata = NULL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7c63abf5bea9..39a27c56411a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4323,6 +4323,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc; + int err; mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); if (!mdsc) @@ -4331,8 +4332,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (!mdsc->mdsmap) { - kfree(mdsc); - return -ENOMEM; + err = -ENOMEM; + goto err_mdsc; } fsc->mdsc = mdsc; @@ -4371,6 +4372,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); atomic_set(&mdsc->cap_reclaim_pending, 0); + err = ceph_metric_init(&mdsc->metric); + if (err) + goto err_mdsmap; spin_lock_init(&mdsc->dentry_list_lock); INIT_LIST_HEAD(&mdsc->dentry_leases); @@ -4389,6 +4393,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); return 0; + +err_mdsmap: + kfree(mdsc->mdsmap); +err_mdsc: + kfree(mdsc); + return err; } /* @@ -4646,6 +4656,8 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_mdsc_stop(mdsc); + ceph_metric_destroy(&mdsc->metric); + fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 903d9edfd4bf..605995ae3718 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -16,6 +16,8 @@ #include #include +#include "metric.h" + /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { CEPHFS_FEATURE_MIMIC = 8, @@ -454,6 +456,8 @@ struct ceph_mds_client { struct list_head dentry_leases; /* fifo list */ struct list_head dentry_dir_leases; /* lru list */ + struct ceph_client_metric metric; + spinlock_t snapid_map_lock; struct rb_root snapid_map_tree; struct list_head snapid_map_lru; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c new file mode 100644 index 000000000000..873a3769fa0d --- /dev/null +++ b/fs/ceph/metric.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include + +#include "metric.h" + +int ceph_metric_init(struct ceph_client_metric *m) +{ + int ret; + + if (!m) + return -EINVAL; + + atomic64_set(&m->total_dentries, 0); + ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); + if (ret) { + percpu_counter_destroy(&m->d_lease_hit); + return ret; + } + + return 0; +} + +void ceph_metric_destroy(struct ceph_client_metric *m) +{ + if (!m) + return; + + percpu_counter_destroy(&m->d_lease_mis); + percpu_counter_destroy(&m->d_lease_hit); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h new file mode 100644 index 000000000000..da725dacabfe --- /dev/null +++ b/fs/ceph/metric.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDS_METRIC_H +#define _FS_CEPH_MDS_METRIC_H + +#include +#include + +/* This is the global metrics */ +struct ceph_client_metric { + atomic64_t total_dentries; + struct percpu_counter d_lease_hit; + struct percpu_counter d_lease_mis; +}; + +extern int ceph_metric_init(struct ceph_client_metric *m); +extern void ceph_metric_destroy(struct ceph_client_metric *m); +#endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 60aac3aee055..5c73cf1ca5ff 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -128,6 +128,7 @@ struct ceph_fs_client { struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_metric; struct dentry *debugfs_mds_sessions; #endif From 1af16d547f3080d71060092d22e79a34527d1d08 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:00 -0400 Subject: [PATCH 269/574] ceph: add caps perf metric for each superblock Count hits and misses in the caps cache. If the client has all of the necessary caps when a task needs references, then it's counted as a hit. Any other situation is a miss. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 2 +- fs/ceph/caps.c | 19 +++++++++++++++++++ fs/ceph/debugfs.c | 16 ++++++++++++++++ fs/ceph/dir.c | 5 +++-- fs/ceph/inode.c | 4 ++-- fs/ceph/metric.c | 26 ++++++++++++++++++++++---- fs/ceph/metric.h | 13 +++++++++++++ fs/ceph/super.h | 8 +++++--- fs/ceph/xattr.c | 4 ++-- 9 files changed, 83 insertions(+), 14 deletions(-) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 26be6520d3fb..e0465741c591 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -22,7 +22,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); spin_lock(&ci->i_ceph_lock); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + if (__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 0)) set_cached_acl(inode, type, acl); else forget_cached_acl(inode, type); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f1acde6fb9a6..82914866affc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -912,6 +912,20 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) return 0; } +int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int touch) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + int r; + + r = __ceph_caps_issued_mask(ci, mask, touch); + if (r) + ceph_update_cap_hit(&fsc->mdsc->metric); + else + ceph_update_cap_mis(&fsc->mdsc->metric); + return r; +} + /* * Return true if mask caps are currently being revoked by an MDS. */ @@ -2685,6 +2699,11 @@ out_unlock: if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); + if (!ret) + ceph_update_cap_mis(&mdsc->metric); + else if (ret == 1) + ceph_update_cap_hit(&mdsc->metric); + dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1e7ea58f546a..37b2775949ad 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -129,6 +129,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; + int i, nr_caps = 0; seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); @@ -138,6 +139,21 @@ static int metric_show(struct seq_file *s, void *p) percpu_counter_sum(&m->d_lease_mis), percpu_counter_sum(&m->d_lease_hit)); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, i); + if (!s) + continue; + nr_caps += s->s_nr_caps; + ceph_put_mds_session(s); + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, + percpu_counter_sum(&m->i_caps_mis), + percpu_counter_sum(&m->i_caps_hit)); + return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 44cf85136ad5..93476d447a4b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -349,8 +349,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && - __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { int shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&ci->i_ceph_lock); err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) @@ -767,7 +768,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && - (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7fef94fd1e55..357c937699d5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2288,8 +2288,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); - if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) - return 0; + if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) + return 0; mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 873a3769fa0d..2a4b7394eed4 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -16,13 +16,29 @@ int ceph_metric_init(struct ceph_client_metric *m) ret = percpu_counter_init(&m->d_lease_hit, 0, GFP_KERNEL); if (ret) return ret; + ret = percpu_counter_init(&m->d_lease_mis, 0, GFP_KERNEL); - if (ret) { - percpu_counter_destroy(&m->d_lease_hit); - return ret; - } + if (ret) + goto err_d_lease_mis; + + ret = percpu_counter_init(&m->i_caps_hit, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_hit; + + ret = percpu_counter_init(&m->i_caps_mis, 0, GFP_KERNEL); + if (ret) + goto err_i_caps_mis; return 0; + +err_i_caps_mis: + percpu_counter_destroy(&m->i_caps_hit); +err_i_caps_hit: + percpu_counter_destroy(&m->d_lease_mis); +err_d_lease_mis: + percpu_counter_destroy(&m->d_lease_hit); + + return ret; } void ceph_metric_destroy(struct ceph_client_metric *m) @@ -30,6 +46,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + percpu_counter_destroy(&m->i_caps_mis); + percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index da725dacabfe..098ee8a9adaf 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -10,8 +10,21 @@ struct ceph_client_metric { atomic64_t total_dentries; struct percpu_counter d_lease_hit; struct percpu_counter d_lease_mis; + + struct percpu_counter i_caps_hit; + struct percpu_counter i_caps_mis; }; extern int ceph_metric_init(struct ceph_client_metric *m); extern void ceph_metric_destroy(struct ceph_client_metric *m); + +static inline void ceph_update_cap_hit(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_hit); +} + +static inline void ceph_update_cap_mis(struct ceph_client_metric *m) +{ + percpu_counter_inc(&m->i_caps_mis); +} #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5c73cf1ca5ff..47cfd8935b9c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -645,6 +645,8 @@ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); +extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, + int t); extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *cap); @@ -657,12 +659,12 @@ static inline int ceph_caps_issued(struct ceph_inode_info *ci) return issued; } -static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, - int touch) +static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, + int mask, int touch) { int r; spin_lock(&ci->i_ceph_lock); - r = __ceph_caps_issued_mask(ci, mask, touch); + r = __ceph_caps_issued_mask_metric(ci, mask, touch); spin_unlock(&ci->i_ceph_lock); return r; } diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 7b8a070a782d..71ee34d160c3 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -856,7 +856,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, if (ci->i_xattrs.version == 0 || !((req_mask & CEPH_CAP_XATTR_SHARED) || - __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) { + __ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); /* security module gets xattr while filling trace */ @@ -914,7 +914,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || - !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { + !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) From 97e27aaa9a2cbd6238c66b3251d397e0eacc9968 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:01 -0400 Subject: [PATCH 270/574] ceph: add read/write latency metric support Calculate the latency for OSD read requests. Add a new r_end_stamp field to struct ceph_osd_request that will hold the time of that the reply was received. Use that to calculate the RTT for each call, and divide the sum of those by number of calls to get averate RTT. Keep a tally of RTT for OSD writes and number of calls to track average latency of OSD writes. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 20 +++++++++ fs/ceph/debugfs.c | 41 +++++++++++++++++++ fs/ceph/file.c | 30 ++++++++++++++ fs/ceph/metric.c | 72 +++++++++++++++++++++++++++++++++ fs/ceph/metric.h | 22 ++++++++++ include/linux/ceph/osd_client.h | 3 ++ net/ceph/osd_client.c | 3 ++ 7 files changed, 191 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f4678d98df7..01ad09733ac7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,10 +11,12 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" +#include "metric.h" #include #include @@ -216,6 +218,9 @@ static int ceph_sync_readpages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); dout("readpages result %d\n", rc); return rc; @@ -299,6 +304,7 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; int rc = req->r_result <= 0 ? req->r_result : 0; int bytes = req->r_result >= 0 ? req->r_result : 0; @@ -336,6 +342,10 @@ unlock: put_page(page); bytes -= PAGE_SIZE; } + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + kfree(osd_data->pages); } @@ -643,6 +653,9 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc, if (!rc) rc = ceph_osdc_wait_request(osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + ceph_osdc_put_request(req); if (rc == 0) rc = len; @@ -794,6 +807,9 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, rc); + /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the @@ -1852,6 +1868,10 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + out_put: ceph_osdc_put_request(req); if (err == -ECANCELED) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 37b2775949ad..30acbc7f9acd 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -18,6 +20,7 @@ #ifdef CONFIG_DEBUG_FS #include "mds_client.h" +#include "metric.h" static int mdsmap_show(struct seq_file *s, void *p) { @@ -124,13 +127,51 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ + s64 _total, _avg, _min, _max, _sq, _st; \ + _avg = ktime_to_us(avg); \ + _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ + _max = ktime_to_us(max); \ + _total = total - 1; \ + _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ + _st = int_sqrt64(_sq); \ + _st = ktime_to_us(_st); \ + seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ + name, total, _avg, _min, _max, _st); \ +} + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client_metric *m = &mdsc->metric; int i, nr_caps = 0; + s64 total, sum, avg, min, max, sq; + seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); + seq_printf(s, "-----------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_latency_lock); + total = m->total_reads; + sum = m->read_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->read_latency_min; + max = m->read_latency_max; + sq = m->read_latency_sq_sum; + spin_unlock(&m->read_latency_lock); + CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + + spin_lock(&m->write_latency_lock); + total = m->total_writes; + sum = m->write_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->write_latency_min; + max = m->write_latency_max; + sq = m->write_latency_sq_sum; + spin_unlock(&m->write_latency_lock); + CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + + seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index afdfca965a7f..160644ddaeed 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -11,11 +11,13 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" #include "io.h" +#include "metric.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -906,6 +908,12 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret = ceph_osdc_start_request(osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + ret); + ceph_osdc_put_request(req); i_size = i_size_read(inode); @@ -1044,6 +1052,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); @@ -1051,6 +1061,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, rc); + } + if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1179,6 +1199,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_client_metric *metric = &fsc->mdsc->metric; struct ceph_vino vino; struct ceph_osd_request *req; struct bio_vec *bvecs; @@ -1295,6 +1316,13 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (write) + ceph_update_write_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + else + ceph_update_read_latency(metric, req->r_start_latency, + req->r_end_latency, ret); + size = i_size_read(inode); if (!write) { if (ret == -ENOENT) @@ -1466,6 +1494,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, ret); out: ceph_osdc_put_request(req); if (ret != 0) { diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 2a4b7394eed4..f5cf32e7789f 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -2,6 +2,7 @@ #include #include +#include #include "metric.h" @@ -29,6 +30,20 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; + spin_lock_init(&m->read_latency_lock); + m->read_latency_sq_sum = 0; + m->read_latency_min = KTIME_MAX; + m->read_latency_max = 0; + m->total_reads = 0; + m->read_latency_sum = 0; + + spin_lock_init(&m->write_latency_lock); + m->write_latency_sq_sum = 0; + m->write_latency_min = KTIME_MAX; + m->write_latency_max = 0; + m->total_writes = 0; + m->write_latency_sum = 0; + return 0; err_i_caps_mis: @@ -51,3 +66,60 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); } + +static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, + ktime_t *min, ktime_t *max, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t total, avg, sq, lsum; + + total = ++(*totalp); + lsum = (*lsump += lat); + + if (unlikely(lat < *min)) + *min = lat; + if (unlikely(lat > *max)) + *max = lat; + + if (unlikely(total == 1)) + return; + + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); + sq = lat - avg; + avg = DIV64_U64_ROUND_CLOSEST(lsum, total); + sq = sq * (lat - avg); + *sq_sump += sq; +} + +void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->read_latency_lock); + __update_latency(&m->total_reads, &m->read_latency_sum, + &m->read_latency_min, &m->read_latency_max, + &m->read_latency_sq_sum, lat); + spin_unlock(&m->read_latency_lock); +} + +void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ETIMEDOUT)) + return; + + spin_lock(&m->write_latency_lock); + __update_latency(&m->total_writes, &m->write_latency_sum, + &m->write_latency_min, &m->write_latency_max, + &m->write_latency_sq_sum, lat); + spin_unlock(&m->write_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 098ee8a9adaf..ab1543c70a75 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -4,6 +4,7 @@ #include #include +#include /* This is the global metrics */ struct ceph_client_metric { @@ -13,6 +14,20 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; + + spinlock_t read_latency_lock; + u64 total_reads; + ktime_t read_latency_sum; + ktime_t read_latency_sq_sum; + ktime_t read_latency_min; + ktime_t read_latency_max; + + spinlock_t write_latency_lock; + u64 total_writes; + ktime_t write_latency_sum; + ktime_t write_latency_sq_sum; + ktime_t write_latency_min; + ktime_t write_latency_max; }; extern int ceph_metric_init(struct ceph_client_metric *m); @@ -27,4 +42,11 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) { percpu_counter_inc(&m->i_caps_mis); } + +extern void ceph_update_read_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); +extern void ceph_update_write_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 9d9f745b98a1..734f7c6a9f56 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -213,6 +214,8 @@ struct ceph_osd_request { /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ + ktime_t r_start_latency; /* ktime_t */ + ktime_t r_end_latency; /* ktime_t */ int r_attempts; u32 r_map_dne_bound; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1d4973f8cd7a..ece124a5138e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2373,6 +2373,7 @@ static void account_request(struct ceph_osd_request *req) atomic_inc(&req->r_osdc->num_requests); req->r_start_stamp = jiffies; + req->r_start_latency = ktime_get(); } static void submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -2389,6 +2390,8 @@ static void finish_request(struct ceph_osd_request *req) WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + req->r_end_latency = ktime_get(); + if (req->r_osd) unlink_request(req->r_osd, req); atomic_dec(&osdc->num_requests); From 70c948206f0616c7e46130a26165b6a5d98bade4 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Mar 2020 23:45:02 -0400 Subject: [PATCH 271/574] ceph: add metadata perf metric support Add a new "r_ended" field to struct ceph_mds_request and use that to maintain the average latency of MDS requests. URL: https://tracker.ceph.com/issues/43215 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 10 ++++++++++ fs/ceph/mds_client.c | 7 +++++++ fs/ceph/mds_client.h | 3 +++ fs/ceph/metric.c | 23 +++++++++++++++++++++++ fs/ceph/metric.h | 10 ++++++++++ 5 files changed, 53 insertions(+) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 30acbc7f9acd..070ed8481340 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -171,6 +171,16 @@ static int metric_show(struct seq_file *s, void *p) spin_unlock(&m->write_latency_lock); CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + spin_lock(&m->metadata_latency_lock); + total = m->total_metadatas; + sum = m->metadata_latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->metadata_latency_min; + max = m->metadata_latency_max; + sq = m->metadata_latency_sq_sum; + spin_unlock(&m->metadata_latency_lock); + CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); + seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 39a27c56411a..20fab5c72d39 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -2201,6 +2202,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) mutex_init(&req->r_fill_mutex); req->r_mdsc = mdsc; req->r_started = jiffies; + req->r_start_latency = ktime_get(); req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); @@ -2547,6 +2549,8 @@ out: static void complete_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + req->r_end_latency = ktime_get(); + if (req->r_callback) req->r_callback(mdsc, req); complete_all(&req->r_completion); @@ -3155,6 +3159,9 @@ out_err: /* kick calling process */ complete_request(mdsc, req); + + ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + req->r_end_latency, err); out: ceph_mdsc_put_request(req); return; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 605995ae3718..cceeb33f2ff9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -299,6 +300,8 @@ struct ceph_mds_request { unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ + unsigned long r_start_latency; /* start time to measure latency */ + unsigned long r_end_latency; /* finish time to measure latency */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index f5cf32e7789f..9217f35bc2b9 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -44,6 +44,13 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_writes = 0; m->write_latency_sum = 0; + spin_lock_init(&m->metadata_latency_lock); + m->metadata_latency_sq_sum = 0; + m->metadata_latency_min = KTIME_MAX; + m->metadata_latency_max = 0; + m->total_metadatas = 0; + m->metadata_latency_sum = 0; + return 0; err_i_caps_mis: @@ -123,3 +130,19 @@ void ceph_update_write_latency(struct ceph_client_metric *m, &m->write_latency_sq_sum, lat); spin_unlock(&m->write_latency_lock); } + +void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ktime_t lat = ktime_sub(r_end, r_start); + + if (unlikely(rc && rc != -ENOENT)) + return; + + spin_lock(&m->metadata_latency_lock); + __update_latency(&m->total_metadatas, &m->metadata_latency_sum, + &m->metadata_latency_min, &m->metadata_latency_max, + &m->metadata_latency_sq_sum, lat); + spin_unlock(&m->metadata_latency_lock); +} diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index ab1543c70a75..ccd81285a450 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -28,6 +28,13 @@ struct ceph_client_metric { ktime_t write_latency_sq_sum; ktime_t write_latency_min; ktime_t write_latency_max; + + spinlock_t metadata_latency_lock; + u64 total_metadatas; + ktime_t metadata_latency_sum; + ktime_t metadata_latency_sq_sum; + ktime_t metadata_latency_min; + ktime_t metadata_latency_max; }; extern int ceph_metric_init(struct ceph_client_metric *m); @@ -49,4 +56,7 @@ extern void ceph_update_read_latency(struct ceph_client_metric *m, extern void ceph_update_write_latency(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); +extern void ceph_update_metadata_latency(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ From 0a454bdd501ad1aa30bb72e9581efa338ad6ce5c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 17 Mar 2020 08:47:31 -0400 Subject: [PATCH 272/574] ceph: reorganize __send_cap for less spinlock abuse Get rid of the __releases annotation by breaking it up into two functions: __prep_cap which is done under the spinlock and __send_cap that is done outside it. Add new fields to cap_msg_args for the wake boolean and old_xattr_buf pointer. Nothing checks the return value from __send_cap, so make it void return. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 165 ++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 79 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 82914866affc..8b17f4b4ef7c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1181,6 +1181,7 @@ struct cap_msg_args { u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; + struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; @@ -1189,6 +1190,7 @@ struct cap_msg_args { kgid_t gid; umode_t mode; bool inline_data; + bool wake; }; /* @@ -1318,44 +1320,29 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) } /* - * Send a cap msg on the given inode. Update our caps state, then - * drop i_ceph_lock and send the message. + * Prepare to send a cap message to an MDS. Update the cap state, and populate + * the arg struct with the parameters that will need to be sent. This should + * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. - * - * Return non-zero if delayed release, or we experienced an error - * such that the caller should requeue + retry later. - * - * called with i_ceph_lock, then drops it. - * caller should hold snap_rwsem (read), s_mutex. */ -static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, - int op, int flags, int used, int want, int retain, - int flushing, u64 flush_tid, u64 oldest_flush_tid) - __releases(cap->ci->i_ceph_lock) +static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, + int op, int flags, int used, int want, int retain, + int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; - struct ceph_buffer *old_blob = NULL; - struct cap_msg_args arg; int held, revoking; - int wake = 0; - int ret; - /* Don't send anything if it's still being created. Return delayed */ - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - spin_unlock(&ci->i_ceph_lock); - dout("%s async create in flight for %p\n", __func__, inode); - return 1; - } + lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", - inode, cap, cap->session, + dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", + __func__, inode, cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); @@ -1363,60 +1350,58 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ - if (cap->implemented & ~cap->issued) { - /* - * Wake up any waiters on wanted -> needed transition. - * This is due to the weird transition from buffered - * to sync IO... we need to flush dirty pages _before_ - * allowing sync writes to avoid reordering. - */ - wake = 1; - } + /* + * Wake up any waiters on wanted -> needed transition. This is due to + * the weird transition from buffered to sync IO... we need to flush + * dirty pages _before_ allowing sync writes to avoid reordering. + */ + arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; - arg.session = cap->session; - arg.ino = ceph_vino(inode).ino; - arg.cid = cap->cap_id; - arg.follows = flushing ? ci->i_head_snapc->seq : 0; - arg.flush_tid = flush_tid; - arg.oldest_flush_tid = oldest_flush_tid; + arg->session = cap->session; + arg->ino = ceph_vino(inode).ino; + arg->cid = cap->cap_id; + arg->follows = flushing ? ci->i_head_snapc->seq : 0; + arg->flush_tid = flush_tid; + arg->oldest_flush_tid = oldest_flush_tid; - arg.size = inode->i_size; - ci->i_reported_size = arg.size; - arg.max_size = ci->i_wanted_max_size; + arg->size = inode->i_size; + ci->i_reported_size = arg->size; + arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) - ci->i_requested_max_size = arg.max_size; + ci->i_requested_max_size = arg->max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { - old_blob = __ceph_build_xattrs_blob(ci); - arg.xattr_version = ci->i_xattrs.version; - arg.xattr_buf = ci->i_xattrs.blob; + arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); + arg->xattr_version = ci->i_xattrs.version; + arg->xattr_buf = ci->i_xattrs.blob; } else { - arg.xattr_buf = NULL; + arg->xattr_buf = NULL; + arg->old_xattr_buf = NULL; } - arg.mtime = inode->i_mtime; - arg.atime = inode->i_atime; - arg.ctime = inode->i_ctime; - arg.btime = ci->i_btime; - arg.change_attr = inode_peek_iversion_raw(inode); + arg->mtime = inode->i_mtime; + arg->atime = inode->i_atime; + arg->ctime = inode->i_ctime; + arg->btime = ci->i_btime; + arg->change_attr = inode_peek_iversion_raw(inode); - arg.op = op; - arg.caps = cap->implemented; - arg.wanted = want; - arg.dirty = flushing; + arg->op = op; + arg->caps = cap->implemented; + arg->wanted = want; + arg->dirty = flushing; - arg.seq = cap->seq; - arg.issue_seq = cap->issue_seq; - arg.mseq = cap->mseq; - arg.time_warp_seq = ci->i_time_warp_seq; + arg->seq = cap->seq; + arg->issue_seq = cap->issue_seq; + arg->mseq = cap->mseq; + arg->time_warp_seq = ci->i_time_warp_seq; - arg.uid = inode->i_uid; - arg.gid = inode->i_gid; - arg.mode = inode->i_mode; + arg->uid = inode->i_uid; + arg->gid = inode->i_gid; + arg->mode = inode->i_mode; - arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; @@ -1429,27 +1414,35 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } } } - arg.flags = flags; + arg->flags = flags; +} - spin_unlock(&ci->i_ceph_lock); +/* + * Send a cap msg on the given inode. + * + * Caller should hold snap_rwsem (read), s_mutex. + */ +static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + int ret; - ceph_buffer_put(old_blob); - - ret = send_cap_msg(&arg); + ret = send_cap_msg(arg); if (ret < 0) { pr_err("error sending cap msg, ino (%llx.%llx) " "flushing %s tid %llu, requeue\n", - ceph_vinop(inode), ceph_cap_string(flushing), - flush_tid); + ceph_vinop(inode), ceph_cap_string(arg->dirty), + arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } - if (wake) - wake_up_all(&ci->i_cap_wq); + ceph_buffer_put(arg->old_xattr_buf); - return ret; + if (arg->wake) + wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, @@ -1470,6 +1463,7 @@ static inline int __send_flush_snap(struct inode *inode, arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; + arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; @@ -1493,6 +1487,7 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; + arg.wake = false; return send_cap_msg(&arg); } @@ -1967,6 +1962,8 @@ retry_locked: } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + struct cap_msg_args arg; + cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ @@ -2094,9 +2091,12 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ - /* __send_cap drops i_ceph_lock */ - __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(mdsc, &arg, ci); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2135,6 +2135,7 @@ retry: retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; + struct cap_msg_args arg; if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); @@ -2162,11 +2163,13 @@ retry_locked: flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); - /* __send_cap drops i_ceph_lock */ - __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + + __send_cap(mdsc, &arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2368,15 +2371,19 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, first_tid = cf->tid + 1; if (cf->caps) { + struct cap_msg_args arg; + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, cap, cf->tid, ceph_cap_string(cf->caps)); - __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); + spin_unlock(&ci->i_ceph_lock); + __send_cap(mdsc, &arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, From 681ac634883ba162ce5c50c10120c8bf4df81574 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 15:29:34 -0400 Subject: [PATCH 273/574] ceph: split up __finish_cap_flush This function takes a mdsc argument or ci argument, but if both are passed in, it ignores the ci arg. Fortunately, nothing does that, but there's no good reason to have the same function handle both cases. Also, get rid of some branches and just use |= to set the wake_* vals. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 60 ++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8b17f4b4ef7c..74d05e5db68d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1740,30 +1740,33 @@ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ -static bool __finish_cap_flush(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci, - struct ceph_cap_flush *cf) +static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; - if (mdsc) { - /* are there older pending cap flushes? */ - if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { - prev = list_prev_entry(cf, g_list); - prev->wake = true; - wake = false; - } - list_del(&cf->g_list); - } else if (ci) { - if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { - prev = list_prev_entry(cf, i_list); - prev->wake = true; - wake = false; - } - list_del(&cf->i_list); - } else { - BUG_ON(1); + + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; } + list_del(&cf->g_list); + return wake; +} + +static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del(&cf->i_list); return wake; } @@ -3473,8 +3476,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, if (cf->caps == 0) /* capsnap */ continue; if (cf->tid <= flush_tid) { - if (__finish_cap_flush(NULL, ci, cf)) - wake_ci = true; + wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { cleaned &= ~cf->caps; @@ -3496,10 +3498,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, spin_lock(&mdsc->cap_dirty_lock); - list_for_each_entry(cf, &to_remove, i_list) { - if (__finish_cap_flush(mdsc, NULL, cf)) - wake_mdsc = true; - } + list_for_each_entry(cf, &to_remove, i_list) + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { @@ -3591,17 +3591,15 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout(" removing %p cap_snap %p follows %lld\n", inode, capsnap, follows); list_del(&capsnap->ci_item); - if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) - wake_ci = true; + wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); - if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) - wake_mdsc = true; - + wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, + &capsnap->cap_flush); spin_unlock(&mdsc->cap_dirty_lock); } spin_unlock(&ci->i_ceph_lock); From d7dbfb4f2bdb037758f46271f75ea6c8d35626b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 15:34:20 -0400 Subject: [PATCH 274/574] ceph: add comments for handle_cap_flush_ack logic Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 74d05e5db68d..056ad0d99438 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3471,14 +3471,26 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { + /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; - if (cf->caps == 0) /* capsnap */ + + /* Is this a capsnap? */ + if (cf->caps == 0) continue; + if (cf->tid <= flush_tid) { + /* + * An earlier or current tid. The FLUSH_ACK should + * represent a superset of this flush's caps. + */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { + /* + * This is a later one. Any caps in it are still dirty + * so don't count them as cleaned. + */ cleaned &= ~cf->caps; if (!cleaned) break; From 7391fba2678c0288d0daf636ddc4f78de7704f81 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Mar 2020 16:43:30 -0400 Subject: [PATCH 275/574] ceph: don't release i_ceph_lock in handle_cap_trunc There's no reason to do this here. Just have the caller handle it. Also, add a lockdep assertion. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 056ad0d99438..34aa9f4c1780 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3631,10 +3631,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, * * caller hold s_mutex. */ -static void handle_cap_trunc(struct inode *inode, +static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -3645,7 +3644,9 @@ static void handle_cap_trunc(struct inode *inode, int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); - int queue_trunc = 0; + bool queue_trunc = false; + + lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; @@ -3653,10 +3654,7 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&ci->i_ceph_lock); - - if (queue_trunc) - ceph_queue_vmtruncate(inode); + return queue_trunc; } /* @@ -3905,6 +3903,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; + bool queue_trunc; dout("handle_caps from mds%d\n", session->s_mds); @@ -4088,7 +4087,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session); + spin_unlock(&ci->i_ceph_lock); + if (queue_trunc) + ceph_queue_vmtruncate(inode); break; default: From 7833323363233c75fd8d10b5ceefbb9515cb3e32 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Mar 2020 12:00:16 -0400 Subject: [PATCH 276/574] ceph: don't take i_ceph_lock in handle_cap_import Just take it before calling it. This means we have to do a couple of minor in-memory operations under the spinlock now, but those shouldn't be an issue. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 34aa9f4c1780..f135660a2479 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3805,7 +3805,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) - __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap, *ocap, *new_cap = NULL; @@ -3830,14 +3829,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", inode, ci, mds, mseq, peer); - retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); + spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; @@ -4051,6 +4049,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } else { down_read(&mdsc->snap_rwsem); } + spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, From 4fb5dda39c26fa5b64059dac0a2c3340a4f6f11b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Mar 2020 15:34:13 -0400 Subject: [PATCH 277/574] ceph: document what protects i_dirty_item and i_flushing_item Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 47cfd8935b9c..bb372859c0ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -351,7 +351,9 @@ struct ceph_inode_info { struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ - struct list_head i_dirty_item, i_flushing_item; + struct list_head i_dirty_item, i_flushing_item; /* protected by + * mdsc->cap_dirty_lock + */ /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ From dc3da0461cc4b76f2d0c5b12247fcb3b520edbbf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Mar 2020 16:45:45 -0400 Subject: [PATCH 278/574] ceph: fix potential race in ceph_check_caps Nothing ensures that session will still be valid by the time we dereference the pointer. Take and put a reference. In principle, we should always be able to get a reference here, but throw a warning if that's ever not the case. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f135660a2479..d555d2790619 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2044,12 +2044,24 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); + session = ceph_get_mds_session(session); spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; } - mutex_lock(&session->s_mutex); + if (session) { + mutex_lock(&session->s_mutex); + ceph_put_mds_session(session); + } else { + /* + * Because we take the reference while + * holding the i_ceph_lock, it should + * never be NULL. Throw a warning if it + * ever is. + */ + WARN_ON_ONCE(true); + } goto retry; } } From 88828190f0073bd8f9aa5e2b1caf753d289c6d49 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 20 Mar 2020 17:07:36 -0400 Subject: [PATCH 279/574] ceph: throw a warning if we destroy session with mutex still locked Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 20fab5c72d39..de6bb8829837 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -659,6 +659,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); + WARN_ON(mutex_is_locked(&s->s_mutex)); xa_destroy(&s->s_delegated_inos); kfree(s); } From 6f05b30ea063a2a05dda47a4105a69267ae5270f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 30 Mar 2020 19:56:37 +0800 Subject: [PATCH 280/574] ceph: reset i_requested_max_size if file write is not wanted write can stuck at waiting for larger max_size in following sequence of events: - client opens a file and writes to position 'A' (larger than unit of max size increment) - client closes the file handle and updates wanted caps (not wanting file write caps) - client opens and truncates the file, writes to position 'A' again. At the 1st event, client set inode's requested_max_size to 'A'. At the 2nd event, mds removes client's writable range, but client does not reset requested_max_size. At the 3rd event, client does not request max size because requested_max_size is already larger than 'A'. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d555d2790619..53db1fcbfdd3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1369,8 +1369,12 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->size = inode->i_size; ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; - if (cap == ci->i_auth_cap) - ci->i_requested_max_size = arg->max_size; + if (cap == ci->i_auth_cap) { + if (want & CEPH_CAP_ANY_FILE_WR) + ci->i_requested_max_size = arg->max_size; + else + ci->i_requested_max_size = 0; + } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); @@ -3342,10 +3346,6 @@ static void handle_cap_grant(struct inode *inode, ci->i_requested_max_size = 0; } wake = true; - } else if (ci->i_wanted_max_size > ci->i_max_size && - ci->i_wanted_max_size > ci->i_requested_max_size) { - /* CEPH_CAP_OP_IMPORT */ - wake = true; } } @@ -3421,9 +3421,18 @@ static void handle_cap_grant(struct inode *inode, fill_inline = true; } - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + if (ci->i_auth_cap == cap && + le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (newcaps & ~extra_info->issued) wake = true; + + if (ci->i_requested_max_size > max_size || + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { + /* re-request max_size if necessary */ + ci->i_requested_max_size = 0; + wake = true; + } + ceph_kick_flushing_inode_caps(session, ci); spin_unlock(&ci->i_ceph_lock); up_read(&session->s_mdsc->snap_rwsem); @@ -3882,9 +3891,6 @@ retry: __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } - /* make sure we re-request max_size, if necessary */ - ci->i_requested_max_size = 0; - *old_issued = issued; *target_cap = cap; } @@ -4318,6 +4324,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode, cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; + if (cap == ci->i_auth_cap && + !(wanted & CEPH_CAP_ANY_FILE_WR)) + ci->i_requested_max_size = 0; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, From 1cf03a68e791b1673bc4daaa88a0820f34f538f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 1 Apr 2020 17:07:52 -0400 Subject: [PATCH 281/574] ceph: convert mdsc->cap_dirty to a per-session list This is a per-sb list now, but that makes it difficult to tell when the cap is the last dirty one associated with the session. Switch this to be a per-session list, but continue using the mdsc->cap_dirty_lock to protect the lists. This list is only ever walked in ceph_flush_dirty_caps, so change that to walk the sessions array and then flush the caps for inodes on each session's list. If the auth cap ever changes while the inode has dirty caps, then move the inode to the appropriate session for the new auth_cap. Also, ensure that we never remove an auth cap while the inode is still on the s_cap_dirty list. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 78 ++++++++++++++++++++++++++++++++++++-------- fs/ceph/mds_client.c | 2 +- fs/ceph/mds_client.h | 5 +-- fs/ceph/super.h | 19 +++++++++-- 4 files changed, 85 insertions(+), 19 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53db1fcbfdd3..d5ad2434bb07 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -597,6 +597,27 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } } +/** + * change_auth_cap_ses - move inode to appropriate lists when auth caps change + * @ci: inode to be moved + * @session: new auth caps session + */ +static void change_auth_cap_ses(struct ceph_inode_info *ci, + struct ceph_mds_session *session) +{ + lockdep_assert_held(&ci->i_ceph_lock); + + if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) + return; + + spin_lock(&session->s_mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) + list_move(&ci->i_dirty_item, &session->s_cap_dirty); + if (!list_empty(&ci->i_flushing_item)) + list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); + spin_unlock(&session->s_mdsc->cap_dirty_lock); +} + /* * Add a capability under the given MDS session. * @@ -727,6 +748,9 @@ void ceph_add_cap(struct inode *inode, if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + if (ci->i_auth_cap && + ci->i_auth_cap->session != cap->session) + change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } @@ -1123,8 +1147,10 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) + if (ci->i_auth_cap == cap) { + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); ci->i_auth_cap = NULL; + } /* remove from session list */ spin_lock(&session->s_cap_lock); @@ -1689,6 +1715,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { + struct ceph_mds_session *session = ci->i_auth_cap->session; + WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); @@ -1701,7 +1729,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -3749,15 +3777,9 @@ retry: tcap->issue_seq = t_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; - if (cap == ci->i_auth_cap) + if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; - - if (!list_empty(&ci->i_cap_flush_list) && - ci->i_auth_cap == tcap) { - spin_lock(&mdsc->cap_dirty_lock); - list_move_tail(&ci->i_flushing_item, - &tcap->session->s_cap_flushing); - spin_unlock(&mdsc->cap_dirty_lock); + change_auth_cap_ses(ci, tcap->session); } } __ceph_remove_cap(cap, false); @@ -4176,15 +4198,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) /* * Flush all dirty caps to the mds */ -void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +static void flush_dirty_session_caps(struct ceph_mds_session *s) { + struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_inode_info *ci; struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - while (!list_empty(&mdsc->cap_dirty)) { - ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + while (!list_empty(&s->s_cap_dirty)) { + ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->vfs_inode; ihold(inode); @@ -4198,6 +4221,35 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } +static void iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *)) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + if (!mdsc->sessions[mds]) + continue; + + s = ceph_get_mds_session(mdsc->sessions[mds]); + if (!s) + continue; + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + +void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) +{ + iterate_sessions(mdsc, flush_dirty_session_caps); +} + void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index de6bb8829837..588221b9b3d0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -755,6 +755,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + INIT_LIST_HEAD(&s->s_cap_dirty); INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; @@ -4373,7 +4374,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); - INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cceeb33f2ff9..788182adcc51 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -199,8 +199,10 @@ struct ceph_mds_session { struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; - /* protected by mutex */ + /* both protected by s_mdsc->cap_dirty_lock */ + struct list_head s_cap_dirty; /* inodes w/ dirty caps */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ + unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; @@ -424,7 +426,6 @@ struct ceph_mds_client { u64 last_cap_flush_tid; struct list_head cap_flush_list; - struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index bb372859c0ad..3a95395a4217 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -351,9 +351,22 @@ struct ceph_inode_info { struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ - struct list_head i_dirty_item, i_flushing_item; /* protected by - * mdsc->cap_dirty_lock - */ + + /* + * Link to the the auth cap's session's s_cap_dirty list. s_cap_dirty + * is protected by the mdsc->cap_dirty_lock, but each individual item + * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty + * requires the mdsc->cap_dirty_lock. List presence for an item can + * be tested under the i_ceph_lock. Changing anything requires both. + */ + struct list_head i_dirty_item; + + /* + * Link to session's s_cap_flushing list. Protected by + * mdsc->cap_dirty_lock. + */ + struct list_head i_flushing_item; + /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ From d67c72e6cce99eab5ab9d62c599e33e5141ff8b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 1 Apr 2020 18:27:25 -0400 Subject: [PATCH 282/574] ceph: request expedited service on session's last cap flush When flushing a lot of caps to the MDS's at once (e.g. for syncfs), we can end up waiting a substantial amount of time for MDS replies, due to the fact that it may delay some of them so that it can batch them up together in a single journal transaction. This can lead to stalls when calling sync or syncfs. What we'd really like to do is request expedited service on the _last_ cap we're flushing back to the server. If the CHECK_CAPS_FLUSH flag is set on the request and the current inode was the last one on the session->s_cap_dirty list, then mark the request with CEPH_CLIENT_CAPS_SYNC. Note that this heuristic is not perfect. New inodes can race onto the list after we've started flushing, but it does seem to fix some common use cases. URL: https://tracker.ceph.com/issues/44744 Reported-by: Jan Fajerski Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d5ad2434bb07..2558fd14126a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1997,6 +1997,7 @@ retry_locked: } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); @@ -2128,6 +2129,9 @@ ack: flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); + if (flags & CHECK_CAPS_FLUSH && + list_empty(&session->s_cap_dirty)) + mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; @@ -2138,8 +2142,8 @@ ack: mds = cap->mds; /* remember mds, so we don't repeat */ - __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, - retain, flushing, flush_tid, oldest_flush_tid); + __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, + want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(mdsc, &arg, ci); From 829ad4db952aac86d11a62647d2516ab46c2fcd2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 3 Apr 2020 13:09:07 -0400 Subject: [PATCH 283/574] ceph: ceph_kick_flushing_caps needs the s_mutex The mdsc->cap_dirty_lock is not held while walking the list in ceph_kick_flushing_caps, which is not safe. ceph_early_kick_flushing_caps does something similar, but the s_mutex is held while it's called and I think that guards against changes to the list. Ensure we hold the s_mutex when calling ceph_kick_flushing_caps, and add some clarifying comments. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 ++ fs/ceph/mds_client.c | 2 ++ fs/ceph/mds_client.h | 4 +++- fs/ceph/super.h | 7 +++++-- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2558fd14126a..41eb999dadf0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2518,6 +2518,8 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; u64 oldest_flush_tid; + lockdep_assert_held(&session->s_mutex); + dout("kick_flushing_caps mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 588221b9b3d0..6c283c52d401 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4024,7 +4024,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate != CEPH_MDS_STATE_STARTING) pr_info("mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); + mutex_lock(&s->s_mutex); ceph_kick_flushing_caps(mdsc, s); + mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); } } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 788182adcc51..43111e408fa2 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -199,8 +199,10 @@ struct ceph_mds_session { struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; - /* both protected by s_mdsc->cap_dirty_lock */ + /* See ceph_inode_info->i_dirty_item. */ struct list_head s_cap_dirty; /* inodes w/ dirty caps */ + + /* See ceph_inode_info->i_flushing_item. */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3a95395a4217..b82f82d8213a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -362,8 +362,11 @@ struct ceph_inode_info { struct list_head i_dirty_item; /* - * Link to session's s_cap_flushing list. Protected by - * mdsc->cap_dirty_lock. + * Link to session's s_cap_flushing list. Protected in a similar + * fashion to i_dirty_item, but also by the s_mutex for changes. The + * s_cap_flushing list can be walked while holding either the s_mutex + * or msdc->cap_dirty_lock. List presence can also be checked while + * holding the i_ceph_lock for this inode. */ struct list_head i_flushing_item; From daa668fbacfdb9147b1828fc0f97b685bf4a51e3 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 7 Apr 2020 11:30:19 +0100 Subject: [PATCH 284/574] ceph: normalize 'delta' parameter usage in check_quota_exceeded Function check_quota_exceeded() uses delta parameter only for the QUOTA_CHECK_MAX_BYTES_OP operation. Using this parameter also for MAX_FILES will makes the code cleaner and will be required to support cross-quota-tree renames. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 19507e2fdb57..7377838d3f8b 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -361,8 +361,6 @@ restart: spin_unlock(&ci->i_ceph_lock); switch (op) { case QUOTA_CHECK_MAX_FILES_OP: - exceeded = (max && (rvalue >= max)); - break; case QUOTA_CHECK_MAX_BYTES_OP: exceeded = (max && (rvalue + delta > max)); break; @@ -417,7 +415,7 @@ bool ceph_quota_is_max_files_exceeded(struct inode *inode) WARN_ON(!S_ISDIR(inode->i_mode)); - return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0); + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 1); } /* From dffdcd71458e699e839f0bf47c3d42d64210b939 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 7 Apr 2020 11:30:20 +0100 Subject: [PATCH 285/574] ceph: allow rename operation under different quota realms Returning -EXDEV when trying to 'mv' files/directories from different quota realms results in copy+unlink operations instead of the faster CEPH_MDS_OP_RENAME. This will occur even when there aren't any quotas set in the destination directory, or if there's enough space left for the new file(s). This patch adds a new helper function to be called on rename operations which will allow these operations if they can be executed. This patch mimics userland fuse client commit b8954e5734b3 ("client: optimize rename operation under different quota root"). Since ceph_quota_is_same_realm() is now called only from this new helper, make it static. URL: https://tracker.ceph.com/issues/44791 Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 9 ++++---- fs/ceph/quota.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- fs/ceph/super.h | 3 ++- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 93476d447a4b..39f5311404b0 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1209,11 +1209,12 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, op = CEPH_MDS_OP_RENAMESNAP; else return -EROFS; + } else if (old_dir != new_dir) { + err = ceph_quota_check_rename(mdsc, d_inode(old_dentry), + new_dir); + if (err) + return err; } - /* don't allow cross-quota renames */ - if ((old_dir != new_dir) && - (!ceph_quota_is_same_realm(old_dir, new_dir))) - return -EXDEV; dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 7377838d3f8b..198ddde5c1e6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -264,7 +264,7 @@ restart: return NULL; } -bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; struct ceph_snap_realm *old_realm, *new_realm; @@ -516,3 +516,59 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) return is_updated; } +/* + * ceph_quota_check_rename - check if a rename can be executed + * @mdsc: MDS client instance + * @old: inode to be copied + * @new: destination inode (directory) + * + * This function verifies if a rename (e.g. moving a file or directory) can be + * executed. It forces an rstat update in the @new target directory (and in the + * source @old as well, if it's a directory). The actual check is done both for + * max_files and max_bytes. + * + * This function returns 0 if it's OK to do the rename, or, if quotas are + * exceeded, -EXDEV (if @old is a directory) or -EDQUOT. + */ +int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new) +{ + struct ceph_inode_info *ci_old = ceph_inode(old); + int ret = 0; + + if (ceph_quota_is_same_realm(old, new)) + return 0; + + /* + * Get the latest rstat for target directory (and for source, if a + * directory) + */ + ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + + if (S_ISDIR(old->i_mode)) { + ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false); + if (ret) + return ret; + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + ci_old->i_rbytes); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, + ci_old->i_rfiles + + ci_old->i_rsubdirs); + if (ret) + ret = -EXDEV; + } else { + ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, + i_size_read(old)); + if (!ret) + ret = check_quota_exceeded(new, + QUOTA_CHECK_MAX_FILES_OP, 1); + if (ret) + ret = -EDQUOT; + } + + return ret; +} diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b82f82d8213a..226f19c9042f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1210,13 +1210,14 @@ extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); -extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); +extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc, + struct inode *old, struct inode *new); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ From 53ab8e7cd2d47594e68951994ac083d30f82fce4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:51:38 -0500 Subject: [PATCH 286/574] libceph, rbd: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd_types.h | 2 +- include/linux/ceph/mon_client.h | 2 +- include/linux/crush/crush.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index ac98ab6ccd3b..a600e0eb6b6f 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -93,7 +93,7 @@ struct rbd_image_header_ondisk { __le32 snap_count; __le32 reserved; __le64 snap_names_len; - struct rbd_image_snap_ondisk snaps[0]; + struct rbd_image_snap_ondisk snaps[]; } __attribute__((packed)); diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index dbb8a6959a73..ce4ffeb384d7 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -19,7 +19,7 @@ struct ceph_monmap { struct ceph_fsid fsid; u32 epoch; u32 num_mon; - struct ceph_entity_inst mon_inst[0]; + struct ceph_entity_inst mon_inst[]; }; struct ceph_mon_client; diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 54741295c70b..38b0e4d50ed9 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -87,7 +87,7 @@ struct crush_rule_mask { struct crush_rule { __u32 len; struct crush_rule_mask mask; - struct crush_rule_step steps[0]; + struct crush_rule_step steps[]; }; #define crush_rule_size(len) (sizeof(struct crush_rule) + \ From 878dabb64117406abd40977b87544d05bb3031fc Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 18 May 2020 18:47:26 +0100 Subject: [PATCH 287/574] ceph: don't return -ESTALE if there's still an open file Similarly to commit 03f219041fdb ("ceph: check i_nlink while converting a file handle to dentry"), this fixes another corner case with name_to_handle_at/open_by_handle_at. The issue has been detected by xfstest generic/467, when doing: - name_to_handle_at("/cephfs/myfile") - open("/cephfs/myfile") - unlink("/cephfs/myfile") - sync; sync; - drop caches - open_by_handle_at() The call to open_by_handle_at should not fail because the file hasn't been deleted yet (only unlinked) and we do have a valid handle to it. -ESTALE shall be returned only if i_nlink is 0 *and* i_count is 1. This patch also makes sure we have LINK caps before checking i_nlink. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Acked-by: Amir Goldstein Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 79dc06881e78..e088843a7734 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -172,9 +172,16 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + int err; + if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode->i_nlink == 0) { + /* We need LINK caps to reliably check i_nlink */ + err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); + if (err) + return ERR_PTR(err); + /* -ESTALE if inode as been unlinked and no file is open */ + if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { iput(inode); return ERR_PTR(-ESTALE); } From ea8412b284c09742d5b11721e225b4ff011aa397 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 20 May 2020 03:51:19 -0400 Subject: [PATCH 288/574] ceph: make sure mdsc->mutex is nested in s->s_mutex to fix dead lock send_mds_reconnect takes the s_mutex while the mdsc->mutex is already held. That inverts the locking order documented in mds_client.h. Drop the mdsc->mutex, acquire the s_mutex and then reacquire the mdsc->mutex to prevent a deadlock. URL: https://tracker.ceph.com/issues/45609 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6c283c52d401..0e0ab01694dc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3769,8 +3769,6 @@ fail: * recovering MDS might have. * * This is a relatively heavyweight operation, but it's rare. - * - * called with mdsc->mutex held. */ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) @@ -4024,7 +4022,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate != CEPH_MDS_STATE_STARTING) pr_info("mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); + mutex_lock(&mdsc->mutex); ceph_kick_flushing_caps(mdsc, s); mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); From e64f44a884657358812e6c057957be546db03cbe Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 27 May 2020 09:09:27 -0400 Subject: [PATCH 289/574] ceph: skip checking caps when session reconnecting and releasing reqs It make no sense to check the caps when reconnecting to mds. And for the async dirop caps, they will be put by its _cb() function, so when releasing the requests, it will make no sense too. URL: https://tracker.ceph.com/issues/45635 Signed-off-by: Xiubo Li Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 15 +++++++++++++-- fs/ceph/mds_client.c | 16 ++++++++++++++-- fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 2 ++ 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 41eb999dadf0..972c13aa4225 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3016,7 +3016,8 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ -void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, + bool skip_checking_caps) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; @@ -3072,7 +3073,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last) + if (last && !skip_checking_caps) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) ceph_flush_snaps(ci, NULL); @@ -3082,6 +3083,16 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) iput(inode); } +void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, false); +} + +void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, true); +} + /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0e0ab01694dc..a50497142e59 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -804,7 +804,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); - ceph_mdsc_release_dir_caps(req); + ceph_mdsc_release_dir_caps_no_check(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -3402,6 +3402,18 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) } } +void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), + dcaps); + } +} + /* * called under session->mutex. */ @@ -3434,7 +3446,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, if (req->r_session->s_mds != session->s_mds) continue; - ceph_mdsc_release_dir_caps(req); + ceph_mdsc_release_dir_caps_no_check(req); __send_request(mdsc, session, req, true); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 43111e408fa2..5e0c4073a6be 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -507,6 +507,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 226f19c9042f..5a6cdd39bc10 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1095,6 +1095,8 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, + int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, From 8a4b863c876d9f135fa00cfe65774c3740970303 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 May 2020 16:46:47 +0200 Subject: [PATCH 290/574] libceph: add non-asserting rbtree insertion helper Needed for the next commit and useful for ceph_pg_pool_info tree as well. I'm leaving the asserting helper in for now, but we should look at getting rid of it in the future. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 10 ++++-- net/ceph/osdmap.c | 60 +++++++----------------------------- 2 files changed, 19 insertions(+), 51 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 525b7c3f1c81..4b5a47bcaba4 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -188,7 +188,7 @@ static inline int calc_pages_for(u64 off, u64 len) #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) #define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ -static void insert_##name(struct rb_root *root, type *t) \ +static bool __insert_##name(struct rb_root *root, type *t) \ { \ struct rb_node **n = &root->rb_node; \ struct rb_node *parent = NULL; \ @@ -206,11 +206,17 @@ static void insert_##name(struct rb_root *root, type *t) \ else if (cmp > 0) \ n = &(*n)->rb_right; \ else \ - BUG(); \ + return false; \ } \ \ rb_link_node(&t->nodefld, parent, n); \ rb_insert_color(&t->nodefld, root); \ + return true; \ +} \ +static void __maybe_unused insert_##name(struct rb_root *root, type *t) \ +{ \ + if (!__insert_##name(root, t)) \ + BUG(); \ } \ static void erase_##name(struct rb_root *root, type *t) \ { \ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2a6e63a8edbe..5d00ce2b5339 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -636,48 +636,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, /* * rbtree of pg pool info */ -static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_pool_info *pi = NULL; - - while (*p) { - parent = *p; - pi = rb_entry(parent, struct ceph_pg_pool_info, node); - if (new->id < pi->id) - p = &(*p)->rb_left; - else if (new->id > pi->id) - p = &(*p)->rb_right; - else - return -EEXIST; - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); - return 0; -} - -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) -{ - struct ceph_pg_pool_info *pi; - struct rb_node *n = root->rb_node; - - while (n) { - pi = rb_entry(n, struct ceph_pg_pool_info, node); - if (id < pi->id) - n = n->rb_left; - else if (id > pi->id) - n = n->rb_right; - else - return pi; - } - return NULL; -} +DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node) struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) { - return __lookup_pg_pool(&map->pg_pools, id); + return lookup_pg_pool(&map->pg_pools, id); } const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) @@ -690,8 +653,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) if (WARN_ON_ONCE(id > (u64) INT_MAX)) return NULL; - pi = __lookup_pg_pool(&map->pg_pools, (int) id); - + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->name : NULL; } EXPORT_SYMBOL(ceph_pg_pool_name_by_id); @@ -714,14 +676,14 @@ u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&map->pg_pools, id); + pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->flags : 0; } EXPORT_SYMBOL(ceph_pg_pool_flags); static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { - rb_erase(&pi->node, root); + erase_pg_pool(root, pi); kfree(pi->name); kfree(pi); } @@ -903,7 +865,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ceph_decode_32_safe(p, end, len, bad); dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) { char *name = kstrndup(*p, len, GFP_NOFS); @@ -1154,18 +1116,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) return -ENOMEM; + RB_CLEAR_NODE(&pi->node); pi->id = pool; - ret = __insert_pg_pool(&map->pg_pools, pi); - if (ret) { + if (!__insert_pg_pool(&map->pg_pools, pi)) { kfree(pi); - return ret; + return -EEXIST; } } @@ -1829,7 +1791,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg_pool_info *pi; ceph_decode_64_safe(p, end, pool, e_inval); - pi = __lookup_pg_pool(&map->pg_pools, pool); + pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } From 86403a92c3c5c6c395983fdbfc5e2f29dc39279b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 May 2020 17:09:52 +0200 Subject: [PATCH 291/574] libceph: decode CRUSH device/bucket types and names These would be matched with the provided client location to calculate the locality value. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/crush/crush.h | 12 ++++++ net/ceph/crush/crush.c | 3 +- net/ceph/osdmap.c | 85 ++++++++++++++++++++++++++++++++++++- 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 38b0e4d50ed9..33c16f2de7f6 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -301,6 +301,12 @@ struct crush_map { __u32 *choose_tries; #else + /* device/bucket type id -> type name (CrushWrapper::type_map) */ + struct rb_root type_names; + + /* device/bucket id -> name (CrushWrapper::name_map) */ + struct rb_root names; + /* CrushWrapper::choose_args */ struct rb_root choose_args; #endif @@ -342,4 +348,10 @@ struct crush_work { struct crush_work_bucket **work; /* Per-bucket working store */ }; +#ifdef __KERNEL__ +/* osdmap.c */ +void clear_crush_names(struct rb_root *root); +void clear_choose_args(struct crush_map *c); +#endif + #endif diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 3d70244bc1b6..254ded0b05f6 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -2,7 +2,6 @@ #ifdef __KERNEL__ # include # include -void clear_choose_args(struct crush_map *c); #else # include "crush_compat.h" # include "crush.h" @@ -130,6 +129,8 @@ void crush_destroy(struct crush_map *map) #ifndef __KERNEL__ kfree(map->choose_tries); #else + clear_crush_names(&map->type_names); + clear_crush_names(&map->names); clear_choose_args(map); #endif kfree(map); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 5d00ce2b5339..e74130876d3a 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -138,6 +138,79 @@ bad: return -EINVAL; } +struct crush_name_node { + struct rb_node cn_node; + int cn_id; + char cn_name[]; +}; + +static struct crush_name_node *alloc_crush_name(size_t name_len) +{ + struct crush_name_node *cn; + + cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO); + if (!cn) + return NULL; + + RB_CLEAR_NODE(&cn->cn_node); + return cn; +} + +static void free_crush_name(struct crush_name_node *cn) +{ + WARN_ON(!RB_EMPTY_NODE(&cn->cn_node)); + + kfree(cn); +} + +DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node) + +static int decode_crush_names(void **p, void *end, struct rb_root *root) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct crush_name_node *cn; + int id; + u32 name_len; + + ceph_decode_32_safe(p, end, id, e_inval); + ceph_decode_32_safe(p, end, name_len, e_inval); + ceph_decode_need(p, end, name_len, e_inval); + + cn = alloc_crush_name(name_len); + if (!cn) + return -ENOMEM; + + cn->cn_id = id; + memcpy(cn->cn_name, *p, name_len); + cn->cn_name[name_len] = '\0'; + *p += name_len; + + if (!__insert_crush_name(root, cn)) { + free_crush_name(cn); + return -EEXIST; + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +void clear_crush_names(struct rb_root *root) +{ + while (!RB_EMPTY_ROOT(root)) { + struct crush_name_node *cn = + rb_entry(rb_first(root), struct crush_name_node, cn_node); + + erase_crush_name(root, cn); + free_crush_name(cn); + } +} + static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; @@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + c->type_names = RB_ROOT; + c->names = RB_ROOT; c->choose_args = RB_ROOT; /* set tunables to default values */ @@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } } - ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ - ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ + err = decode_crush_names(p, end, &c->type_names); + if (err) + goto fail; + + err = decode_crush_names(p, end, &c->names); + if (err) + goto fail; + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ From 45e6aa9f5592cd127367074f4822039cd8a825c3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 22 May 2020 15:24:53 +0200 Subject: [PATCH 292/574] libceph: crush_location infrastructure Allow expressing client's location in terms of CRUSH hierarchy as a set of (bucket type name, bucket name) pairs. The userspace syntax "crush_location = key1=value1 key2=value2" is incompatible with mount options and needed adaptation. Key-value pairs are separated by '|' and we use ':' instead of '=' to separate keys from values. So for: crush_location = host=foo rack=bar one would write: crush_location=host:foo|rack:bar As in userspace, "multipath" locations are supported, so indicating locality for parallel hierarchies is possible: crush_location=rack:foo1|rack:foo2|datacenter:bar Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/osdmap.h | 16 ++++- net/ceph/ceph_common.c | 36 +++++++++++ net/ceph/osdmap.c | 116 +++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 4b5a47bcaba4..4733959f1ec7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -64,6 +64,7 @@ struct ceph_options { int num_mon; char *name; struct ceph_crypto_key *key; + struct rb_root crush_locs; }; /* diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5e601975745f..8c9d18cc9f45 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); +struct crush_loc { + char *cl_type_name; + char *cl_name; +}; + +struct crush_loc_node { + struct rb_node cl_node; + struct crush_loc cl_loc; /* pointers into cl_data */ + char cl_data[]; +}; + +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); +void ceph_clear_crush_locs(struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); - extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..44770b60bc38 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt, } } + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); + if (ret) + return ret; + /* any matching mon ip implies a match */ for (i = 0; i < opt1->num_mon; i++) { if (ceph_monmap_contains(client->monc.monmap, @@ -260,6 +264,7 @@ enum { Opt_secret, Opt_key, Opt_ip, + Opt_crush_location, /* string args above */ Opt_share, Opt_crc, @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), fsparam_flag_no ("crc", Opt_crc), + fsparam_string ("crush_location", Opt_crush_location), fsparam_string ("fsid", Opt_fsid), fsparam_string ("ip", Opt_ip), fsparam_string ("key", Opt_key), @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void) if (!opt) return NULL; + opt->crush_locs = RB_ROOT; opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) { @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt) if (!opt) return; + ceph_clear_crush_locs(&opt->crush_locs); kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); @@ -454,6 +462,16 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); + case Opt_crush_location: + ceph_clear_crush_locs(&opt->crush_locs); + err = ceph_parse_crush_location(param->string, + &opt->crush_locs); + if (err) { + error_plog(&log, "Failed to parse CRUSH location: %d", + err); + return err; + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -536,6 +554,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, { struct ceph_options *opt = client->options; size_t pos = m->count; + struct rb_node *n; if (opt->name) { seq_puts(m, "name="); @@ -545,6 +564,23 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, if (opt->key) seq_puts(m, "secret=,"); + if (!RB_EMPTY_ROOT(&opt->crush_locs)) { + seq_puts(m, "crush_location="); + for (n = rb_first(&opt->crush_locs); ; ) { + struct crush_loc_node *loc = + rb_entry(n, struct crush_loc_node, cl_node); + + seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name, + loc->cl_loc.cl_name); + n = rb_next(n); + if (!n) + break; + + seq_putc(m, '|'); + } + seq_putc(m, ','); + } + if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); if (opt->flags & CEPH_OPT_NOSHARE) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e74130876d3a..4b81334e9e5b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2715,3 +2715,119 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); + +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, + size_t name_len) +{ + struct crush_loc_node *loc; + + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); + if (!loc) + return NULL; + + RB_CLEAR_NODE(&loc->cl_node); + return loc; +} + +static void free_crush_loc(struct crush_loc_node *loc) +{ + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); + + kfree(loc); +} + +static int crush_loc_compare(const struct crush_loc *loc1, + const struct crush_loc *loc2) +{ + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: + strcmp(loc1->cl_name, loc2->cl_name); +} + +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, + RB_BYPTR, const struct crush_loc *, cl_node) + +/* + * Parses a set of ':' pairs separated + * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". + * + * Note that @crush_location is modified by strsep(). + */ +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) +{ + struct crush_loc_node *loc; + const char *type_name, *name, *colon; + size_t type_name_len, name_len; + + dout("%s '%s'\n", __func__, crush_location); + while ((type_name = strsep(&crush_location, "|"))) { + colon = strchr(type_name, ':'); + if (!colon) + return -EINVAL; + + type_name_len = colon - type_name; + if (type_name_len == 0) + return -EINVAL; + + name = colon + 1; + name_len = strlen(name); + if (name_len == 0) + return -EINVAL; + + loc = alloc_crush_loc(type_name_len, name_len); + if (!loc) + return -ENOMEM; + + loc->cl_loc.cl_type_name = loc->cl_data; + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); + loc->cl_loc.cl_type_name[type_name_len] = '\0'; + + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; + memcpy(loc->cl_loc.cl_name, name, name_len); + loc->cl_loc.cl_name[name_len] = '\0'; + + if (!__insert_crush_loc(locs, loc)) { + free_crush_loc(loc); + return -EEXIST; + } + + dout("%s type_name '%s' name '%s'\n", __func__, + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); + } + + return 0; +} + +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) +{ + struct rb_node *n1 = rb_first(locs1); + struct rb_node *n2 = rb_first(locs2); + int ret; + + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { + struct crush_loc_node *loc1 = + rb_entry(n1, struct crush_loc_node, cl_node); + struct crush_loc_node *loc2 = + rb_entry(n2, struct crush_loc_node, cl_node); + + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); + if (ret) + return ret; + } + + if (!n1 && n2) + return -1; + if (n1 && !n2) + return 1; + return 0; +} + +void ceph_clear_crush_locs(struct rb_root *locs) +{ + while (!RB_EMPTY_ROOT(locs)) { + struct crush_loc_node *loc = + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); + + erase_crush_loc(locs, loc); + free_crush_loc(loc); + } +} From 117d96a04f007ce8fc2e292369056c3bd09f6f63 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 23 May 2020 11:45:48 +0200 Subject: [PATCH 293/574] libceph: support for balanced and localized reads OSD-side issues with reads from replica have been resolved in Octopus. Reading from replica should be safe wrt. unstable or uncommitted state now, so add support for balanced and localized reads. There are two cases when a read from replica can't be served: - OSD may silently drop the request, expecting the client to notice that the acting set has changed and resend via the usual means (handled with t->used_replica) - OSD may return EAGAIN, expecting the client to resend to the primary, ignoring replica read flags (see handle_reply()) Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/osd_client.h | 1 + include/linux/ceph/osdmap.h | 3 + net/ceph/debugfs.c | 6 +- net/ceph/osd_client.c | 87 +++++++++++++++++++++++++-- net/ceph/osdmap.c | 102 ++++++++++++++++++++++++++++++++ 5 files changed, 193 insertions(+), 6 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 734f7c6a9f56..671fb93e8c60 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -165,6 +165,7 @@ struct ceph_osd_request_target { bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool used_replica; bool paused; u32 epoch; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8c9d18cc9f45..3f4498fef6ad 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -317,6 +317,9 @@ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); void ceph_clear_crush_locs(struct rb_root *locs); +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1344f232ecc5..409d505ff320 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p) u32 state = map->osd_state[i]; char sb[64]; - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n", i, ceph_pr_addr(addr), ((map->osd_weight[i]*100) >> 16), ceph_osdmap_state_str(sb, sizeof(sb), state), - ((ceph_get_primary_affinity(map, i)*100) >> 16)); + ((ceph_get_primary_affinity(map, i)*100) >> 16), + ceph_get_crush_locality(map, i, + &client->options->crush_locs)); } for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { struct ceph_pg_mapping *pg = diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ece124a5138e..4ce6cdc744e4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1497,6 +1497,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, (osdc->osdmap->epoch < osdc->epoch_barrier); } +static int pick_random_replica(const struct ceph_osds *acting) +{ + int i = prandom_u32() % acting->size; + + dout("%s picked osd%d, primary osd%d\n", __func__, + acting->osds[i], acting->primary); + return i; +} + +/* + * Picks the closest replica based on client's location given by + * crush_location option. Prefers the primary if the locality is + * the same. + */ +static int pick_closest_replica(struct ceph_osd_client *osdc, + const struct ceph_osds *acting) +{ + struct ceph_options *opt = osdc->client->options; + int best_i, best_locality; + int i = 0, locality; + + do { + locality = ceph_get_crush_locality(osdc->osdmap, + acting->osds[i], + &opt->crush_locs); + if (i == 0 || + (locality >= 0 && best_locality < 0) || + (locality >= 0 && best_locality >= 0 && + locality < best_locality)) { + best_i = i; + best_locality = locality; + } + } while (++i < acting->size); + + dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, + acting->osds[best_i], best_locality, acting->primary); + return best_i; +} + enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, @@ -1510,6 +1549,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; + bool is_read = t->flags & CEPH_OSD_FLAG_READ; + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; @@ -1540,9 +1581,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; - if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); @@ -1581,7 +1622,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change); + ceph_osds_changed(&t->acting, &acting, + t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); @@ -1597,7 +1639,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; - t->osd = acting.primary; + if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + !is_write && pi->type == CEPH_POOL_TYPE_REP && + acting.size > 1) { + int pos; + + WARN_ON(!is_read || acting.osds[0] != acting.primary); + if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { + pos = pick_random_replica(&acting); + } else { + pos = pick_closest_replica(osdc, &acting); + } + t->osd = acting.osds[pos]; + t->used_replica = pos > 0; + } else { + t->osd = acting.primary; + t->used_replica = false; + } } if (unpaused || legacy_change || force_resend || split) @@ -3660,6 +3719,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_osdc; } + if (m.result == -EAGAIN) { + dout("req %p tid %llu EAGAIN\n", req, req->r_tid); + unlink_request(osd, req); + mutex_unlock(&osd->lock); + + /* + * The object is missing on the replica or not (yet) + * readable. Clear pgid to force a resend to the primary + * via legacy_change. + */ + req->r_t.pgid.pool = 0; + req->r_t.pgid.seed = 0; + WARN_ON(!req->r_t.used_replica); + req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; + } + if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4b81334e9e5b..96c25f5e064a 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2831,3 +2831,105 @@ void ceph_clear_crush_locs(struct rb_root *locs) free_crush_loc(loc); } } + +/* + * [a-zA-Z0-9-_.]+ + */ +static bool is_valid_crush_name(const char *name) +{ + do { + if (!('a' <= *name && *name <= 'z') && + !('A' <= *name && *name <= 'Z') && + !('0' <= *name && *name <= '9') && + *name != '-' && *name != '_' && *name != '.') + return false; + } while (*++name != '\0'); + + return true; +} + +/* + * Gets the parent of an item. Returns its id (<0 because the + * parent is always a bucket), type id (>0 for the same reason, + * via @parent_type_id) and location (via @parent_loc). If no + * parent, returns 0. + * + * Does a linear search, as there are no parent pointers of any + * kind. Note that the result is ambigous for items that occur + * multiple times in the map. + */ +static int get_immediate_parent(struct crush_map *c, int id, + u16 *parent_type_id, + struct crush_loc *parent_loc) +{ + struct crush_bucket *b; + struct crush_name_node *type_cn, *cn; + int i, j; + + for (i = 0; i < c->max_buckets; i++) { + b = c->buckets[i]; + if (!b) + continue; + + /* ignore per-class shadow hierarchy */ + cn = lookup_crush_name(&c->names, b->id); + if (!cn || !is_valid_crush_name(cn->cn_name)) + continue; + + for (j = 0; j < b->size; j++) { + if (b->items[j] != id) + continue; + + *parent_type_id = b->type; + type_cn = lookup_crush_name(&c->type_names, b->type); + parent_loc->cl_type_name = type_cn->cn_name; + parent_loc->cl_name = cn->cn_name; + return b->id; + } + } + + return 0; /* no parent */ +} + +/* + * Calculates the locality/distance from an item to a client + * location expressed in terms of CRUSH hierarchy as a set of + * (bucket type name, bucket name) pairs. Specifically, looks + * for the lowest-valued bucket type for which the location of + * @id matches one of the locations in @locs, so for standard + * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) + * a matching host is closer than a matching rack and a matching + * data center is closer than a matching zone. + * + * Specifying multiple locations (a "multipath" location) such + * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs + * is a multimap. The locality will be: + * + * - 3 for OSDs in racks foo1 and foo2 + * - 8 for OSDs in data center bar + * - -1 for all other OSDs + * + * The lowest possible bucket type is 1, so the best locality + * for an OSD is 1 (i.e. a matching host). Locality 0 would be + * the OSD itself. + */ +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs) +{ + struct crush_loc loc; + u16 type_id; + + /* + * Instead of repeated get_immediate_parent() calls, + * the location of @id could be obtained with a single + * depth-first traversal. + */ + for (;;) { + id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); + if (id >= 0) + return -1; /* not local */ + + if (lookup_crush_loc(locs, &loc)) + return type_id; + } +} From 8ad44d5e0d1eda7e4a0ed382174888476dc81789 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 23 May 2020 11:47:33 +0200 Subject: [PATCH 294/574] libceph: read_from_replica option Expose replica reads through read_from_replica=balance and read_from_replica=localize. The default is to read from primary (read_from_replica=no). Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- include/linux/ceph/libceph.h | 2 ++ net/ceph/ceph_common.c | 39 ++++++++++++++++++++++++++++++++++++ net/ceph/osd_client.c | 5 ++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 4733959f1ec7..2247e71beb83 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -53,6 +53,8 @@ struct ceph_options { unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_request_timeout; /* jiffies */ + u32 osd_req_flags; /* CEPH_OSD_FLAG_*, applied to each OSD request */ + /* * any type that can't be simply compared or doesn't need * to be compared should go beyond this point, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 44770b60bc38..9bab3e9a039b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -265,6 +265,7 @@ enum { Opt_key, Opt_ip, Opt_crush_location, + Opt_read_from_replica, /* string args above */ Opt_share, Opt_crc, @@ -274,6 +275,19 @@ enum { Opt_abort_on_full, }; +enum { + Opt_read_from_replica_no, + Opt_read_from_replica_balance, + Opt_read_from_replica_localize, +}; + +static const struct constant_table ceph_param_read_from_replica[] = { + {"no", Opt_read_from_replica_no}, + {"balance", Opt_read_from_replica_balance}, + {"localize", Opt_read_from_replica_localize}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), @@ -290,6 +304,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout), __fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout, fs_param_deprecated, NULL), + fsparam_enum ("read_from_replica", Opt_read_from_replica, + ceph_param_read_from_replica), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -472,6 +488,24 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, return err; } break; + case Opt_read_from_replica: + switch (result.uint_32) { + case Opt_read_from_replica_no: + opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + break; + case Opt_read_from_replica_balance: + opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS; + break; + case Opt_read_from_replica_localize: + opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; + opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -580,6 +614,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } seq_putc(m, ','); } + if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { + seq_puts(m, "read_from_replica=balance,"); + } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { + seq_puts(m, "read_from_replica=localize,"); + } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 4ce6cdc744e4..22733e844be1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2425,11 +2425,14 @@ promote: static void account_request(struct ceph_osd_request *req) { + struct ceph_osd_client *osdc = req->r_osdc; + WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); req->r_flags |= CEPH_OSD_FLAG_ONDISK; - atomic_inc(&req->r_osdc->num_requests); + req->r_flags |= osdc->client->options->osd_req_flags; + atomic_inc(&osdc->num_requests); req->r_start_stamp = jiffies; req->r_start_latency = ktime_get(); From 1b94b3aed367ff2cdc84d325e0aa9d7cc9e3cf2a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:19 -0500 Subject: [PATCH 295/574] tracing: Check state.disabled in synth event trace functions Since trace_state.disabled is set in __synth_event_trace_start() at the same time -ENOENT is returned, don't bother returning -ENOENT - just have callers check trace_state.disabled instead, and avoid the extra return val munging. Link: http://lkml.kernel.org/r/87315c3889af870e8370e82b76cf48b426d70130.1585941485.git.zanussi@kernel.org Suggested-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcab11cc6833..5cfe8f998b3e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1845,7 +1845,6 @@ __synth_event_trace_start(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED) || trace_trigger_soft_disabled(file)) { trace_state->disabled = true; - ret = -ENOENT; goto out; } @@ -1907,11 +1906,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -1987,11 +1983,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, int ret; ret = __synth_event_trace_start(file, &state); - if (ret) { - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + if (ret || state.disabled) return ret; - } if (n_vals != state.event->n_fields) { ret = -EINVAL; @@ -2067,16 +2060,10 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - int ret; - if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ - - return ret; + return __synth_event_trace_start(file, trace_state); } EXPORT_SYMBOL_GPL(synth_event_trace_start); From 16b585fe71924b3aebaef5548a291021efaf7c7f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:20 -0500 Subject: [PATCH 296/574] tracing: Add histogram-design document Add a new Documentation/trace/histogram-design.rst file describing the ftrace histogram low-level design, meant to help developers trying to understand the internals when extending or making use of the hist triggers for higher-level tools. This documentation refers to the hist_debug files implemented by 'tracing: Add hist_debug trace event files for histogram debugging' so users wishing to try out the test examples here should make sure CONFIG_HIST_TRIGGERS_DEBUG is enabled. Link: http://lkml.kernel.org/r/256b29c3274bb89a10157c4a8d1a8bce7e74849e.1585941485.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram-design.rst | 2115 ++++++++++++++++++++++ 1 file changed, 2115 insertions(+) create mode 100644 Documentation/trace/histogram-design.rst diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst new file mode 100644 index 000000000000..c246753f0ffc --- /dev/null +++ b/Documentation/trace/histogram-design.rst @@ -0,0 +1,2115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Histogram Design Notes +====================== + +:Author: Tom Zanussi + +This document attempts to provide a description of how the ftrace +histograms work and how the individual pieces map to the data +structures used to implement them in trace_events_hist.c and +tracing_map.c. + +Note: All the ftrace histogram command examples assume the working + directory is the ftrace /tracing directory. For example:: + + # cd /sys/kernel/debug/tracing + +Also, the histogram output displayed for those commands will be +generally be truncated - only enough to make the point is displayed. + +'hist_debug' trace event files +============================== + +If the kernel is compiled with CONFIG_HIST_TRIGGERS_DEBUG set, an +event file named 'hist_debug' will appear in each event's +subdirectory. This file can be read at any time and will display some +of the hist trigger internals described in this document. Specific +examples and output will be described in test cases below. + +Basic histograms +================ + +First, basic histograms. Below is pretty much the simplest thing you +can do with histograms - create one with a single key on a single +event and cat the output:: + + # echo 'hist:keys=pid' >> events/sched/sched_waking/trigger + + # cat events/sched/sched_waking/hist + + { pid: 18249 } hitcount: 1 + { pid: 13399 } hitcount: 1 + { pid: 17973 } hitcount: 1 + { pid: 12572 } hitcount: 1 + ... + { pid: 10 } hitcount: 921 + { pid: 18255 } hitcount: 1444 + { pid: 25526 } hitcount: 2055 + { pid: 5257 } hitcount: 2055 + { pid: 27367 } hitcount: 2055 + { pid: 1728 } hitcount: 2161 + + Totals: + Hits: 21305 + Entries: 183 + Dropped: 0 + +What this does is create a histogram on the sched_waking event using +pid as a key and with a single value, hitcount, which even if not +explicitly specified, exists for every histogram regardless. + +The hitcount value is a per-bucket value that's automatically +incremented on every hit for the given key, which in this case is the +pid. + +So in this histogram, there's a separate bucket for each pid, and each +bucket contains a value for that bucket, counting the number of times +sched_waking was called for that pid. + +Each histogram is represented by a hist_data struct. + +To keep track of each key and value field in the histogram, hist_data +keeps an array of these fields named fields[]. The fields[] array is +an array containing struct hist_field representations of each +histogram val and key in the histogram (variables are also included +here, but are discussed later). So for the above histogram we have one +key and one value; in this case the one value is the hitcount value, +which all histograms have, regardless of whether they define that +value or not, which the above histogram does not. + +Each struct hist_field contains a pointer to the ftrace_event_field +from the event's trace_event_file along with various bits related to +that such as the size, offset, type, and a hist_field_fn_t function, +which is used to grab the field's data from the ftrace event buffer +(in most cases - some hist_fields such as hitcount don't directly map +to an event field in the trace buffer - in these cases the function +implementation gets its value from somewhere else). The flags field +indicates which type of field it is - key, value, variable, variable +reference, etc., with value being the default. + +The other important hist_data data structure in addition to the +fields[] array is the tracing_map instance created for the histogram, +which is held in the .map member. The tracing_map implements the +lock-free hash table used to implement histograms (see +kernel/trace/tracing_map.h for much more discussion about the +low-level data structures implementing the tracing_map). For the +purposes of this discussion, the tracing_map contains a number of +buckets, each bucket corresponding to a particular tracing_map_elt +object hashed by a given histogram key. + +Below is a diagram the first part of which describes the hist_data and +associated key and value fields for the histogram described above. As +you can see, there are two fields in the fields array, one val field +for the hitcount and one key field for the pid key. + +Below that is a diagram of a run-time snapshot of what the tracing_map +might look like for a given run. It attempts to show the +relationships between the hist_data fields and the tracing_map +elements for a couple hypothetical keys and values. + + +------------------+ + | hist_data | + +------------------+ +----------------+ + | .fields[] |---->| val = hitcount |----------------------------+ + +----------------+ +----------------+ | + | .map | | .size | | + +----------------+ +--------------+ | + | .offset | | + +--------------+ | + | .fn() | | + +--------------+ | + . | + . | + . | + +----------------+ <--- n_vals | + | key = pid |----------------------------|--+ + +----------------+ | | + | .size | | | + +--------------+ | | + | .offset | | | + +--------------+ | | + | .fn() | | | + +----------------+ <--- n_fields | | + | unused | | | + +----------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + n_keys = n_fields - n_vals | | + | | +The hist_data n_vals and n_fields delineate the extent of the fields[] | | +array and separate keys from values for the rest of the code. | | + | | +Below is a run-time representation of the tracing_map part of the | | +histogram, with pointers from various parts of the fields[] array | | +to corresponding parts of the tracing_map. | | + | | +The tracing_map consists of an array of tracing_map_entrys and a set | | +of preallocated tracing_map_elts (abbreviated below as map_entry and | | +map_elt). The total number of map_entrys in the hist_data.map array = | | +map->max_elts (actually map->map_size but only max_elts of those are | | +used. This is a property required by the map_insert() algorithm). | | + | | +If a map_entry is unused, meaning no key has yet hashed into it, its | | +.key value is 0 and its .val pointer is NULL. Once a map_entry has | | +been claimed, the .key value contains the key's hash value and the | | +.val member points to a map_elt containing the full key and an entry | | +for each key or value in the map_elt.fields[] array. There is an | | +entry in the map_elt.fields[] array corresponding to each hist_field | | +in the histogram, and this is where the continually aggregated sums | | +corresponding to each histogram value are kept. | | + | | +The diagram attempts to show the relationship between the | | +hist_data.fields[] and the map_elt.fields[] with the links drawn | | +between diagrams:: | | + | | + +-----------+ | | + | hist_data | | | + +-----------+ | | + | .fields | | | + +---------+ +-----------+ | | + | .map |---->| map_entry | | | + +---------+ +-----------+ | | + | .key |---> 0 | | + +---------+ | | + | .val |---> NULL | | + +-----------+ | | + | map_entry | | | + +-----------+ | | + | .key |---> pid = 999 | | + +---------+ +-----------+ | | + | .val |--->| map_elt | | | + +---------+ +-----------+ | | + . | .key |---> full key * | | + . +---------+ +---------------+ | | + . | .fields |--->| .sum (val) |<-+ | + +-----------+ +---------+ | 2345 | | | + | map_entry | +---------------+ | | + +-----------+ | .offset (key) |<----+ + | .key |---> 0 | 0 | | | + +---------+ +---------------+ | | + | .val |---> NULL . | | + +-----------+ . | | + | map_entry | . | | + +-----------+ +---------------+ | | + | .key | | .sum (val) or | | | + +---------+ +---------+ | .offset (key) | | | + | .val |--->| map_elt | +---------------+ | | + +-----------+ +---------+ | .sum (val) or | | | + | map_entry | | .offset (key) | | | + +-----------+ +---------------+ | | + | .key |---> pid = 4444 | | + +---------+ +-----------+ | | + | .val | | map_elt | | | + +---------+ +-----------+ | | + | .key |---> full key * | | + +---------+ +---------------+ | | + | .fields |--->| .sum (val) |<-+ | + +---------+ | 65523 | | + +---------------+ | + | .offset (key) |<----+ + | 0 | + +---------------+ + . + . + . + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + map_entry = struct tracing_map_entry + map_elt = struct tracing_map_elt + map_elt.fields = struct tracing_map_field + +Whenever a new event occurs and it has a hist trigger associated with +it, event_hist_trigger() is called. event_hist_trigger() first deals +with the key: for each subkey in the key (in the above example, there +is just one subkey corresponding to pid), the hist_field that +represents that subkey is retrieved from hist_data.fields[] and the +hist_field_fn_t fn() associated with that field, along with the +field's size and offset, is used to grab that subkey's data from the +current trace record. + +Once the complete key has been retrieved, it's used to look that key +up in the tracing_map. If there's no tracing_map_elt associated with +that key, an empty one is claimed and inserted in the map for the new +key. In either case, the tracing_map_elt associated with that key is +returned. + +Once a tracing_map_elt available, hist_trigger_elt_update() is called. +As the name implies, this updates the element, which basically means +updating the element's fields. There's a tracing_map_field associated +with each key and value in the histogram, and each of these correspond +to the key and value hist_fields created when the histogram was +created. hist_trigger_elt_update() goes through each value hist_field +and, as for the keys, uses the hist_field's fn() and size and offset +to grab the field's value from the current trace record. Once it has +that value, it simply adds that value to that field's +continually-updated tracing_map_field.sum member. Some hist_field +fn()s, such as for the hitcount, don't actually grab anything from the +trace record (the hitcount fn() just increments the counter sum by 1), +but the idea is the same. + +Once all the values have been updated, hist_trigger_elt_update() is +done and returns. Note that there are also tracing_map_fields for +each subkey in the key, but hist_trigger_elt_update() doesn't look at +them or update anything - those exist only for sorting, which can +happen later. + +Basic histogram test +-------------------- + +This is a good example to try. It produces 3 value fields and 2 key +fields in the output:: + + # echo 'hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +To see the debug data, cat the kmem/kmalloc's 'hist_debug' file. It +will show the trigger info of the histogram it corresponds to, along +with the address of the hist_data associated with the histogram, which +will become useful in later examples. It then displays the number of +total hist_fields associated with the histogram along with a count of +how many of those correspond to keys and how many correspond to values. + +It then goes on to display details for each field, including the +field's flags and the position of each field in the hist_data's +fields[] array, which is useful information for verifying that things +internally appear correct or not, and which again will become even +more useful in further examples:: + + # cat events/kmem/kmalloc/hist_debug + + # event histogram + # + # trigger info: hist:keys=common_pid,call_site.sym:vals=hitcount,bytes_req,bytes_alloc:sort=hitcount:size=2048 [active] + # + + hist_data: 000000005e48c9a5 + + n_vals: 3 + n_keys: 2 + n_fields: 5 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_req + type: size_t + size: 8 + is_signed: 0 + + hist_data->fields[2]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_alloc + type: size_t + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: common_pid + type: int + size: 8 + is_signed: 1 + + hist_data->fields[4]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: call_site + type: unsigned long + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +Variables +========= + +Variables allow data from one hist trigger to be saved by one hist +trigger and retrieved by another hist trigger. For example, a trigger +on the sched_waking event can capture a timestamp for a particular +pid, and later a sched_switch event that switches to that pid event +can grab the timestamp and use it to calculate a time delta between +the two events:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> + events/sched/sched_switch/trigger + +In terms of the histogram data structures, variables are implemented +as another type of hist_field and for a given hist trigger are added +to the hist_data.fields[] array just after all the val fields. To +distinguish them from the existing key and val fields, they're given a +new flag type, HIST_FIELD_FL_VAR (abbreviated FL_VAR) and they also +make use of a new .var.idx field member in struct hist_field, which +maps them to an index in a new map_elt.vars[] array added to the +map_elt specifically designed to store and retrieve variable values. +The diagram below shows those new elements and adds a new variable +entry, ts0, corresponding to the ts0 variable in the sched_waking +trigger above. + +sched_waking histogram +---------------------- + + +------------------+ + | hist_data |<-------------------------------------------------------+ + +------------------+ +-------------------+ | + | .fields[] |-->| val = hitcount | | + +----------------+ +-------------------+ | + | .map | | .size | | + +----------------+ +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags | | + +-----------------+ | + | .var.idx | | + +-------------------+ | + | var = ts0 | | + +-------------------+ | + | .size | | + +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags & FL_VAR | | + +-----------------+ | + | .var.idx |----------------------------+-+ | + +-----------------+ | | | + . | | | + . | | | + . | | | + +-------------------+ <--- n_vals | | | + | key = pid | | | | + +-------------------+ | | | + | .size | | | | + +-----------------+ | | | + | .offset | | | | + +-----------------+ | | | + | .fn() | | | | + +-----------------+ | | | + | .flags & FL_KEY | | | | + +-----------------+ | | | + | .var.idx | | | | + +-------------------+ <--- n_fields | | | + | unused | | | | + +-------------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + n_keys = n_fields - n_vals | | | + | | | + | | | +This is very similar to the basic case. In the above diagram, we can | | | +see a new .flags member has been added to the struct hist_field | | | +struct, and a new entry added to hist_data.fields representing the ts0 | | | +variable. For a normal val hist_field, .flags is just 0 (modulo | | | +modifier flags), but if the value is defined as a variable, the .flags | | | +contains a set FL_VAR bit. | | | + | | | +As you can see, the ts0 entry's .var.idx member contains the index | | | +into the tracing_map_elts' .vars[] array containing variable values. | | | +This idx is used whenever the value of the variable is set or read. | | | +The map_elt.vars idx assigned to the given variable is assigned and | | | +saved in .var.idx by create_tracing_map_fields() after it calls | | | +tracing_map_add_var(). | | | + | | | +Below is a representation of the histogram at run-time, which | | | +populates the map, along with correspondence to the above hist_data and | | | +hist_field data structures. | | | + | | | +The diagram attempts to show the relationship between the | | | +hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with | | | +the links drawn between diagrams. For each of the map_elts, you can | | | +see that the .fields[] members point to the .sum or .offset of a key | | | +or val and the .vars[] members point to the value of a variable. The | | | +arrows between the two diagrams show the linkages between those | | | +tracing_map members and the field definitions in the corresponding | | | +hist_data fields[] members. | | | + | | | + +-----------+ | | | + | hist_data | | | | + +-----------+ | | | + | .fields | | | | + +---------+ +-----------+ | | | + | .map |---->| map_entry | | | | + +---------+ +-----------+ | | | + | .key |---> 0 | | | + +---------+ | | | + | .val |---> NULL | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 999 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + . +---------+ | 2345 | | | | + . +--| .vars | +---------------+ | | | + . | +---------+ | .offset (key) | | | | + . | | 0 | | | | + . | +---------------+ | | | + . | . | | | + . | . | | | + . | . | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | | | + . +---------------->+---------------+ | | | + . | ts0 |<--+ | | + . | 113345679876 | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . . | | | + . . | | | + . . | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 4444 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + +---------+ | 2345 | | | | + +--| .vars | +---------------+ | | | + | +---------+ | .offset (key) | | | | + | | 0 | | | | + | +---------------+ | | | + | . | | | + | . | | | + | . | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | | | + | +---------------+ | | | + +---------------->| ts0 |<--+ | | + | 213499240729 | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + . | | + . | | + . | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + | | +For each used map entry, there's a map_elt pointing to an array of | | +.vars containing the current value of the variables associated with | | +that histogram entry. So in the above, the timestamp associated with | | +pid 999 is 113345679876, and the timestamp variable in the same | | +.var.idx for pid 4444 is 213499240729. | | + | | +sched_switch histogram | | +---------------------- | | + | | +The sched_switch histogram paired with the above sched_waking | | +histogram is shown below. The most important aspect of the | | +sched_switch histogram is that it references a variable on the | | +sched_waking histogram above. | | + | | +The histogram diagram is very similar to the others so far displayed, | | +but it adds variable references. You can see the normal hitcount and | | +key fields along with a new wakeup_lat variable implemented in the | | +same way as the sched_waking ts0 variable, but in addition there's an | | +entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag. | | + | | +Associated with the new var ref field are a couple of new hist_field | | +members, var.hist_data and var_ref_idx. For a variable reference, the | | +var.hist_data goes with the var.idx, which together uniquely identify | | +a particular variable on a particular histogram. The var_ref_idx is | | +just the index into the var_ref_vals[] array that caches the values of | | +each variable whenever a hist trigger is updated. Those resulting | | +values are then finally accessed by other code such as trace action | | +code that uses the var_ref_idx values to assign param values. | | + | | +The diagram below describes the situation for the sched_switch | | +histogram referred to before: | | + | | + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> | | + events/sched/sched_switch/trigger | | + | | + +------------------+ | | + | hist_data | | | + +------------------+ +-----------------------+ | | + | .fields[] |-->| val = hitcount | | | + +----------------+ +-----------------------+ | | + | .map | | .size | | | + +----------------+ +---------------------+ | | + +--| .var_refs[] | | .offset | | | + | +----------------+ +---------------------+ | | + | | .fn() | | | + | var_ref_vals[] +---------------------+ | | + | +-------------+ | .flags | | | + | | $ts0 |<---+ +---------------------+ | | + | +-------------+ | | .var.idx | | | + | | | | +---------------------+ | | + | +-------------+ | | .var.hist_data | | | + | | | | +---------------------+ | | + | +-------------+ | | .var_ref_idx | | | + | | | | +-----------------------+ | | + | +-------------+ | | var = wakeup_lat | | | + | . | +-----------------------+ | | + | . | | .size | | | + | . | +---------------------+ | | + | +-------------+ | | .offset | | | + | | | | +---------------------+ | | + | +-------------+ | | .fn() | | | + | | | | +---------------------+ | | + | +-------------+ | | .flags & FL_VAR | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +---------------------+ | | + | | | .var.hist_data | | | + | | +---------------------+ | | + | | | .var_ref_idx | | | + | | +---------------------+ | | + | | . | | + | | . | | + | | . | | + | | +-----------------------+ <--- n_vals | | + | | | key = pid | | | + | | +-----------------------+ | | + | | | .size | | | + | | +---------------------+ | | + | | | .offset | | | + | | +---------------------+ | | + | | | .fn() | | | + | | +---------------------+ | | + | | | .flags | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +-----------------------+ <--- n_fields | | + | | | unused | | | + | | +-----------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | n_keys = n_fields - n_vals | | + | | | | + | | | | + | | +-----------------------+ | | + +---------------------->| var_ref = $ts0 | | | + | +-----------------------+ | | + | | .size | | | + | +---------------------+ | | + | | .offset | | | + | +---------------------+ | | + | | .fn() | | | + | +---------------------+ | | + | | .flags & FL_VAR_REF | | | + | +---------------------+ | | + | | .var.idx |--------------------------+ | + | +---------------------+ | + | | .var.hist_data |----------------------------+ + | +---------------------+ + +---| .var_ref_idx | + +---------------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +When a hist trigger makes use of a variable, a new hist_field is +created with flag HIST_FIELD_FL_VAR_REF. For a VAR_REF field, the +var.idx and var.hist_data take the same values as the referenced +variable, as well as the referenced variable's size, type, and +is_signed values. The VAR_REF field's .name is set to the name of the +variable it references. If a variable reference was created using the +explicit system.event.$var_ref notation, the hist_field's system and +event_name variabls are also set. + +So, in order to handle an event for the sched_switch histogram, +because we have a reference to a variable on another histogram, we +need to resolve all variable references first. This is done via the +resolve_var_refs() calls made from event_hist_trigger(). What this +does is grabs the var_refs[] array from the hist_data representing the +sched_switch histogram. For each one of those, the referenced +variable's var.hist_data along with the current key is used to look up +the corresponding tracing_map_elt in that histogram. Once found, the +referenced variable's var.idx is used to look up the variable's value +using tracing_map_read_var(elt, var.idx), which yields the value of +the variable for that element, ts0 in the case above. Note that both +the hist_fields representing both the variable and the variable +reference have the same var.idx, so this is straightforward. + +Variable and variable reference test +------------------------------------ + +This example creates a variable on the sched_waking event, ts0, and +uses it in the sched_switch trigger. The sched_switch trigger also +creates its own variable, wakeup_lat, but nothing yet uses it:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + +Looking at the sched_waking 'hist_debug' output, in addition to the +normal key and value hist_fields, in the val fields section we see a +field with the HIST_FIELD_FL_VAR flag, which indicates that that field +represents a variable. Note that in addition to the variable name, +contained in the var.name field, it includes the var.idx, which is the +index into the tracing_map_elt.vars[] array of the actual variable +location. Note also that the output shows that variables live in the +same part of the hist_data->fields[] array as normal values:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 000000009536f554 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Moving on to the sched_switch trigger hist_debug output, in addition +to the unused wakeup_lat variable, we see a new section displaying +variable references. Variable references are displayed in a separate +section because in addition to to being logically separate from +variables and values, they actually live in a separate hist_data +array, var_refs[]. + +In this example, the sched_switch trigger has a reference to a +variable on the sched_waking trigger, $ts0. Looking at the details, +we can see that the var.hist_data value of the referenced variable +matches the previously displayed sched_waking trigger, and the var.idx +value matches the previously displayed var.idx value for that +variable. Also displayed is the var_ref_idx value for that variable +reference, which is where the value for that variable is cached for +use when the trigger is invoked:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000f4ee8006 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000009536f554 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Actions and Handlers +==================== + +Adding onto the previous example, we will now do something with that +wakeup_lat variable, namely send it and another field as a synthetic +event. + +The onmatch() action below basically says that whenever we have a +sched_switch event, if we have a matching sched_waking event, in this +case if we have a pid in the sched_waking histogram that matches the +the next_pid field on this sched_switch event, we retrieve the +variables specified in the wakeup_latency() trace action, and use +them to generate a new wakeup_latency event into the trace stream. + +Note that the way the trace handlers such as wakeup_latency() (which +could equivalently be written trace(wakeup_latency,$wakeup_lat,next_pid) +are implemented, the parameters specified to the trace handler must be +variables. In this case, $wakeup_lat is obviously a variable, but +next_pid isn't, since it's just naming a field in the sched_switch +trace event. Since this is something that almost every trace() and +save() action does, a special shortcut is implemented to allow field +names to be used directly in those cases. How it works is that under +the covers, a temporary variable is created for the named field, and +this variable is what is actually passed to the trace handler. In the +code and documentation, this type of variable is called a 'field +variable'. + +Fields on other trace event's histograms can be used as well. In that +case we have to generate a new histogram and an unfortunately named +'synthetic_field' (the use of synthetic here has nothing to do with +synthetic events) and use that special histogram field as a variable. + +The diagram below illustrates the new elements described above in the +context of the sched_switch histogram using the onmatch() handler and +the trace() action. + +First, we define the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events + +Next, the sched_waking hist trigger as before:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + +Finally, we create a hist trigger on the sched_switch event that +generates a wakeup_latency() trace event. In this case we pass +next_pid into the wakeup_latency synthetic event invocation, which +means it will be automatically converted into a field variable:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The diagram for the sched_switch event is similar to previous examples +but shows the additional field_vars[] array for hist_data and shows +the linkages between the field_vars and the variables and references +created to implement the field variables. The details are discussed +below:: + + +------------------+ + | hist_data | + +------------------+ +-----------------------+ + | .fields[] |-->| val = hitcount | + +----------------+ +-----------------------+ + | .map | | .size | + +----------------+ +---------------------+ + +---| .field_vars[] | | .offset | + | +----------------+ +---------------------+ + |+--| .var_refs[] | | .offset | + || +----------------+ +---------------------+ + || | .fn() | + || var_ref_vals[] +---------------------+ + || +-------------+ | .flags | + || | $ts0 |<---+ +---------------------+ + || +-------------+ | | .var.idx | + || | $next_pid |<-+ | +---------------------+ + || +-------------+ | | | .var.hist_data | + ||+>| $wakeup_lat | | | +---------------------+ + ||| +-------------+ | | | .var_ref_idx | + ||| | | | | +-----------------------+ + ||| +-------------+ | | | var = wakeup_lat | + ||| . | | +-----------------------+ + ||| . | | | .size | + ||| . | | +---------------------+ + ||| +-------------+ | | | .offset | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .fn() | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .flags & FL_VAR | + ||| | | +---------------------+ + ||| | | | .var.idx | + ||| | | +---------------------+ + ||| | | | .var.hist_data | + ||| | | +---------------------+ + ||| | | | .var_ref_idx | + ||| | | +---------------------+ + ||| | | . + ||| | | . + ||| | | . + ||| | | . + ||| +--------------+ | | . + +-->| field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +--------------+ | | . + || | field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +------------+ | | . + || . | | . + || . | | . + || . | | +-----------------------+ <--- n_vals + || +--------------+ | | | key = pid | + || | field_var | | | +-----------------------+ + || +--------------+ | | | .size | + || | var |--+| +---------------------+ + || +------------+ ||| | .offset | + || | val |-+|| +---------------------+ + || +------------+ ||| | .fn() | + || ||| +---------------------+ + || ||| | .flags | + || ||| +---------------------+ + || ||| | .var.idx | + || ||| +---------------------+ <--- n_fields + || ||| + || ||| n_keys = n_fields - n_vals + || ||| +-----------------------+ + || |+->| var = next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .flags & FL_VAR | + || | | +---------------------+ + || | | | .var.idx | + || | | +---------------------+ + || | | | .var.hist_data | + || | | +-----------------------+ + || +-->| val for next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .fn() | + || | | +---------------------+ + || | | | .flags | + || | | +---------------------+ + || | | | | + || | | +---------------------+ + || | | + || | | + || | | +-----------------------+ + +|------------------|-|>| var_ref = $ts0 | + | | | +-----------------------+ + | | | | .size | + | | | +---------------------+ + | | | | .offset | + | | | +---------------------+ + | | | | .fn() | + | | | +---------------------+ + | | | | .flags & FL_VAR_REF | + | | | +---------------------+ + | | +---| .var_ref_idx | + | | +-----------------------+ + | | | var_ref = $next_pid | + | | +-----------------------+ + | | | .size | + | | +---------------------+ + | | | .offset | + | | +---------------------+ + | | | .fn() | + | | +---------------------+ + | | | .flags & FL_VAR_REF | + | | +---------------------+ + | +-----| .var_ref_idx | + | +-----------------------+ + | | var_ref = $wakeup_lat | + | +-----------------------+ + | | .size | + | +---------------------+ + | | .offset | + | +---------------------+ + | | .fn() | + | +---------------------+ + | | .flags & FL_VAR_REF | + | +---------------------+ + +------------------------| .var_ref_idx | + +---------------------+ + +As you can see, for a field variable, two hist_fields are created: one +representing the variable, in this case next_pid, and one to actually +get the value of the field from the trace stream, like a normal val +field does. These are created separately from normal variable +creation and are saved in the hist_data->field_vars[] array. See +below for how these are used. In addition, a reference hist_field is +also created, which is needed to reference the field variables such as +$next_pid variable in the trace() action. + +Note that $wakeup_lat is also a variable reference, referencing the +value of the expression common_timestamp-$ts0, and so also needs to +have a hist field entry representing that reference created. + +When hist_trigger_elt_update() is called to get the normal key and +value fields, it also calls update_field_vars(), which goes through +each field_var created for the histogram, and available from +hist_data->field_vars and calls val->fn() to get the data from the +current trace record, and then uses the var's var.idx to set the +variable at the var.idx offset in the appropriate tracing_map_elt's +variable at elt->vars[var.idx]. + +Once all the variables have been updated, resolve_var_refs() can be +called from event_hist_trigger(), and not only can our $ts0 and +$next_pid references be resolved but the $wakeup_lat reference as +well. At this point, the trace() action can simply access the values +assembled in the var_ref_vals[] array and generate the trace event. + +The same process occurs for the field variables associated with the +save() action. + +Abbreviations used in the diagram:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + field_var = struct field_var + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +trace() action field variable test +---------------------------------- + +This example adds to the previous test example by finally making use +of the wakeup_lat variable, but in addition also creates a couple of +field variables that then are all passed to the wakeup_latency() trace +action via the onmatch() handler. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, the sched_waking trigger from previous examples:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Finally, as in the previous test example, we calculate and assign the +wakeup latency using the $ts0 reference from the sched_waking trigger +to the wakeup_lat variable, and finally use it along with a couple +sched_switch event fields, next_pid and next_comm, to generate a +wakeup_latency trace event. The next_pid and next_comm event fields +are automatically converted into field variables for this purpose:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test example:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000d60ff61f + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows the same key and value fields +as in the previous test example - note that wakeup_lat is still in the +val fields section, but that the new field variables are not there - +although the field variables are variables, they're held separately in +the hist_data's field_vars[] array. Although the field variables and +the normal variables are located in separate places, you can see that +the actual variable locations for those variables in the +tracing_map_elt.vars[] do have increasing indices as expected: +wakeup_lat takes the var.idx = 0 slot, while the field variables for +next_pid and next_comm have values var.idx = 1, and var.idx = 2. Note +also that those are the same values displayed for the variable +references corresponding to those variables in the variable reference +fields section. Since there are two triggers and thus two hist_data +addresses, those addresses also need to be accounted for when doing +the matching - you can see that the first variable refers to the 0 +var.idx on the previous hist trigger (see the hist_data address +associated with that trigger), while the second variable refers to the +0 var.idx on the sched_switch hist trigger, as do all the remaining +variable references. + +Finally, the action tracking variables section just shows the system +and event name for the onmatch() handler:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm) [active] + # + + hist_data: 0000000008f551b7 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000d60ff61f + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 3 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->field_vars[1]: + + field_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[1].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +action_data and the trace() action +---------------------------------- + +As mentioned above, when the trace() action generates a synthetic +event, all the parameters to the synthetic event either already are +variables or are converted into variables (via field variables), and +finally all those variable values are collected via references to them +into a var_ref_vals[] array. + +The values in the var_ref_vals[] array, however, don't necessarily +follow the same ordering as the synthetic event params. To address +that, struct action_data contains another array, var_ref_idx[] that +maps the trace action params to the var_ref_vals[] values. Below is a +diagram illustrating that for the wakeup_latency() synthetic event:: + + +------------------+ wakeup_latency() + | action_data | event params var_ref_vals[] + +------------------+ +-----------------+ +-----------------+ + | .var_ref_idx[] |--->| $wakeup_lat idx |---+ | | + +----------------+ +-----------------+ | +-----------------+ + | .synth_event | | $next_pid idx |---|-+ | $wakeup_lat val | + +----------------+ +-----------------+ | | +-----------------+ + . | +->| $next_pid val | + . | +-----------------+ + . | . + +-----------------+ | . + | | | . + +-----------------+ | +-----------------+ + +--->| $wakeup_lat val | + +-----------------+ + +Basically, how this ends up getting used in the synthetic event probe +function, trace_event_raw_event_synth(), is as follows:: + + for each field i in .synth_event + val_idx = .var_ref_idx[i] + val = var_ref_vals[val_idx] + +action_data and the onXXX() handlers +------------------------------------ + +The hist trigger onXXX() actions other than onmatch(), such as onmax() +and onchange(), also make use of and internally create hidden +variables. This information is contained in the +action_data.track_data struct, and is also visible in the hist_debug +output as will be described in the example below. + +Typically, the onmax() or onchange() handlers are used in conjunction +with the save() and snapshot() actions. For example:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +or:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).snapshot()' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +save() action field variable test +--------------------------------- + +For this example, instead of generating a synthetic event, the save() +action is used to save field values whenever an onmax() handler +detects that a new max latency has been hit. As in the previous +example, the values being saved are also field values, but in this +case, are kept in a separate hist_data array named save_vars[]. + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +In this case, however, we set up the sched_switch trigger to save some +sched_switch field values whenever we hit a new maximum latency. For +both the onmax() handler and save() action, variables will be created, +which we can use the hist_debug files to examine:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test examples:: + + # cat events/sched/sched_waking/hist_debug + + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000e6290f48 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The output of the sched_switch trigger shows the same val and key +values as before, but also shows a couple new sections. + +First, the action tracking variables section now shows the +actions[].track_data information describing the special tracking +variables and references used to track, in this case, the running +maximum value. The actions[].track_data.var_ref member contains the +reference to the variable being tracked, in this case the $wakeup_lat +variable. In order to perform the onmax() handler function, there +also needs to be a variable that tracks the current maximum by getting +updated whenever a new maximum is hit. In this case, we can see that +an autogenerated veriable named ' __max' has been created and is +visible in the actions[].track_data.track_var variable. + +Finally, in the new 'save action variables' section, we can see that +the 4 params to the save() function have resulted in 4 field variables +being created for the purposes of saving the values of the named +fields when the max is hit. These variables are kept in a separate +save_vars[] array off of hist_data, so are displayed in a separate +section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) [active] + # + + hist_data: 0000000057bcd28d + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000e6290f48 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].track_data.var_ref: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->actions[0].track_data.track_var: + flags: + HIST_FIELD_FL_VAR + var.name: __max + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + save action variables (save() params): + + hist_data->save_vars[0]: + + save_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + save_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + hist_data->save_vars[1]: + + save_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_pid + var.idx (into tracing_map_elt.vars[]): 3 + + save_vars[1].val: + ftrace_event_field name: prev_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->save_vars[2]: + + save_vars[2].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_prio + var.idx (into tracing_map_elt.vars[]): 4 + + save_vars[2].val: + ftrace_event_field name: prev_prio + type: int + size: 4 + is_signed: 1 + + hist_data->save_vars[3]: + + save_vars[3].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_comm + var.idx (into tracing_map_elt.vars[]): 5 + + save_vars[3].val: + ftrace_event_field name: prev_comm + type: char[16] + size: 256 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +A couple special cases +====================== + +While the above covers the basics of the histogram internals, there +are a couple of special cases that should be discussed, since they +tend to creae even more confusion. Those are field variables on other +histograms, and aliases, both described below through example tests +using the hist_debug files. + +Test of field variables on other histograms +------------------------------------------- + +This example is similar to the previous examples, but in this case, +the sched_switch trigger references a hist trigger field on another +event, namely the sched_waking event. In order to accomplish this, a +field variable is created for the other event, but since an existing +histogram can't be used, as existing histograms are immutable, a new +histogram with a matching variable is created and used, and we'll see +that reflected in the hist_debug output shown below. + +First, we create the wakeup_latency synthetic event. Note the +addition of the prio field:: + + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Here we set up a hist trigger on sched_switch to send a wakeup_latency +event using an onmatch handler naming the sched_waking event. Note +that the third param being passed to the wakeup_latency() is prio, +which is a field name that needs to have a field variable created for +it. There isn't however any prio field on the sched_switch event so +it would seem that it wouldn't be possible to create a field variable +for it. The matching sched_waking event does have a prio field, so it +should be possible to make use of it for this purpose. The problem +with that is that it's not currently possible to define a new variable +on an existing histogram, so it's not possible to add a new prio field +variable to the existing sched_waking histogram. It is however +possible to create an additional new 'matching' sched_waking histogram +for the same event, meaning that it uses the same key and filters, and +define the new prio field variable on that. + +Here's the sched_switch trigger:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + +And here's the output of the hist_debug information for the +sched_waking hist trigger. Note that there are two histograms +displayed in the output: the first is the normal sched_waking +histogram we've seen in the previous examples, and the second is the +special histogram we created to provide the prio field variable. + +Looking at the second histogram below, we see a variable with the name +synthetic_prio. This is the field variable created for the prio field +on that sched_waking histogram:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000349570e4 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:synthetic_prio=prio:sort=hitcount:size=2048 [active] + # + + hist_data: 000000006920cf38 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: prio + var.name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + type: int + size: 4 + is_signed: 1 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Looking at the sched_switch histogram below, we can see a reference to +the synthetic_prio variable on sched_waking, and looking at the +associated hist_data address we see that it is indeed associated with +the new histogram. Note also that the other references are to a +normal variable, wakeup_lat, and to a normal field variable, next_pid, +the details of which are in the field variables section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio) [active] + # + + hist_data: 00000000a73b67df + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000349570e4 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000006920cf38 + var_ref_idx (into hist_data->var_refs[]): 3 + type: int + size: 4 + is_signed: 1 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +Alias test +---------- + +This example is very similar to previous examples, but demonstrates +the alias flag. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, we create a sched_waking trigger similar to previous examples, +but in this case we save the pid in the waking_pid variable:: + + # echo 'hist:keys=pid:waking_pid=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +For the sched_switch trigger, instead of using $waking_pid directly in +the wakeup_latency synthetic event invocation, we create an alias of +$waking_pid named $woken_pid, and use that in the synthetic event +invocation instead:: + + # echo 'hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + +Looking at the sched_waking hist_debug output, in addition to the +normal fields, we can see the waking_pid variable:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:waking_pid=pid,ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000a250528c + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: pid + var.name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows that a variable named +woken_pid has been created but that it also has the +HIST_FIELD_FL_ALIAS flag set. It also has the HIST_FIELD_FL_VAR flag +set, which is why it appears in the val field section. + +Despite that implementation detail, an alias variable is actually more +like a variable reference; in fact it can be thought of as a reference +to a reference. The implementation copies the var_ref->fn() from the +variable reference being referenced, in this case, the waking_pid +fn(), which is hist_field_var_ref() and makes that the fn() of the +alias. The hist_field_var_ref() fn() requires the var_ref_idx of the +variable reference it's using, so waking_pid's var_ref_idx is also +copied to the alias. The end result is that when the value of alias +is retrieved, in the end it just does the same thing the original +reference would have done and retrieves the same value from the +var_ref_vals[] array. You can verify this in the output by noting +that the var_ref_idx of the alias, in this case woken_pid, is the same +as the var_ref_idx of the reference, waking_pid, in the variable +reference fields section. + +Additionally, once it gets that value, since it is also a variable, it +then saves that value into its var.idx. So the var.idx of the +woken_pid alias is 0, which it fills with the value from var_ref_idx 0 +when its fn() is called to update itself. You'll also notice that +there's a woken_pid var_ref in the variable refs section. That is the +reference to the woken_pid alias variable, and you can see that it +retrieves the value from the same var.idx as the woken_pid alias, 0, +and then in turn saves that value in its own var_ref_idx slot, 3, and +the value at this position is finally what gets assigned to the +$woken_pid slot in the trace event invocation:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:woken_pid=$waking_pid,wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm) [active] + # + + hist_data: 0000000055d65ed0 + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + HIST_FIELD_FL_ALIAS + var.name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 2 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 3 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[4]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 4 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events From dd873dd51d8d7868887048545d48befdd6bea7ad Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 26 May 2020 10:07:52 +0100 Subject: [PATCH 297/574] drm/i915: Reorder await_execution before await_request Reorder the code so that we can reuse the await_execution from a special case in await_request in the next patch. Signed-off-by: Chris Wilson Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200526090753.11329-1-chris@chris-wilson.co.uk (cherry picked from commit ffb0c600c240103f6f34e07892a7e0a75502b243) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 284 ++++++++++++++-------------- 1 file changed, 142 insertions(+), 142 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index c282719ad3ac..33bbad623e02 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1053,148 +1053,6 @@ await_fence: I915_FENCE_GFP); } -static int -i915_request_await_request(struct i915_request *to, struct i915_request *from) -{ - int ret; - - GEM_BUG_ON(to == from); - GEM_BUG_ON(to->timeline == from->timeline); - - if (i915_request_completed(from)) { - i915_sw_fence_set_error_once(&to->submit, from->fence.error); - return 0; - } - - if (to->engine->schedule) { - ret = i915_sched_node_add_dependency(&to->sched, - &from->sched, - I915_DEPENDENCY_EXTERNAL); - if (ret < 0) - return ret; - } - - if (to->engine == from->engine) - ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, - &from->submit, - I915_FENCE_GFP); - else - ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); - if (ret < 0) - return ret; - - return 0; -} - -static void mark_external(struct i915_request *rq) -{ - /* - * The downside of using semaphores is that we lose metadata passing - * along the signaling chain. This is particularly nasty when we - * need to pass along a fatal error such as EFAULT or EDEADLK. For - * fatal errors we want to scrub the request before it is executed, - * which means that we cannot preload the request onto HW and have - * it wait upon a semaphore. - */ - rq->sched.flags |= I915_SCHED_HAS_EXTERNAL_CHAIN; -} - -static int -__i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) -{ - mark_external(rq); - return i915_sw_fence_await_dma_fence(&rq->submit, fence, - i915_fence_context_timeout(rq->i915, - fence->context), - I915_FENCE_GFP); -} - -static int -i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) -{ - struct dma_fence *iter; - int err = 0; - - if (!to_dma_fence_chain(fence)) - return __i915_request_await_external(rq, fence); - - dma_fence_chain_for_each(iter, fence) { - struct dma_fence_chain *chain = to_dma_fence_chain(iter); - - if (!dma_fence_is_i915(chain->fence)) { - err = __i915_request_await_external(rq, iter); - break; - } - - err = i915_request_await_dma_fence(rq, chain->fence); - if (err < 0) - break; - } - - dma_fence_put(iter); - return err; -} - -int -i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) -{ - struct dma_fence **child = &fence; - unsigned int nchild = 1; - int ret; - - /* - * Note that if the fence-array was created in signal-on-any mode, - * we should *not* decompose it into its individual fences. However, - * we don't currently store which mode the fence-array is operating - * in. Fortunately, the only user of signal-on-any is private to - * amdgpu and we should not see any incoming fence-array from - * sync-file being in signal-on-any mode. - */ - if (dma_fence_is_array(fence)) { - struct dma_fence_array *array = to_dma_fence_array(fence); - - child = array->fences; - nchild = array->num_fences; - GEM_BUG_ON(!nchild); - } - - do { - fence = *child++; - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { - i915_sw_fence_set_error_once(&rq->submit, fence->error); - continue; - } - - /* - * Requests on the same timeline are explicitly ordered, along - * with their dependencies, by i915_request_add() which ensures - * that requests are submitted in-order through each ring. - */ - if (fence->context == rq->fence.context) - continue; - - /* Squash repeated waits to the same timelines */ - if (fence->context && - intel_timeline_sync_is_later(i915_request_timeline(rq), - fence)) - continue; - - if (dma_fence_is_i915(fence)) - ret = i915_request_await_request(rq, to_request(fence)); - else - ret = i915_request_await_external(rq, fence); - if (ret < 0) - return ret; - - /* Record the latest fence used against each timeline */ - if (fence->context) - intel_timeline_sync_set(i915_request_timeline(rq), - fence); - } while (--nchild); - - return 0; -} - static bool intel_timeline_sync_has_start(struct intel_timeline *tl, struct dma_fence *fence) { @@ -1282,6 +1140,55 @@ __i915_request_await_execution(struct i915_request *to, &from->fence); } +static void mark_external(struct i915_request *rq) +{ + /* + * The downside of using semaphores is that we lose metadata passing + * along the signaling chain. This is particularly nasty when we + * need to pass along a fatal error such as EFAULT or EDEADLK. For + * fatal errors we want to scrub the request before it is executed, + * which means that we cannot preload the request onto HW and have + * it wait upon a semaphore. + */ + rq->sched.flags |= I915_SCHED_HAS_EXTERNAL_CHAIN; +} + +static int +__i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) +{ + mark_external(rq); + return i915_sw_fence_await_dma_fence(&rq->submit, fence, + i915_fence_context_timeout(rq->i915, + fence->context), + I915_FENCE_GFP); +} + +static int +i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) +{ + struct dma_fence *iter; + int err = 0; + + if (!to_dma_fence_chain(fence)) + return __i915_request_await_external(rq, fence); + + dma_fence_chain_for_each(iter, fence) { + struct dma_fence_chain *chain = to_dma_fence_chain(iter); + + if (!dma_fence_is_i915(chain->fence)) { + err = __i915_request_await_external(rq, iter); + break; + } + + err = i915_request_await_dma_fence(rq, chain->fence); + if (err < 0) + break; + } + + dma_fence_put(iter); + return err; +} + int i915_request_await_execution(struct i915_request *rq, struct dma_fence *fence, @@ -1330,6 +1237,99 @@ i915_request_await_execution(struct i915_request *rq, return 0; } +static int +i915_request_await_request(struct i915_request *to, struct i915_request *from) +{ + int ret; + + GEM_BUG_ON(to == from); + GEM_BUG_ON(to->timeline == from->timeline); + + if (i915_request_completed(from)) { + i915_sw_fence_set_error_once(&to->submit, from->fence.error); + return 0; + } + + if (to->engine->schedule) { + ret = i915_sched_node_add_dependency(&to->sched, + &from->sched, + I915_DEPENDENCY_EXTERNAL); + if (ret < 0) + return ret; + } + + if (to->engine == READ_ONCE(from->engine)) + ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, + &from->submit, + I915_FENCE_GFP); + else + ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); + if (ret < 0) + return ret; + + return 0; +} + +int +i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) +{ + struct dma_fence **child = &fence; + unsigned int nchild = 1; + int ret; + + /* + * Note that if the fence-array was created in signal-on-any mode, + * we should *not* decompose it into its individual fences. However, + * we don't currently store which mode the fence-array is operating + * in. Fortunately, the only user of signal-on-any is private to + * amdgpu and we should not see any incoming fence-array from + * sync-file being in signal-on-any mode. + */ + if (dma_fence_is_array(fence)) { + struct dma_fence_array *array = to_dma_fence_array(fence); + + child = array->fences; + nchild = array->num_fences; + GEM_BUG_ON(!nchild); + } + + do { + fence = *child++; + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { + i915_sw_fence_set_error_once(&rq->submit, fence->error); + continue; + } + + /* + * Requests on the same timeline are explicitly ordered, along + * with their dependencies, by i915_request_add() which ensures + * that requests are submitted in-order through each ring. + */ + if (fence->context == rq->fence.context) + continue; + + /* Squash repeated waits to the same timelines */ + if (fence->context && + intel_timeline_sync_is_later(i915_request_timeline(rq), + fence)) + continue; + + if (dma_fence_is_i915(fence)) + ret = i915_request_await_request(rq, to_request(fence)); + else + ret = i915_request_await_external(rq, fence); + if (ret < 0) + return ret; + + /* Record the latest fence used against each timeline */ + if (fence->context) + intel_timeline_sync_set(i915_request_timeline(rq), + fence); + } while (--nchild); + + return 0; +} + /** * i915_request_await_object - set this request to (async) wait upon a bo * @to: request we are wishing to use From 2d19bd79ae6509858582a9cade739c2e9a4fdca8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 3 Apr 2020 14:31:21 -0500 Subject: [PATCH 298/574] tracing: Add hist_debug trace event files for histogram debugging Add a new "hist_debug" file for each trace event, which when read will dump out a bunch of internal details about the hist triggers defined on that event. This is normally off but can be enabled by saying 'y' to the new CONFIG_HIST_TRIGGERS_DEBUG config option. This is in support of the new Documentation file describing histogram internals, Documentation/trace/histogram-design.rst, which was requested by developers trying to understand the internals when extending or making use of the hist triggers for higher-level tools. The histogram-design.rst documentation refers to the hist_debug files and demonstrates their use with output in the test examples. Link: http://lkml.kernel.org/r/77914c22b0ba493d9783c53bbfbc6087d6a7e1b1.1585941485.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 23 +++ kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 4 + kernel/trace/trace_events_hist.c | 273 +++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 743647005f64..2aec9376825f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -847,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST If unsure, say N. +config HIST_TRIGGERS_DEBUG + bool "Hist trigger debug support" + depends on HIST_TRIGGERS + help + Add "hist_debug" file for each event, which when read will + dump out a bunch of internal details about the hist triggers + defined on that event. + + The hist_debug file serves a couple of purposes: + + - Helps developers verify that nothing is broken. + + - Provides educational information to support the details + of the hist trigger internals as described by + Documentation/trace/histogram-design.rst. + + The hist_debug output only covers the data structures + related to the histogram definitions themselves and doesn't + display the internals of map buckets or variable values of + running histograms. + + If unsure, say N. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4eb1d004d5f2..def769df5bf1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1661,6 +1661,7 @@ extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; +extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 242f59e7f17d..f6f55682d3e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2208,6 +2208,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, &event_hist_fops); +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + trace_create_file("hist_debug", 0444, file->dir, file, + &event_hist_debug_fops); #endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5cfe8f998b3e..313227da4925 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6478,6 +6478,279 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +#ifdef CONFIG_HIST_TRIGGERS_DEBUG +static void hist_field_debug_show_flags(struct seq_file *m, + unsigned long flags) +{ + seq_puts(m, " flags:\n"); + + if (flags & HIST_FIELD_FL_KEY) + seq_puts(m, " HIST_FIELD_FL_KEY\n"); + else if (flags & HIST_FIELD_FL_HITCOUNT) + seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n"); + else if (flags & HIST_FIELD_FL_VAR) + seq_puts(m, " HIST_FIELD_FL_VAR\n"); + else if (flags & HIST_FIELD_FL_VAR_REF) + seq_puts(m, " HIST_FIELD_FL_VAR_REF\n"); + else + seq_puts(m, " VAL: normal u64 value\n"); + + if (flags & HIST_FIELD_FL_ALIAS) + seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); +} + +static int hist_field_debug_show(struct seq_file *m, + struct hist_field *field, unsigned long flags) +{ + if ((field->flags & flags) != flags) { + seq_printf(m, "ERROR: bad flags - %lx\n", flags); + return -EINVAL; + } + + hist_field_debug_show_flags(m, field->flags); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + + if (field->flags & HIST_FIELD_FL_VAR) { + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + } + + if (field->flags & HIST_FIELD_FL_ALIAS) + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + + if (field->flags & HIST_FIELD_FL_VAR_REF) { + seq_printf(m, " name: %s\n", field->name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + seq_printf(m, " var.hist_data: %p\n", field->var.hist_data); + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + if (field->system) + seq_printf(m, " system: %s\n", field->system); + if (field->event_name) + seq_printf(m, " event_name: %s\n", field->event_name); + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); + + return 0; +} + +static int field_var_debug_show(struct seq_file *m, + struct field_var *field_var, unsigned int i, + bool save_vars) +{ + const char *vars_name = save_vars ? "save_vars" : "field_vars"; + struct hist_field *field; + int ret = 0; + + seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i); + + field = field_var->var; + + seq_printf(m, "\n %s[%d].var:\n", vars_name, i); + + hist_field_debug_show_flags(m, field->flags); + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + + field = field_var->val; + + seq_printf(m, "\n %s[%d].val:\n", vars_name, i); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + else { + ret = -EINVAL; + goto out; + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); +out: + return ret; +} + +static int hist_action_debug_show(struct seq_file *m, + struct action_data *data, int i) +{ + int ret = 0; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i); + ret = hist_field_debug_show(m, data->track_data.var_ref, + HIST_FIELD_FL_VAR_REF); + if (ret) + goto out; + + seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i); + ret = hist_field_debug_show(m, data->track_data.track_var, + HIST_FIELD_FL_VAR); + if (ret) + goto out; + } + + if (data->handler == HANDLER_ONMATCH) { + seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n", + i, data->match_data.event_system); + seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n", + i, data->match_data.event); + } +out: + return ret; +} + +static int hist_actions_debug_show(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + int i, ret = 0; + + if (hist_data->n_actions) + seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n"); + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *action = hist_data->actions[i]; + + ret = hist_action_debug_show(m, action, i); + if (ret) + goto out; + } + + if (hist_data->n_save_vars) + seq_puts(m, "\n save action variables (save() params):\n"); + + for (i = 0; i < hist_data->n_save_vars; i++) { + ret = field_var_debug_show(m, hist_data->save_vars[i], i, true); + if (ret) + goto out; + } +out: + return ret; +} + +static void hist_trigger_debug_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int i, ret; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + + seq_printf(m, "hist_data: %p\n\n", hist_data); + seq_printf(m, " n_vals: %u\n", hist_data->n_vals); + seq_printf(m, " n_keys: %u\n", hist_data->n_keys); + seq_printf(m, " n_fields: %u\n", hist_data->n_fields); + + seq_puts(m, "\n val fields:\n\n"); + + seq_puts(m, " hist_data->fields[0]:\n"); + ret = hist_field_debug_show(m, hist_data->fields[0], + HIST_FIELD_FL_HITCOUNT); + if (ret) + return; + + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], 0); + if (ret) + return; + } + + seq_puts(m, "\n key fields:\n"); + + for (i = hist_data->n_vals; i < hist_data->n_fields; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], + HIST_FIELD_FL_KEY); + if (ret) + return; + } + + if (hist_data->n_var_refs) + seq_puts(m, "\n variable reference fields:\n"); + + for (i = 0; i < hist_data->n_var_refs; i++) { + seq_printf(m, "\n hist_data->var_refs[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->var_refs[i], + HIST_FIELD_FL_VAR_REF); + if (ret) + return; + } + + if (hist_data->n_field_vars) + seq_puts(m, "\n field variables:\n"); + + for (i = 0; i < hist_data->n_field_vars; i++) { + ret = field_var_debug_show(m, hist_data->field_vars[i], i, false); + if (ret) + return; + } + + ret = hist_actions_debug_show(m, hist_data); + if (ret) + return; +} + +static int hist_debug_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_debug_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_debug_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return single_open(file, hist_debug_show, file); +} + +const struct file_operations event_hist_debug_fops = { + .open = event_hist_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { const char *field_name = hist_field_name(hist_field, 0); From 0906844545a2cbe0fabcff1c78ab9c6b0d0c2ca0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 28 May 2020 19:27:08 -0400 Subject: [PATCH 299/574] tracing/doc: Fix typos in histogram-design.rst There's a few typos in the histogram-design.rst document that need need to be fixed. Cc: Tom Zanussi Acked-by: Randy Dunlap Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram-design.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst index c246753f0ffc..06f5c7e5f2ee 100644 --- a/Documentation/trace/histogram-design.rst +++ b/Documentation/trace/histogram-design.rst @@ -700,7 +700,7 @@ variable, as well as the referenced variable's size, type, and is_signed values. The VAR_REF field's .name is set to the name of the variable it references. If a variable reference was created using the explicit system.event.$var_ref notation, the hist_field's system and -event_name variabls are also set. +event_name variables are also set. So, in order to handle an event for the sched_switch histogram, because we have a reference to a variable on another histogram, we @@ -1445,7 +1445,7 @@ reference to the variable being tracked, in this case the $wakeup_lat variable. In order to perform the onmax() handler function, there also needs to be a variable that tracks the current maximum by getting updated whenever a new maximum is hit. In this case, we can see that -an autogenerated veriable named ' __max' has been created and is +an auto-generated variable named ' __max' has been created and is visible in the actions[].track_data.track_var variable. Finally, in the new 'save action variables' section, we can see that @@ -1611,7 +1611,7 @@ A couple special cases While the above covers the basics of the histogram internals, there are a couple of special cases that should be discussed, since they -tend to creae even more confusion. Those are field variables on other +tend to create even more confusion. Those are field variables on other histograms, and aliases, both described below through example tests using the hist_debug files. From 631a6582b75ffac82bee76a65217caa856fafb5e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 26 May 2020 10:07:53 +0100 Subject: [PATCH 300/574] drm/i915/gt: Do not schedule normal requests immediately along virtual When we push a virtual request onto the HW, we update the rq->engine to point to the physical engine. A request that is then submitted by the user that waits upon the virtual engine, but along the physical engine in use, will then see that it is due to be submitted to the same engine and take a shortcut (and be queued without waiting for the completion fence). However, the virtual request may be preempted (either by higher priority users, or by timeslicing) and removed from the physical engine to be migrated over to one of its siblings. The dependent normal request however is oblivious to the removal of the virtual request and remains queued to execute on HW, believing that once it reaches the head of its queue all of its predecessors will have completed executing! v2: Beware restriction of signal->execution_mask prior to submission. Fixes: 6d06779e8672 ("drm/i915: Load balancing across a virtual engine") Testcase: igt/gem_exec_balancer/sliced Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: # v5.3+ Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200526090753.11329-2-chris@chris-wilson.co.uk (cherry picked from commit 511b6d9aed417739b6aa49d0b6b4354ad21020f1) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 33bbad623e02..0b07ccc7e9bc 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1237,6 +1237,25 @@ i915_request_await_execution(struct i915_request *rq, return 0; } +static int +await_request_submit(struct i915_request *to, struct i915_request *from) +{ + /* + * If we are waiting on a virtual engine, then it may be + * constrained to execute on a single engine *prior* to submission. + * When it is submitted, it will be first submitted to the virtual + * engine and then passed to the physical engine. We cannot allow + * the waiter to be submitted immediately to the physical engine + * as it may then bypass the virtual request. + */ + if (to->engine == READ_ONCE(from->engine)) + return i915_sw_fence_await_sw_fence_gfp(&to->submit, + &from->submit, + I915_FENCE_GFP); + else + return __i915_request_await_execution(to, from, NULL); +} + static int i915_request_await_request(struct i915_request *to, struct i915_request *from) { @@ -1258,10 +1277,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from) return ret; } - if (to->engine == READ_ONCE(from->engine)) - ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, - &from->submit, - I915_FENCE_GFP); + if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask))) + ret = await_request_submit(to, from); else ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); if (ret < 0) From 0e386959272e4adf192867681a65cc0aa580c247 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 29 May 2020 15:39:26 +0100 Subject: [PATCH 301/574] drm/i915: Check for awaits on still currently executing requests With the advent of preempt-to-busy, a request may still be on the GPU as we unwind. And in the case of a unpreemptible [due to HW] request, that request will remain indefinitely on the GPU even though we have returned it back to our submission queue, and cleared the active bit. We only run the execution callbacks on transferring the request from our submission queue to the execution queue, but if this is a bonded request that the HW is waiting for, we will not submit it (as we wait for a fresh execution) even though it is still being executed. As we know that there are always preemption points between requests, we know that only the currently executing request may be still active even though we have cleared the flag. However, we do not precisely know which request is in ELSP[0] due to a delay in processing events, and furthermore we only store the last request in a context in our state tracker. Fixes: 22b7a426bbe1 ("drm/i915/execlists: Preempt-to-busy") Testcase: igt/gem_exec_balancer/bonded-dual Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200529143926.3245-1-chris@chris-wilson.co.uk (cherry picked from commit b55230e5e800868961fc271b26d9ce53ae1f691e) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_request.c | 49 ++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 0b07ccc7e9bc..def62100e666 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -357,6 +357,53 @@ void i915_request_retire_upto(struct i915_request *rq) } while (i915_request_retire(tmp) && tmp != rq); } +static struct i915_request * const * +__engine_active(struct intel_engine_cs *engine) +{ + return READ_ONCE(engine->execlists.active); +} + +static bool __request_in_flight(const struct i915_request *signal) +{ + struct i915_request * const *port, *rq; + bool inflight = false; + + if (!i915_request_is_ready(signal)) + return false; + + /* + * Even if we have unwound the request, it may still be on + * the GPU (preempt-to-busy). If that request is inside an + * unpreemptible critical section, it will not be removed. Some + * GPU functions may even be stuck waiting for the paired request + * (__await_execution) to be submitted and cannot be preempted + * until the bond is executing. + * + * As we know that there are always preemption points between + * requests, we know that only the currently executing request + * may be still active even though we have cleared the flag. + * However, we can't rely on our tracking of ELSP[0] to known + * which request is currently active and so maybe stuck, as + * the tracking maybe an event behind. Instead assume that + * if the context is still inflight, then it is still active + * even if the active flag has been cleared. + */ + if (!intel_context_inflight(signal->context)) + return false; + + rcu_read_lock(); + for (port = __engine_active(signal->engine); (rq = *port); port++) { + if (rq->context == signal->context) { + inflight = i915_seqno_passed(rq->fence.seqno, + signal->fence.seqno); + break; + } + } + rcu_read_unlock(); + + return inflight; +} + static int __await_execution(struct i915_request *rq, struct i915_request *signal, @@ -387,7 +434,7 @@ __await_execution(struct i915_request *rq, } spin_lock_irq(&signal->lock); - if (i915_request_is_active(signal)) { + if (i915_request_is_active(signal) || __request_in_flight(signal)) { if (hook) { hook(rq, &signal->fence); i915_request_put(signal); From 5bbf959de408640a99a378d05107d510b866b3ca Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 18 May 2020 13:29:24 -0500 Subject: [PATCH 302/574] tracing: Fix events.rst section numbering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The in-kernel trace event API should have its own section, and the duplicate section numbers need fixing as well. Link: https://lkml.kernel.org/r/90ea854dfb728390b50ddf8a8675238973ee014a.camel@kernel.org Reported-by: Li Xinhai Reviewed-by: Li Xinhai Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/events.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 4a2ebe0bd19b..f792b1959a33 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -527,8 +527,8 @@ The following commands are supported: See Documentation/trace/histogram.rst for details and examples. -6.3 In-kernel trace event API ------------------------------ +7. In-kernel trace event API +============================ In most cases, the command-line interface to trace events is more than sufficient. Sometimes, however, applications might find the need for @@ -560,8 +560,8 @@ following: - tracing synthetic events from in-kernel code - the low-level "dynevent_cmd" API -6.3.1 Dyamically creating synthetic event definitions ------------------------------------------------------ +7.1 Dyamically creating synthetic event definitions +--------------------------------------------------- There are a couple ways to create a new synthetic event from a kernel module or other kernel code. @@ -666,8 +666,8 @@ registered by calling the synth_event_gen_cmd_end() function:: At this point, the event object is ready to be used for tracing new events. -6.3.3 Tracing synthetic events from in-kernel code --------------------------------------------------- +7.2 Tracing synthetic events from in-kernel code +------------------------------------------------ To trace a synthetic event, there are several options. The first option is to trace the event in one call, using synth_event_trace() @@ -678,8 +678,8 @@ synth_event_trace_start() and synth_event_trace_end() along with synth_event_add_next_val() or synth_event_add_val() to add the values piecewise. -6.3.3.1 Tracing a synthetic event all at once ---------------------------------------------- +7.2.1 Tracing a synthetic event all at once +------------------------------------------- To trace a synthetic event all at once, the synth_event_trace() or synth_event_trace_array() functions can be used. @@ -780,8 +780,8 @@ remove the event:: ret = synth_event_delete("schedtest"); -6.3.3.1 Tracing a synthetic event piecewise -------------------------------------------- +7.2.2 Tracing a synthetic event piecewise +----------------------------------------- To trace a synthetic using the piecewise method described above, the synth_event_trace_start() function is used to 'open' the synthetic @@ -864,8 +864,8 @@ Note that synth_event_trace_end() must be called at the end regardless of whether any of the add calls failed (say due to a bad field name being passed in). -6.3.4 Dyamically creating kprobe and kretprobe event definitions ----------------------------------------------------------------- +7.3 Dyamically creating kprobe and kretprobe event definitions +-------------------------------------------------------------- To create a kprobe or kretprobe trace event from kernel code, the kprobe_event_gen_cmd_start() or kretprobe_event_gen_cmd_start() @@ -941,8 +941,8 @@ used to give the kprobe event file back and delete the event:: ret = kprobe_event_delete("gen_kprobe_test"); -6.3.4 The "dynevent_cmd" low-level API --------------------------------------- +7.4 The "dynevent_cmd" low-level API +------------------------------------ Both the in-kernel synthetic event and kprobe interfaces are built on top of a lower-level "dynevent_cmd" interface. This interface is From 726721a51838e3983023f906580722fc83f804ee Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 28 May 2020 14:32:37 -0500 Subject: [PATCH 303/574] tracing: Move synthetic events to a separate file With the addition of the in-kernel synthetic event API, synthetic events are no longer specifically tied to the histogram triggers. The synthetic event code is also making trace_event_hist.c very bloated, so for those reasons, move it to a separate file, trace_events_synth.c, along with a new trace_synth.h header file. Because synthetic events are now independent from hist triggers, add a new CONFIG_SYNTH_EVENTS config option, and have CONFIG_HIST_TRIGGERS select it, and have CONFIG_SYNTH_EVENT_GEN_TEST depend on it. Link: http://lkml.kernel.org/r/4d1fa1f85ed5982706ac44844ac92451dcb04715.1590693308.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 +- kernel/trace/Makefile | 1 + kernel/trace/trace_events_hist.c | 1787 +--------------------------- kernel/trace/trace_events_synth.c | 1789 +++++++++++++++++++++++++++++ kernel/trace/trace_synth.h | 36 + 5 files changed, 1847 insertions(+), 1786 deletions(-) create mode 100644 kernel/trace/trace_events_synth.c create mode 100644 kernel/trace/trace_synth.h diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2aec9376825f..75407d5dc83a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -623,12 +623,30 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config SYNTH_EVENTS + bool "Synthetic trace events" + select TRACING + select DYNAMIC_EVENTS + default n + help + Synthetic events are user-defined trace events that can be + used to combine data from other trace events or in fact any + data source. Synthetic events can be generated indirectly + via the trace() action of histogram triggers or directly + by way of an in-kernel API. + + See Documentation/trace/events.rst or + Documentation/trace/histogram.rst for details and examples. + + If in doubt, say N. + config HIST_TRIGGERS bool "Histogram triggers" depends on ARCH_HAVE_NMI_SAFE_CMPXCHG select TRACING_MAP select TRACING select DYNAMIC_EVENTS + select SYNTH_EVENTS default n help Hist triggers allow one or more arbitrary trace event fields @@ -824,7 +842,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on HIST_TRIGGERS + depends on SYNTH_EVENTS help This option creates a test module to check the base functionality of in-kernel synthetic event definition and diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f9dcd19165fa..1d8aaa546412 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -72,6 +72,7 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o +obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 313227da4925..0b933546142e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -19,13 +19,7 @@ #include #include "tracing_map.h" -#include "trace.h" -#include "trace_dynevent.h" - -#define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 - -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#include "trace_synth.h" #define ERRORS \ C(NONE, "No error"), \ @@ -380,69 +374,6 @@ struct hist_trigger_data { unsigned int n_save_var_str; }; -static int create_synth_event(int argc, const char **argv); -static int synth_event_show(struct seq_file *m, struct dyn_event *ev); -static int synth_event_release(struct dyn_event *ev); -static bool synth_event_is_busy(struct dyn_event *ev); -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev); - -static struct dyn_event_operations synth_event_ops = { - .create = create_synth_event, - .show = synth_event_show, - .is_busy = synth_event_is_busy, - .free = synth_event_release, - .match = synth_event_match, -}; - -struct synth_field { - char *type; - char *name; - size_t size; - unsigned int offset; - bool is_signed; - bool is_string; -}; - -struct synth_event { - struct dyn_event devent; - int ref; - char *name; - struct synth_field **fields; - unsigned int n_fields; - unsigned int n_u64; - struct trace_event_class class; - struct trace_event_call call; - struct tracepoint *tp; - struct module *mod; -}; - -static bool is_synth_event(struct dyn_event *ev) -{ - return ev->ops == &synth_event_ops; -} - -static struct synth_event *to_synth_event(struct dyn_event *ev) -{ - return container_of(ev, struct synth_event, devent); -} - -static bool synth_event_is_busy(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - return event->ref != 0; -} - -static bool synth_event_match(const char *system, const char *event, - int argc, const char **argv, struct dyn_event *ev) -{ - struct synth_event *sev = to_synth_event(ev); - - return strcmp(sev->name, event) == 0 && - (!system || strcmp(system, SYNTH_SYSTEM) == 0); -} - struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, @@ -589,6 +520,7 @@ static struct track_data *track_data_alloc(unsigned int key_len, track_data_free(data); return ERR_PTR(-ENOMEM); } + data->elt.private_data = elt_data; elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); @@ -621,7 +553,6 @@ static void last_cmd_set(struct trace_event_file *file, char *str) if (file) { call = file->event_call; - system = call->class->system; if (system) { name = trace_event_name(call); @@ -646,510 +577,6 @@ static void hist_err_clear(void) last_cmd_loc[0] = '\0'; } -struct synth_trace_event { - struct trace_entry ent; - u64 fields[]; -}; - -static int synth_event_define_fields(struct trace_event_call *call) -{ - struct synth_trace_event trace; - int offset = offsetof(typeof(trace), fields); - struct synth_event *event = call->data; - unsigned int i, size, n_u64; - char *name, *type; - bool is_signed; - int ret = 0; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - size = event->fields[i]->size; - is_signed = event->fields[i]->is_signed; - type = event->fields[i]->type; - name = event->fields[i]->name; - ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); - if (ret) - break; - - event->fields[i]->offset = n_u64; - - if (event->fields[i]->is_string) { - offset += STR_VAR_LEN_MAX; - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - offset += sizeof(u64); - n_u64++; - } - } - - event->n_u64 = n_u64; - - return ret; -} - -static bool synth_field_signed(char *type) -{ - if (str_has_prefix(type, "u")) - return false; - if (strcmp(type, "gfp_t") == 0) - return false; - - return true; -} - -static int synth_field_is_string(char *type) -{ - if (strstr(type, "char[") != NULL) - return true; - - return false; -} - -static int synth_field_string_size(char *type) -{ - char buf[4], *end, *start; - unsigned int len; - int size, err; - - start = strstr(type, "char["); - if (start == NULL) - return -EINVAL; - start += sizeof("char[") - 1; - - end = strchr(type, ']'); - if (!end || end < start) - return -EINVAL; - - len = end - start; - if (len > 3) - return -EINVAL; - - strncpy(buf, start, len); - buf[len] = '\0'; - - err = kstrtouint(buf, 0, &size); - if (err) - return err; - - if (size > STR_VAR_LEN_MAX) - return -EINVAL; - - return size; -} - -static int synth_field_size(char *type) -{ - int size = 0; - - if (strcmp(type, "s64") == 0) - size = sizeof(s64); - else if (strcmp(type, "u64") == 0) - size = sizeof(u64); - else if (strcmp(type, "s32") == 0) - size = sizeof(s32); - else if (strcmp(type, "u32") == 0) - size = sizeof(u32); - else if (strcmp(type, "s16") == 0) - size = sizeof(s16); - else if (strcmp(type, "u16") == 0) - size = sizeof(u16); - else if (strcmp(type, "s8") == 0) - size = sizeof(s8); - else if (strcmp(type, "u8") == 0) - size = sizeof(u8); - else if (strcmp(type, "char") == 0) - size = sizeof(char); - else if (strcmp(type, "unsigned char") == 0) - size = sizeof(unsigned char); - else if (strcmp(type, "int") == 0) - size = sizeof(int); - else if (strcmp(type, "unsigned int") == 0) - size = sizeof(unsigned int); - else if (strcmp(type, "long") == 0) - size = sizeof(long); - else if (strcmp(type, "unsigned long") == 0) - size = sizeof(unsigned long); - else if (strcmp(type, "pid_t") == 0) - size = sizeof(pid_t); - else if (strcmp(type, "gfp_t") == 0) - size = sizeof(gfp_t); - else if (synth_field_is_string(type)) - size = synth_field_string_size(type); - - return size; -} - -static const char *synth_field_fmt(char *type) -{ - const char *fmt = "%llu"; - - if (strcmp(type, "s64") == 0) - fmt = "%lld"; - else if (strcmp(type, "u64") == 0) - fmt = "%llu"; - else if (strcmp(type, "s32") == 0) - fmt = "%d"; - else if (strcmp(type, "u32") == 0) - fmt = "%u"; - else if (strcmp(type, "s16") == 0) - fmt = "%d"; - else if (strcmp(type, "u16") == 0) - fmt = "%u"; - else if (strcmp(type, "s8") == 0) - fmt = "%d"; - else if (strcmp(type, "u8") == 0) - fmt = "%u"; - else if (strcmp(type, "char") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned char") == 0) - fmt = "%u"; - else if (strcmp(type, "int") == 0) - fmt = "%d"; - else if (strcmp(type, "unsigned int") == 0) - fmt = "%u"; - else if (strcmp(type, "long") == 0) - fmt = "%ld"; - else if (strcmp(type, "unsigned long") == 0) - fmt = "%lu"; - else if (strcmp(type, "pid_t") == 0) - fmt = "%d"; - else if (strcmp(type, "gfp_t") == 0) - fmt = "%x"; - else if (synth_field_is_string(type)) - fmt = "%s"; - - return fmt; -} - -static void print_synth_event_num_val(struct trace_seq *s, - char *print_fmt, char *name, - int size, u64 val, char *space) -{ - switch (size) { - case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); - break; - - case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); - break; - - case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); - break; - - default: - trace_seq_printf(s, print_fmt, name, val, space); - break; - } -} - -static enum print_line_t print_synth_event(struct trace_iterator *iter, - int flags, - struct trace_event *event) -{ - struct trace_array *tr = iter->tr; - struct trace_seq *s = &iter->seq; - struct synth_trace_event *entry; - struct synth_event *se; - unsigned int i, n_u64; - char print_fmt[32]; - const char *fmt; - - entry = (struct synth_trace_event *)iter->ent; - se = container_of(event, struct synth_event, call.event); - - trace_seq_printf(s, "%s: ", se->name); - - for (i = 0, n_u64 = 0; i < se->n_fields; i++) { - if (trace_seq_has_overflowed(s)) - goto end; - - fmt = synth_field_fmt(se->fields[i]->type); - - /* parameter types */ - if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) - trace_seq_printf(s, "%s ", fmt); - - snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); - - /* parameter values */ - if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; - char *space = (i == se->n_fields - 1 ? "" : " "); - - print_synth_event_num_val(s, print_fmt, - se->fields[i]->name, - se->fields[i]->size, - entry->fields[n_u64], - space); - - if (strcmp(se->fields[i]->type, "gfp_t") == 0) { - trace_seq_puts(s, " ("); - trace_print_flags_seq(s, "|", - entry->fields[n_u64], - __flags); - trace_seq_putc(s, ')'); - } - n_u64++; - } - } -end: - trace_seq_putc(s, '\n'); - - return trace_handle_return(s); -} - -static struct trace_event_functions synth_event_funcs = { - .trace = print_synth_event -}; - -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) -{ - struct trace_event_file *trace_file = __data; - struct synth_trace_event *entry; - struct trace_event_buffer fbuffer; - struct trace_buffer *buffer; - struct synth_event *event; - unsigned int i, n_u64, val_idx; - int fields_size = 0; - - event = trace_file->event_call->data; - - if (trace_trigger_soft_disabled(trace_file)) - return; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - - entry = trace_event_buffer_reserve(&fbuffer, trace_file, - sizeof(*entry) + fields_size); - if (!entry) - goto out; - - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - val_idx = var_ref_idx[i]; - if (event->fields[i]->is_string) { - char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = event->fields[i]; - u64 val = var_ref_vals[val_idx]; - - switch (field->size) { - case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; - break; - - default: - entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); -} - -static void free_synth_event_print_fmt(struct trace_event_call *call) -{ - if (call) { - kfree(call->print_fmt); - call->print_fmt = NULL; - } -} - -static int __set_synth_event_print_fmt(struct synth_event *event, - char *buf, int len) -{ - const char *fmt; - int pos = 0; - int i; - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - for (i = 0; i < event->n_fields; i++) { - fmt = synth_field_fmt(event->fields[i]->type); - pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", - event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); - } - pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - - for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_synth_event_print_fmt(struct trace_event_call *call) -{ - struct synth_event *event = call->data; - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_synth_event_print_fmt(event, NULL, 0); - - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_synth_event_print_fmt(event, print_fmt, len + 1); - call->print_fmt = print_fmt; - - return 0; -} - -static void free_synth_field(struct synth_field *field) -{ - kfree(field->type); - kfree(field->name); - kfree(field); -} - -static struct synth_field *parse_synth_field(int argc, const char **argv, - int *consumed) -{ - struct synth_field *field; - const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; - - if (field_type[0] == ';') - field_type++; - - if (!strcmp(field_type, "unsigned")) { - if (argc < 3) - return ERR_PTR(-EINVAL); - prefix = "unsigned "; - field_type = argv[1]; - field_name = argv[2]; - *consumed = 3; - } else { - field_name = argv[1]; - *consumed = 2; - } - - field = kzalloc(sizeof(*field), GFP_KERNEL); - if (!field) - return ERR_PTR(-ENOMEM); - - len = strlen(field_name); - array = strchr(field_name, '['); - if (array) - len -= strlen(array); - else if (field_name[len - 1] == ';') - len--; - - field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; - goto free; - } - - if (field_type[0] == ';') - field_type++; - len = strlen(field_type) + 1; - if (array) - len += strlen(array); - if (prefix) - len += strlen(prefix); - - field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; - goto free; - } - if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); - if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; - } - - field->size = synth_field_size(field->type); - if (!field->size) { - ret = -EINVAL; - goto free; - } - - if (synth_field_is_string(field->type)) - field->is_string = true; - - field->is_signed = synth_field_signed(field->type); - - out: - return field; - free: - free_synth_field(field); - field = ERR_PTR(ret); - goto out; -} - -static void free_synth_tracepoint(struct tracepoint *tp) -{ - if (!tp) - return; - - kfree(tp->name); - kfree(tp); -} - -static struct tracepoint *alloc_synth_tracepoint(char *name) -{ - struct tracepoint *tp; - - tp = kzalloc(sizeof(*tp), GFP_KERNEL); - if (!tp) - return ERR_PTR(-ENOMEM); - - tp->name = kstrdup(name, GFP_KERNEL); - if (!tp->name) { - kfree(tp); - return ERR_PTR(-ENOMEM); - } - - return tp; -} - typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx); @@ -1177,145 +604,6 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } } -static struct synth_event *find_synth_event(const char *name) -{ - struct dyn_event *pos; - struct synth_event *event; - - for_each_dyn_event(pos) { - if (!is_synth_event(pos)) - continue; - event = to_synth_event(pos); - if (strcmp(event->name, name) == 0) - return event; - } - - return NULL; -} - -static struct trace_event_fields synth_event_fields_array[] = { - { .type = TRACE_FUNCTION_TYPE, - .define_fields = synth_event_define_fields }, - {} -}; - -static int register_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret = 0; - - event->call.class = &event->class; - event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); - if (!event->class.system) { - ret = -ENOMEM; - goto out; - } - - event->tp = alloc_synth_tracepoint(event->name); - if (IS_ERR(event->tp)) { - ret = PTR_ERR(event->tp); - event->tp = NULL; - goto out; - } - - INIT_LIST_HEAD(&call->class->fields); - call->event.funcs = &synth_event_funcs; - call->class->fields_array = synth_event_fields_array; - - ret = register_trace_event(&call->event); - if (!ret) { - ret = -ENODEV; - goto out; - } - call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; - call->class->probe = trace_event_raw_event_synth; - call->data = event; - call->tp = event->tp; - - ret = trace_add_event_call(call); - if (ret) { - pr_warn("Failed to register synthetic event: %s\n", - trace_event_name(call)); - goto err; - } - - ret = set_synth_event_print_fmt(call); - if (ret < 0) { - trace_remove_event_call(call); - goto err; - } - out: - return ret; - err: - unregister_trace_event(&call->event); - goto out; -} - -static int unregister_synth_event(struct synth_event *event) -{ - struct trace_event_call *call = &event->call; - int ret; - - ret = trace_remove_event_call(call); - - return ret; -} - -static void free_synth_event(struct synth_event *event) -{ - unsigned int i; - - if (!event) - return; - - for (i = 0; i < event->n_fields; i++) - free_synth_field(event->fields[i]); - - kfree(event->fields); - kfree(event->name); - kfree(event->class.system); - free_synth_tracepoint(event->tp); - free_synth_event_print_fmt(&event->call); - kfree(event); -} - -static struct synth_event *alloc_synth_event(const char *name, int n_fields, - struct synth_field **fields) -{ - struct synth_event *event; - unsigned int i; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) { - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->name = kstrdup(name, GFP_KERNEL); - if (!event->name) { - kfree(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); - if (!event->fields) { - free_synth_event(event); - event = ERR_PTR(-ENOMEM); - goto out; - } - - dyn_event_init(&event->devent, &synth_event_ops); - - for (i = 0; i < n_fields; i++) - event->fields[i] = fields[i]; - - event->n_fields = n_fields; - out: - return event; -} - static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, struct ring_buffer_event *rbe, void *key, @@ -1331,1043 +619,6 @@ struct hist_var_data { struct hist_trigger_data *hist_data; }; -static int synth_event_check_arg_fn(void *data) -{ - struct dynevent_arg_pair *arg_pair = data; - int size; - - size = synth_field_size((char *)arg_pair->lhs); - - return size ? 0 : -EINVAL; -} - -/** - * synth_event_add_field - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type: The type of the new field to add - * @name: The name of the new field to add - * - * Add a new field to a synthetic event cmd object. Field ordering is in - * the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, - const char *name) -{ - struct dynevent_arg_pair arg_pair; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type || !name) - return -EINVAL; - - dynevent_arg_pair_init(&arg_pair, 0, ';'); - - arg_pair.lhs = type; - arg_pair.rhs = name; - - ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field); - -/** - * synth_event_add_field_str - Add a new field to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @type_name: The type and name of the new field to add, as a single string - * - * Add a new field to a synthetic event cmd object, as a single - * string. The @type_name string is expected to be of the form 'type - * name', which will be appended by ';'. No sanity checking is done - - * what's passed in is assumed to already be well-formed. Field - * ordering is in the same order the fields are added. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) -{ - struct dynevent_arg arg; - int ret; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (!type_name) - return -EINVAL; - - dynevent_arg_init(&arg, ';'); - - arg.str = type_name; - - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) - ret = -EINVAL; - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_field_str); - -/** - * synth_event_add_fields - Add multiple fields to a synthetic event cmd - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Add a new set of fields to a synthetic event cmd object. The event - * fields that will be defined for the event should be passed in as an - * array of struct synth_field_desc, and the number of elements in the - * array passed in as n_fields. Field ordering will retain the - * ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_add_fields(struct dynevent_cmd *cmd, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - unsigned int i; - int ret = 0; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_add_fields); - -/** - * __synth_event_gen_cmd_start - Start a synthetic event command from arg list - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field - * - * NOTE: Users normally won't want to call this function directly, but - * rather use the synth_event_gen_cmd_start() wrapper, which - * automatically adds a NULL to the end of the arg list. If this - * function is used directly, make sure the last arg in the variable - * arg list is NULL. - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * There should be an even number variable args, each pair consisting - * of a type followed by a field name. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, ...) -{ - struct dynevent_arg arg; - va_list args; - int ret; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - va_start(args, mod); - for (;;) { - const char *type, *name; - - type = va_arg(args, const char *); - if (!type) - break; - name = va_arg(args, const char *); - if (!name) - break; - - if (++cmd->n_fields > SYNTH_FIELDS_MAX) { - ret = -EINVAL; - break; - } - - ret = synth_event_add_field(cmd, type, name); - if (ret) - break; - } - va_end(args); - - return ret; -} -EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); - -/** - * synth_event_gen_cmd_array_start - Start synthetic event command from an array - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @name: The name of the synthetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * - * Generate a synthetic event command to be executed by - * synth_event_gen_cmd_end(). This function can be used to generate - * the complete command or only the first part of it; in the latter - * case, synth_event_add_field(), synth_event_add_field_str(), or - * synth_event_add_fields() can be used to add more fields following - * this. - * - * The event fields that will be defined for the event should be - * passed in as an array of struct synth_field_desc, and the number of - * elements in the array passed in as n_fields. Field ordering will - * retain the ordering given in the fields array. - * - * See synth_field_size() for available types. If field_name contains - * [n] the field is considered to be an array. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, - struct module *mod, - struct synth_field_desc *fields, - unsigned int n_fields) -{ - struct dynevent_arg arg; - unsigned int i; - int ret = 0; - - cmd->event_name = name; - cmd->private_data = mod; - - if (cmd->type != DYNEVENT_TYPE_SYNTH) - return -EINVAL; - - if (n_fields > SYNTH_FIELDS_MAX) - return -EINVAL; - - dynevent_arg_init(&arg, 0); - arg.str = name; - ret = dynevent_arg_add(cmd, &arg, NULL); - if (ret) - return ret; - - for (i = 0; i < n_fields; i++) { - if (fields[i].type == NULL || fields[i].name == NULL) - return -EINVAL; - - ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); - if (ret) - break; - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); - -static int __create_synth_event(int argc, const char *name, const char **argv) -{ - struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; - struct synth_event *event = NULL; - int i, consumed = 0, n_fields = 0, ret = 0; - - /* - * Argument syntax: - * - Add synthetic event: field[;field] ... - * - Remove synthetic event: ! field[;field] ... - * where 'field' = type field_name - */ - - if (name[0] == '\0' || argc < 1) - return -EINVAL; - - mutex_lock(&event_mutex); - - event = find_synth_event(name); - if (event) { - ret = -EEXIST; - goto out; - } - - for (i = 0; i < argc - 1; i++) { - if (strcmp(argv[i], ";") == 0) - continue; - if (n_fields == SYNTH_FIELDS_MAX) { - ret = -EINVAL; - goto err; - } - - field = parse_synth_field(argc - i, &argv[i], &consumed); - if (IS_ERR(field)) { - ret = PTR_ERR(field); - goto err; - } - fields[n_fields++] = field; - i += consumed - 1; - } - - if (i < argc && strcmp(argv[i], ";") != 0) { - ret = -EINVAL; - goto err; - } - - event = alloc_synth_event(name, n_fields, fields); - if (IS_ERR(event)) { - ret = PTR_ERR(event); - event = NULL; - goto err; - } - ret = register_synth_event(event); - if (!ret) - dyn_event_add(&event->devent); - else - free_synth_event(event); - out: - mutex_unlock(&event_mutex); - - return ret; - err: - for (i = 0; i < n_fields; i++) - free_synth_field(fields[i]); - - goto out; -} - -/** - * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event - * @fields: An array of type/name field descriptions - * @n_fields: The number of field descriptions contained in the fields array - * @mod: The module creating the event, NULL if not created from a module - * - * Create a new synthetic event with the given name under the - * trace/events/synthetic/ directory. The event fields that will be - * defined for the event should be passed in as an array of struct - * synth_field_desc, and the number elements in the array passed in as - * n_fields. Field ordering will retain the ordering given in the - * fields array. - * - * If the new synthetic event is being created from a module, the mod - * param must be non-NULL. This will ensure that the trace buffer - * won't contain unreadable events. - * - * The new synth event should be deleted using synth_event_delete() - * function. The new synthetic event can be generated from modules or - * other kernel code using trace_synth_event() and related functions. - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_create(const char *name, struct synth_field_desc *fields, - unsigned int n_fields, struct module *mod) -{ - struct dynevent_cmd cmd; - char *buf; - int ret; - - buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); - - ret = synth_event_gen_cmd_array_start(&cmd, name, mod, - fields, n_fields); - if (ret) - goto out; - - ret = synth_event_gen_cmd_end(&cmd); - out: - kfree(buf); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_create); - -static int destroy_synth_event(struct synth_event *se) -{ - int ret; - - if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } - } - - return ret; -} - -/** - * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event - * - * Delete a synthetic event that was created with synth_event_create(). - * - * Return: 0 if successful, error otherwise. - */ -int synth_event_delete(const char *event_name) -{ - struct synth_event *se = NULL; - struct module *mod = NULL; - int ret = -ENOENT; - - mutex_lock(&event_mutex); - se = find_synth_event(event_name); - if (se) { - mod = se->mod; - ret = destroy_synth_event(se); - } - mutex_unlock(&event_mutex); - - if (mod) { - mutex_lock(&trace_types_lock); - /* - * It is safest to reset the ring buffer if the module - * being unloaded registered any events that were - * used. The only worry is if a new module gets - * loaded, and takes on the same id as the events of - * this module. When printing out the buffer, traced - * events left over from this module may be passed to - * the new module events and unexpected results may - * occur. - */ - tracing_reset_all_online_cpus(); - mutex_unlock(&trace_types_lock); - } - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_delete); - -static int create_or_delete_synth_event(int argc, char **argv) -{ - const char *name = argv[0]; - int ret; - - /* trace_run_command() ensures argc != 0 */ - if (name[0] == '!') { - ret = synth_event_delete(name + 1); - return ret; - } - - ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); - return ret == -ECANCELED ? -EINVAL : ret; -} - -static int synth_event_run_command(struct dynevent_cmd *cmd) -{ - struct synth_event *se; - int ret; - - ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); - if (ret) - return ret; - - se = find_synth_event(cmd->event_name); - if (WARN_ON(!se)) - return -ENOENT; - - se->mod = cmd->private_data; - - return ret; -} - -/** - * synth_event_cmd_init - Initialize a synthetic event command object - * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @buf: A pointer to the buffer used to build the command - * @maxlen: The length of the buffer passed in @buf - * - * Initialize a synthetic event command object. Use this before - * calling any of the other dyenvent_cmd functions. - */ -void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) -{ - dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, - synth_event_run_command); -} -EXPORT_SYMBOL_GPL(synth_event_cmd_init); - -static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - int entry_size, fields_size = 0; - int ret = 0; - - memset(trace_state, '\0', sizeof(*trace_state)); - - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->disabled = true; - goto out; - } - - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry_size = sizeof(*trace_state->entry) + fields_size; - trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, - file, - entry_size); - if (!trace_state->entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - } -out: - return ret; -} - -static inline void -__synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); -} - -/** - * synth_event_trace - Trace a synthetic event - * @file: The trace_event_file representing the synthetic event - * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values - * - * Trace a synthetic event using the values passed in the variable - * argument list. - * - * The argument list should be a list 'n_vals' u64 values. The number - * of vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - va_list args; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - u64 val; - - val = va_arg(args, u64); - - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } - va_end(args); -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace); - -/** - * synth_event_trace_array - Trace a synthetic event from an array - * @file: The trace_event_file representing the synthetic event - * @vals: Array of values - * @n_vals: The number of values in vals - * - * Trace a synthetic event using the values passed in as 'vals'. - * - * The 'vals' array is just an array of 'n_vals' u64. The number of - * vals must match the number of field in the synthetic event, and - * must be in the same order as the synthetic event fields. - * - * All vals should be cast to u64, and string vals are just pointers - * to strings, cast to u64. Strings will be copied into space - * reserved in the event for the string, using these pointers. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_array(struct trace_event_file *file, u64 *vals, - unsigned int n_vals) -{ - struct synth_event_trace_state state; - unsigned int i, n_u64; - int ret; - - ret = __synth_event_trace_start(file, &state); - if (ret || state.disabled) - return ret; - - if (n_vals != state.event->n_fields) { - ret = -EINVAL; - goto out; - } - - for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { - if (state.event->fields[i]->is_string) { - char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); - } else { - struct synth_field *field = state.event->fields[i]; - u64 val = vals[i]; - - switch (field->size) { - case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; - break; - - case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; - break; - - case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; - break; - - default: - state.entry->fields[n_u64] = val; - break; - } - n_u64++; - } - } -out: - __synth_event_trace_end(&state); - - return ret; -} -EXPORT_SYMBOL_GPL(synth_event_trace_array); - -/** - * synth_event_trace_start - Start piecewise synthetic event trace - * @file: The trace_event_file representing the synthetic event - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Start the trace of a synthetic event field-by-field rather than all - * at once. - * - * This function 'opens' an event trace, which means space is reserved - * for the event in the trace buffer, after which the event's - * individual field values can be set through either - * synth_event_add_next_val() or synth_event_add_val(). - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state until the event trace is - * closed (and the event finally traced) using - * synth_event_trace_end(). - * - * Note that synth_event_trace_end() must be called after all values - * have been added for each event trace, regardless of whether adding - * all field values succeeded or not. - * - * Note also that for a given event trace, all fields must be added - * using either synth_event_add_next_val() or synth_event_add_val() - * but not both together or interleaved. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; - - return __synth_event_trace_start(file, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_trace_start); - -static int __synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - struct synth_field *field = NULL; - struct synth_trace_event *entry; - struct synth_event *event; - int i, ret = 0; - - if (!trace_state) { - ret = -EINVAL; - goto out; - } - - /* can't mix add_next_synth_val() with add_synth_val() */ - if (field_name) { - if (trace_state->add_next) { - ret = -EINVAL; - goto out; - } - trace_state->add_name = true; - } else { - if (trace_state->add_name) { - ret = -EINVAL; - goto out; - } - trace_state->add_next = true; - } - - if (trace_state->disabled) - goto out; - - event = trace_state->event; - if (trace_state->add_name) { - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - if (strcmp(field->name, field_name) == 0) - break; - } - if (!field) { - ret = -EINVAL; - goto out; - } - } else { - if (trace_state->cur_field >= event->n_fields) { - ret = -EINVAL; - goto out; - } - field = event->fields[trace_state->cur_field++]; - } - - entry = trace_state->entry; - if (field->is_string) { - char *str_val = (char *)(long)val; - char *str_field; - - if (!str_val) { - ret = -EINVAL; - goto out; - } - - str_field = (char *)&entry->fields[field->offset]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else { - switch (field->size) { - case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; - break; - - case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; - break; - - case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; - break; - - default: - trace_state->entry->fields[field->offset] = val; - break; - } - } - out: - return ret; -} - -/** - * synth_event_add_next_val - Add the next field's value to an open synth trace - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the next field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function assumes all the fields in an event are to be set one - * after another - successive calls to this function are made, one for - * each field, in the order of the fields in the event, until all - * fields have been set. If you'd rather set each field individually - * without regard to ordering, synth_event_add_val() can be used - * instead. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_next_val(u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(NULL, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_next_val); - -/** - * synth_event_add_val - Add a named field's value to an open synth trace - * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to - * @trace_state: A pointer to object tracking the piecewise trace state - * - * Set the value of the named field in an event that's been opened by - * synth_event_trace_start(). - * - * The val param should be the value cast to u64. If the value points - * to a string, the val param should be a char * cast to u64. - * - * This function looks up the field name, and if found, sets the field - * to the specified value. This lookup makes this function more - * expensive than synth_event_add_next_val(), so use that or the - * none-piecewise synth_event_trace() instead if efficiency is more - * important. - * - * Note however that synth_event_add_next_val() and - * synth_event_add_val() can't be intermixed for a given event trace - - * one or the other but not both can be used at the same time. - * - * Note also that synth_event_trace_end() must be called after all - * values have been added for each event trace, regardless of whether - * adding all field values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_add_val(const char *field_name, u64 val, - struct synth_event_trace_state *trace_state) -{ - return __synth_event_add_val(field_name, val, trace_state); -} -EXPORT_SYMBOL_GPL(synth_event_add_val); - -/** - * synth_event_trace_end - End piecewise synthetic event trace - * @trace_state: A pointer to object tracking the piecewise trace state - * - * End the trace of a synthetic event opened by - * synth_event_trace__start(). - * - * This function 'closes' an event trace, which basically means that - * it commits the reserved event and cleans up other loose ends. - * - * A pointer to a trace_state object is passed in, which will keep - * track of the current event trace state opened with - * synth_event_trace_start(). - * - * Note that this function must be called after all values have been - * added for each event trace, regardless of whether adding all field - * values succeeded or not. - * - * Return: 0 on success, err otherwise. - */ -int synth_event_trace_end(struct synth_event_trace_state *trace_state) -{ - if (!trace_state) - return -EINVAL; - - __synth_event_trace_end(trace_state); - - return 0; -} -EXPORT_SYMBOL_GPL(synth_event_trace_end); - -static int create_synth_event(int argc, const char **argv) -{ - const char *name = argv[0]; - int len; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - - /* This interface accepts group name prefix */ - if (strchr(name, '/')) { - len = str_has_prefix(name, SYNTH_SYSTEM "/"); - if (len == 0) - return -EINVAL; - name += len; - } - return __create_synth_event(argc - 1, name, argv + 1); -} - -static int synth_event_release(struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - int ret; - - if (event->ref) - return -EBUSY; - - ret = unregister_synth_event(event); - if (ret) - return ret; - - dyn_event_remove(ev); - free_synth_event(event); - return 0; -} - -static int __synth_event_show(struct seq_file *m, struct synth_event *event) -{ - struct synth_field *field; - unsigned int i; - - seq_printf(m, "%s\t", event->name); - - for (i = 0; i < event->n_fields; i++) { - field = event->fields[i]; - - /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, - i == event->n_fields - 1 ? "" : "; "); - } - - seq_putc(m, '\n'); - - return 0; -} - -static int synth_event_show(struct seq_file *m, struct dyn_event *ev) -{ - struct synth_event *event = to_synth_event(ev); - - seq_printf(m, "s:%s/", event->class.system); - - return __synth_event_show(m, event); -} - -static int synth_events_seq_show(struct seq_file *m, void *v) -{ - struct dyn_event *ev = v; - - if (!is_synth_event(ev)) - return 0; - - return __synth_event_show(m, to_synth_event(ev)); -} - -static const struct seq_operations synth_events_seq_op = { - .start = dyn_event_seq_start, - .next = dyn_event_seq_next, - .stop = dyn_event_seq_stop, - .show = synth_events_seq_show, -}; - -static int synth_events_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = security_locked_down(LOCKDOWN_TRACEFS); - if (ret) - return ret; - - if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = dyn_events_release_all(&synth_event_ops); - if (ret < 0) - return ret; - } - - return seq_open(file, &synth_events_seq_op); -} - -static ssize_t synth_events_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - return trace_parse_run_command(file, buffer, count, ppos, - create_or_delete_synth_event); -} - -static const struct file_operations synth_events_fops = { - .open = synth_events_open, - .write = synth_events_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -7653,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void) return ret; } - -static __init int trace_events_hist_init(void) -{ - struct dentry *entry = NULL; - struct dentry *d_tracer; - int err = 0; - - err = dyn_event_register(&synth_event_ops); - if (err) { - pr_warn("Could not register synth_event_ops\n"); - return err; - } - - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); - goto err; - } - - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, - NULL, &synth_events_fops); - if (!entry) { - err = -ENODEV; - goto err; - } - - return err; - err: - pr_warn("Could not create tracefs 'synthetic_events' entry\n"); - - return err; -} - -fs_initcall(trace_events_hist_init); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c new file mode 100644 index 000000000000..c6cca0d1d584 --- /dev/null +++ b/kernel/trace/trace_events_synth.c @@ -0,0 +1,1789 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_synth - synthetic trace events + * + * Copyright (C) 2015, 2020 Tom Zanussi + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* for gfp flag names */ +#include +#include + +#include "trace_synth.h" + +static int create_synth_event(int argc, const char **argv); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = create_synth_event, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + +struct synth_trace_event { + struct trace_entry ent; + u64 fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + event->fields[i]->offset = n_u64; + + if (event->fields[i]->is_string) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (str_has_prefix(type, "u")) + return false; + if (strcmp(type, "gfp_t") == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += sizeof("char[") - 1; + + end = strchr(type, ']'); + if (!end || end < start) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; + else if (synth_field_is_string(type)) + fmt = "%s"; + + return fmt; +} + +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); + + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64], + __flags); + trace_seq_putc(s, ')'); + } + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + unsigned int i, n_u64, val_idx; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[val_idx]; + char *str_field = (char *)&entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[val_idx]; + + switch (field->size) { + case 1: + *(u8 *)&entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&entry->fields[n_u64] = (u32)val; + break; + + default: + entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static struct synth_field *parse_synth_field(int argc, const char **argv, + int *consumed) +{ + struct synth_field *field; + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + int len, ret = 0; + + if (field_type[0] == ';') + field_type++; + + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) + return ERR_PTR(-EINVAL); + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed = 3; + } else { + field_name = argv[1]; + *consumed = 2; + } + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_name); + array = strchr(field_name, '['); + if (array) + len -= strlen(array); + else if (field_name[len - 1] == ';') + len--; + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) { + ret = -ENOMEM; + goto free; + } + + if (field_type[0] == ';') + field_type++; + len = strlen(field_type) + 1; + if (array) + len += strlen(array); + if (prefix) + len += strlen(prefix); + + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) { + ret = -ENOMEM; + goto free; + } + if (prefix) + strcat(field->type, prefix); + strcat(field->type, field_type); + if (array) { + strcat(field->type, array); + if (field->type[len - 1] == ';') + field->type[len - 1] = '\0'; + } + + field->size = synth_field_size(field->type); + if (!field->size) { + ret = -EINVAL; + goto free; + } + + if (synth_field_is_string(field->type)) + field->is_string = true; + + field->is_signed = synth_field_signed(field->type); + + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +struct synth_event *find_synth_event(const char *name) +{ + struct dyn_event *pos; + struct synth_event *event; + + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->fields_array = synth_event_fields_array; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + if (ret < 0) { + trace_remove_event_call(call); + goto err; + } + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(const char *name, int n_fields, + struct synth_field **fields) +{ + struct synth_event *event; + unsigned int i; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + dyn_event_init(&event->devent, &synth_event_ops); + + for (i = 0; i < n_fields; i++) + event->fields[i] = fields[i]; + + event->n_fields = n_fields; + out: + return event; +} + +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + +static int __create_synth_event(int argc, const char *name, const char **argv) +{ + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + struct synth_event *event = NULL; + int i, consumed = 0, n_fields = 0, ret = 0; + + /* + * Argument syntax: + * - Add synthetic event: field[;field] ... + * - Remove synthetic event: ! field[;field] ... + * where 'field' = type field_name + */ + + if (name[0] == '\0' || argc < 1) + return -EINVAL; + + mutex_lock(&event_mutex); + + event = find_synth_event(name); + if (event) { + ret = -EEXIST; + goto out; + } + + for (i = 0; i < argc - 1; i++) { + if (strcmp(argv[i], ";") == 0) + continue; + if (n_fields == SYNTH_FIELDS_MAX) { + ret = -EINVAL; + goto err; + } + + field = parse_synth_field(argc - i, &argv[i], &consumed); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err; + } + fields[n_fields++] = field; + i += consumed - 1; + } + + if (i < argc && strcmp(argv[i], ";") != 0) { + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + ret = register_synth_event(event); + if (!ret) + dyn_event_add(&event->devent); + else + free_synth_event(event); + out: + mutex_unlock(&event_mutex); + + return ret; + err: + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + + goto out; +} + +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new sythetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + ret = -EBUSY; + else { + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new sythetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + struct module *mod = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) { + mod = se->mod; + ret = destroy_synth_event(se); + } + mutex_unlock(&event_mutex); + + if (mod) { + mutex_lock(&trace_types_lock); + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + mutex_unlock(&trace_types_lock); + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + +static int create_or_delete_synth_event(int argc, char **argv) +{ + const char *name = argv[0]; + int ret; + + /* trace_run_command() ensures argc != 0 */ + if (name[0] == '!') { + ret = synth_event_delete(name + 1); + return ret; + } + + ret = __create_synth_event(argc - 1, name, (const char **)argv + 1); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int synth_event_run_command(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_command); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + va_list args; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } + va_end(args); +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + struct synth_event_trace_state state; + unsigned int i, n_u64; + int ret; + + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + char *str_field = (char *)&state.entry->fields[n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } + n_u64++; + } + } +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret; + + if (!trace_state) + return -EINVAL; + + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (trace_state->disabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + + entry = trace_state->entry; + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (!str_val) { + ret = -EINVAL; + goto out; + } + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } + out: + return ret; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(NULL, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(field_name, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + __synth_event_trace_end(trace_state); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + +static int create_synth_event(int argc, const char **argv) +{ + const char *name = argv[0]; + int len; + + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) + return -EINVAL; + name += len; + } + return __create_synth_event(argc - 1, name, argv + 1); +} + +static int synth_event_release(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; +} + +static int __synth_event_show(struct seq_file *m, struct synth_event *event) +{ + struct synth_field *field; + unsigned int i; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + /* parameter values */ + seq_printf(m, "%s %s%s", field->type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + +static const struct seq_operations synth_events_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&synth_event_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + struct dentry *d_tracer; + int err = 0; + + err = dyn_event_register(&synth_event_ops); + if (err) { + pr_warn("Could not register synth_event_ops\n"); + return err; + } + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) { + err = PTR_ERR(d_tracer); + goto err; + } + + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_synth_init); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h new file mode 100644 index 000000000000..ac35c45207c4 --- /dev/null +++ b/kernel/trace/trace_synth.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_SYNTH_H +#define __TRACE_SYNTH_H + +#include "trace_dynevent.h" + +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 32 + +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ + +struct synth_field { + char *type; + char *name; + size_t size; + unsigned int offset; + bool is_signed; + bool is_string; +}; + +struct synth_event { + struct dyn_event devent; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; + struct module *mod; +}; + +extern struct synth_event *find_synth_event(const char *name); + +#endif /* __TRACE_SYNTH_H */ From bea24f766efceb5322f704015c56596ff94642f7 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 28 May 2020 14:32:38 -0500 Subject: [PATCH 304/574] selftests/ftrace: Distinguish between hist and synthetic event checks With synthetic events now a separate config item as a result of 'tracing: Move synthetic events to a separate file', tests that use both need to explicitly check for hist trigger support rather than relying on hist triggers to pull in synthetic events. Add an additional hist trigger check to all the trigger tests that now require it, otherwise they'll fail if synthetic events but not hist triggers are enabled. Link: http://lkml.kernel.org/r/af36c539006ef2768114b4ed38e6b054f7c7a3bd.1590693308.git.zanussi@kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- .../trigger/inter-event/trigger-field-variable-support.tc | 5 +++++ .../trigger/inter-event/trigger-inter-event-combined-hist.tc | 5 +++++ .../trigger/inter-event/trigger-multi-actions-accept.tc | 5 +++++ .../trigger/inter-event/trigger-onmatch-action-hist.tc | 5 +++++ .../trigger/inter-event/trigger-onmatch-onmax-action-hist.tc | 5 +++++ .../test.d/trigger/inter-event/trigger-onmax-action-hist.tc | 5 +++++ .../trigger/inter-event/trigger-snapshot-action-hist.tc | 5 +++++ .../test.d/trigger/inter-event/trigger-trace-action-hist.tc | 5 +++++ 8 files changed, 40 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc index 77be6e1f6e7b..e232059a8ab2 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test field variable support" echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc index f3eb8aacec0e..07cfcb8157b6 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'waking_latency u64 lat pid_t pid' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc index d281f056f980..73e413c2ca26 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test multiple actions on hist trigger" echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events TRIGGER1=events/sched/sched_wakeup/trigger diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc index a708f0e7858a..ebe0ad827f9f 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc index dfce6932d8be..2a2ef767249e 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc index 0035995c2194..98d73bfb0296 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + echo "Test create synthetic event" echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc index f546c1b66a9b..01b01b9c4e07 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc @@ -12,6 +12,11 @@ if [ ! -f set_event ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + if [ ! -f snapshot ]; then echo "snapshot is not supported" exit_unsupported diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc index 8021d60aafec..c3baa486aeb4 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc @@ -17,6 +17,11 @@ if [ ! -f synthetic_events ]; then exit_unsupported fi +if [ ! -f events/sched/sched_process_fork/hist ]; then + echo "hist trigger is not supported" + exit_unsupported +fi + grep -q "trace(" README || exit_unsupported # version issue echo "Test create synthetic event" From 58f6e384480ec97b902e44399a44862907840ba9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 May 2020 16:52:40 +0200 Subject: [PATCH 305/574] ftrace,bug: Improve traceoff_on_warn While doing some tracing, I found a huge portion of the per-cpu buffer was taken by printk/serial output because we're disabling the trace far too late (after printing the CUT string). Improve matters for architectures that have GENERIC_BUG + _BUG_FLAGS by killing the tracer in the exception handler before printing anything much. Link: https://lkml.kernel.org/r/20200528145240.GF706495@hirez.programming.kicks-ass.net Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- lib/bug.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/bug.c b/lib/bug.c index 8c98af0bf585..7103440c0ee1 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -47,6 +47,7 @@ #include #include #include +#include extern struct bug_entry __start___bug_table[], __stop___bug_table[]; @@ -153,6 +154,8 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (!bug) return BUG_TRAP_TYPE_NONE; + disable_trace_on_warning(); + file = NULL; line = 0; warning = 0; From c200784a08d4ea82f82a30678955b7f2c7550af4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 29 May 2020 10:46:32 -0400 Subject: [PATCH 306/574] tracing: Add a trace print when traceoff_on_warning is triggered When "traceoff_on_warning" is enabled and a warning happens, there can still be many trace events happening on other CPUs between the time the warning occurred and the last trace event on that same CPU. This can cause confusion in examining the trace, as it may not be obvious where the warning happened. By adding a trace print into the trace just before disabling tracing, it makes it obvious where the warning occurred, and the developer doesn't have to look at other means to see what CPU it occurred on. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..760fd102dbe2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { - if (__disable_trace_on_warning) + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); tracing_off(); + } } /** From 01397e822af42f8485e876ba9d1309b63646d886 Mon Sep 17 00:00:00 2001 From: Vitor Massaru Iha Date: Fri, 29 May 2020 16:28:45 -0300 Subject: [PATCH 307/574] kunit: Fix TabError, remove defconfig code and handle when there is no kunitconfig The identation before this code (`if not os.path.exists(cli_args.build_dir):``) was with spaces instead of tabs after fixed up merge conflits, this commit revert spaces to tabs: [iha@bbking linux]$ tools/testing/kunit/kunit.py run File "tools/testing/kunit/kunit.py", line 247 if not linux: ^ TabError: inconsistent use of tabs and spaces in indentation [iha@bbking linux]$ tools/testing/kunit/kunit.py run Traceback (most recent call last): File "tools/testing/kunit/kunit.py", line 338, in main(sys.argv[1:]) File "tools/testing/kunit/kunit.py", line 215, in main add_config_opts(config_parser) [iha@bbking linux]$ tools/testing/kunit/kunit.py run Traceback (most recent call last): File "tools/testing/kunit/kunit.py", line 337, in main(sys.argv[1:]) File "tools/testing/kunit/kunit.py", line 255, in main result = run_tests(linux, request) File "tools/testing/kunit/kunit.py", line 133, in run_tests request.defconfig, AttributeError: 'KunitRequest' object has no attribute 'defconfig' Handles when there is no .kunitconfig, the error due to merge conflicts between the following: commit 9bdf64b35117 ("kunit: use KUnit defconfig by default") commit 45ba7a893ad8 ("kunit: kunit_tool: Separate out config/build/exec/parse") [iha@bbking linux]$ tools/testing/kunit/kunit.py run Traceback (most recent call last): File "tools/testing/kunit/kunit.py", line 335, in main(sys.argv[1:]) File "tools/testing/kunit/kunit.py", line 246, in main linux = kunit_kernel.LinuxSourceTree() File "../tools/testing/kunit/kunit_kernel.py", line 109, in __init__ self._kconfig.read_from_file(kunitconfig_path) File "t../ools/testing/kunit/kunit_config.py", line 88, in read_from_file with open(path, 'r') as f: FileNotFoundError: [Errno 2] No such file or directory: '.kunit/.kunitconfig' Signed-off-by: Vitor Massaru Iha Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index ec73b07d1265..787b6d4ad716 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -23,7 +23,7 @@ import kunit_parser KunitResult = namedtuple('KunitResult', ['status','result','elapsed_time']) KunitConfigRequest = namedtuple('KunitConfigRequest', - ['build_dir', 'defconfig', 'make_options']) + ['build_dir', 'make_options']) KunitBuildRequest = namedtuple('KunitBuildRequest', ['jobs', 'build_dir', 'alltests', 'make_options']) @@ -130,7 +130,6 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, run_start = time.time() config_request = KunitConfigRequest(request.build_dir, - request.defconfig, request.make_options) config_result = config_tests(linux, config_request) if config_result.status != KunitStatus.SUCCESS: @@ -212,7 +211,6 @@ def main(argv, linux=None): help='Ensures that .config contains all of ' 'the options in .kunitconfig') add_common_opts(config_parser) - add_config_opts(config_parser) build_parser = subparser.add_parser('build', help='Builds a kernel with KUnit tests') add_common_opts(build_parser) @@ -238,11 +236,14 @@ def main(argv, linux=None): cli_args = parser.parse_args(argv) if cli_args.subcommand == 'run': - if not os.path.exists(cli_args.build_dir): - os.mkdir(cli_args.build_dir) - kunit_kernel.kunitconfig_path = os.path.join( - cli_args.build_dir, - kunit_kernel.kunitconfig_path) + if not os.path.exists(cli_args.build_dir): + os.mkdir(cli_args.build_dir) + kunit_kernel.kunitconfig_path = os.path.join( + cli_args.build_dir, + kunit_kernel.kunitconfig_path) + + if not os.path.exists(kunit_kernel.kunitconfig_path): + create_default_kunitconfig() if not linux: linux = kunit_kernel.LinuxSourceTree() @@ -264,11 +265,13 @@ def main(argv, linux=None): cli_args.build_dir, kunit_kernel.kunitconfig_path) + if not os.path.exists(kunit_kernel.kunitconfig_path): + create_default_kunitconfig() + if not linux: linux = kunit_kernel.LinuxSourceTree() request = KunitConfigRequest(cli_args.build_dir, - cli_args.defconfig, cli_args.make_options) result = config_tests(linux, request) kunit_parser.print_with_timestamp(( @@ -284,6 +287,9 @@ def main(argv, linux=None): cli_args.build_dir, kunit_kernel.kunitconfig_path) + if not os.path.exists(kunit_kernel.kunitconfig_path): + create_default_kunitconfig() + if not linux: linux = kunit_kernel.LinuxSourceTree() @@ -305,6 +311,9 @@ def main(argv, linux=None): cli_args.build_dir, kunit_kernel.kunitconfig_path) + if not os.path.exists(kunit_kernel.kunitconfig_path): + create_default_kunitconfig() + if not linux: linux = kunit_kernel.LinuxSourceTree() From 92238b31bd054cc2a046df96507575f3f10c277f Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:20 +0200 Subject: [PATCH 308/574] kunit: Kconfig: enable a KUNIT_ALL_TESTS fragment Make it easier to enable all KUnit fragments. This is useful for kernel devs or testers, so its easy to get all KUnit tests enabled and if new gets added they will be enabled as well. Fragments that has to be builtin will be missed if CONFIG_KUNIT_ALL_TESTS is set as a module. Signed-off-by: Anders Roxell Reviewed-by: Brendan Higgins Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index 95d12e3d6d95..bdeee7639005 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -41,4 +41,18 @@ config KUNIT_EXAMPLE_TEST is intended for curious hackers who would like to understand how to use KUnit for kernel development. +config KUNIT_ALL_TESTS + tristate "All KUnit tests with satisfied dependencies" + help + Enables all KUnit tests, if they can be enabled. + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + endif # KUNIT From beaed42c427dbe8fedba4518b2baf5203f5e398b Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:25 +0200 Subject: [PATCH 309/574] kunit: default KUNIT_* fragments to KUNIT_ALL_TESTS This makes it easier to enable all KUnit fragments. Adding 'if !KUNIT_ALL_TESTS' so individual tests can not be turned off. Therefore if KUNIT_ALL_TESTS is enabled that will hide the prompt in menuconfig. Reviewed-by: David Gow Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index bdeee7639005..00909e6a2443 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -15,7 +15,8 @@ menuconfig KUNIT if KUNIT config KUNIT_DEBUGFS - bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS help Enable debugfs representation for kunit. Currently this consists of /sys/kernel/debug/kunit//results files for each @@ -23,7 +24,8 @@ config KUNIT_DEBUGFS run that occurred. config KUNIT_TEST - tristate "KUnit test for KUnit" + tristate "KUnit test for KUnit" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS help Enables the unit tests for the KUnit test framework. These tests test the KUnit test framework itself; the tests are both written using @@ -32,7 +34,8 @@ config KUNIT_TEST expected. config KUNIT_EXAMPLE_TEST - tristate "Example test for KUnit" + tristate "Example test for KUnit" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS help Enables an example unit test that illustrates some of the basic features of KUnit. This test only exists to help new users understand From 5f215aab4ea0cf39e9fcdb721733a7814f3a3a50 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:29 +0200 Subject: [PATCH 310/574] lib: Kconfig.debug: default KUNIT_* fragments to KUNIT_ALL_TESTS This makes it easier to enable all KUnit fragments. Adding 'if !KUNIT_ALL_TESTS' so individual tests can not be turned off. Therefore if KUNIT_ALL_TESTS is enabled that will hide the prompt in menuconfig. Reviewed-by: David Gow Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- lib/Kconfig.debug | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 21d9c5f6e7ec..1f4ab7a2bdee 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2064,8 +2064,9 @@ config TEST_SYSCTL If unsure, say N. config SYSCTL_KUNIT_TEST - tristate "KUnit test for sysctl" + tristate "KUnit test for sysctl" if !KUNIT_ALL_TESTS depends on KUNIT + default KUNIT_ALL_TESTS help This builds the proc sysctl unit test, which runs on boot. Tests the API contract and implementation correctness of sysctl. @@ -2075,8 +2076,9 @@ config SYSCTL_KUNIT_TEST If unsure, say N. config LIST_KUNIT_TEST - tristate "KUnit Test for Kernel Linked-list structures" + tristate "KUnit Test for Kernel Linked-list structures" if !KUNIT_ALL_TESTS depends on KUNIT + default KUNIT_ALL_TESTS help This builds the linked list KUnit test suite. It tests that the API and basic functionality of the list_head type From bebe94b53eb7ee28f436f7b45324e37713690c0f Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:33 +0200 Subject: [PATCH 311/574] drivers: base: default KUNIT_* fragments to KUNIT_ALL_TESTS This makes it easier to enable all KUnit fragments. Adding 'if !KUNIT_ALL_TESTS' so individual tests can not be turned off. Therefore if KUNIT_ALL_TESTS is enabled that will hide the prompt in menuconfig. Reviewed-by: David Gow Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- drivers/base/Kconfig | 3 ++- drivers/base/test/Kconfig | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 5f0bc74d2409..8d7001712062 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -149,8 +149,9 @@ config DEBUG_TEST_DRIVER_REMOVE test this functionality. config PM_QOS_KUNIT_TEST - bool "KUnit Test for PM QoS features" + bool "KUnit Test for PM QoS features" if !KUNIT_ALL_TESTS depends on KUNIT=y + default KUNIT_ALL_TESTS config HMEM_REPORTING bool diff --git a/drivers/base/test/Kconfig b/drivers/base/test/Kconfig index 305c7751184a..ba225eb1b761 100644 --- a/drivers/base/test/Kconfig +++ b/drivers/base/test/Kconfig @@ -9,5 +9,6 @@ config TEST_ASYNC_DRIVER_PROBE If unsure say N. config KUNIT_DRIVER_PE_TEST - bool "KUnit Tests for property entry API" + bool "KUnit Tests for property entry API" if !KUNIT_ALL_TESTS depends on KUNIT=y + default KUNIT_ALL_TESTS From d194e12b3ed3c4799951072271f001cbd104b5e9 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:38 +0200 Subject: [PATCH 312/574] fs: ext4: default KUNIT_* fragments to KUNIT_ALL_TESTS This makes it easier to enable all KUnit fragments. Adding 'if !KUNIT_ALL_TESTS' so individual tests can not be turned off. Therefore if KUNIT_ALL_TESTS is enabled that will hide the prompt in menuconfig. Reviewed-by: David Gow Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan --- fs/ext4/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 2a592e38cdfe..bc7815158503 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -103,9 +103,10 @@ config EXT4_DEBUG echo 1 > /sys/module/ext4/parameters/mballoc_debug config EXT4_KUNIT_TESTS - tristate "KUnit tests for ext4" + tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS select EXT4_FS depends on KUNIT + default KUNIT_ALL_TESTS help This builds the ext4 KUnit tests. From 6d6861d45e38d42a7df9db244c871ee3856acf57 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 11 May 2020 15:14:42 +0200 Subject: [PATCH 313/574] security: apparmor: default KUNIT_* fragments to KUNIT_ALL_TESTS This makes it easier to enable all KUnit fragments. Adding 'if !KUNIT_ALL_TESTS' so individual tests can not be turned off. Therefore if KUNIT_ALL_TESTS is enabled that will hide the prompt in menuconfig. Reviewed-by: David Gow Signed-off-by: Anders Roxell Acked-by: John Johansen Signed-off-by: Shuah Khan --- security/apparmor/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/apparmor/Kconfig b/security/apparmor/Kconfig index 0fe336860773..03fae1bd48a6 100644 --- a/security/apparmor/Kconfig +++ b/security/apparmor/Kconfig @@ -70,8 +70,9 @@ config SECURITY_APPARMOR_DEBUG_MESSAGES the kernel message buffer. config SECURITY_APPARMOR_KUNIT_TEST - bool "Build KUnit tests for policy_unpack.c" + bool "Build KUnit tests for policy_unpack.c" if !KUNIT_ALL_TESTS depends on KUNIT=y && SECURITY_APPARMOR + default KUNIT_ALL_TESTS help This builds the AppArmor KUnit tests. From d3798acc094c8ff2406e9acc7a9b2c09da994616 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 May 2020 20:31:37 +0200 Subject: [PATCH 314/574] libceph: support for alloc hint flags Allow indicating future I/O pattern via flags. This is supported since Kraken (and bluestore persists flags together with expected_object_size and expected_write_size). Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 3 ++- include/linux/ceph/osd_client.h | 4 +++- include/linux/ceph/rados.h | 14 ++++++++++++++ net/ceph/osd_client.c | 8 +++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 67d65ac785e9..97e102ea03e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2253,7 +2253,8 @@ static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { osd_req_op_alloc_hint_init(osd_req, which++, rbd_dev->layout.object_size, - rbd_dev->layout.object_size); + rbd_dev->layout.object_size, + 0); } if (rbd_obj_is_entire(obj_req)) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 671fb93e8c60..c60b59e9291b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -136,6 +136,7 @@ struct ceph_osd_req_op { struct { u64 expected_object_size; u64 expected_write_size; + u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } alloc_hint; struct { u64 snapid; @@ -472,7 +473,8 @@ extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size); + u64 expected_write_size, + u32 flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 88ed3c5c04c5..3a518fd0eaad 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -464,6 +464,19 @@ enum { const char *ceph_osd_watch_op_name(int o); +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + enum { CEPH_OSD_BACKOFF_OP_BLOCK = 1, CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, @@ -517,6 +530,7 @@ struct ceph_osd_op { struct { __le64 expected_object_size; __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } __attribute__ ((packed)) alloc_hint; struct { __le64 snapid; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 22733e844be1..4fea3c33af2a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -932,10 +932,14 @@ static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, op->watch.gen = 0; } +/* + * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_* + */ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size) + u64 expected_write_size, + u32 flags) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, @@ -943,6 +947,7 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, op->alloc_hint.expected_object_size = expected_object_size; op->alloc_hint.expected_write_size = expected_write_size; + op->alloc_hint.flags = flags; /* * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed @@ -1018,6 +1023,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, cpu_to_le64(src->alloc_hint.expected_object_size); dst->alloc_hint.expected_write_size = cpu_to_le64(src->alloc_hint.expected_write_size); + dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags); break; case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: From dc1dad8e1a612650b1e786e992cb0c6e101e226a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 May 2020 20:51:23 +0200 Subject: [PATCH 315/574] rbd: compression_hint option Allow hinting to bluestore if the data should/should not be compressed. The default is to not hint (compression_hint=none). Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 97e102ea03e0..7420648a1de6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -836,6 +836,7 @@ enum { Opt_lock_timeout, /* int args above */ Opt_pool_ns, + Opt_compression_hint, /* string args above */ Opt_read_only, Opt_read_write, @@ -844,8 +845,23 @@ enum { Opt_notrim, }; +enum { + Opt_compression_hint_none, + Opt_compression_hint_compressible, + Opt_compression_hint_incompressible, +}; + +static const struct constant_table rbd_param_compression_hint[] = { + {"none", Opt_compression_hint_none}, + {"compressible", Opt_compression_hint_compressible}, + {"incompressible", Opt_compression_hint_incompressible}, + {} +}; + static const struct fs_parameter_spec rbd_parameters[] = { fsparam_u32 ("alloc_size", Opt_alloc_size), + fsparam_enum ("compression_hint", Opt_compression_hint, + rbd_param_compression_hint), fsparam_flag ("exclusive", Opt_exclusive), fsparam_flag ("lock_on_read", Opt_lock_on_read), fsparam_u32 ("lock_timeout", Opt_lock_timeout), @@ -867,6 +883,8 @@ struct rbd_options { bool lock_on_read; bool exclusive; bool trim; + + u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ }; #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ @@ -2254,7 +2272,7 @@ static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, osd_req_op_alloc_hint_init(osd_req, which++, rbd_dev->layout.object_size, rbd_dev->layout.object_size, - 0); + rbd_dev->opts->alloc_hint_flags); } if (rbd_obj_is_entire(obj_req)) @@ -6332,6 +6350,29 @@ static int rbd_parse_param(struct fs_parameter *param, pctx->spec->pool_ns = param->string; param->string = NULL; break; + case Opt_compression_hint: + switch (result.uint_32) { + case Opt_compression_hint_none: + opt->alloc_hint_flags &= + ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE | + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE); + break; + case Opt_compression_hint_compressible: + opt->alloc_hint_flags |= + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; + opt->alloc_hint_flags &= + ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; + break; + case Opt_compression_hint_incompressible: + opt->alloc_hint_flags |= + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; + opt->alloc_hint_flags &= + ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; + break; + default: + BUG(); + } + break; case Opt_read_only: opt->read_only = true; break; From 1cb2c4a2c89b2004a36399860c85a1af9b3fcba7 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 1 Jun 2020 20:54:50 -0700 Subject: [PATCH 316/574] Revert "drm/msm/dpu: add support for clk and bw scaling for display" This is causing multiple armv7 missing do_div() errors, so lets drop it for now. This reverts commit 04d9044f6c577948609c03b4e33b8fbc8b87c4b1. Cc: Kalyan Thota Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c | 106 +++--------------- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 5 +- .../gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h | 4 - drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 37 +----- drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h | 4 - drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c | 9 +- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 82 -------------- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h | 4 - 8 files changed, 23 insertions(+), 228 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c index 9697abcbec3f..7c230f719ad3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_core_perf.c @@ -29,73 +29,6 @@ enum dpu_perf_mode { DPU_PERF_MODE_MAX }; -/** - * @_dpu_core_perf_calc_bw() - to calculate BW per crtc - * @kms - pointer to the dpu_kms - * @crtc - pointer to a crtc - * Return: returns aggregated BW for all planes in crtc. - */ -static u64 _dpu_core_perf_calc_bw(struct dpu_kms *kms, - struct drm_crtc *crtc) -{ - struct drm_plane *plane; - struct dpu_plane_state *pstate; - u64 crtc_plane_bw = 0; - u32 bw_factor; - - drm_atomic_crtc_for_each_plane(plane, crtc) { - pstate = to_dpu_plane_state(plane->state); - - if (!pstate) - continue; - - crtc_plane_bw += pstate->plane_fetch_bw; - } - - bw_factor = kms->catalog->perf.bw_inefficiency_factor; - if (bw_factor) - crtc_plane_bw = mult_frac(crtc_plane_bw, bw_factor, 100); - - return crtc_plane_bw; -} - -/** - * _dpu_core_perf_calc_clk() - to calculate clock per crtc - * @kms - pointer to the dpu_kms - * @crtc - pointer to a crtc - * @state - pointer to a crtc state - * Return: returns max clk for all planes in crtc. - */ -static u64 _dpu_core_perf_calc_clk(struct dpu_kms *kms, - struct drm_crtc *crtc, struct drm_crtc_state *state) -{ - struct drm_plane *plane; - struct dpu_plane_state *pstate; - struct drm_display_mode *mode; - u64 crtc_clk; - u32 clk_factor; - - mode = &state->adjusted_mode; - - crtc_clk = mode->vtotal * mode->hdisplay * drm_mode_vrefresh(mode); - - drm_atomic_crtc_for_each_plane(plane, crtc) { - pstate = to_dpu_plane_state(plane->state); - - if (!pstate) - continue; - - crtc_clk = max(pstate->plane_clk, crtc_clk); - } - - clk_factor = kms->catalog->perf.clk_inefficiency_factor; - if (clk_factor) - crtc_clk = mult_frac(crtc_clk, clk_factor, 100); - - return crtc_clk; -} - - static struct dpu_kms *_dpu_crtc_get_kms(struct drm_crtc *crtc) { struct msm_drm_private *priv; @@ -118,7 +51,12 @@ static void _dpu_core_perf_calc_crtc(struct dpu_kms *kms, dpu_cstate = to_dpu_crtc_state(state); memset(perf, 0, sizeof(struct dpu_core_perf_params)); - if (kms->perf.perf_tune.mode == DPU_PERF_MODE_MINIMUM) { + if (!dpu_cstate->bw_control) { + perf->bw_ctl = kms->catalog->perf.max_bw_high * + 1000ULL; + perf->max_per_pipe_ib = perf->bw_ctl; + perf->core_clk_rate = kms->perf.max_core_clk_rate; + } else if (kms->perf.perf_tune.mode == DPU_PERF_MODE_MINIMUM) { perf->bw_ctl = 0; perf->max_per_pipe_ib = 0; perf->core_clk_rate = 0; @@ -126,10 +64,6 @@ static void _dpu_core_perf_calc_crtc(struct dpu_kms *kms, perf->bw_ctl = kms->perf.fix_core_ab_vote; perf->max_per_pipe_ib = kms->perf.fix_core_ib_vote; perf->core_clk_rate = kms->perf.fix_core_clk_rate; - } else { - perf->bw_ctl = _dpu_core_perf_calc_bw(kms, crtc); - perf->max_per_pipe_ib = kms->catalog->perf.min_dram_ib; - perf->core_clk_rate = _dpu_core_perf_calc_clk(kms, crtc, state); } DPU_DEBUG( @@ -181,7 +115,11 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, DPU_DEBUG("crtc:%d bw:%llu ctrl:%d\n", tmp_crtc->base.id, tmp_cstate->new_perf.bw_ctl, tmp_cstate->bw_control); - + /* + * For bw check only use the bw if the + * atomic property has been already set + */ + if (tmp_cstate->bw_control) bw_sum_of_intfs += tmp_cstate->new_perf.bw_ctl; } @@ -193,7 +131,9 @@ int dpu_core_perf_crtc_check(struct drm_crtc *crtc, DPU_DEBUG("final threshold bw limit = %d\n", threshold); - if (!threshold) { + if (!dpu_cstate->bw_control) { + DPU_DEBUG("bypass bandwidth check\n"); + } else if (!threshold) { DPU_ERROR("no bandwidth limits specified\n"); return -E2BIG; } else if (bw > threshold) { @@ -214,8 +154,7 @@ static int _dpu_core_perf_crtc_update_bus(struct dpu_kms *kms, = dpu_crtc_get_client_type(crtc); struct drm_crtc *tmp_crtc; struct dpu_crtc_state *dpu_cstate; - int i, ret = 0; - u64 avg_bw; + int ret = 0; drm_for_each_crtc(tmp_crtc, crtc->dev) { if (tmp_crtc->enabled && @@ -226,21 +165,10 @@ static int _dpu_core_perf_crtc_update_bus(struct dpu_kms *kms, perf.max_per_pipe_ib = max(perf.max_per_pipe_ib, dpu_cstate->new_perf.max_per_pipe_ib); - perf.bw_ctl += dpu_cstate->new_perf.bw_ctl; - - DPU_DEBUG("crtc=%d bw=%llu paths:%d\n", - tmp_crtc->base.id, - dpu_cstate->new_perf.bw_ctl, kms->num_paths); + DPU_DEBUG("crtc=%d bw=%llu\n", tmp_crtc->base.id, + dpu_cstate->new_perf.bw_ctl); } } - - avg_bw = kms->num_paths ? - perf.bw_ctl / kms->num_paths : 0; - - for (i = 0; i < kms->num_paths; i++) - icc_set_bw(kms->path[i], - Bps_to_icc(avg_bw), (perf.max_per_pipe_ib)); - return ret; } diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 8f2357d9960a..29d4fde3172b 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -541,8 +541,7 @@ static const struct dpu_perf_cfg sc7180_perf_data = { .max_bw_high = 6800000, .min_core_ib = 2400000, .min_llcc_ib = 800000, - .min_dram_ib = 1600000, - .min_prefill_lines = 24, + .min_dram_ib = 800000, .danger_lut_tbl = {0xff, 0xffff, 0x0}, .qos_lut_tbl = { {.nentry = ARRAY_SIZE(sc7180_qos_linear), @@ -559,8 +558,6 @@ static const struct dpu_perf_cfg sc7180_perf_data = { {.rd_enable = 1, .wr_enable = 1}, {.rd_enable = 1, .wr_enable = 0} }, - .clk_inefficiency_factor = 105, - .bw_inefficiency_factor = 120, }; /************************************************************* diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h index f2a5fe2d9d62..f7de43838c69 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h @@ -651,8 +651,6 @@ struct dpu_perf_cdp_cfg { * @downscaling_prefill_lines downscaling latency in lines * @amortizable_theshold minimum y position for traffic shaping prefill * @min_prefill_lines minimum pipeline latency in lines - * @clk_inefficiency_factor DPU src clock inefficiency factor - * @bw_inefficiency_factor DPU axi bus bw inefficiency factor * @safe_lut_tbl: LUT tables for safe signals * @danger_lut_tbl: LUT tables for danger signals * @qos_lut_tbl: LUT tables for QoS signals @@ -677,8 +675,6 @@ struct dpu_perf_cfg { u32 downscaling_prefill_lines; u32 amortizable_threshold; u32 min_prefill_lines; - u32 clk_inefficiency_factor; - u32 bw_inefficiency_factor; u32 safe_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; u32 danger_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; struct dpu_qos_lut_tbl qos_lut_tbl[DPU_QOS_LUT_USAGE_MAX]; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c index a5da7aacddba..b8615d4fe8a3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c @@ -303,28 +303,6 @@ static int dpu_kms_global_obj_init(struct dpu_kms *dpu_kms) return 0; } -static int dpu_kms_parse_data_bus_icc_path(struct dpu_kms *dpu_kms) -{ - struct icc_path *path0; - struct icc_path *path1; - struct drm_device *dev = dpu_kms->dev; - - path0 = of_icc_get(dev->dev, "mdp0-mem"); - path1 = of_icc_get(dev->dev, "mdp1-mem"); - - if (IS_ERR_OR_NULL(path0)) - return PTR_ERR_OR_ZERO(path0); - - dpu_kms->path[0] = path0; - dpu_kms->num_paths = 1; - - if (!IS_ERR_OR_NULL(path1)) { - dpu_kms->path[1] = path1; - dpu_kms->num_paths++; - } - return 0; -} - static int dpu_kms_enable_vblank(struct msm_kms *kms, struct drm_crtc *crtc) { return dpu_crtc_vblank(crtc, true); @@ -994,9 +972,6 @@ static int dpu_kms_hw_init(struct msm_kms *kms) dpu_vbif_init_memtypes(dpu_kms); - if (of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss")) - dpu_kms_parse_data_bus_icc_path(dpu_kms); - pm_runtime_put_sync(&dpu_kms->pdev->dev); return 0; @@ -1102,7 +1077,7 @@ static int dpu_dev_remove(struct platform_device *pdev) static int __maybe_unused dpu_runtime_suspend(struct device *dev) { - int i, rc = -1; + int rc = -1; struct platform_device *pdev = to_platform_device(dev); struct dpu_kms *dpu_kms = platform_get_drvdata(pdev); struct dss_module_power *mp = &dpu_kms->mp; @@ -1111,9 +1086,6 @@ static int __maybe_unused dpu_runtime_suspend(struct device *dev) if (rc) DPU_ERROR("clock disable failed rc:%d\n", rc); - for (i = 0; i < dpu_kms->num_paths; i++) - icc_set_bw(dpu_kms->path[i], 0, 0); - return rc; } @@ -1125,15 +1097,8 @@ static int __maybe_unused dpu_runtime_resume(struct device *dev) struct drm_encoder *encoder; struct drm_device *ddev; struct dss_module_power *mp = &dpu_kms->mp; - int i; ddev = dpu_kms->dev; - - /* Min vote of BW is required before turning on AXI clk */ - for (i = 0; i < dpu_kms->num_paths; i++) - icc_set_bw(dpu_kms->path[i], 0, - dpu_kms->catalog->perf.min_dram_ib); - rc = msm_dss_enable_clk(mp->clk_config, mp->num_clk, true); if (rc) { DPU_ERROR("clock enable failed rc:%d\n", rc); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h index 94410ca9bd70..4e32d040f1e6 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.h @@ -8,8 +8,6 @@ #ifndef __DPU_KMS_H__ #define __DPU_KMS_H__ -#include - #include #include "msm_drv.h" @@ -139,8 +137,6 @@ struct dpu_kms { * when disabled. */ atomic_t bandwidth_ref; - struct icc_path *path[2]; - u32 num_paths; }; struct vsync_info { diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c index df0a9835bfb1..80d3cfc14007 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c @@ -8,6 +8,7 @@ #include #include #include "dpu_kms.h" +#include #define to_dpu_mdss(x) container_of(x, struct dpu_mdss, base) @@ -314,11 +315,9 @@ int dpu_mdss_init(struct drm_device *dev) } dpu_mdss->mmio_len = resource_size(res); - if (!of_device_is_compatible(dev->dev->of_node, "qcom,sc7180-mdss")) { - ret = dpu_mdss_parse_data_bus_icc_path(dev, dpu_mdss); - if (ret) - return ret; - } + ret = dpu_mdss_parse_data_bus_icc_path(dev, dpu_mdss); + if (ret) + return ret; mp = &dpu_mdss->mp; ret = msm_dss_parse_clock(pdev, mp); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index c2a6e3dacd68..3b9c33e694bf 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -131,84 +131,6 @@ static struct dpu_kms *_dpu_plane_get_kms(struct drm_plane *plane) return to_dpu_kms(priv->kms); } -/** - * _dpu_plane_calc_bw - calculate bandwidth required for a plane - * @Plane: Pointer to drm plane. - * Result: Updates calculated bandwidth in the plane state. - * BW Equation: src_w * src_h * bpp * fps * (v_total / v_dest) - * Prefill BW Equation: line src bytes * line_time - */ -static void _dpu_plane_calc_bw(struct drm_plane *plane, - struct drm_framebuffer *fb) -{ - struct dpu_plane *pdpu = to_dpu_plane(plane); - struct dpu_plane_state *pstate; - struct drm_display_mode *mode; - const struct dpu_format *fmt = NULL; - struct dpu_kms *dpu_kms = _dpu_plane_get_kms(plane); - int src_width, src_height, dst_height, fps; - u64 plane_prefill_bw; - u64 plane_bw; - u32 hw_latency_lines; - u32 scale_factor; - int vbp, vpw; - - pstate = to_dpu_plane_state(plane->state); - mode = &plane->state->crtc->mode; - - fmt = dpu_get_dpu_format_ext(fb->format->format, fb->modifier); - - src_width = drm_rect_width(&pdpu->pipe_cfg.src_rect); - src_height = drm_rect_height(&pdpu->pipe_cfg.src_rect); - dst_height = drm_rect_height(&pdpu->pipe_cfg.dst_rect); - fps = drm_mode_vrefresh(mode); - vbp = mode->vtotal - mode->vsync_end; - vpw = mode->vsync_end - mode->vsync_start; - hw_latency_lines = dpu_kms->catalog->perf.min_prefill_lines; - scale_factor = src_height > dst_height ? - mult_frac(src_height, 1, dst_height) : 1; - - plane_bw = - src_width * mode->vtotal * fps * fmt->bpp * scale_factor; - - plane_prefill_bw = - src_width * hw_latency_lines * fps * fmt->bpp * scale_factor; - - plane_prefill_bw = mult_frac(plane_prefill_bw, mode->vtotal, (vbp+vpw)); - - pstate->plane_fetch_bw = max(plane_bw, plane_prefill_bw); -} - - -/** - * _dpu_plane_calc_clk - calculate clock required for a plane - * @Plane: Pointer to drm plane. - * Result: Updates calculated clock in the plane state. - * Clock equation: dst_w * v_total * fps * (src_h / dst_h) - */ -static void _dpu_plane_calc_clk(struct drm_plane *plane) -{ - struct dpu_plane *pdpu = to_dpu_plane(plane); - struct dpu_plane_state *pstate; - struct drm_display_mode *mode; - int dst_width, src_height, dst_height, fps; - - pstate = to_dpu_plane_state(plane->state); - mode = &plane->state->crtc->mode; - - src_height = drm_rect_height(&pdpu->pipe_cfg.src_rect); - dst_width = drm_rect_width(&pdpu->pipe_cfg.dst_rect); - dst_height = drm_rect_height(&pdpu->pipe_cfg.dst_rect); - fps = drm_mode_vrefresh(mode); - - pstate->plane_clk = - dst_width * mode->vtotal * fps; - - if (src_height > dst_height) - pstate->plane_clk = mult_frac(pstate->plane_clk, - src_height, dst_height); -} - /** * _dpu_plane_calc_fill_level - calculate fill level of the given source format * @plane: Pointer to drm plane @@ -1180,10 +1102,6 @@ static void dpu_plane_sspp_atomic_update(struct drm_plane *plane) } _dpu_plane_set_qos_remap(plane); - - _dpu_plane_calc_bw(plane, fb); - - _dpu_plane_calc_clk(plane); } static void _dpu_plane_atomic_disable(struct drm_plane *plane) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h index ca83b8753d59..456949713e90 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h @@ -25,8 +25,6 @@ * @scaler3_cfg: configuration data for scaler3 * @pixel_ext: configuration data for pixel extensions * @cdp_cfg: CDP configuration - * @plane_fetch_bw: calculated BW per plane - * @plane_clk: calculated clk per plane */ struct dpu_plane_state { struct drm_plane_state base; @@ -41,8 +39,6 @@ struct dpu_plane_state { struct dpu_hw_pixel_ext pixel_ext; struct dpu_hw_pipe_cdp_cfg cdp_cfg; - u64 plane_fetch_bw; - u64 plane_clk; }; /** From 725b94d74128cd208bfdd446ad6b5f4b38cf5395 Mon Sep 17 00:00:00 2001 From: Jared Rossi Date: Wed, 6 May 2020 17:24:40 -0400 Subject: [PATCH 317/574] vfio-ccw: Enable transparent CCW IPL from DASD Remove the explicit prefetch check when using vfio-ccw devices. This check does not trigger in practice as all Linux channel programs are intended to use prefetch. It is expected that all ORBs issued by Linux will request prefetch. Although non-prefetching ORBs are not rejected, they will prefetch nonetheless. A warning is issued up to once per 5 seconds when a forced prefetch occurs. A non-prefetch ORB does not necessarily result in an error, however frequent encounters with non-prefetch ORBs indicate that channel programs are being executed in a way that is inconsistent with what the guest is requesting. While there is currently no known case of an error caused by forced prefetch, it is possible in theory that forced prefetch could result in an error if applied to a channel program that is dependent on non-prefetch. Signed-off-by: Jared Rossi Reviewed-by: Eric Farman Message-Id: <20200506212440.31323-2-jrossi@linux.ibm.com> Signed-off-by: Cornelia Huck --- Documentation/s390/vfio-ccw.rst | 6 ++++++ drivers/s390/cio/vfio_ccw_cp.c | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index fca9c4f5bd9c..23e7d136f8b4 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -335,6 +335,12 @@ device. The current code allows the guest to start channel programs via START SUBCHANNEL, and to issue HALT SUBCHANNEL and CLEAR SUBCHANNEL. +Currently all channel programs are prefetched, regardless of the +p-bit setting in the ORB. As a result, self modifying channel +programs are not supported. For this reason, IPL has to be handled as +a special case by a userspace/guest program; this has been implemented +in QEMU's s390-ccw bios as of QEMU 4.1. + vfio-ccw supports classic (command mode) channel I/O only. Transport mode (HPF) is not supported. diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 3645d1720c4b..b9febc581b1f 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -8,6 +8,7 @@ * Xiao Feng Ren */ +#include #include #include #include @@ -625,23 +626,27 @@ static int ccwchain_fetch_one(struct ccwchain *chain, * the target channel program from @orb->cmd.iova to the new ccwchain(s). * * Limitations: - * 1. Supports only prefetch enabled mode. - * 2. Supports idal(c64) ccw chaining. - * 3. Supports 4k idaw. + * 1. Supports idal(c64) ccw chaining. + * 2. Supports 4k idaw. * * Returns: * %0 on success and a negative error value on failure. */ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) { + /* custom ratelimit used to avoid flood during guest IPL */ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 1); int ret; /* - * XXX: - * Only support prefetch enable mode now. + * We only support prefetching the channel program. We assume all channel + * programs executed by supported guests likewise support prefetching. + * Executing a channel program that does not specify prefetching will + * typically not cause an error, but a warning is issued to help identify + * the problem if something does break. */ - if (!orb->cmd.pfch) - return -EOPNOTSUPP; + if (!orb->cmd.pfch && __ratelimit(&ratelimit_state)) + dev_warn(mdev, "Prefetching channel program even though prefetch not specified in ORB"); INIT_LIST_HEAD(&cp->ccwchain_list); memcpy(&cp->orb, orb, sizeof(*orb)); From 430220b0bbcbaaaa03718111ff541ee8cd97c781 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 7 Apr 2020 13:16:05 +0200 Subject: [PATCH 318/574] vfio-ccw: document possible errors Interacting with the I/O and the async regions can yield a number of errors, which had been undocumented so far. These are part of the api, so remedy that. Signed-off-by: Cornelia Huck Reviewed-by: Eric Farman Message-Id: <20200407111605.1795-1-cohuck@redhat.com> --- Documentation/s390/vfio-ccw.rst | 56 +++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 23e7d136f8b4..3a946fd45562 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -204,15 +204,44 @@ definition of the region is:: __u32 ret_code; } __packed; +This region is always available. + While starting an I/O request, orb_area should be filled with the guest ORB, and scsw_area should be filled with the SCSW of the Virtual Subchannel. irb_area stores the I/O result. -ret_code stores a return code for each access of the region. +ret_code stores a return code for each access of the region. The following +values may occur: + +``0`` + The operation was successful. + +``-EOPNOTSUPP`` + The orb specified transport mode or an unidentified IDAW format, or the + scsw specified a function other than the start function. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests, or an internal error occurred. + +``-EBUSY`` + The subchannel was status pending or busy, or a request is already active. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EACCES`` + The channel path(s) used for the I/O were found to be not operational. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + The orb specified a chain longer than 255 ccws, or an internal error + occurred. -This region is always available. vfio-ccw cmd region ------------------- @@ -231,6 +260,29 @@ This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD. Currently, CLEAR SUBCHANNEL and HALT SUBCHANNEL use this region. +command specifies the command to be issued; ret_code stores a return code +for each access of the region. The following values may occur: + +``0`` + The operation was successful. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + A command other than halt or clear was specified. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EBUSY`` + The subchannel was status pending or busy while processing a halt request. + + vfio-ccw operation details -------------------------- From 9a44ce6c9b69e15c87c87ef1f1a6972837cff3db Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:38 +0200 Subject: [PATCH 319/574] vfio-ccw: Introduce new helper functions to free/destroy regions Consolidate some of the cleanup code for the regions, so that as more are added we reduce code duplication. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-2-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 339a6bc0339b..8715c1c2f1e1 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -116,6 +116,14 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } +static void vfio_ccw_free_regions(struct vfio_ccw_private *private) +{ + if (private->cmd_region) + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + if (private->io_region) + kmem_cache_free(vfio_ccw_io_region, private->io_region); +} + static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; @@ -181,10 +189,7 @@ out_disable: cio_disable_subchannel(sch); out_free: dev_set_drvdata(&sch->dev, NULL); - if (private->cmd_region) - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - if (private->io_region) - kmem_cache_free(vfio_ccw_io_region, private->io_region); + vfio_ccw_free_regions(private); kfree(private->cp.guest_cp); kfree(private); return ret; @@ -200,8 +205,7 @@ static int vfio_ccw_sch_remove(struct subchannel *sch) dev_set_drvdata(&sch->dev, NULL); - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - kmem_cache_free(vfio_ccw_io_region, private->io_region); + vfio_ccw_free_regions(private); kfree(private->cp.guest_cp); kfree(private); @@ -304,6 +308,12 @@ static void vfio_ccw_debug_exit(void) debug_unregister(vfio_ccw_debug_trace_id); } +static void vfio_ccw_destroy_regions(void) +{ + kmem_cache_destroy(vfio_ccw_cmd_region); + kmem_cache_destroy(vfio_ccw_io_region); +} + static int __init vfio_ccw_sch_init(void) { int ret; @@ -346,8 +356,7 @@ static int __init vfio_ccw_sch_init(void) return ret; out_err: - kmem_cache_destroy(vfio_ccw_cmd_region); - kmem_cache_destroy(vfio_ccw_io_region); + vfio_ccw_destroy_regions(); destroy_workqueue(vfio_ccw_work_q); vfio_ccw_debug_exit(); return ret; @@ -357,8 +366,7 @@ static void __exit vfio_ccw_sch_exit(void) { css_driver_unregister(&vfio_ccw_sch_driver); isc_unregister(VFIO_CCW_ISC); - kmem_cache_destroy(vfio_ccw_io_region); - kmem_cache_destroy(vfio_ccw_cmd_region); + vfio_ccw_destroy_regions(); destroy_workqueue(vfio_ccw_work_q); vfio_ccw_debug_exit(); } From b7701dfbf98327f34966a68c2532730353245d7b Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:39 +0200 Subject: [PATCH 320/574] vfio-ccw: Register a chp_event callback for vfio-ccw Register the chp_event callback to receive channel path related events for the subchannels managed by vfio-ccw. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-3-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 8715c1c2f1e1..fb1275a7d1f5 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -19,6 +19,7 @@ #include +#include "chp.h" #include "ioasm.h" #include "css.h" #include "vfio_ccw_private.h" @@ -262,6 +263,51 @@ out_unlock: return rc; } +static int vfio_ccw_chp_event(struct subchannel *sch, + struct chp_link *link, int event) +{ + struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + int mask = chp_ssd_get_mask(&sch->ssd_info, link); + int retry = 255; + + if (!private || !mask) + return 0; + + VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): mask=0x%x event=%d\n", + mdev_uuid(private->mdev), sch->schid.cssid, + sch->schid.ssid, sch->schid.sch_no, + mask, event); + + if (cio_update_schib(sch)) + return -ENODEV; + + switch (event) { + case CHP_VARY_OFF: + /* Path logically turned off */ + sch->opm &= ~mask; + sch->lpm &= ~mask; + if (sch->schib.pmcw.lpum & mask) + cio_cancel_halt_clear(sch, &retry); + break; + case CHP_OFFLINE: + /* Path is gone */ + if (sch->schib.pmcw.lpum & mask) + cio_cancel_halt_clear(sch, &retry); + break; + case CHP_VARY_ON: + /* Path logically turned on */ + sch->opm |= mask; + sch->lpm |= mask; + break; + case CHP_ONLINE: + /* Path became available */ + sch->lpm |= mask & sch->opm; + break; + } + + return 0; +} + static struct css_device_id vfio_ccw_sch_ids[] = { { .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, }, { /* end of list */ }, @@ -279,6 +325,7 @@ static struct css_driver vfio_ccw_sch_driver = { .remove = vfio_ccw_sch_remove, .shutdown = vfio_ccw_sch_shutdown, .sch_event = vfio_ccw_sch_event, + .chp_event = vfio_ccw_chp_event, }; static int __init vfio_ccw_debug_init(void) From 600279b52659c8211c6625fb2c9cd8bf8de15061 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 5 May 2020 14:27:40 +0200 Subject: [PATCH 321/574] vfio-ccw: Refactor the unregister of the async regions This is mostly for the purposes of a later patch, since we'll need to do the same thing later. While we are at it, move the resulting function call to ahead of the unregistering of the IOMMU notifier, so that it's done in the reverse order of how it was created. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-4-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_ops.c | 20 ++++++++++++-------- drivers/s390/cio/vfio_ccw_private.h | 1 + 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index f0d71ab77c50..d4fc84b8867f 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -181,7 +181,6 @@ static void vfio_ccw_mdev_release(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev_parent_dev(mdev)); - int i; if ((private->state != VFIO_CCW_STATE_NOT_OPER) && (private->state != VFIO_CCW_STATE_STANDBY)) { @@ -191,15 +190,9 @@ static void vfio_ccw_mdev_release(struct mdev_device *mdev) } cp_free(&private->cp); + vfio_ccw_unregister_dev_regions(private); vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &private->nb); - - for (i = 0; i < private->num_regions; i++) - private->region[i].ops->release(private, &private->region[i]); - - private->num_regions = 0; - kfree(private->region); - private->region = NULL; } static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, @@ -482,6 +475,17 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, return 0; } +void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) +{ + int i; + + for (i = 0; i < private->num_regions; i++) + private->region[i].ops->release(private, &private->region[i]); + private->num_regions = 0; + kfree(private->region); + private->region = NULL; +} + static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 9b9bb4982972..ce3834159d98 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -53,6 +53,7 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, unsigned int subtype, const struct vfio_ccw_regops *ops, size_t size, u32 flags, void *data); +void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); From 24c986748ba670c903a9d6a11ee96de2b3f5f1b8 Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:41 +0200 Subject: [PATCH 322/574] vfio-ccw: Introduce a new schib region The schib region can be used by userspace to get the subchannel- information block (SCHIB) for the passthrough subchannel. This can be useful to get information such as channel path information via the SCHIB.PMCW fields. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-5-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- Documentation/s390/vfio-ccw.rst | 18 ++++++- drivers/s390/cio/Makefile | 2 +- drivers/s390/cio/vfio_ccw_chp.c | 76 +++++++++++++++++++++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 20 ++++++++ drivers/s390/cio/vfio_ccw_ops.c | 14 +++++- drivers/s390/cio/vfio_ccw_private.h | 3 ++ include/uapi/linux/vfio.h | 1 + include/uapi/linux/vfio_ccw.h | 10 ++++ 8 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 drivers/s390/cio/vfio_ccw_chp.c diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 3a946fd45562..32310df525ba 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -282,6 +282,21 @@ for each access of the region. The following values may occur: ``-EBUSY`` The subchannel was status pending or busy while processing a halt request. +vfio-ccw schib region +--------------------- + +The vfio-ccw schib region is used to return Subchannel-Information +Block (SCHIB) data to userspace:: + + struct ccw_schib_region { + #define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. + +Reading this region triggers a STORE SUBCHANNEL to be issued to the +associated hardware. vfio-ccw operation details -------------------------- @@ -385,7 +400,8 @@ through DASD/ECKD device online in a guest now and use it as a block device. The current code allows the guest to start channel programs via -START SUBCHANNEL, and to issue HALT SUBCHANNEL and CLEAR SUBCHANNEL. +START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, +and STORE SUBCHANNEL. Currently all channel programs are prefetched, regardless of the p-bit setting in the ORB. As a result, self modifying channel diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index 23eae4188876..a9235f111e79 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -21,5 +21,5 @@ qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o obj-$(CONFIG_QDIO) += qdio.o vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o \ - vfio_ccw_async.o vfio_ccw_trace.o + vfio_ccw_async.o vfio_ccw_trace.o vfio_ccw_chp.o obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c new file mode 100644 index 000000000000..18f3b3e873a9 --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Channel path related status regions for vfio_ccw + * + * Copyright IBM Corp. 2020 + * + * Author(s): Farhan Ali + * Eric Farman + */ + +#include +#include "vfio_ccw_private.h" + +static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_schib_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + mutex_lock(&private->io_mutex); + region = private->region[i].data; + + if (cio_update_schib(private->sch)) { + ret = -ENODEV; + goto out; + } + + memcpy(region, &private->sch->schib, sizeof(*region)); + + if (copy_to_user(buf, (void *)region + pos, count)) { + ret = -EFAULT; + goto out; + } + + ret = count; + +out: + mutex_unlock(&private->io_mutex); + return ret; +} + +static ssize_t vfio_ccw_schib_region_write(struct vfio_ccw_private *private, + const char __user *buf, size_t count, + loff_t *ppos) +{ + return -EINVAL; +} + + +static void vfio_ccw_schib_region_release(struct vfio_ccw_private *private, + struct vfio_ccw_region *region) +{ + +} + +const struct vfio_ccw_regops vfio_ccw_schib_region_ops = { + .read = vfio_ccw_schib_region_read, + .write = vfio_ccw_schib_region_write, + .release = vfio_ccw_schib_region_release, +}; + +int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private) +{ + return vfio_ccw_register_dev_region(private, + VFIO_REGION_SUBTYPE_CCW_SCHIB, + &vfio_ccw_schib_region_ops, + sizeof(struct ccw_schib_region), + VFIO_REGION_INFO_FLAG_READ, + private->schib_region); +} diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index fb1275a7d1f5..7aeff42f370d 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -27,6 +27,7 @@ struct workqueue_struct *vfio_ccw_work_q; static struct kmem_cache *vfio_ccw_io_region; static struct kmem_cache *vfio_ccw_cmd_region; +static struct kmem_cache *vfio_ccw_schib_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -119,6 +120,8 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) static void vfio_ccw_free_regions(struct vfio_ccw_private *private) { + if (private->schib_region) + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); if (private->cmd_region) kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); if (private->io_region) @@ -156,6 +159,12 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (!private->cmd_region) goto out_free; + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, + GFP_KERNEL | GFP_DMA); + + if (!private->schib_region) + goto out_free; + private->sch = sch; dev_set_drvdata(&sch->dev, private); mutex_init(&private->io_mutex); @@ -357,6 +366,7 @@ static void vfio_ccw_debug_exit(void) static void vfio_ccw_destroy_regions(void) { + kmem_cache_destroy(vfio_ccw_schib_region); kmem_cache_destroy(vfio_ccw_cmd_region); kmem_cache_destroy(vfio_ccw_io_region); } @@ -393,6 +403,16 @@ static int __init vfio_ccw_sch_init(void) goto out_err; } + vfio_ccw_schib_region = kmem_cache_create_usercopy("vfio_ccw_schib_region", + sizeof(struct ccw_schib_region), 0, + SLAB_ACCOUNT, 0, + sizeof(struct ccw_schib_region), NULL); + + if (!vfio_ccw_schib_region) { + ret = -ENOMEM; + goto out_err; + } + isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index d4fc84b8867f..22988d67b6bb 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -172,8 +172,18 @@ static int vfio_ccw_mdev_open(struct mdev_device *mdev) ret = vfio_ccw_register_async_dev_regions(private); if (ret) - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, - &private->nb); + goto out_unregister; + + ret = vfio_ccw_register_schib_dev_regions(private); + if (ret) + goto out_unregister; + + return ret; + +out_unregister: + vfio_ccw_unregister_dev_regions(private); + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &private->nb); return ret; } diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index ce3834159d98..d6601a8adf13 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -56,6 +56,7 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); +int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); /** * struct vfio_ccw_private @@ -69,6 +70,7 @@ int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); * @io_mutex: protect against concurrent update of I/O regions * @region: additional regions for other subchannel operations * @cmd_region: MMIO region for asynchronous I/O commands other than START + * @schib_region: MMIO region for SCHIB information * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt @@ -87,6 +89,7 @@ struct vfio_ccw_private { struct mutex io_mutex; struct vfio_ccw_region *region; struct ccw_cmd_region *cmd_region; + struct ccw_schib_region *schib_region; int num_regions; struct channel_program cp; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 015516bcfaa3..7a1abbd889bd 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -378,6 +378,7 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h index cbecbf0cd54f..758bf214898d 100644 --- a/include/uapi/linux/vfio_ccw.h +++ b/include/uapi/linux/vfio_ccw.h @@ -34,4 +34,14 @@ struct ccw_cmd_region { __u32 ret_code; } __packed; +/* + * Used for processing commands that read the subchannel-information block + * Reading this region triggers a stsch() to hardware + * Note: this is controlled by a capability + */ +struct ccw_schib_region { +#define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; +} __packed; + #endif From 4296151d231eeee78514bf7f495c46683785255d Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 5 May 2020 14:27:42 +0200 Subject: [PATCH 323/574] vfio-ccw: Refactor IRQ handlers To simplify future expansion. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-6-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_ops.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 22988d67b6bb..c3a74ab7bb86 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -387,17 +387,21 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info, static int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) { - if (info->index != VFIO_CCW_IO_IRQ_INDEX) + switch (info->index) { + case VFIO_CCW_IO_IRQ_INDEX: + info->count = 1; + info->flags = VFIO_IRQ_INFO_EVENTFD; + break; + default: return -EINVAL; - - info->count = 1; - info->flags = VFIO_IRQ_INFO_EVENTFD; + } return 0; } static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, uint32_t flags, + uint32_t index, void __user *data) { struct vfio_ccw_private *private; @@ -407,7 +411,14 @@ static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, return -EINVAL; private = dev_get_drvdata(mdev_parent_dev(mdev)); - ctx = &private->io_trigger; + + switch (index) { + case VFIO_CCW_IO_IRQ_INDEX: + ctx = &private->io_trigger; + break; + default: + return -EINVAL; + } switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { case VFIO_IRQ_SET_DATA_NONE: @@ -579,7 +590,7 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, return ret; data = (void __user *)(arg + minsz); - return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, data); + return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, hdr.index, data); } case VFIO_DEVICE_RESET: return vfio_ccw_mdev_reset(mdev); From 882f38b7f6c406e7229e72ee4328b7f1d5938ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 27 May 2020 23:02:45 +0300 Subject: [PATCH 324/574] drm/i915: Fix global state use-after-frees with a refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the current locking/serialization of the global state suffices for protecting the obj->state access and the actual hardware reprogramming, we do have a problem with accessing the old/new states during nonblocking commits. The state computation and swap will be protected by the crtc locks, but the commit_tails can finish out of order, thus also causing the atomic states to be cleaned up out of order. This would mean the commit that started first but finished last has had its new state freed as the no-longer-needed old state by the other commit. To fix this let's just refcount the states. obj->state amounts to one reference, and the intel_atomic_state holds extra references to both its new and old global obj states. Fixes: 0ef1905ecf2e ("drm/i915: Introduce better global state handling") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20200527200245.13184-1-ville.syrjala@linux.intel.com Reviewed-by: Stanislav Lisovskiy (cherry picked from commit f8c86ffa2800adc80adc679c84c45e0c6b027374) Signed-off-by: Joonas Lahtinen --- .../gpu/drm/i915/display/intel_global_state.c | 45 ++++++++++++++++--- .../gpu/drm/i915/display/intel_global_state.h | 3 ++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_global_state.c b/drivers/gpu/drm/i915/display/intel_global_state.c index 212d4ee68205..7a19215ad844 100644 --- a/drivers/gpu/drm/i915/display/intel_global_state.c +++ b/drivers/gpu/drm/i915/display/intel_global_state.c @@ -10,6 +10,28 @@ #include "intel_display_types.h" #include "intel_global_state.h" +static void __intel_atomic_global_state_free(struct kref *kref) +{ + struct intel_global_state *obj_state = + container_of(kref, struct intel_global_state, ref); + struct intel_global_obj *obj = obj_state->obj; + + obj->funcs->atomic_destroy_state(obj, obj_state); +} + +static void intel_atomic_global_state_put(struct intel_global_state *obj_state) +{ + kref_put(&obj_state->ref, __intel_atomic_global_state_free); +} + +static struct intel_global_state * +intel_atomic_global_state_get(struct intel_global_state *obj_state) +{ + kref_get(&obj_state->ref); + + return obj_state; +} + void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv, struct intel_global_obj *obj, struct intel_global_state *state, @@ -17,6 +39,10 @@ void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv, { memset(obj, 0, sizeof(*obj)); + state->obj = obj; + + kref_init(&state->ref); + obj->state = state; obj->funcs = funcs; list_add_tail(&obj->head, &dev_priv->global_obj_list); @@ -28,7 +54,9 @@ void intel_atomic_global_obj_cleanup(struct drm_i915_private *dev_priv) list_for_each_entry_safe(obj, next, &dev_priv->global_obj_list, head) { list_del(&obj->head); - obj->funcs->atomic_destroy_state(obj, obj->state); + + drm_WARN_ON(&dev_priv->drm, kref_read(&obj->state->ref) != 1); + intel_atomic_global_state_put(obj->state); } } @@ -97,10 +125,14 @@ intel_atomic_get_global_obj_state(struct intel_atomic_state *state, if (!obj_state) return ERR_PTR(-ENOMEM); + obj_state->obj = obj; obj_state->changed = false; + kref_init(&obj_state->ref); + state->global_objs[index].state = obj_state; - state->global_objs[index].old_state = obj->state; + state->global_objs[index].old_state = + intel_atomic_global_state_get(obj->state); state->global_objs[index].new_state = obj_state; state->global_objs[index].ptr = obj; obj_state->state = state; @@ -163,7 +195,9 @@ void intel_atomic_swap_global_state(struct intel_atomic_state *state) new_obj_state->state = NULL; state->global_objs[i].state = old_obj_state; - obj->state = new_obj_state; + + intel_atomic_global_state_put(obj->state); + obj->state = intel_atomic_global_state_get(new_obj_state); } } @@ -172,10 +206,9 @@ void intel_atomic_clear_global_state(struct intel_atomic_state *state) int i; for (i = 0; i < state->num_global_objs; i++) { - struct intel_global_obj *obj = state->global_objs[i].ptr; + intel_atomic_global_state_put(state->global_objs[i].old_state); + intel_atomic_global_state_put(state->global_objs[i].new_state); - obj->funcs->atomic_destroy_state(obj, - state->global_objs[i].state); state->global_objs[i].ptr = NULL; state->global_objs[i].state = NULL; state->global_objs[i].old_state = NULL; diff --git a/drivers/gpu/drm/i915/display/intel_global_state.h b/drivers/gpu/drm/i915/display/intel_global_state.h index e6163a469029..1f16fa3073c9 100644 --- a/drivers/gpu/drm/i915/display/intel_global_state.h +++ b/drivers/gpu/drm/i915/display/intel_global_state.h @@ -6,6 +6,7 @@ #ifndef __INTEL_GLOBAL_STATE_H__ #define __INTEL_GLOBAL_STATE_H__ +#include #include struct drm_i915_private; @@ -54,7 +55,9 @@ struct intel_global_obj { for_each_if(obj) struct intel_global_state { + struct intel_global_obj *obj; struct intel_atomic_state *state; + struct kref ref; bool changed; }; From 273500ae71711c040d258a7b3f4b6f44c368fff2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 1 Jun 2020 17:19:42 +0100 Subject: [PATCH 325/574] drm/i915: Whitelist context-local timestamp in the gen9 cmdparser Allow batch buffers to read their own _local_ cumulative HW runtime of their logical context. Fixes: 0f2f39758341 ("drm/i915: Add gen9 BCS cmdparsing") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: # v5.4+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200601161942.30854-1-chris@chris-wilson.co.uk (cherry picked from commit f9496520df11de00fbafc3cbd693b9570d600ab3) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_cmd_parser.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 189b573d02be..372354d33f55 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -572,6 +572,9 @@ struct drm_i915_reg_descriptor { #define REG32(_reg, ...) \ { .addr = (_reg), __VA_ARGS__ } +#define REG32_IDX(_reg, idx) \ + { .addr = _reg(idx) } + /* * Convenience macro for adding 64-bit registers. * @@ -669,6 +672,7 @@ static const struct drm_i915_reg_descriptor gen9_blt_regs[] = { REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE), REG32(BCS_SWCTRL), REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), + REG32_IDX(RING_CTX_TIMESTAMP, BLT_RING_BASE), REG64_IDX(BCS_GPR, 0), REG64_IDX(BCS_GPR, 1), REG64_IDX(BCS_GPR, 2), From ea2b383ded0f84f310fe330eb43d758f2807e728 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 2 Jun 2020 00:55:08 +0300 Subject: [PATCH 326/574] drm/i915/params: don't expose inject_probe_failure in debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parameter only makes sense as a module parameter only. Fixes: c43c5a8818d4 ("drm/i915/params: add i915 parameters to debugfs") Cc: Juha-Pekka Heikkilä Cc: Venkata Sandeep Dhanalakota Reviewed-by: Juha-Pekka Heikkila Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200601215510.18379-1-jani.nikula@intel.com (cherry picked from commit dbf4081ffb68c0d9b518a34c715a8d8681658411) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_params.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h index 45323732f099..4f21bfffbf0e 100644 --- a/drivers/gpu/drm/i915/i915_params.h +++ b/drivers/gpu/drm/i915/i915_params.h @@ -64,7 +64,7 @@ struct drm_printer; param(int, mmio_debug, -IS_ENABLED(CONFIG_DRM_I915_DEBUG_MMIO), 0600) \ param(int, edp_vswing, 0, 0400) \ param(unsigned int, reset, 3, 0600) \ - param(unsigned int, inject_probe_failure, 0, 0600) \ + param(unsigned int, inject_probe_failure, 0, 0) \ param(int, fastboot, -1, 0600) \ param(int, enable_dpcd_backlight, -1, 0600) \ param(char *, force_probe, CONFIG_DRM_I915_FORCE_PROBE, 0400) \ From 62b6e899a0ad4863b2e799f3c10a77116e3cfadc Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 2 Jun 2020 00:55:09 +0300 Subject: [PATCH 327/574] drm/i915/params: fix i915.fake_lmem_start module param sysfs permissions fake_lmem_start does not need to be mutable via module param sysfs. It's only used during driver probe. Fixes: 1629224324b6 ("drm/i915/lmem: add the fake lmem region") Cc: Matthew Auld Cc: Joonas Lahtinen Cc: Chris Wilson Reviewed-by: Rodrigo Vivi Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200601215510.18379-2-jani.nikula@intel.com (cherry picked from commit f322e851f20e534cf5305332a9ad5eefadb55d56) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c index add00ec1f787..a3dde770226d 100644 --- a/drivers/gpu/drm/i915/i915_params.c +++ b/drivers/gpu/drm/i915/i915_params.c @@ -173,7 +173,7 @@ i915_param_named(enable_gvt, bool, 0400, #endif #if IS_ENABLED(CONFIG_DRM_I915_UNSTABLE_FAKE_LMEM) -i915_param_named_unsafe(fake_lmem_start, ulong, 0600, +i915_param_named_unsafe(fake_lmem_start, ulong, 0400, "Fake LMEM start offset (default: 0)"); #endif From d0676871fd52f575cc2bfec4faa2fcbc8af370e8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 28 May 2020 23:52:06 +0900 Subject: [PATCH 328/574] lib: Make prime number generator independently selectable Make prime number generator independently selectable from kconfig. This allows us to enable CONFIG_PRIME_NUMBERS=m and run the tools/testing/selftests/lib/prime_numbers.sh without other DRM selftest modules. Signed-off-by: Masami Hiramatsu Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Signed-off-by: Shuah Khan --- lib/math/Kconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/math/Kconfig b/lib/math/Kconfig index 15bd50d92308..f19bc9734fa7 100644 --- a/lib/math/Kconfig +++ b/lib/math/Kconfig @@ -6,7 +6,12 @@ config CORDIC calculations are in fixed point. Module will be called cordic. config PRIME_NUMBERS - tristate + tristate "Simple prime number generator for testing" + help + This option provides a simple prime number generator for test + modules. + + If unsure, say N. config RATIONAL bool From 2f56f84511367ce5ffd4a1c88dcd770ef049ea96 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 28 May 2020 23:52:16 +0900 Subject: [PATCH 329/574] lib: Make test_sysctl initialized as module test_sysctl.c is expected to be used as a module, but since it does not use module_init(), it never be registered as a module and not appeared under /sys/module/. In the result, the selftests/sysctl/sysctl.sh always fails to find the test module and is skipped. This makes test_sysctl.c initialized as a module by module_init() and allow sysctl.sh to find the test module is loaded. Signed-off-by: Masami Hiramatsu Reviewed-by: Luis Chamberlain Signed-off-by: Shuah Khan --- lib/test_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 566dad3f4196..ec4d0f03475d 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -149,7 +149,7 @@ static int __init test_sysctl_init(void) } return 0; } -late_initcall(test_sysctl_init); +module_init(test_sysctl_init); static void __exit test_sysctl_exit(void) { From eee470e0739a9d8e29460f6d355cefa1c9a0384a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 28 May 2020 23:52:26 +0900 Subject: [PATCH 330/574] selftests/sysctl: Fix to load test_sysctl module Fix to load test_sysctl.ko module correctly. sysctl.sh checks whether the test module is embedded (or loaded already) or not at first, and if not, it returns skip error instead of trying modprobe. Thus, there is no chance to load the test_sysctl test module. Instead, this removes that module embedded check and returns skip error only if it ensures that there is no embedded test module *and* no loadable test module. This also avoid referring config file since that is not installed. Signed-off-by: Masami Hiramatsu Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Signed-off-by: Shuah Khan --- tools/testing/selftests/sysctl/sysctl.sh | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..c3459f9f2429 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -40,16 +40,6 @@ ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" -test_modprobe() -{ - if [ ! -d $DIR ]; then - echo "$0: $DIR not present" >&2 - echo "You must have the following enabled in your kernel:" >&2 - cat $TEST_DIR/config >&2 - exit $ksft_skip - fi -} - function allow_user_defaults() { if [ -z $DIR ]; then @@ -125,10 +115,12 @@ function load_req_mod() if [ ! -d $DIR ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" + echo "You must set CONFIG_TEST_SYSCTL=m in your kernel" >&2 exit $ksft_skip fi modprobe $TEST_DRIVER if [ $? -ne 0 ]; then + echo "$0: modprobe $TEST_DRIVER failed." exit fi fi @@ -929,7 +921,6 @@ test_reqs allow_user_defaults check_production_sysctl_writes_strict load_req_mod -test_modprobe trap "test_finish" EXIT From 382561a0f11c4995d48ab82670412f8d6c418430 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 28 May 2020 23:52:37 +0900 Subject: [PATCH 331/574] selftests/sysctl: Make sysctl test driver as a module Fix config file to require CONFIG_TEST_SYSCTL=m instead of y because this driver introduces a test sysctl interfaces which are normally not used, and only used for the selftest. Signed-off-by: Masami Hiramatsu Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Signed-off-by: Shuah Khan --- tools/testing/selftests/sysctl/config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config index 6ca14800d755..fc263efd1fad 100644 --- a/tools/testing/selftests/sysctl/config +++ b/tools/testing/selftests/sysctl/config @@ -1 +1 @@ -CONFIG_TEST_SYSCTL=y +CONFIG_TEST_SYSCTL=m From 3700bec3323ebe90924156775be0124e93094f78 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 15 Apr 2020 16:32:31 +0200 Subject: [PATCH 332/574] docs: filesystems: convert gfs2-glocks.txt to ReST - Add a SPDX header; - Adjust document and section titles; - Some whitespace fixes and new line breaks; - Add table markups; - Use notes markups; - Add it to filesystems/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Andreas Gruenbacher --- .../{gfs2-glocks.txt => gfs2-glocks.rst} | 147 ++++++++++-------- Documentation/filesystems/index.rst | 1 + MAINTAINERS | 2 +- 3 files changed, 86 insertions(+), 64 deletions(-) rename Documentation/filesystems/{gfs2-glocks.txt => gfs2-glocks.rst} (63%) diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.rst similarity index 63% rename from Documentation/filesystems/gfs2-glocks.txt rename to Documentation/filesystems/gfs2-glocks.rst index 7059623635b2..d14f230f0b12 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.rst @@ -1,5 +1,8 @@ - Glock internal locking rules - ------------------------------ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Glock internal locking rules +============================ This documents the basic principles of the glock state machine internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h) @@ -24,24 +27,28 @@ There are three lock states that users of the glock layer can request, namely shared (SH), deferred (DF) and exclusive (EX). Those translate to the following DLM lock modes: -Glock mode | DLM lock mode ------------------------------- - UN | IV/NL Unlocked (no DLM lock associated with glock) or NL - SH | PR (Protected read) - DF | CW (Concurrent write) - EX | EX (Exclusive) +========== ====== ===================================================== +Glock mode DLM lock mode +========== ====== ===================================================== + UN IV/NL Unlocked (no DLM lock associated with glock) or NL + SH PR (Protected read) + DF CW (Concurrent write) + EX EX (Exclusive) +========== ====== ===================================================== Thus DF is basically a shared mode which is incompatible with the "normal" shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O operations. The glocks are basically a lock plus some routines which deal with cache management. The following rules apply for the cache: -Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata --------------------------------------------------------------------------- - UN | No | No | No | No - SH | Yes | Yes | No | No - DF | No | Yes | No | No - EX | Yes | Yes | Yes | Yes +========== ========== ============== ========== ============== +Glock mode Cache data Cache Metadata Dirty Data Dirty Metadata +========== ========== ============== ========== ============== + UN No No No No + SH Yes Yes No No + DF No Yes No No + EX Yes Yes Yes Yes +========== ========== ============== ========== ============== These rules are implemented using the various glock operations which are defined for each type of glock. Not all types of glocks use @@ -49,21 +56,23 @@ all the modes. Only inode glocks use the DF mode for example. Table of glock operations and per type constants: -Field | Purpose ----------------------------------------------------------------------------- -go_xmote_th | Called before remote state change (e.g. to sync dirty data) -go_xmote_bh | Called after remote state change (e.g. to refill cache) -go_inval | Called if remote state change requires invalidating the cache -go_demote_ok | Returns boolean value of whether its ok to demote a glock - | (e.g. checks timeout, and that there is no cached data) -go_lock | Called for the first local holder of a lock -go_unlock | Called on the final local unlock of a lock -go_dump | Called to print content of object for debugfs file, or on - | error to dump glock to the log. -go_type | The type of the glock, LM_TYPE_..... -go_callback | Called if the DLM sends a callback to drop this lock -go_flags | GLOF_ASPACE is set, if the glock has an address space - | associated with it +============= ============================================================= +Field Purpose +============= ============================================================= +go_xmote_th Called before remote state change (e.g. to sync dirty data) +go_xmote_bh Called after remote state change (e.g. to refill cache) +go_inval Called if remote state change requires invalidating the cache +go_demote_ok Returns boolean value of whether its ok to demote a glock + (e.g. checks timeout, and that there is no cached data) +go_lock Called for the first local holder of a lock +go_unlock Called on the final local unlock of a lock +go_dump Called to print content of object for debugfs file, or on + error to dump glock to the log. +go_type The type of the glock, ``LM_TYPE_*`` +go_callback Called if the DLM sends a callback to drop this lock +go_flags GLOF_ASPACE is set, if the glock has an address space + associated with it +============= ============================================================= The minimum hold time for each lock is the time after a remote lock grant for which we ignore remote demote requests. This is in order to @@ -82,21 +91,25 @@ rather than via the glock. Locking rules for glock operations: -Operation | GLF_LOCK bit lock held | gl_lockref.lock spinlock held -------------------------------------------------------------------------- -go_xmote_th | Yes | No -go_xmote_bh | Yes | No -go_inval | Yes | No -go_demote_ok | Sometimes | Yes -go_lock | Yes | No -go_unlock | Yes | No -go_dump | Sometimes | Yes -go_callback | Sometimes (N/A) | Yes +============= ====================== ============================= +Operation GLF_LOCK bit lock held gl_lockref.lock spinlock held +============= ====================== ============================= +go_xmote_th Yes No +go_xmote_bh Yes No +go_inval Yes No +go_demote_ok Sometimes Yes +go_lock Yes No +go_unlock Yes No +go_dump Sometimes Yes +go_callback Sometimes (N/A) Yes +============= ====================== ============================= -N.B. Operations must not drop either the bit lock or the spinlock -if its held on entry. go_dump and do_demote_ok must never block. -Note that go_dump will only be called if the glock's state -indicates that it is caching uptodate data. +.. Note:: + + Operations must not drop either the bit lock or the spinlock + if its held on entry. go_dump and do_demote_ok must never block. + Note that go_dump will only be called if the glock's state + indicates that it is caching uptodate data. Glock locking order within GFS2: @@ -104,7 +117,7 @@ Glock locking order within GFS2: 2. Rename glock (for rename only) 3. Inode glock(s) (Parents before children, inodes at "same level" with same parent in - lock number order) + lock number order) 4. Rgrp glock(s) (for (de)allocation operations) 5. Transaction glock (via gfs2_trans_begin) for non-read operations 6. i_rw_mutex (if required) @@ -117,8 +130,8 @@ determine the lifetime of the inode in question. Locking of inodes is on a per-inode basis. Locking of rgrps is on a per rgrp basis. In general we prefer to lock local locks prior to cluster locks. - Glock Statistics - ------------------ +Glock Statistics +---------------- The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The @@ -173,8 +186,8 @@ we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for -allocation (to base it on lock wait time, rather than blindly -using a "try lock") + allocation (to base it on lock wait time, rather than blindly + using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken @@ -195,10 +208,13 @@ as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. -Per sb stats can be found here: -/sys/kernel/debug/gfs2//sbstats -Per glock stats can be found here: -/sys/kernel/debug/gfs2//glstats +Per sb stats can be found here:: + + /sys/kernel/debug/gfs2//sbstats + +Per glock stats can be found here:: + + /sys/kernel/debug/gfs2//glstats Assuming that debugfs is mounted on /sys/kernel/debug and also that is replaced with the name of the gfs2 filesystem @@ -206,14 +222,16 @@ in question. The abbreviations used in the output as are follows: -srtt - Smoothed round trip time for non-blocking dlm requests -srttvar - Variance estimate for srtt -srttb - Smoothed round trip time for (potentially) blocking dlm requests -srttvarb - Variance estimate for srttb -sirt - Smoothed inter-request time (for dlm requests) -sirtvar - Variance estimate for sirt -dlm - Number of dlm requests made (dcnt in glstats file) -queue - Number of glock requests queued (qcnt in glstats file) +========= ================================================================ +srtt Smoothed round trip time for non blocking dlm requests +srttvar Variance estimate for srtt +srttb Smoothed round trip time for (potentially) blocking dlm requests +srttvarb Variance estimate for srttb +sirt Smoothed inter request time (for dlm requests) +sirtvar Variance estimate for sirt +dlm Number of dlm requests made (dcnt in glstats file) +queue Number of glock requests queued (qcnt in glstats file) +========= ================================================================ The sbstats file contains a set of these stats for each glock type (so 8 lines for each type) and for each cpu (one column per cpu). The glstats file contains @@ -224,9 +242,12 @@ The gfs2_glock_lock_time tracepoint prints out the current values of the stats for the glock in question, along with some addition information on each dlm reply that is received: -status - The status of the dlm request -flags - The dlm request flags -tdiff - The time taken by this specific request +====== ======================================= +status The status of the dlm request +flags The dlm request flags +tdiff The time taken by this specific request +====== ======================================= + (remaining fields as per above list) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e7b46dac7079..0641429f53a2 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -69,6 +69,7 @@ Documentation for filesystem implementations. f2fs gfs2 gfs2-uevents + gfs2-glocks hfs hfsplus hpfs diff --git a/MAINTAINERS b/MAINTAINERS index 50659d76976b..3354ed247d0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7179,7 +7179,7 @@ L: cluster-devel@redhat.com S: Supported W: http://sources.redhat.com/cluster/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git -F: Documentation/filesystems/gfs2*.txt +F: Documentation/filesystems/gfs2* F: fs/gfs2/ F: include/uapi/linux/gfs2_ondisk.h From bbae10fac2dceb6c8585d385cc689f4e7ce81b1f Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 8 May 2020 09:18:03 -0500 Subject: [PATCH 333/574] gfs2: Don't ignore inode write errors during inode_go_sync Before for this patch, function inode_go_sync ignored io errors during inode_go_sync, overwriting them with metadata write errors: error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } error = filemap_fdatawait(metamapping); ... return error; So any errors returned by the inode write would be forgotten if the metadata write succeeded. This patch still does both writes, but only sets error if it's still zero. That way, any errors will be reported by to the caller, do_xmote, which will take appropriate action and report the error. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 9e9c7a4b8c66..4862dae868a2 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -268,7 +268,7 @@ static int inode_go_sync(struct gfs2_glock *gl) struct gfs2_inode *ip = gfs2_glock2inode(gl); int isreg = ip && S_ISREG(ip->i_inode.i_mode); struct address_space *metamapping = gfs2_glock2aspace(gl); - int error = 0; + int error = 0, ret; if (isreg) { if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) @@ -289,8 +289,10 @@ static int inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + ret = filemap_fdatawait(metamapping); + mapping_set_error(metamapping, ret); + if (!error) + error = ret; gfs2_ail_empty_gl(gl); /* * Writeback of the data mapping may cause the dirty flag to be set From ea22eee4e6027d8927099de344f7fff43c507ef9 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 29 Apr 2020 08:45:54 -0500 Subject: [PATCH 334/574] gfs2: Allow lock_nolock mount to specify jid=X Before this patch, a simple typo accidentally added \n to the jid= string for lock_nolock mounts. This made it impossible to mount a gfs2 file system with a journal other than journal0. Thus: mount -tgfs2 -o hostdata="jid=1" Resulted in: mount: wrong fs type, bad option, bad superblock on In most cases this is not a problem. However, for debugging and testing purposes we sometimes want to test the integrity of other journals. This patch removes the unnecessary \n and thus allows lock_nolock users to specify an alternate journal. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/ops_fstype.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e2b69ffcc6a8..094f5fe7c009 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -880,7 +880,7 @@ fail: } static const match_table_t nolock_tokens = { - { Opt_jid, "jid=%d\n", }, + { Opt_jid, "jid=%d", }, { Opt_err, NULL }, }; From 1a0b00d15d4005390b74154e98822fc7d36d36cd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 1 Jun 2020 11:37:09 -0400 Subject: [PATCH 335/574] gfs2: Only do glock put in gfs2_create_inode for free inodes Before this patch, the error path of function gfs2_create_inode would always calls gfs2_glock_put for the inode glock. That's good for inodes that are free. But after they've been added to the vfs inodes, errors will cause the inode to be evicted, and the evict will do the glock put for us. If we do a glock put again, we can try to free the glock while there are still references to it, e.g. revokes pending for the transaction that created it. This patch adds a check: if (free_vfs_inode) before the put, thus solving the problem. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5acd3ce30759..e3a27fd284dd 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -780,7 +780,8 @@ fail_gunlock2: fail_free_inode: if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); - gfs2_glock_put(ip->i_gl); + if (free_vfs_inode) /* else evict will do the put for us */ + gfs2_glock_put(ip->i_gl); } gfs2_rs_delete(ip, NULL); gfs2_qa_put(ip); From 243f5908b13509d5c246e1ce660dbd2cf7051a96 Mon Sep 17 00:00:00 2001 From: Anupam Aggarwal Date: Fri, 29 May 2020 17:56:00 +0530 Subject: [PATCH 336/574] sparc: remove unused header file nfs_fs.h Remove unused header file linux/nfs_fs.h Signed-off-by: Anupam Aggarwal Signed-off-by: Vivek Trivedi Signed-off-by: Amit Sahrawat Signed-off-by: David S. Miller --- arch/sparc/kernel/sys_sparc32.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index b5da3bfdc225..f84a02ab6bf9 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include From ee7c7953134e23af41055c41a124b4f776d813c3 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 24 May 2020 19:53:58 +0300 Subject: [PATCH 337/574] sparc32: register memory occupied by kernel as memblock.memory sparc32 never registered the memory occupied by the kernel image with memblock_add() and it only reserved this memory with meblock_reserve(). With openbios as system firmware, the memory occupied by the kernel is reserved in openbios and removed from mem.available. The prom setup code in the kernel uses mem.available to set up the memory banks and essentially there is a hole for the memory occupied by the kernel image. Later in bootmem_init() this memory is memblock_reserve()d. Up until recently, memmap initialization would call __init_single_page() for the pages in that hole, the free_low_memory_core_early() would mark them as reserved and everything would be Ok. After the change in memmap initialization introduced by the commit "mm: memmap_init: iterate over memblock regions rather that check each PFN", the hole is skipped and the page structs for it are not initialized. And when they are passed from memblock to page allocator as reserved, the latter gets confused. Simply registering the memory occupied by the kernel with memblock_add() resolves this issue. Tested on qemu-system-sparc with Debian Etch [1] userspace. [1] https://people.debian.org/~aurel32/qemu/sparc/debian_etch_sparc_small.qcow2 Signed-off-by: Mike Rapoport Link: https://lkml.kernel.org/r/20200517000050.GA87467@roeck-us.nlllllet/ Acked-by: David S. Miller Signed-off-by: David S. Miller --- arch/sparc/mm/init_32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 906eda1158b4..3cb3dffcbcdc 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -193,6 +193,7 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) /* Reserve the kernel text/data/bss. */ size = (start_pfn << PAGE_SHIFT) - phys_base; memblock_reserve(phys_base, size); + memblock_add(phys_base, size); size = memblock_phys_mem_size() - memblock_reserved_size(); *pages_avail = (size >> PAGE_SHIFT) - high_pages; From 454b0289c6b5f2c66164654b80212d15fbef7a03 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 May 2020 18:33:00 +0100 Subject: [PATCH 338/574] sparc32: mm: Don't try to free page-table pages if ctor() fails The pages backing page-table allocations for SRMMU are allocated via memblock as part of the "nocache" region initialisation during srmmu_paging_init() and should not be freed even if a later call to pgtable_pte_page_ctor() fails. Remove the broken call to __free_page(). Cc: David S. Miller Cc: Kirill A. Shutemov Fixes: 1ae9ae5f7df7 ("sparc: handle pgtable_page_ctor() fail") Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/mm/srmmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index c861c0f0df73..589370a21b12 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -365,7 +365,6 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) return NULL; page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); if (!pgtable_pte_page_ctor(page)) { - __free_page(page); return NULL; } return ptep; From 60bccaa671bce7ed040d9511b0738ca3b0a0e9ff Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 May 2020 18:33:01 +0100 Subject: [PATCH 339/574] sparc32: mm: Disable SPLIT_PTLOCK_CPUS The SRMMU page-table allocator is not compatible with SPLIT_PTLOCK_CPUS for two major reasons: 1. Pages are allocated via memblock, and therefore the ptl is not cleared by prep_new_page(), which is expected by ptlock_init() 2. Multiple PTE tables can exist in a single page, causing them to share the same ptl and deadlock when attempting to take the same lock twice (e.g. as part of copy_page_range()). Ensure that SPLIT_PTLOCK_CPUS is not selected for SPARC32. Cc: David S. Miller Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- mm/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..97458119cce8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -192,6 +192,9 @@ config MEMORY_HOTREMOVE # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# SPARC32 allocates multiple pte tables within a single page, and therefore +# a per-page lock leads to problems when multiple tables need to be locked +# at the same time (e.g. copy_page_range()). # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS @@ -199,6 +202,7 @@ config SPLIT_PTLOCK_CPUS default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 + default "999999" if SPARC32 default "4" config ARCH_ENABLE_SPLIT_PMD_PTLOCK From 1996d47a0db27a3071f1a6587123185ba98c4585 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 26 May 2020 18:33:02 +0100 Subject: [PATCH 340/574] sparc32: mm: Only call ctor()/dtor() functions for first and last user The SRMMU page-table allocator allocates multiple PTE tables per page, since they are only 1K in size. However, this means that calls to pgtable_pte_page_{ctor,dtor}() must be serialised and performed only by the first and last page-table allocation for the page respectively. Use the page reference count to track how many PTE tables we have allocated for a given page returned by the SRMMU allocator and only call the ctor()/dtor() functions for the first and last user respectively. Cc: David S. Miller Fixes: 8c8f3156dd40 ("sparc32: mm: Reduce allocation size for PMD and PTE tables") Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- arch/sparc/mm/srmmu.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 589370a21b12..116d19a390f2 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -364,9 +364,13 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) if ((ptep = pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); - if (!pgtable_pte_page_ctor(page)) { - return NULL; + spin_lock(&mm->page_table_lock); + if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) { + page_ref_dec(page); + ptep = NULL; } + spin_unlock(&mm->page_table_lock); + return ptep; } @@ -375,7 +379,11 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) struct page *page; page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); - pgtable_pte_page_dtor(page); + spin_lock(&mm->page_table_lock); + if (page_ref_dec_return(page) == 1) + pgtable_pte_page_dtor(page); + spin_unlock(&mm->page_table_lock); + srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); } From d8cac29b1d52204e6632d2887eff766acd02b9aa Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:43 +0200 Subject: [PATCH 341/574] vfio-ccw: Introduce a new CRW region This region provides a mechanism to pass a Channel Report Word that affect vfio-ccw devices, and needs to be passed to the guest for its awareness and/or processing. The base driver (see crw_collect_info()) provides space for two CRWs, as a subchannel event may have two CRWs chained together (one for the ssid, one for the subchannel). As vfio-ccw will deal with everything at the subchannel level, provide space for a single CRW to be transferred in one shot. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-7-farman@linux.ibm.com> [CH: added padding to ccw_crw_region] Signed-off-by: Cornelia Huck --- Documentation/s390/vfio-ccw.rst | 20 +++++++++++ drivers/s390/cio/vfio_ccw_chp.c | 55 +++++++++++++++++++++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 20 +++++++++++ drivers/s390/cio/vfio_ccw_ops.c | 8 +++++ drivers/s390/cio/vfio_ccw_private.h | 4 +++ include/uapi/linux/vfio.h | 2 ++ include/uapi/linux/vfio_ccw.h | 9 +++++ 7 files changed, 118 insertions(+) diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 32310df525ba..8aad08a8b8a5 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -298,6 +298,26 @@ This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. Reading this region triggers a STORE SUBCHANNEL to be issued to the associated hardware. +vfio-ccw crw region +--------------------- + +The vfio-ccw crw region is used to return Channel Report Word (CRW) +data to userspace:: + + struct ccw_crw_region { + __u32 crw; + __u32 pad; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. + +Reading this region returns a CRW if one that is relevant for this +subchannel (e.g. one reporting changes in channel path state) is +pending, or all zeroes if not. If multiple CRWs are pending (including +possibly chained CRWs), reading this region again will return the next +one, until no more CRWs are pending and zeroes are returned. This is +similar to how STORE CHANNEL REPORT WORD works. + vfio-ccw operation details -------------------------- diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 18f3b3e873a9..37ea344a4d72 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -74,3 +74,58 @@ int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private) VFIO_REGION_INFO_FLAG_READ, private->schib_region); } + +static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_crw_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + mutex_lock(&private->io_mutex); + region = private->region[i].data; + + if (copy_to_user(buf, (void *)region + pos, count)) + ret = -EFAULT; + else + ret = count; + + region->crw = 0; + + mutex_unlock(&private->io_mutex); + return ret; +} + +static ssize_t vfio_ccw_crw_region_write(struct vfio_ccw_private *private, + const char __user *buf, size_t count, + loff_t *ppos) +{ + return -EINVAL; +} + +static void vfio_ccw_crw_region_release(struct vfio_ccw_private *private, + struct vfio_ccw_region *region) +{ + +} + +const struct vfio_ccw_regops vfio_ccw_crw_region_ops = { + .read = vfio_ccw_crw_region_read, + .write = vfio_ccw_crw_region_write, + .release = vfio_ccw_crw_region_release, +}; + +int vfio_ccw_register_crw_dev_regions(struct vfio_ccw_private *private) +{ + return vfio_ccw_register_dev_region(private, + VFIO_REGION_SUBTYPE_CCW_CRW, + &vfio_ccw_crw_region_ops, + sizeof(struct ccw_crw_region), + VFIO_REGION_INFO_FLAG_READ, + private->crw_region); +} diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7aeff42f370d..e4deae6fd525 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -28,6 +28,7 @@ struct workqueue_struct *vfio_ccw_work_q; static struct kmem_cache *vfio_ccw_io_region; static struct kmem_cache *vfio_ccw_cmd_region; static struct kmem_cache *vfio_ccw_schib_region; +static struct kmem_cache *vfio_ccw_crw_region; debug_info_t *vfio_ccw_debug_msg_id; debug_info_t *vfio_ccw_debug_trace_id; @@ -120,6 +121,8 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) static void vfio_ccw_free_regions(struct vfio_ccw_private *private) { + if (private->crw_region) + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); if (private->schib_region) kmem_cache_free(vfio_ccw_schib_region, private->schib_region); if (private->cmd_region) @@ -165,6 +168,12 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (!private->schib_region) goto out_free; + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, + GFP_KERNEL | GFP_DMA); + + if (!private->crw_region) + goto out_free; + private->sch = sch; dev_set_drvdata(&sch->dev, private); mutex_init(&private->io_mutex); @@ -366,6 +375,7 @@ static void vfio_ccw_debug_exit(void) static void vfio_ccw_destroy_regions(void) { + kmem_cache_destroy(vfio_ccw_crw_region); kmem_cache_destroy(vfio_ccw_schib_region); kmem_cache_destroy(vfio_ccw_cmd_region); kmem_cache_destroy(vfio_ccw_io_region); @@ -413,6 +423,16 @@ static int __init vfio_ccw_sch_init(void) goto out_err; } + vfio_ccw_crw_region = kmem_cache_create_usercopy("vfio_ccw_crw_region", + sizeof(struct ccw_crw_region), 0, + SLAB_ACCOUNT, 0, + sizeof(struct ccw_crw_region), NULL); + + if (!vfio_ccw_crw_region) { + ret = -ENOMEM; + goto out_err; + } + isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c3a74ab7bb86..8b3ed5b45277 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -178,6 +178,10 @@ static int vfio_ccw_mdev_open(struct mdev_device *mdev) if (ret) goto out_unregister; + ret = vfio_ccw_register_crw_dev_regions(private); + if (ret) + goto out_unregister; + return ret; out_unregister: @@ -389,6 +393,7 @@ static int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) { switch (info->index) { case VFIO_CCW_IO_IRQ_INDEX: + case VFIO_CCW_CRW_IRQ_INDEX: info->count = 1; info->flags = VFIO_IRQ_INFO_EVENTFD; break; @@ -416,6 +421,9 @@ static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, case VFIO_CCW_IO_IRQ_INDEX: ctx = &private->io_trigger; break; + case VFIO_CCW_CRW_IRQ_INDEX: + ctx = &private->crw_trigger; + break; default: return -EINVAL; } diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index d6601a8adf13..97131b4df0b9 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -57,6 +57,7 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); +int vfio_ccw_register_crw_dev_regions(struct vfio_ccw_private *private); /** * struct vfio_ccw_private @@ -71,6 +72,7 @@ int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); * @region: additional regions for other subchannel operations * @cmd_region: MMIO region for asynchronous I/O commands other than START * @schib_region: MMIO region for SCHIB information + * @crw_region: MMIO region for getting channel report words * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt @@ -90,6 +92,7 @@ struct vfio_ccw_private { struct vfio_ccw_region *region; struct ccw_cmd_region *cmd_region; struct ccw_schib_region *schib_region; + struct ccw_crw_region *crw_region; int num_regions; struct channel_program cp; @@ -97,6 +100,7 @@ struct vfio_ccw_private { union scsw scsw; struct eventfd_ctx *io_trigger; + struct eventfd_ctx *crw_trigger; struct work_struct io_work; } __aligned(8); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 7a1abbd889bd..907758cf6d60 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -379,6 +379,7 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) #define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) +#define VFIO_REGION_SUBTYPE_CCW_CRW (3) /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped @@ -578,6 +579,7 @@ enum { enum { VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, VFIO_CCW_NUM_IRQS }; diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h index 758bf214898d..aa04f3aa6db0 100644 --- a/include/uapi/linux/vfio_ccw.h +++ b/include/uapi/linux/vfio_ccw.h @@ -44,4 +44,13 @@ struct ccw_schib_region { __u8 schib_area[SCHIB_AREA_SIZE]; } __packed; +/* + * Used for returning a Channel Report Word to userspace. + * Note: this is controlled by a capability + */ +struct ccw_crw_region { + __u32 crw; + __u32 pad; +} __packed; + #endif From 3f02cb2fd9d2d9e8762102886e3e4b51285797ee Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 5 May 2020 14:27:44 +0200 Subject: [PATCH 342/574] vfio-ccw: Wire up the CRW irq and CRW region Use the IRQ to notify userspace that there is a CRW pending in the region, related to path-availability changes on the passthrough subchannel. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-8-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_chp.c | 17 ++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 49 +++++++++++++++++++++++++++++ drivers/s390/cio/vfio_ccw_private.h | 8 +++++ 3 files changed, 74 insertions(+) diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 37ea344a4d72..876f6ade51cc 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -82,14 +82,24 @@ static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; struct ccw_crw_region *region; + struct vfio_ccw_crw *crw; int ret; if (pos + count > sizeof(*region)) return -EINVAL; + crw = list_first_entry_or_null(&private->crw, + struct vfio_ccw_crw, next); + + if (crw) + list_del(&crw->next); + mutex_lock(&private->io_mutex); region = private->region[i].data; + if (crw) + memcpy(®ion->crw, &crw->crw, sizeof(region->crw)); + if (copy_to_user(buf, (void *)region + pos, count)) ret = -EFAULT; else @@ -98,6 +108,13 @@ static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, region->crw = 0; mutex_unlock(&private->io_mutex); + + kfree(crw); + + /* Notify the guest if more CRWs are on our queue */ + if (!list_empty(&private->crw) && private->crw_trigger) + eventfd_signal(private->crw_trigger, 1); + return ret; } diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e4deae6fd525..9144360851ed 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -108,6 +108,16 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) eventfd_signal(private->io_trigger, 1); } +static void vfio_ccw_crw_todo(struct work_struct *work) +{ + struct vfio_ccw_private *private; + + private = container_of(work, struct vfio_ccw_private, crw_work); + + if (!list_empty(&private->crw) && private->crw_trigger) + eventfd_signal(private->crw_trigger, 1); +} + /* * Css driver callbacks */ @@ -186,7 +196,9 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) if (ret) goto out_free; + INIT_LIST_HEAD(&private->crw); INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); + INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); atomic_set(&private->avail, 1); private->state = VFIO_CCW_STATE_STANDBY; @@ -217,9 +229,15 @@ out_free: static int vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); + struct vfio_ccw_crw *crw, *temp; vfio_ccw_sch_quiesce(sch); + list_for_each_entry_safe(crw, temp, &private->crw, next) { + list_del(&crw->next); + kfree(crw); + } + vfio_ccw_mdev_unreg(sch); dev_set_drvdata(&sch->dev, NULL); @@ -281,6 +299,33 @@ out_unlock: return rc; } +static void vfio_ccw_queue_crw(struct vfio_ccw_private *private, + unsigned int rsc, + unsigned int erc, + unsigned int rsid) +{ + struct vfio_ccw_crw *crw; + + /* + * If unable to allocate a CRW, just drop the event and + * carry on. The guest will either see a later one or + * learn when it issues its own store subchannel. + */ + crw = kzalloc(sizeof(*crw), GFP_ATOMIC); + if (!crw) + return; + + /* + * Build the CRW based on the inputs given to us. + */ + crw->crw.rsc = rsc; + crw->crw.erc = erc; + crw->crw.rsid = rsid; + + list_add_tail(&crw->next, &private->crw); + queue_work(vfio_ccw_work_q, &private->crw_work); +} + static int vfio_ccw_chp_event(struct subchannel *sch, struct chp_link *link, int event) { @@ -311,6 +356,8 @@ static int vfio_ccw_chp_event(struct subchannel *sch, /* Path is gone */ if (sch->schib.pmcw.lpum & mask) cio_cancel_halt_clear(sch, &retry); + vfio_ccw_queue_crw(private, CRW_RSC_CPATH, CRW_ERC_PERRN, + link->chpid.id); break; case CHP_VARY_ON: /* Path logically turned on */ @@ -320,6 +367,8 @@ static int vfio_ccw_chp_event(struct subchannel *sch, case CHP_ONLINE: /* Path became available */ sch->lpm |= mask & sch->opm; + vfio_ccw_queue_crw(private, CRW_RSC_CPATH, CRW_ERC_INIT, + link->chpid.id); break; } diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 97131b4df0b9..8723156b29ea 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "css.h" @@ -59,6 +60,11 @@ int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_schib_dev_regions(struct vfio_ccw_private *private); int vfio_ccw_register_crw_dev_regions(struct vfio_ccw_private *private); +struct vfio_ccw_crw { + struct list_head next; + struct crw crw; +}; + /** * struct vfio_ccw_private * @sch: pointer to the subchannel @@ -98,10 +104,12 @@ struct vfio_ccw_private { struct channel_program cp; struct irb irb; union scsw scsw; + struct list_head crw; struct eventfd_ctx *io_trigger; struct eventfd_ctx *crw_trigger; struct work_struct io_work; + struct work_struct crw_work; } __aligned(8); extern int vfio_ccw_mdev_reg(struct subchannel *sch); From b2dd9a44a1098c96935c495570b663bd223a087e Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Tue, 5 May 2020 14:27:45 +0200 Subject: [PATCH 343/574] vfio-ccw: Add trace for CRW event Since CRW events are (should be) rare, let's put a trace in that routine too. Signed-off-by: Eric Farman Reviewed-by: Cornelia Huck Message-Id: <20200505122745.53208-9-farman@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 1 + drivers/s390/cio/vfio_ccw_trace.c | 1 + drivers/s390/cio/vfio_ccw_trace.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 9144360851ed..8c625b530035 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -336,6 +336,7 @@ static int vfio_ccw_chp_event(struct subchannel *sch, if (!private || !mask) return 0; + trace_vfio_ccw_chp_event(private->sch->schid, mask, event); VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): mask=0x%x event=%d\n", mdev_uuid(private->mdev), sch->schid.cssid, sch->schid.ssid, sch->schid.sch_no, diff --git a/drivers/s390/cio/vfio_ccw_trace.c b/drivers/s390/cio/vfio_ccw_trace.c index 8c671d2519f6..4a0205905afc 100644 --- a/drivers/s390/cio/vfio_ccw_trace.c +++ b/drivers/s390/cio/vfio_ccw_trace.c @@ -9,6 +9,7 @@ #define CREATE_TRACE_POINTS #include "vfio_ccw_trace.h" +EXPORT_TRACEPOINT_SYMBOL(vfio_ccw_chp_event); EXPORT_TRACEPOINT_SYMBOL(vfio_ccw_fsm_async_request); EXPORT_TRACEPOINT_SYMBOL(vfio_ccw_fsm_event); EXPORT_TRACEPOINT_SYMBOL(vfio_ccw_fsm_io_request); diff --git a/drivers/s390/cio/vfio_ccw_trace.h b/drivers/s390/cio/vfio_ccw_trace.h index f5d31887d413..62fb30598d47 100644 --- a/drivers/s390/cio/vfio_ccw_trace.h +++ b/drivers/s390/cio/vfio_ccw_trace.h @@ -17,6 +17,36 @@ #include +TRACE_EVENT(vfio_ccw_chp_event, + TP_PROTO(struct subchannel_id schid, + int mask, + int event), + TP_ARGS(schid, mask, event), + + TP_STRUCT__entry( + __field(u8, cssid) + __field(u8, ssid) + __field(u16, sch_no) + __field(int, mask) + __field(int, event) + ), + + TP_fast_assign( + __entry->cssid = schid.cssid; + __entry->ssid = schid.ssid; + __entry->sch_no = schid.sch_no; + __entry->mask = mask; + __entry->event = event; + ), + + TP_printk("schid=%x.%x.%04x mask=0x%x event=%d", + __entry->cssid, + __entry->ssid, + __entry->sch_no, + __entry->mask, + __entry->event) +); + TRACE_EVENT(vfio_ccw_fsm_async_request, TP_PROTO(struct subchannel_id schid, int command, From b7f839d292948142eaab77cedd031aad0bfec872 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 2 Jun 2020 17:22:48 -0400 Subject: [PATCH 344/574] drm/amdgpu/display: use blanked rather than plane state for sync groups We may end up with no planes set yet, depending on the ordering, but we should have the proper blanking state which is either handled by either DPG or TG depending on the hardware generation. Check both to determine the proper blanked state. Bug: https://gitlab.freedesktop.org/drm/amd/issues/781 Fixes: 5fc0cbfad45648 ("drm/amd/display: determine if a pipe is synced by plane state") Cc: nicholas.kazlauskas@amd.com Reviewed-by: Nicholas Kazlauskas Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 45cfb7c45566..b4e2053bca9f 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1016,9 +1016,17 @@ static void program_timing_sync( } } - /* set first pipe with plane as master */ + /* set first unblanked pipe as master */ for (j = 0; j < group_size; j++) { - if (pipe_set[j]->plane_state) { + bool is_blanked; + + if (pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked) + is_blanked = + pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked(pipe_set[j]->stream_res.opp); + else + is_blanked = + pipe_set[j]->stream_res.tg->funcs->is_blanked(pipe_set[j]->stream_res.tg); + if (!is_blanked) { if (j == 0) break; @@ -1039,9 +1047,17 @@ static void program_timing_sync( status->timing_sync_info.master = false; } - /* remove any other pipes with plane as they have already been synced */ + /* remove any other unblanked pipes as they have already been synced */ for (j = j + 1; j < group_size; j++) { - if (pipe_set[j]->plane_state) { + bool is_blanked; + + if (pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked) + is_blanked = + pipe_set[j]->stream_res.opp->funcs->dpg_is_blanked(pipe_set[j]->stream_res.opp); + else + is_blanked = + pipe_set[j]->stream_res.tg->funcs->is_blanked(pipe_set[j]->stream_res.tg); + if (!is_blanked) { group_size--; pipe_set[j] = pipe_set[group_size]; j--; From a24eaa5c51255b344d5a321f1eeb3205f2775498 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Tue, 2 Jun 2020 20:42:33 -0400 Subject: [PATCH 345/574] drm/amd/display: Revalidate bandwidth before commiting DC updates [Why] Whenever we switch between tiled formats without also switching pixel formats or doing anything else that recreates the DC plane state we can run into underflow or hangs since we're not updating the DML parameters before committing to the hardware. [How] If the update type is FULL then call validate_bandwidth again to update the DML parmeters before committing the state. This is basically just a workaround and protective measure against update types being added DC where we could run into this issue in the future. We can only fully validate the state in advance before applying it to the hardware if we recreate all the plane and stream states since we can't modify what's currently in use. The next step is to update DM to ensure that we're creating the plane and stream states for whatever could potentially be a full update in DC to pre-emptively recreate the state for DC global validation. The workaround can stay until this has been fixed in DM. Signed-off-by: Nicholas Kazlauskas Reviewed-by: Hersen Wu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index b4e2053bca9f..6f93a6ca4cf0 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2538,6 +2538,12 @@ void dc_commit_updates_for_stream(struct dc *dc, copy_stream_update_to_stream(dc, context, stream, stream_update); + if (!dc->res_pool->funcs->validate_bandwidth(dc, context, false)) { + DC_ERROR("Mode validation failed for stream update!\n"); + dc_release_state(context); + return; + } + commit_planes_for_stream( dc, srf_updates, From daceabf1b494c9e40a93fcc323b1258f557f65a1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 3 Jun 2020 10:21:24 -0500 Subject: [PATCH 346/574] tracing/doc: Fix ascii-art in histogram-design.rst This fixes the Sphinx parallel build error when building htmldocs: docutils.utils.SystemMessage: /home/sfr/next/next/Documentation/trace/histogram-design.rst:219: (SEVERE/4) Unexpected section title. It also fixes a bunch of other warnings I noticed when fixing the above, caused by mixing ascii-art and text. Link: https://lkml.kernel.org/r/69c291c76964642a417e5dd170d183ba6b552010.camel@kernel.org Reported-by: Stephen Rothwell Tested-by: Stephen Rothwell Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram-design.rst | 48 ++++++++++++------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst index 06f5c7e5f2ee..eef840043da9 100644 --- a/Documentation/trace/histogram-design.rst +++ b/Documentation/trace/histogram-design.rst @@ -14,7 +14,7 @@ tracing_map.c. Note: All the ftrace histogram command examples assume the working directory is the ftrace /tracing directory. For example:: - # cd /sys/kernel/debug/tracing + # cd /sys/kernel/debug/tracing Also, the histogram output displayed for those commands will be generally be truncated - only enough to make the point is displayed. @@ -107,7 +107,7 @@ for the hitcount and one key field for the pid key. Below that is a diagram of a run-time snapshot of what the tracing_map might look like for a given run. It attempts to show the relationships between the hist_data fields and the tracing_map -elements for a couple hypothetical keys and values. +elements for a couple hypothetical keys and values.:: +------------------+ | hist_data | @@ -141,20 +141,20 @@ elements for a couple hypothetical keys and values. | | | | +--------------+ | | n_keys = n_fields - n_vals | | - | | + The hist_data n_vals and n_fields delineate the extent of the fields[] | | array and separate keys from values for the rest of the code. | | - | | + Below is a run-time representation of the tracing_map part of the | | histogram, with pointers from various parts of the fields[] array | | to corresponding parts of the tracing_map. | | - | | + The tracing_map consists of an array of tracing_map_entrys and a set | | of preallocated tracing_map_elts (abbreviated below as map_entry and | | map_elt). The total number of map_entrys in the hist_data.map array = | | map->max_elts (actually map->map_size but only max_elts of those are | | used. This is a property required by the map_insert() algorithm). | | - | | + If a map_entry is unused, meaning no key has yet hashed into it, its | | .key value is 0 and its .val pointer is NULL. Once a map_entry has | | been claimed, the .key value contains the key's hash value and the | | @@ -163,11 +163,11 @@ for each key or value in the map_elt.fields[] array. There is an | | entry in the map_elt.fields[] array corresponding to each hist_field | | in the histogram, and this is where the continually aggregated sums | | corresponding to each histogram value are kept. | | - | | + The diagram attempts to show the relationship between the | | hist_data.fields[] and the map_elt.fields[] with the links drawn | | -between diagrams:: | | - | | +between diagrams:: + +-----------+ | | | hist_data | | | +-----------+ | | @@ -380,7 +380,7 @@ entry, ts0, corresponding to the ts0 variable in the sched_waking trigger above. sched_waking histogram ----------------------- +----------------------:: +------------------+ | hist_data |<-------------------------------------------------------+ @@ -439,25 +439,25 @@ sched_waking histogram +-----------------+ | | | n_keys = n_fields - n_vals | | | | | | - | | | + This is very similar to the basic case. In the above diagram, we can | | | see a new .flags member has been added to the struct hist_field | | | struct, and a new entry added to hist_data.fields representing the ts0 | | | variable. For a normal val hist_field, .flags is just 0 (modulo | | | modifier flags), but if the value is defined as a variable, the .flags | | | contains a set FL_VAR bit. | | | - | | | + As you can see, the ts0 entry's .var.idx member contains the index | | | into the tracing_map_elts' .vars[] array containing variable values. | | | This idx is used whenever the value of the variable is set or read. | | | The map_elt.vars idx assigned to the given variable is assigned and | | | saved in .var.idx by create_tracing_map_fields() after it calls | | | tracing_map_add_var(). | | | - | | | + Below is a representation of the histogram at run-time, which | | | populates the map, along with correspondence to the above hist_data and | | | hist_field data structures. | | | - | | | + The diagram attempts to show the relationship between the | | | hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with | | | the links drawn between diagrams. For each of the map_elts, you can | | | @@ -465,8 +465,8 @@ see that the .fields[] members point to the .sum or .offset of a key | | | or val and the .vars[] members point to the value of a variable. The | | | arrows between the two diagrams show the linkages between those | | | tracing_map members and the field definitions in the corresponding | | | -hist_data fields[] members. | | | - | | | +hist_data fields[] members.:: + +-----------+ | | | | hist_data | | | | +-----------+ | | | @@ -564,27 +564,27 @@ hist_data fields[] members. | | | | unused | | | | | | | +---------------+ | | - | | + For each used map entry, there's a map_elt pointing to an array of | | .vars containing the current value of the variables associated with | | that histogram entry. So in the above, the timestamp associated with | | pid 999 is 113345679876, and the timestamp variable in the same | | .var.idx for pid 4444 is 213499240729. | | - | | + sched_switch histogram | | ---------------------- | | - | | + The sched_switch histogram paired with the above sched_waking | | histogram is shown below. The most important aspect of the | | sched_switch histogram is that it references a variable on the | | sched_waking histogram above. | | - | | + The histogram diagram is very similar to the others so far displayed, | | but it adds variable references. You can see the normal hitcount and | | key fields along with a new wakeup_lat variable implemented in the | | same way as the sched_waking ts0 variable, but in addition there's an | | entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag. | | - | | + Associated with the new var ref field are a couple of new hist_field | | members, var.hist_data and var_ref_idx. For a variable reference, the | | var.hist_data goes with the var.idx, which together uniquely identify | | @@ -593,10 +593,10 @@ just the index into the var_ref_vals[] array that caches the values of | | each variable whenever a hist trigger is updated. Those resulting | | values are then finally accessed by other code such as trace action | | code that uses the var_ref_idx values to assign param values. | | - | | + The diagram below describes the situation for the sched_switch | | -histogram referred to before: | | - | | +histogram referred to before:: + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> | | events/sched/sched_switch/trigger | | | | From 6f8dbcf1c9cec3ec5efcf4c17b29a5b1732d1491 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 3 Jun 2020 11:37:56 +1000 Subject: [PATCH 347/574] drm/nouveau/disp: provide hint to OR allocation about HDA requirements Will be used by a subsequent commit to influence SOR allocation policy. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/disp.c | 17 ++++++++++++----- drivers/gpu/drm/nouveau/include/nvif/cl5070.h | 3 ++- drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c | 2 +- drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.h | 2 +- .../gpu/drm/nouveau/nvkm/engine/disp/rootnv50.c | 4 ++-- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index e8ac510f8298..d472942102f5 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -277,7 +277,7 @@ nv50_outp_release(struct nouveau_encoder *nv_encoder) } static int -nv50_outp_acquire(struct nouveau_encoder *nv_encoder) +nv50_outp_acquire(struct nouveau_encoder *nv_encoder, bool hda) { struct nouveau_drm *drm = nouveau_drm(nv_encoder->base.base.dev); struct nv50_disp *disp = nv50_disp(drm->dev); @@ -289,6 +289,7 @@ nv50_outp_acquire(struct nouveau_encoder *nv_encoder) .base.method = NV50_DISP_MTHD_V1_ACQUIRE, .base.hasht = nv_encoder->dcb->hasht, .base.hashm = nv_encoder->dcb->hashm, + .info.hda = hda, }; int ret; @@ -393,7 +394,7 @@ nv50_dac_enable(struct drm_encoder *encoder) struct nv50_head_atom *asyh = nv50_head_atom(nv_crtc->base.state); struct nv50_core *core = nv50_disp(encoder->dev)->core; - nv50_outp_acquire(nv_encoder); + nv50_outp_acquire(nv_encoder, false); core->func->dac->ctrl(core, nv_encoder->or, 1 << nv_crtc->index, asyh); asyh->or.depth = 0; @@ -968,7 +969,7 @@ nv50_msto_enable(struct drm_encoder *encoder) DRM_DEBUG_KMS("Failed to allocate VCPI\n"); if (!mstm->links++) - nv50_outp_acquire(mstm->outp); + nv50_outp_acquire(mstm->outp, false /*XXX: MST audio.*/); if (mstm->outp->link & 1) proto = 0x8; @@ -1562,12 +1563,18 @@ nv50_sor_enable(struct drm_encoder *encoder) struct nouveau_drm *drm = nouveau_drm(dev); struct nouveau_connector *nv_connector; struct nvbios *bios = &drm->vbios; + bool hda = false; u8 proto = 0xf; u8 depth = 0x0; nv_connector = nouveau_encoder_connector_get(nv_encoder); nv_encoder->crtc = encoder->crtc; - nv50_outp_acquire(nv_encoder); + + if ((disp->disp->object.oclass == GT214_DISP || + disp->disp->object.oclass >= GF110_DISP) && + drm_detect_monitor_audio(nv_connector->edid)) + hda = true; + nv50_outp_acquire(nv_encoder, hda); switch (nv_encoder->dcb->type) { case DCB_OUTPUT_TMDS: @@ -1777,7 +1784,7 @@ nv50_pior_enable(struct drm_encoder *encoder) u8 owner = 1 << nv_crtc->index; u8 proto; - nv50_outp_acquire(nv_encoder); + nv50_outp_acquire(nv_encoder, false); switch (asyh->or.bpc) { case 10: asyh->or.depth = 0x6; break; diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl5070.h b/drivers/gpu/drm/nouveau/include/nvif/cl5070.h index 38bf4f38e869..53800fb46582 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/cl5070.h +++ b/drivers/gpu/drm/nouveau/include/nvif/cl5070.h @@ -46,7 +46,8 @@ struct nv50_disp_acquire_v0 { __u8 version; __u8 or; __u8 link; - __u8 pad03[5]; + __u8 hda; + __u8 pad04[4]; }; struct nv50_disp_dac_load_v0 { diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c index c62030c96fba..1b1c6ff0e1bc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c @@ -112,7 +112,7 @@ nvkm_outp_acquire_ior(struct nvkm_outp *outp, u8 user, struct nvkm_ior *ior) } int -nvkm_outp_acquire(struct nvkm_outp *outp, u8 user) +nvkm_outp_acquire(struct nvkm_outp *outp, u8 user, bool hda) { struct nvkm_ior *ior = outp->ior; enum nvkm_ior_proto proto; diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.h index 721b068b87ef..ee028d30cfe7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.h @@ -32,7 +32,7 @@ int nvkm_outp_new(struct nvkm_disp *, int index, struct dcb_output *, void nvkm_outp_del(struct nvkm_outp **); void nvkm_outp_init(struct nvkm_outp *); void nvkm_outp_fini(struct nvkm_outp *); -int nvkm_outp_acquire(struct nvkm_outp *, u8 user); +int nvkm_outp_acquire(struct nvkm_outp *, u8 user, bool hda); void nvkm_outp_release(struct nvkm_outp *, u8 user); void nvkm_outp_route(struct nvkm_disp *); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.c index a7672ef17d3b..fb5de44e4b8d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.c @@ -99,7 +99,7 @@ nv50_disp_root_mthd_(struct nvkm_object *object, u32 mthd, void *data, u32 size) } *args = data; int ret = -ENOSYS; if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) { - ret = nvkm_outp_acquire(outp, NVKM_OUTP_USER); + ret = nvkm_outp_acquire(outp, NVKM_OUTP_USER, args->v0.hda); if (ret == 0) { args->v0.or = outp->ior->id; args->v0.link = outp->ior->asy.link; @@ -119,7 +119,7 @@ nv50_disp_root_mthd_(struct nvkm_object *object, u32 mthd, void *data, u32 size) if (!(ret = nvif_unpack(ret, &data, &size, args->v0, 0, 0, false))) { if (args->v0.data & 0xfff00000) return -EINVAL; - ret = nvkm_outp_acquire(outp, NVKM_OUTP_PRIV); + ret = nvkm_outp_acquire(outp, NVKM_OUTP_PRIV, false); if (ret) return ret; ret = outp->ior->func->sense(outp->ior, args->v0.data); From f24b6ae19fa257ea9b10a9389d0b7046b3efd97c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 3 Jun 2020 11:40:47 +1000 Subject: [PATCH 348/574] drm/nouveau/disp: split part of OR allocation logic into a function No logical changes here, this is just moving the code to make the changes in the next commit more obvious. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/nvkm/engine/disp/outp.c | 63 +++++++++++-------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c index 1b1c6ff0e1bc..b56224558a05 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c @@ -111,6 +111,42 @@ nvkm_outp_acquire_ior(struct nvkm_outp *outp, u8 user, struct nvkm_ior *ior) return 0; } +static inline int +nvkm_outp_acquire_hda(struct nvkm_outp *outp, enum nvkm_ior_type type, + u8 user, bool hda) +{ + struct nvkm_ior *ior; + + /* First preference is to reuse the OR that is currently armed + * on HW, if any, in order to prevent unnecessary switching. + */ + list_for_each_entry(ior, &outp->disp->ior, head) { + if (!ior->identity && + !ior->asy.outp && ior->arm.outp == outp) + return nvkm_outp_acquire_ior(outp, user, ior); + } + + /* Failing that, a completely unused OR is the next best thing. */ + list_for_each_entry(ior, &outp->disp->ior, head) { + if (!ior->identity && + !ior->asy.outp && ior->type == type && !ior->arm.outp && + (ior->func->route.set || ior->id == __ffs(outp->info.or))) + return nvkm_outp_acquire_ior(outp, user, ior); + } + + /* Last resort is to assign an OR that's already active on HW, + * but will be released during the next modeset. + */ + list_for_each_entry(ior, &outp->disp->ior, head) { + if (!ior->identity && + !ior->asy.outp && ior->type == type && + (ior->func->route.set || ior->id == __ffs(outp->info.or))) + return nvkm_outp_acquire_ior(outp, user, ior); + } + + return -ENOSPC; +} + int nvkm_outp_acquire(struct nvkm_outp *outp, u8 user, bool hda) { @@ -137,32 +173,7 @@ nvkm_outp_acquire(struct nvkm_outp *outp, u8 user, bool hda) return nvkm_outp_acquire_ior(outp, user, ior); } - /* First preference is to reuse the OR that is currently armed - * on HW, if any, in order to prevent unnecessary switching. - */ - list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && !ior->asy.outp && ior->arm.outp == outp) - return nvkm_outp_acquire_ior(outp, user, ior); - } - - /* Failing that, a completely unused OR is the next best thing. */ - list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && - !ior->asy.outp && ior->type == type && !ior->arm.outp && - (ior->func->route.set || ior->id == __ffs(outp->info.or))) - return nvkm_outp_acquire_ior(outp, user, ior); - } - - /* Last resort is to assign an OR that's already active on HW, - * but will be released during the next modeset. - */ - list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && !ior->asy.outp && ior->type == type && - (ior->func->route.set || ior->id == __ffs(outp->info.or))) - return nvkm_outp_acquire_ior(outp, user, ior); - } - - return -ENOSPC; + return nvkm_outp_acquire_hda(outp, type, user, true); } void From e6867ffa34340636252efe8e6b82be625c43d9b1 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 3 Jun 2020 11:43:23 +1000 Subject: [PATCH 349/574] drm/nouveau/disp: modify OR allocation policy to account for HDA requirements Since GM200, SORs are no longer tied to a specific connector, and we allocate them instead, with the assumption that all SORs are equally capable. However, there's a 1<->1 mapping between SOR and HDA pin widget, and it turns out that it's possible for some widgets to be disabled... In order to avoid picking a SOR without a valid pin widget, some new rules need to be added. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/nvkm/engine/disp/outp.c | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c index b56224558a05..dcf08249374a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/outp.c @@ -121,14 +121,14 @@ nvkm_outp_acquire_hda(struct nvkm_outp *outp, enum nvkm_ior_type type, * on HW, if any, in order to prevent unnecessary switching. */ list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && + if (!ior->identity && !!ior->func->hda.hpd == hda && !ior->asy.outp && ior->arm.outp == outp) return nvkm_outp_acquire_ior(outp, user, ior); } /* Failing that, a completely unused OR is the next best thing. */ list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && + if (!ior->identity && !!ior->func->hda.hpd == hda && !ior->asy.outp && ior->type == type && !ior->arm.outp && (ior->func->route.set || ior->id == __ffs(outp->info.or))) return nvkm_outp_acquire_ior(outp, user, ior); @@ -138,7 +138,7 @@ nvkm_outp_acquire_hda(struct nvkm_outp *outp, enum nvkm_ior_type type, * but will be released during the next modeset. */ list_for_each_entry(ior, &outp->disp->ior, head) { - if (!ior->identity && + if (!ior->identity && !!ior->func->hda.hpd == hda && !ior->asy.outp && ior->type == type && (ior->func->route.set || ior->id == __ffs(outp->info.or))) return nvkm_outp_acquire_ior(outp, user, ior); @@ -173,7 +173,25 @@ nvkm_outp_acquire(struct nvkm_outp *outp, u8 user, bool hda) return nvkm_outp_acquire_ior(outp, user, ior); } - return nvkm_outp_acquire_hda(outp, type, user, true); + /* If we don't need HDA, first try to acquire an OR that doesn't + * support it to leave free the ones that do. + */ + if (!hda) { + if (!nvkm_outp_acquire_hda(outp, type, user, false)) + return 0; + + /* Use a HDA-supporting SOR anyway. */ + return nvkm_outp_acquire_hda(outp, type, user, true); + } + + /* We want HDA, try to acquire an OR that supports it. */ + if (!nvkm_outp_acquire_hda(outp, type, user, true)) + return 0; + + /* There weren't any free ORs that support HDA, grab one that + * doesn't and at least allow display to work still. + */ + return nvkm_outp_acquire_hda(outp, type, user, false); } void From 9f9f54e887adef3dc9b62f372002c421ef4f1921 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Thu, 4 Jun 2020 14:22:32 +1000 Subject: [PATCH 350/574] drm/nouveau/disp/gp100: split SOR implementation from gm200 GP100 needs different HDA detection. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/nvkm/engine/disp/Kbuild | 1 + .../gpu/drm/nouveau/nvkm/engine/disp/gp100.c | 2 +- .../gpu/drm/nouveau/nvkm/engine/disp/gp102.c | 2 +- .../gpu/drm/nouveau/nvkm/engine/disp/ior.h | 1 + .../drm/nouveau/nvkm/engine/disp/sorgp100.c | 59 +++++++++++++++++++ 5 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild index 571687ba85b8..cf075311cdd2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild @@ -39,6 +39,7 @@ nvkm-y += nvkm/engine/disp/sorgf119.o nvkm-y += nvkm/engine/disp/sorgk104.o nvkm-y += nvkm/engine/disp/sorgm107.o nvkm-y += nvkm/engine/disp/sorgm200.o +nvkm-y += nvkm/engine/disp/sorgp100.o nvkm-y += nvkm/engine/disp/sorgv100.o nvkm-y += nvkm/engine/disp/sortu102.o diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp100.c index fd6216684f6d..8471de3f3b61 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp100.c @@ -36,7 +36,7 @@ gp100_disp = { .super = gf119_disp_super, .root = &gp100_disp_root_oclass, .head = { .cnt = gf119_head_cnt, .new = gf119_head_new }, - .sor = { .cnt = gf119_sor_cnt, .new = gm200_sor_new }, + .sor = { .cnt = gf119_sor_cnt, .new = gp100_sor_new }, }; int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp102.c index 3468ddec1270..a3779c5046ea 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gp102.c @@ -63,7 +63,7 @@ gp102_disp = { .super = gf119_disp_super, .root = &gp102_disp_root_oclass, .head = { .cnt = gf119_head_cnt, .new = gf119_head_new }, - .sor = { .cnt = gf119_sor_cnt, .new = gm200_sor_new }, + .sor = { .cnt = gf119_sor_cnt, .new = gp100_sor_new }, }; int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h index c1d7a36e4d3c..1a200a9ba4e4 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h @@ -201,6 +201,7 @@ int gf119_sor_new(struct nvkm_disp *, int); int gk104_sor_new(struct nvkm_disp *, int); int gm107_sor_new(struct nvkm_disp *, int); int gm200_sor_new(struct nvkm_disp *, int); +int gp100_sor_new(struct nvkm_disp *, int); int gv100_sor_cnt(struct nvkm_disp *, unsigned long *); int gv100_sor_new(struct nvkm_disp *, int); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c new file mode 100644 index 000000000000..94feb91b16d5 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c @@ -0,0 +1,59 @@ +/* + * Copyright 2020 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "ior.h" + +static const struct nvkm_ior_func +gp100_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gf119_sor_state, + .power = nv50_sor_power, + .clock = gf119_sor_clock, + .hdmi = { + .ctrl = gk104_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = gf119_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = gf119_sor_dp_vcpi, + .audio = gf119_sor_dp_audio, + .audio_sym = gf119_sor_dp_audio_sym, + .watermark = gf119_sor_dp_watermark, + }, + .hda = { + .hpd = gf119_hda_hpd, + .eld = gf119_hda_eld, + .device_entry = gf119_hda_device_entry, + }, +}; + +int +gp100_sor_new(struct nvkm_disp *disp, int id) +{ + return nvkm_ior_new_(&gp100_sor, disp, SOR, id); +} From 9b5ca547bb8fc26c251d457ac55cc8e3d9baa6bf Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 3 Jun 2020 12:51:03 +1000 Subject: [PATCH 351/574] drm/nouveau/disp/gm200-: detect and potentially disable HDA support on some SORs Some HDA pin widgets may be disabled by BIOS, and unavailable from a SOR. Our SOR allocation policy uses this information to allocate an appropriate SOR when HDA is supported by a display. Thank you to NVIDIA for providing the information to determine this. Signed-off-by: Ben Skeggs --- .../drm/nouveau/nvkm/engine/disp/sorgm200.c | 36 ++++++++++++++++++- .../drm/nouveau/nvkm/engine/disp/sorgp100.c | 36 ++++++++++++++++++- .../drm/nouveau/nvkm/engine/disp/sorgv100.c | 35 +++++++++++++++++- .../drm/nouveau/nvkm/engine/disp/sortu102.c | 32 ++++++++++++++++- 4 files changed, 135 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgm200.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgm200.c index cf2075db742a..4dd7f382968e 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgm200.c @@ -89,7 +89,7 @@ gm200_sor_route_get(struct nvkm_outp *outp, int *link) } static const struct nvkm_ior_func -gm200_sor = { +gm200_sor_hda = { .route = { .get = gm200_sor_route_get, .set = gm200_sor_route_set, @@ -119,8 +119,42 @@ gm200_sor = { }, }; +static const struct nvkm_ior_func +gm200_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gf119_sor_state, + .power = nv50_sor_power, + .clock = gf119_sor_clock, + .hdmi = { + .ctrl = gk104_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = gf119_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = gf119_sor_dp_vcpi, + .audio = gf119_sor_dp_audio, + .audio_sym = gf119_sor_dp_audio_sym, + .watermark = gf119_sor_dp_watermark, + }, +}; + int gm200_sor_new(struct nvkm_disp *disp, int id) { + struct nvkm_device *device = disp->engine.subdev.device; + u32 hda; + + if (!((hda = nvkm_rd32(device, 0x08a15c)) & 0x40000000)) + hda = nvkm_rd32(device, 0x101034); + + if (hda & BIT(id)) + return nvkm_ior_new_(&gm200_sor_hda, disp, SOR, id); return nvkm_ior_new_(&gm200_sor, disp, SOR, id); } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c index 94feb91b16d5..c54f88317a07 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgp100.c @@ -22,7 +22,7 @@ #include "ior.h" static const struct nvkm_ior_func -gp100_sor = { +gp100_sor_hda = { .route = { .get = gm200_sor_route_get, .set = gm200_sor_route_set, @@ -52,8 +52,42 @@ gp100_sor = { }, }; +static const struct nvkm_ior_func +gp100_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gf119_sor_state, + .power = nv50_sor_power, + .clock = gf119_sor_clock, + .hdmi = { + .ctrl = gk104_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = gf119_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = gf119_sor_dp_vcpi, + .audio = gf119_sor_dp_audio, + .audio_sym = gf119_sor_dp_audio_sym, + .watermark = gf119_sor_dp_watermark, + }, +}; + int gp100_sor_new(struct nvkm_disp *disp, int id) { + struct nvkm_device *device = disp->engine.subdev.device; + u32 hda; + + if (!((hda = nvkm_rd32(device, 0x08a15c)) & 0x40000000)) + hda = nvkm_rd32(device, 0x10ebb0) >> 8; + + if (hda & BIT(id)) + return nvkm_ior_new_(&gp100_sor_hda, disp, SOR, id); return nvkm_ior_new_(&gp100_sor, disp, SOR, id); } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgv100.c index d11a0dff10c6..4441187e8ec9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorgv100.c @@ -78,7 +78,7 @@ gv100_sor_state(struct nvkm_ior *sor, struct nvkm_ior_state *state) } static const struct nvkm_ior_func -gv100_sor = { +gv100_sor_hda = { .route = { .get = gm200_sor_route_get, .set = gm200_sor_route_set, @@ -107,9 +107,42 @@ gv100_sor = { }, }; +static const struct nvkm_ior_func +gv100_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gv100_sor_state, + .power = nv50_sor_power, + .clock = gf119_sor_clock, + .hdmi = { + .ctrl = gv100_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = gf119_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .audio = gv100_sor_dp_audio, + .audio_sym = gv100_sor_dp_audio_sym, + .watermark = gv100_sor_dp_watermark, + }, +}; + int gv100_sor_new(struct nvkm_disp *disp, int id) { + struct nvkm_device *device = disp->engine.subdev.device; + u32 hda; + + if (!((hda = nvkm_rd32(device, 0x08a15c)) & 0x40000000)) + hda = nvkm_rd32(device, 0x118fb0) >> 8; + + if (hda & BIT(id)) + return nvkm_ior_new_(&gv100_sor_hda, disp, SOR, id); return nvkm_ior_new_(&gv100_sor, disp, SOR, id); } diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c index fa6d74251237..59865a934c4b 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c @@ -62,7 +62,7 @@ tu102_sor_dp_links(struct nvkm_ior *sor, struct nvkm_i2c_aux *aux) } static const struct nvkm_ior_func -tu102_sor = { +tu102_sor_hda = { .route = { .get = gm200_sor_route_get, .set = gm200_sor_route_set, @@ -92,8 +92,38 @@ tu102_sor = { }, }; +static const struct nvkm_ior_func +tu102_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gv100_sor_state, + .power = nv50_sor_power, + .clock = gf119_sor_clock, + .hdmi = { + .ctrl = gv100_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = tu102_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = tu102_sor_dp_vcpi, + .audio = gv100_sor_dp_audio, + .audio_sym = gv100_sor_dp_audio_sym, + .watermark = gv100_sor_dp_watermark, + }, +}; + int tu102_sor_new(struct nvkm_disp *disp, int id) { + struct nvkm_device *device = disp->engine.subdev.device; + u32 hda = nvkm_rd32(device, 0x08a15c); + if (hda & BIT(id)) + return nvkm_ior_new_(&tu102_sor_hda, disp, SOR, id); return nvkm_ior_new_(&tu102_sor, disp, SOR, id); } From 21454fe697fde188ad6fb541f94b9838fa73ab38 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 3 Jun 2020 16:20:02 +0200 Subject: [PATCH 352/574] drm/nouveau: gr/gk20a: Use firmware version 0 Tegra firmware doesn't actually use any version numbers and passing -1 causes the existing firmware binaries not to be found. Use version 0 to find the correct files. Fixes: ef16dc278ec2 ("drm/nouveau/gr/gf100-: select implementation based on available FW") Signed-off-by: Thierry Reding Reviewed-by: Emil Velikov Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c index ec330d791d15..e56880f3e3bd 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c @@ -352,7 +352,7 @@ gk20a_gr_load(struct gf100_gr *gr, int ver, const struct gf100_gr_fwif *fwif) static const struct gf100_gr_fwif gk20a_gr_fwif[] = { - { -1, gk20a_gr_load, &gk20a_gr }, + { 0, gk20a_gr_load, &gk20a_gr }, {} }; From dd67cab5db7e940dad66653a04d780d53bd380d5 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Thu, 4 Jun 2020 11:00:01 +1000 Subject: [PATCH 353/574] drm/nouveau/kms/nv50-: clear SW state of disabled windows harder The most innocuous result of not having done this is that we end up sending unnecessary methods when we next enable the window. However, interactions with the code handling skipping disables when an update immediately follows, and window ownership assignment, can lead to upsetting the display hardware on Volta and newer. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/wndw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index e25ead56052c..99b9b681736d 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -192,6 +192,8 @@ nv50_wndw_atomic_check_release(struct nv50_wndw *wndw, wndw->func->release(wndw, asyw, asyh); asyw->ntfy.handle = 0; asyw->sema.handle = 0; + asyw->xlut.handle = 0; + memset(asyw->image.handle, 0x00, sizeof(asyw->image.handle)); } static int @@ -519,7 +521,8 @@ nv50_wndw_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state) return PTR_ERR(ctxdma); } - asyw->image.handle[0] = ctxdma->object.handle; + if (asyw->visible) + asyw->image.handle[0] = ctxdma->object.handle; } asyw->state.fence = dma_resv_get_excl_rcu(nvbo->bo.base.resv); From 431275afdc7155415254aef4bd3816a1b8a2ead0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Jun 2020 11:19:44 +0200 Subject: [PATCH 354/574] iommu: Check for deferred attach in iommu_group_do_dma_attach() The iommu_group_do_dma_attach() must not attach devices which have deferred_attach set. Otherwise devices could cause IOMMU faults when re-initialized in a kdump kernel. Fixes: deac0b3bed26 ("iommu: Split off default domain allocation from group assignment") Reported-by: Jerry Snitselaar Signed-off-by: Joerg Roedel Tested-by: Jerry Snitselaar Reviewed-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20200604091944.26402-1-joro@8bytes.org --- drivers/iommu/iommu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b5ea203f6c68..d43120eb1dc5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1680,8 +1680,12 @@ static void probe_alloc_default_domain(struct bus_type *bus, static int iommu_group_do_dma_attach(struct device *dev, void *data) { struct iommu_domain *domain = data; + int ret = 0; - return __iommu_attach_device(domain, dev); + if (!iommu_is_attach_deferred(domain, dev)) + ret = __iommu_attach_device(domain, dev); + + return ret; } static int __iommu_group_dma_attach(struct iommu_group *group) From bfa50e1427e4608ce6941d3d0855445fcaa7dbb7 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 4 Jun 2020 13:20:45 +0200 Subject: [PATCH 355/574] vfio-ccw: make vfio_ccw_regops variables declarations static Fixes the following sparse warnings: drivers/s390/cio/vfio_ccw_chp.c:62:30: warning: symbol 'vfio_ccw_schib_region_ops' was not declared. Should it be static? drivers/s390/cio/vfio_ccw_chp.c:117:30: warning: symbol 'vfio_ccw_crw_region_ops' was not declared. Should it be static? Link: https://lkml.kernel.org/r/patch.git-a34be7aede18.your-ad-here.call-01591269421-ext-5655@work.hours Reviewed-by: Cornelia Huck Signed-off-by: Vasily Gorbik --- drivers/s390/cio/vfio_ccw_chp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index 876f6ade51cc..a646fc81c872 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -59,7 +59,7 @@ static void vfio_ccw_schib_region_release(struct vfio_ccw_private *private, } -const struct vfio_ccw_regops vfio_ccw_schib_region_ops = { +static const struct vfio_ccw_regops vfio_ccw_schib_region_ops = { .read = vfio_ccw_schib_region_read, .write = vfio_ccw_schib_region_write, .release = vfio_ccw_schib_region_release, @@ -131,7 +131,7 @@ static void vfio_ccw_crw_region_release(struct vfio_ccw_private *private, } -const struct vfio_ccw_regops vfio_ccw_crw_region_ops = { +static const struct vfio_ccw_regops vfio_ccw_crw_region_ops = { .read = vfio_ccw_crw_region_read, .write = vfio_ccw_crw_region_write, .release = vfio_ccw_crw_region_release, From ef1c75593e770aff8749e902aa0deb6855a3f485 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Wed, 3 Jun 2020 09:58:13 +0530 Subject: [PATCH 356/574] crypto/chcr: error seen if CONFIG_CHELSIO_TLS_DEVICE isn't set cxgb4_uld_in_use() is used only by cxgb4_ktls_det_feature() which is under CONFIG_CHELSIO_TLS_DEVICE macro. Fixes: a3ac249a1ab5 ("cxgb4/chcr: Enable ktls settings at run time") Signed-off-by: Rohit Maheshwari Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index 0307e9c69a47..08439e215efe 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -663,6 +663,7 @@ static int uld_attach(struct adapter *adap, unsigned int uld) return 0; } +#ifdef CONFIG_CHELSIO_TLS_DEVICE static bool cxgb4_uld_in_use(struct adapter *adap) { const struct tid_info *t = &adap->tids; @@ -670,7 +671,6 @@ static bool cxgb4_uld_in_use(struct adapter *adap) return (atomic_read(&t->conns_in_use) || t->stids_in_use); } -#ifdef CONFIG_CHELSIO_TLS_DEVICE /* cxgb4_set_ktls_feature: request FW to enable/disable ktls settings. * @adap: adapter info * @enable: 1 to enable / 0 to disable ktls settings. From c36f05559104b66bcd7f617e931e38c680227b74 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 2 Jun 2020 21:49:10 -0700 Subject: [PATCH 357/574] genetlink: fix memory leaks in genl_family_rcv_msg_dumpit() There are two kinds of memory leaks in genl_family_rcv_msg_dumpit(): 1. Before we call ops->start(), whenever an error happens, we forget to free the memory allocated in genl_family_rcv_msg_dumpit(). 2. When ops->start() fails, the 'info' has been already installed on the per socket control block, so we should not free it here. More importantly, nlk->cb_running is still false at this point, so netlink_sock_destruct() cannot free it either. The first kind of memory leaks is easier to resolve, but the second one requires some deeper thoughts. After reviewing how netfilter handles this, the most elegant solution I find is just to use a similar way to allocate the memory, that is, moving memory allocations from caller into ops->start(). With this, we can solve both kinds of memory leaks: for 1), no memory allocation happens before ops->start(); for 2), ops->start() handles its own failures and 'info' is installed to the socket control block only when success. The only ugliness here is we have to pass all local variables on stack via a struct, but this is not hard to understand. Alternatively, we can introduce a ops->free() to solve this too, but it is overkill as only genetlink has this problem so far. Fixes: 1927f41a22a0 ("net: genetlink: introduce dump info struct to be available during dumpit op") Reported-by: syzbot+21f04f481f449c8db840@syzkaller.appspotmail.com Cc: "Jason A. Donenfeld" Cc: Florian Westphal Cc: Pablo Neira Ayuso Cc: Jiri Pirko Cc: YueHaibing Cc: Shaochun Chen Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 94 +++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2f049692e012..6c19b91bbb86 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -513,15 +513,58 @@ static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, kfree(attrbuf); } -static int genl_lock_start(struct netlink_callback *cb) +struct genl_start_context { + const struct genl_family *family; + struct nlmsghdr *nlh; + struct netlink_ext_ack *extack; + const struct genl_ops *ops; + int hdrlen; +}; + +static int genl_start(struct netlink_callback *cb) { - const struct genl_ops *ops = genl_dumpit_info(cb)->ops; + struct genl_start_context *ctx = cb->data; + const struct genl_ops *ops = ctx->ops; + struct genl_dumpit_info *info; + struct nlattr **attrs = NULL; int rc = 0; + if (ops->validate & GENL_DONT_VALIDATE_DUMP) + goto no_attrs; + + if (ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) + return -EINVAL; + + attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, + ops, ctx->hdrlen, + GENL_DONT_VALIDATE_DUMP_STRICT, + true); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + +no_attrs: + info = genl_dumpit_info_alloc(); + if (!info) { + kfree(attrs); + return -ENOMEM; + } + info->family = ctx->family; + info->ops = ops; + info->attrs = attrs; + + cb->data = info; if (ops->start) { - genl_lock(); + if (!ctx->family->parallel_ops) + genl_lock(); rc = ops->start(cb); - genl_unlock(); + if (!ctx->family->parallel_ops) + genl_unlock(); + } + + if (rc) { + kfree(attrs); + genl_dumpit_info_free(info); + cb->data = NULL; } return rc; } @@ -548,7 +591,7 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } - genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_family_rcv_msg_attrs_free(info->family, info->attrs, false); genl_dumpit_info_free(info); return rc; } @@ -573,43 +616,23 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, const struct genl_ops *ops, int hdrlen, struct net *net) { - struct genl_dumpit_info *info; - struct nlattr **attrs = NULL; + struct genl_start_context ctx; int err; if (!ops->dumpit) return -EOPNOTSUPP; - if (ops->validate & GENL_DONT_VALIDATE_DUMP) - goto no_attrs; - - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) - return -EINVAL; - - attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack, - ops, hdrlen, - GENL_DONT_VALIDATE_DUMP_STRICT, - true); - if (IS_ERR(attrs)) - return PTR_ERR(attrs); - -no_attrs: - /* Allocate dumpit info. It is going to be freed by done() callback. */ - info = genl_dumpit_info_alloc(); - if (!info) { - genl_family_rcv_msg_attrs_free(family, attrs, true); - return -ENOMEM; - } - - info->family = family; - info->ops = ops; - info->attrs = attrs; + ctx.family = family; + ctx.nlh = nlh; + ctx.extack = extack; + ctx.ops = ops; + ctx.hdrlen = hdrlen; if (!family->parallel_ops) { struct netlink_dump_control c = { .module = family->module, - .data = info, - .start = genl_lock_start, + .data = &ctx, + .start = genl_start, .dump = genl_lock_dumpit, .done = genl_lock_done, }; @@ -617,12 +640,11 @@ no_attrs: genl_unlock(); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_lock(); - } else { struct netlink_dump_control c = { .module = family->module, - .data = info, - .start = ops->start, + .data = &ctx, + .start = genl_start, .dump = ops->dumpit, .done = genl_parallel_done, }; From 5e9eeccc58f3e6bcc99b929670665d2ce047e9c9 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 3 Jun 2020 12:06:01 +0700 Subject: [PATCH 358/574] tipc: fix NULL pointer dereference in streaming syzbot found the following crash: general protection fault, probably for non-canonical address 0xdffffc0000000019: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000c8-0x00000000000000cf] CPU: 1 PID: 7060 Comm: syz-executor394 Not tainted 5.7.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__tipc_sendstream+0xbde/0x11f0 net/tipc/socket.c:1591 Code: 00 00 00 00 48 39 5c 24 28 48 0f 44 d8 e8 fa 3e db f9 48 b8 00 00 00 00 00 fc ff df 48 8d bb c8 00 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 e2 04 00 00 48 8b 9b c8 00 00 00 48 b8 00 00 00 RSP: 0018:ffffc90003ef7818 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8797fd9d RDX: 0000000000000019 RSI: ffffffff8797fde6 RDI: 00000000000000c8 RBP: ffff888099848040 R08: ffff88809a5f6440 R09: fffffbfff1860b4c R10: ffffffff8c305a5f R11: fffffbfff1860b4b R12: ffff88809984857e R13: 0000000000000000 R14: ffff888086aa4000 R15: 0000000000000000 FS: 00000000009b4880(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 00000000a7fdf000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tipc_sendstream+0x4c/0x70 net/tipc/socket.c:1533 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x32f/0x810 net/socket.c:2352 ___sys_sendmsg+0x100/0x170 net/socket.c:2406 __sys_sendmmsg+0x195/0x480 net/socket.c:2496 __do_sys_sendmmsg net/socket.c:2525 [inline] __se_sys_sendmmsg net/socket.c:2522 [inline] __x64_sys_sendmmsg+0x99/0x100 net/socket.c:2522 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 entry_SYSCALL_64_after_hwframe+0x49/0xb3 RIP: 0033:0x440199 ... This bug was bisected to commit 0a3e060f340d ("tipc: add test for Nagle algorithm effectiveness"). However, it is not the case, the trouble was from the base in the case of zero data length message sending, we would unexpectedly make an empty 'txq' queue after the 'tipc_msg_append()' in Nagle mode. A similar crash can be generated even without the bisected patch but at the link layer when it accesses the empty queue. We solve the issues by building at least one buffer to go with socket's header and an optional data section that may be empty like what we had with the 'tipc_msg_build()'. Note: the previous commit 4c21daae3dbc ("tipc: Fix NULL pointer dereference in __tipc_sendstream()") is obsoleted by this one since the 'txq' will be never empty and the check of 'skb != NULL' is unnecessary but it is safe anyway. Reported-by: syzbot+8eac6d030e7807c21d32@syzkaller.appspotmail.com Fixes: c0bceb97db9e ("tipc: add smart nagle feature") Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index c0afcd627c5e..046e4cb3acea 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -221,7 +221,7 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, accounted = skb ? msg_blocks(buf_msg(skb)) : 0; total = accounted; - while (rem) { + do { if (!skb || skb->len >= mss) { skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) @@ -245,7 +245,7 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, skb_put(skb, cpy); rem -= cpy; total += msg_blocks(hdr) - curr; - } + } while (rem); return total - accounted; } From bb986a50421a11bf31a81afb15b9b8f45a4a3a11 Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Wed, 3 Jun 2020 06:54:42 +0000 Subject: [PATCH 359/574] seg6: fix seg6_validate_srh() to avoid slab-out-of-bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The seg6_validate_srh() is used to validate SRH for three cases: case1: SRH of data-plane SRv6 packets to be processed by the Linux kernel. Case2: SRH of the netlink message received from user-space (iproute2) Case3: SRH injected into packets through setsockopt In case1, the SRH can be encoded in the Reduced way (i.e., first SID is carried in DA only and not represented as SID in the SRH) and the seg6_validate_srh() now handles this case correctly. In case2 and case3, the SRH shouldn’t be encoded in the Reduced way otherwise we lose the first segment (i.e., the first hop). The current implementation of the seg6_validate_srh() allow SRH of case2 and case3 to be encoded in the Reduced way. This leads a slab-out-of-bounds problem. This patch verifies SRH of case1, case2 and case3. Allowing case1 to be reduced while preventing SRH of case2 and case3 from being reduced . Reported-by: syzbot+e8c028b62439eac42073@syzkaller.appspotmail.com Reported-by: YueHaibing Fixes: 0cb7498f234e ("seg6: fix SRH processing to comply with RFC8754") Signed-off-by: Ahmed Abdelsalam Signed-off-by: David S. Miller --- include/net/seg6.h | 2 +- net/core/filter.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/seg6.c | 16 ++++++++++------ net/ipv6/seg6_iptunnel.c | 2 +- net/ipv6/seg6_local.c | 6 +++--- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/net/seg6.h b/include/net/seg6.h index 640724b35273..9d19c15e8545 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -57,7 +57,7 @@ extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); -extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); +extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); diff --git a/net/core/filter.c b/net/core/filter.c index d01a244b5087..209482a4eaa2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5050,7 +5050,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 2c843ff5e3a9..20576e87a5f7 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -493,7 +493,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *) opt->srcrt; - if (!seg6_validate_srh(srh, optlen)) + if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 37b434293bda..d2f8138e5a73 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -25,7 +25,7 @@ #include #endif -bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) +bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced) { unsigned int tlv_offset; int max_last_entry; @@ -37,13 +37,17 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) if (((srh->hdrlen + 1) << 3) != len) return false; - max_last_entry = (srh->hdrlen / 2) - 1; - - if (srh->first_segment > max_last_entry) + if (!reduced && srh->segments_left > srh->first_segment) { return false; + } else { + max_last_entry = (srh->hdrlen / 2) - 1; - if (srh->segments_left > srh->first_segment + 1) - return false; + if (srh->first_segment > max_last_entry) + return false; + + if (srh->segments_left > srh->first_segment + 1) + return false; + } tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index c7cbfeae94f5..e0e9f48ab14f 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -426,7 +426,7 @@ static int seg6_build_state(struct net *net, struct nlattr *nla, } /* verify that SRH is consistent */ - if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo))) + if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false)) return -EINVAL; newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt)); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 52493423f329..eba23279912d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -87,7 +87,7 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) */ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, true)) return NULL; return srh; @@ -495,7 +495,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb) return false; srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true)) return false; srh_state->valid = true; @@ -670,7 +670,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) if (len < sizeof(*srh) + sizeof(struct in6_addr)) return -EINVAL; - if (!seg6_validate_srh(srh, len)) + if (!seg6_validate_srh(srh, len, false)) return -EINVAL; slwt->srh = kmemdup(srh, len, GFP_KERNEL); From cd07ecccba13b8bd5023ffe7be57363d07e3105f Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Wed, 3 Jun 2020 08:50:22 +0000 Subject: [PATCH 360/574] net: ena: xdp: XDP_TX: fix memory leak When sending very high packet rate, the XDP tx queues can get full and start dropping packets. In this case we don't free the pages which results in ena driver draining the system memory. Fix: Simply free the pages when necessary. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index a0af74c93971..e101fc934f83 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -355,7 +355,7 @@ error_unmap_dma: ena_unmap_tx_buff(xdp_ring, tx_info); tx_info->xdpf = NULL; error_drop_packet: - + __free_page(tx_info->xdp_rx_page); return NETDEV_TX_OK; } From 3921a81c31df6057183aeb7f7d204003bf699d6f Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Wed, 3 Jun 2020 08:50:23 +0000 Subject: [PATCH 361/574] net: ena: xdp: update napi budget for DROP and ABORTED This patch fixes two issues with XDP: 1. If the XDP verdict is XDP_ABORTED we break the loop, which results in us handling one buffer per napi cycle instead of the total budget (usually 64). To overcome this simply change the xdp_verdict check to != XDP_PASS. When the verdict is XDP_PASS, the skb is not expected to be NULL. 2. Update the residual budget for XDP_DROP and XDP_ABORTED, since packets are handled in these cases. Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action") Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index e101fc934f83..dda4b8fc9525 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1646,11 +1646,9 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, &next_to_clean); if (unlikely(!skb)) { - if (xdp_verdict == XDP_TX) { + if (xdp_verdict == XDP_TX) ena_free_rx_page(rx_ring, &rx_ring->rx_buffer_info[rx_ring->ena_bufs[0].req_id]); - res_budget--; - } for (i = 0; i < ena_rx_ctx.descs; i++) { rx_ring->free_ids[next_to_clean] = rx_ring->ena_bufs[i].req_id; @@ -1658,8 +1656,10 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi, ENA_RX_RING_IDX_NEXT(next_to_clean, rx_ring->ring_size); } - if (xdp_verdict == XDP_TX || xdp_verdict == XDP_DROP) + if (xdp_verdict != XDP_PASS) { + res_budget--; continue; + } break; } From 9d149045b3c0e44c049cdbce8a64e19415290017 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Wed, 3 Jun 2020 11:12:14 +0200 Subject: [PATCH 362/574] geneve: change from tx_error to tx_dropped on missing metadata If the geneve interface is in collect_md (external) mode, it can't send any packets submitted directly to its net interface, as such packets won't have metadata attached. This is expected. However, the kernel itself sends some packets to the interface, most notably, IPv6 DAD, IPv6 multicast listener reports, etc. This is not wrong, as tunnel metadata can be specified in routing table (although technically, that has never worked for IPv6, but hopefully will be fixed eventually) and then the interface must correctly participate in IPv6 housekeeping. The problem is that any such attempt increases the tx_error counter. Just bringing up a geneve interface with IPv6 enabled is enough to see a number of tx_errors. That causes confusion among users, prompting them to find a network error where there is none. Change the counter used to tx_dropped. That better conveys the meaning (there's nothing wrong going on, just some packets are getting dropped) and hopefully will make admins panic less. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6b461be1820b..75266580b586 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -987,9 +987,10 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (geneve->collect_md) { info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { - err = -EINVAL; netdev_dbg(dev, "no tunnel metadata\n"); - goto tx_error; + dev_kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; } } else { info = &geneve->info; @@ -1006,7 +1007,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (likely(!err)) return NETDEV_TX_OK; -tx_error: + dev_kfree_skb(skb); if (err == -ELOOP) From 67122a7910bf2135dc7f7ececfcf16a5bdb362c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Vok=C3=A1=C4=8D?= Date: Wed, 3 Jun 2020 13:31:39 +0200 Subject: [PATCH 363/574] net: dsa: qca8k: Fix "Unexpected gfp" kernel exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7e99e3470172 ("net: dsa: remove dsa_switch_alloc helper") replaced the dsa_switch_alloc helper by devm_kzalloc in all DSA drivers. Unfortunately it introduced a typo in qca8k.c driver and wrong argument is passed to the devm_kzalloc function. This fix mitigates the following kernel exception: Unexpected gfp: 0x6 (__GFP_HIGHMEM|GFP_DMA32). Fixing up to gfp: 0x101 (GFP_DMA|__GFP_ZERO). Fix your code! CPU: 1 PID: 44 Comm: kworker/1:1 Not tainted 5.5.9-yocto-ua #1 Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) Workqueue: events deferred_probe_work_func [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x90/0xa4) [] (dump_stack) from [] (new_slab+0x20c/0x214) [] (new_slab) from [] (___slab_alloc.constprop.0+0x1b8/0x540) [] (___slab_alloc.constprop.0) from [] (__slab_alloc.constprop.0+0x1c/0x24) [] (__slab_alloc.constprop.0) from [] (__kmalloc_track_caller+0x1b0/0x298) [] (__kmalloc_track_caller) from [] (devm_kmalloc+0x24/0x70) [] (devm_kmalloc) from [] (qca8k_sw_probe+0x94/0x1ac) [] (qca8k_sw_probe) from [] (mdio_probe+0x30/0x54) [] (mdio_probe) from [] (really_probe+0x1e0/0x348) [] (really_probe) from [] (driver_probe_device+0x60/0x16c) [] (driver_probe_device) from [] (bus_for_each_drv+0x70/0x94) [] (bus_for_each_drv) from [] (__device_attach+0xb4/0x11c) [] (__device_attach) from [] (bus_probe_device+0x84/0x8c) [] (bus_probe_device) from [] (deferred_probe_work_func+0x64/0x90) [] (deferred_probe_work_func) from [] (process_one_work+0x1d4/0x41c) [] (process_one_work) from [] (worker_thread+0x248/0x528) [] (worker_thread) from [] (kthread+0x124/0x150) [] (kthread) from [] (ret_from_fork+0x14/0x3c) Exception stack(0xee1b5fb0 to 0xee1b5ff8) 5fa0: 00000000 00000000 00000000 00000000 5fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 5fe0: 00000000 00000000 00000000 00000000 00000013 00000000 qca8k 2188000.ethernet-1:0a: Using legacy PHYLIB callbacks. Please migrate to PHYLINK! qca8k 2188000.ethernet-1:0a eth2 (uninitialized): PHY [2188000.ethernet-1:01] driver [Generic PHY] qca8k 2188000.ethernet-1:0a eth1 (uninitialized): PHY [2188000.ethernet-1:02] driver [Generic PHY] Fixes: 7e99e3470172 ("net: dsa: remove dsa_switch_alloc helper") Signed-off-by: Michal Vokáč Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/qca8k.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 9f4205b4439b..d2b5ab403e06 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -1079,8 +1079,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev) if (id != QCA8K_ID_QCA8337) return -ENODEV; - priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds), - QCA8K_NUM_PORTS); + priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds), GFP_KERNEL); if (!priv->ds) return -ENOMEM; From 11d6011c2cf29f7c8181ebde6c8bc0c4d83adcd7 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:44 +0200 Subject: [PATCH 364/574] net: core: device_rename: Use rwsem instead of a seqcount Sequence counters write paths are critical sections that must never be preempted, and blocking, even for CONFIG_PREEMPTION=n, is not allowed. Commit 5dbe7c178d3f ("net: fix kernel deadlock with interface rename and netdev name retrieval.") handled a deadlock, observed with CONFIG_PREEMPTION=n, where the devnet_rename seqcount read side was infinitely spinning: it got scheduled after the seqcount write side blocked inside its own critical section. To fix that deadlock, among other issues, the commit added a cond_resched() inside the read side section. While this will get the non-preemptible kernel eventually unstuck, the seqcount reader is fully exhausting its slice just spinning -- until TIF_NEED_RESCHED is set. The fix is also still broken: if the seqcount reader belongs to a real-time scheduling policy, it can spin forever and the kernel will livelock. Disabling preemption over the seqcount write side critical section will not work: inside it are a number of GFP_KERNEL allocations and mutex locking through the drivers/base/ :: device_rename() call chain. >From all the above, replace the seqcount with a rwsem. Fixes: 5dbe7c178d3f (net: fix kernel deadlock with interface rename and netdev name retrieval.) Fixes: 30e6c9fa93cf (net: devnet_rename_seq should be a seqcount) Fixes: c91f6df2db49 (sockopt: Change getsockopt() of SO_BINDTODEVICE to return an interface name) Cc: Reported-by: kbuild test robot [ v1 missing up_read() on error exit ] Reported-by: Dan Carpenter [ v1 missing up_read() on error exit ] Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- net/core/dev.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 10684833f864..061496a1f640 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -194,7 +195,7 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static seqcount_t devnet_rename_seq; +static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { @@ -998,33 +999,28 @@ EXPORT_SYMBOL(dev_get_by_napi_id); * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. - * - * The use of raw_seqcount_begin() and cond_resched() before - * retrying is required as we want to give the writers a chance - * to complete when CONFIG_PREEMPTION is not set. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; - unsigned int seq; + int ret; -retry: - seq = raw_seqcount_begin(&devnet_rename_seq); + down_read(&devnet_rename_sem); rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { - rcu_read_unlock(); - return -ENODEV; + ret = -ENODEV; + goto out; } strcpy(name, dev->name); - rcu_read_unlock(); - if (read_seqcount_retry(&devnet_rename_seq, seq)) { - cond_resched(); - goto retry; - } - return 0; + ret = 0; +out: + rcu_read_unlock(); + up_read(&devnet_rename_sem); + return ret; } /** @@ -1296,10 +1292,10 @@ int dev_change_name(struct net_device *dev, const char *newname) likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return 0; } @@ -1307,7 +1303,7 @@ int dev_change_name(struct net_device *dev, const char *newname) err = dev_get_valid_name(net, dev, newname); if (err < 0) { - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return err; } @@ -1322,11 +1318,11 @@ rollback: if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); return ret; } - write_seqcount_end(&devnet_rename_seq); + up_write(&devnet_rename_sem); netdev_adjacent_rename_links(dev, oldname); @@ -1347,7 +1343,7 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - write_seqcount_begin(&devnet_rename_seq); + down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; From 79cbb6bc3332da7162c2581e151659ab8ebaa528 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:45 +0200 Subject: [PATCH 365/574] net: phy: fixed_phy: Remove unused seqcount Commit bf7afb29d545 ("phy: improve safety of fixed-phy MII register reading") protected the fixed PHY status with a sequence counter. Two years later, commit d2b977939b18 ("net: phy: fixed-phy: remove fixed_phy_update_state()") removed the sequence counter's write side critical section -- neutralizing its read side retry loop. Remove the unused seqcount. Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/fixed_phy.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 4a3d34f40cb9..c4641b1704d6 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +33,6 @@ struct fixed_mdio_bus { struct fixed_phy { int addr; struct phy_device *phydev; - seqcount_t seqcount; struct fixed_phy_status status; bool no_carrier; int (*link_update)(struct net_device *, struct fixed_phy_status *); @@ -80,19 +78,17 @@ static int fixed_mdio_read(struct mii_bus *bus, int phy_addr, int reg_num) list_for_each_entry(fp, &fmb->phys, node) { if (fp->addr == phy_addr) { struct fixed_phy_status state; - int s; - do { - s = read_seqcount_begin(&fp->seqcount); - fp->status.link = !fp->no_carrier; - /* Issue callback if user registered it. */ - if (fp->link_update) - fp->link_update(fp->phydev->attached_dev, - &fp->status); - /* Check the GPIO for change in status */ - fixed_phy_update(fp); - state = fp->status; - } while (read_seqcount_retry(&fp->seqcount, s)); + fp->status.link = !fp->no_carrier; + + /* Issue callback if user registered it. */ + if (fp->link_update) + fp->link_update(fp->phydev->attached_dev, + &fp->status); + + /* Check the GPIO for change in status */ + fixed_phy_update(fp); + state = fp->status; return swphy_read_reg(reg_num, &state); } @@ -150,8 +146,6 @@ static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, if (!fp) return -ENOMEM; - seqcount_init(&fp->seqcount); - if (irq != PHY_POLL) fmb->mii_bus->irq[phy_addr] = irq; From 6501bf87602f799b7e502014f8bc0aa58b868277 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:46 +0200 Subject: [PATCH 366/574] u64_stats: Document writer non-preemptibility requirement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The u64_stats mechanism uses sequence counters to protect against 64-bit values tearing on 32-bit architectures. Updating such statistics is a sequence counter write side critical section. Preemption must be disabled before entering this seqcount write critical section. Failing to do so, the seqcount read side can preempt the write side section and spin for the entire scheduler tick. If that reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. Document this statistics update side non-preemptibility requirement. Reword the introductory paragraph to highlight u64_stats raison d'être: 64-bit values tearing protection on 32-bit architectures. Divide documentation on a basis of internal design vs. usage constraints. Reword the u64_stats header file top comment to always mention "Reader" or "Writer" at the start of each bullet point, making it easier to follow which side each point is actually for. Clarify the statement "whole thing is a NOOP on 64bit arches or UP kernels". For 32-bit UP kernels, preemption is always disabled for the statistics read side section. Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 9de5c10293f5..c6abb79501b3 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -3,33 +3,36 @@ #define _LINUX_U64_STATS_SYNC_H /* - * To properly implement 64bits network statistics on 32bit and 64bit hosts, - * we provide a synchronization point, that is a noop on 64bit or UP kernels. + * Protect against 64-bit values tearing on 32-bit architectures. This is + * typically used for statistics read/update in different subsystems. * * Key points : - * 1) Use a seqcount on SMP 32bits, with low overhead. - * 2) Whole thing is a noop on 64bit arches or UP kernels. - * 3) Write side must ensure mutual exclusion or one seqcount update could + * + * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. + * - The whole thing is a no-op on 64-bit architectures. + * + * Usage constraints: + * + * 1) Write side must ensure mutual exclusion, or one seqcount update could * be lost, thus blocking readers forever. - * If this synchronization point is not a mutex, but a spinlock or - * spinlock_bh() or disable_bh() : - * 3.1) Write side should not sleep. - * 3.2) Write side should not allow preemption. - * 3.3) If applicable, interrupts should be disabled. + * + * 2) Write side must disable preemption, or a seqcount reader can preempt the + * writer and also spin forever. + * + * 3) Write side must use the _irqsave() variant if other writers, or a reader, + * can be invoked from an IRQ context. * * 4) If reader fetches several counters, there is no guarantee the whole values - * are consistent (remember point 1) : this is a noop on 64bit arches anyway) + * are consistent w.r.t. each other (remember point #2: seqcounts are not + * used for 64bit architectures). * - * 5) readers are allowed to sleep or be preempted/interrupted : They perform - * pure reads. But if they have to fetch many values, it's better to not allow - * preemptions/interruptions to avoid many retries. + * 5) Readers are allowed to sleep or be preempted/interrupted: they perform + * pure reads. * - * 6) If counter might be written by an interrupt, readers should block interrupts. - * (On UP, there is no seqcount_t protection, a reader allowing interrupts could - * read partial values) - * - * 7) For irq and softirq uses, readers can use u64_stats_fetch_begin_irq() and - * u64_stats_fetch_retry_irq() helpers + * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats + * might be updated from a hardirq or softirq context (remember point #1: + * seqcounts are not used for UP kernels). 32-bit UP stat readers could read + * corrupted 64-bit values otherwise. * * Usage : * From c7e261d81783387a0502878cd229327e7c54322e Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:47 +0200 Subject: [PATCH 367/574] net: mdiobus: Disable preemption upon u64_stats update The u64_stats mechanism uses sequence counters to protect against 64-bit values tearing on 32-bit architectures. Updating u64_stats is thus a sequence counter write side critical section where preemption must be disabled. For mdiobus_stats_acct(), disable preemption upon the u64_stats update. It is called from process context through mdiobus_read() and mdiobus_write(). Reported-by: kernel test robot Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 255fdfcc13a6..6ceee82b2839 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL(mdiobus_scan); static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) { + preempt_disable(); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->transfers); @@ -778,6 +779,7 @@ static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) u64_stats_inc(&stats->writes); out: u64_stats_update_end(&stats->syncp); + preempt_enable(); } /** From f6c1fb0a76d97447ea7f928ee6a113ee15318df1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Jun 2020 20:50:25 +0300 Subject: [PATCH 368/574] net: ethernet: dwmac: Fix an error code in imx_dwmac_probe() The code is return PTR_ERR(NULL) which is zero or success. We should return -ENOMEM instead. Fixes: 94abdad6974a5 ("net: ethernet: dwmac: add ethernet glue logic for NXP imx8 chip") Signed-off-by: Dan Carpenter Acked-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 5010af7dab4a..3c5df5eeed6c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -225,7 +225,7 @@ static int imx_dwmac_probe(struct platform_device *pdev) dwmac = devm_kzalloc(&pdev->dev, sizeof(*dwmac), GFP_KERNEL); if (!dwmac) - return PTR_ERR(dwmac); + return -ENOMEM; plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); if (IS_ERR(plat_dat)) From 120068481405f9e3d60a95bd464496dff6b54669 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 3 Jun 2020 22:29:06 +0200 Subject: [PATCH 369/574] r8169: fix failing WoL Th referenced change added an extra hw reset to rtl8169_net_suspend() what makes WoL fail on few chip versions. Therefore skip the extra reset if we're going down and WoL is enabled. In rtl_shutdown() rtl8169_hw_reset() is called by rtl8169_net_suspend() already if needed, therefore avoid issues issue by removing the extra call. The fix was tested on a system with RTL8168g. Meanwhile rtl8169_hw_reset() does more than a hw reset and should be renamed. But that's net-next material. Fixes: 8ac8e8c64b53 ("r8169: make rtl8169_down central chip quiesce function") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4d2ec9742cee..dad84ecf5a77 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3928,7 +3928,7 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp) netdev_reset_queue(tp->dev); } -static void rtl8169_hw_reset(struct rtl8169_private *tp) +static void rtl8169_hw_reset(struct rtl8169_private *tp, bool going_down) { /* Give a racing hard_start_xmit a few cycles to complete. */ synchronize_rcu(); @@ -3938,6 +3938,9 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp) rtl_rx_close(tp); + if (going_down && tp->dev->wol_enabled) + goto no_reset; + switch (tp->mac_version) { case RTL_GIGA_MAC_VER_27: case RTL_GIGA_MAC_VER_28: @@ -3959,7 +3962,7 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp) } rtl_hw_reset(tp); - +no_reset: rtl8169_tx_clear(tp); rtl8169_init_ring_indexes(tp); } @@ -3972,7 +3975,7 @@ static void rtl_reset_work(struct rtl8169_private *tp) napi_disable(&tp->napi); netif_stop_queue(dev); - rtl8169_hw_reset(tp); + rtl8169_hw_reset(tp, false); for (i = 0; i < NUM_RX_DESC; i++) rtl8169_mark_to_asic(tp->RxDescArray + i); @@ -4637,7 +4640,7 @@ static void rtl8169_down(struct rtl8169_private *tp) phy_stop(tp->phydev); napi_disable(&tp->napi); - rtl8169_hw_reset(tp); + rtl8169_hw_reset(tp, true); rtl_pll_power_down(tp); @@ -4942,8 +4945,6 @@ static void rtl_shutdown(struct pci_dev *pdev) /* Restore original MAC address */ rtl_rar_set(tp, tp->dev->perm_addr); - rtl8169_hw_reset(tp); - if (system_state == SYSTEM_POWER_OFF) { if (tp->saved_wolopts) { rtl_wol_suspend_quirk(tp); From 09820ce88b4d36c452b23e05042542f87ff122d0 Mon Sep 17 00:00:00 2001 From: Valentin Longchamp Date: Wed, 3 Jun 2020 23:28:23 +0200 Subject: [PATCH 370/574] net: ethernet: freescale: remove unneeded include for ucc_geth net/sch_generic.h does not need to be included, remove it. Signed-off-by: Valentin Longchamp Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/ucc_geth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 552e7554a9f8..db791f60b884 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -42,7 +42,6 @@ #include #include #include -#include #include "ucc_geth.h" From 7cdee28c4eeda8242c9fdbb02e315dbf35064afe Mon Sep 17 00:00:00 2001 From: Roelof Berg Date: Wed, 3 Jun 2020 23:54:14 +0200 Subject: [PATCH 371/574] lan743x: Use correct MAC_CR configuration for 1 GBit speed Corrected the MAC_CR configuration bits for 1 GBit operation. The data sheet allows MAC_CR(2:1) to be 10 and also 11 for 1 GBit/s speed, but only 10 works correctly. Devices tested: Microchip Lan7431, fixed-phy mode Microchip Lan7430, normal phy mode Fixes: 6f197fb63850 ("lan743x: Added fixed link and RGMII support") Signed-off-by: Roelof Berg Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 36624e3c633b..c5c5c688b7e2 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -985,7 +985,7 @@ static void lan743x_phy_link_status_change(struct net_device *netdev) break; case SPEED_1000: data |= MAC_CR_CFG_H_; - data |= MAC_CR_CFG_L_; + data &= ~MAC_CR_CFG_L_; break; } lan743x_csr_write(adapter, MAC_CR, data); From 98749b7188affbf2900c2aab704a8853901d1139 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Thu, 4 Jun 2020 20:18:51 +0800 Subject: [PATCH 372/574] yam: fix possible memory leak in yam_init_driver If register_netdev(dev) fails, free_netdev(dev) needs to be called, otherwise a memory leak will occur. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- drivers/net/hamradio/yam.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 71cdef9fb56b..5ab53e9942f3 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -1133,6 +1133,7 @@ static int __init yam_init_driver(void) err = register_netdev(dev); if (err) { printk(KERN_WARNING "yam: cannot register net device %s\n", dev->name); + free_netdev(dev); goto error; } yam_devs[i] = dev; From 6761893eeaa378321198f06194ef2e1e4e8a4ad4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 4 Jun 2020 18:55:45 +0200 Subject: [PATCH 373/574] inet_connection_sock: clear inet_num out of destroy helper Clearing the 'inet_num' field is necessary and safe if and only if the socket is not bound. The MPTCP protocol calls the destroy helper on bound sockets, as tcp_v{4,6}_syn_recv_sock completed successfully. Move the clearing of such field out of the common code, otherwise the MPTCP MP_JOIN error path will find the wrong 'inet_num' value on socket disposal, __inet_put_port() will acquire the wrong lock and bind_node removal could race with other modifiers possibly corrupting the bind hash table. Reported-and-tested-by: Christoph Paasch Fixes: 729cd6436f35 ("mptcp: cope better with MP_JOIN failure") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 1 - net/ipv4/inet_connection_sock.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 2f1f8c3efb26..e5b388f5fa20 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -292,7 +292,6 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) /* The below has to be done to allow calling inet_csk_destroy_sock */ sock_set_flag(sk, SOCK_DEAD); percpu_counter_inc(sk->sk_prot->orphan_count); - inet_sk(sk)->inet_num = 0; } void inet_csk_destroy_sock(struct sock *sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f40b1b72f979..afaf582a5aa9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -902,6 +902,7 @@ void inet_csk_prepare_forced_close(struct sock *sk) bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); + inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); From a624a86510adaeefd33aac224751e89348596d2a Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Fri, 5 Jun 2020 01:53:44 +0530 Subject: [PATCH 374/574] crypto/chtls:Fix compile error when CONFIG_IPV6 is disabled Fix compile errors,warnings when CONFIG_IPV6 is disabled and inconsistent indenting. v1->v2: - Corrected errors/warnings reported when used newer gcc version, unused array. Fixes: 6abde0b24122 ("crypto/chtls: IPv6 support for inline TLS") Signed-off-by: Vinay Kumar Yadav Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_algo.h | 4 -- drivers/crypto/chelsio/chtls/chtls_cm.c | 46 +++++++++++++++++------ drivers/crypto/chelsio/chtls/chtls_main.c | 2 + 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.h b/drivers/crypto/chelsio/chcr_algo.h index f58c2b5c7fc5..d4f6e010dc79 100644 --- a/drivers/crypto/chelsio/chcr_algo.h +++ b/drivers/crypto/chelsio/chcr_algo.h @@ -389,10 +389,6 @@ static inline void copy_hash_init_values(char *key, int digestsize) } } -static const u8 sgl_lengths[20] = { - 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15 -}; - /* Number of len fields(8) * size of one addr field */ #define PHYSDSGL_MAX_LEN_SIZE 16 diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index 9a642c79a657..f200fae6f7cb 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -93,8 +93,10 @@ static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, struct sock *sk) { struct net_device *ndev = cdev->ports[0]; +#if IS_ENABLED(CONFIG_IPV6) struct net_device *temp; int addr_type; +#endif switch (sk->sk_family) { case PF_INET: @@ -102,19 +104,21 @@ static struct net_device *chtls_find_netdev(struct chtls_dev *cdev, return ndev; ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); break; +#if IS_ENABLED(CONFIG_IPV6) case PF_INET6: addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); if (likely(addr_type == IPV6_ADDR_ANY)) return ndev; - for_each_netdev_rcu(&init_net, temp) { - if (ipv6_chk_addr(&init_net, (struct in6_addr *) - &sk->sk_v6_rcv_saddr, temp, 1)) { - ndev = temp; - break; + for_each_netdev_rcu(&init_net, temp) { + if (ipv6_chk_addr(&init_net, (struct in6_addr *) + &sk->sk_v6_rcv_saddr, temp, 1)) { + ndev = temp; + break; + } } - } break; +#endif default: return NULL; } @@ -476,8 +480,10 @@ void chtls_destroy_sock(struct sock *sk) csk->cdev = NULL; if (sk->sk_family == AF_INET) sk->sk_prot = &tcp_prot; +#if IS_ENABLED(CONFIG_IPV6) else sk->sk_prot = &tcpv6_prot; +#endif sk->sk_prot->destroy(sk); } @@ -629,14 +635,15 @@ static void chtls_reset_synq(struct listen_ctx *listen_ctx) int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) { struct net_device *ndev; +#if IS_ENABLED(CONFIG_IPV6) + bool clip_valid = false; +#endif struct listen_ctx *ctx; struct adapter *adap; struct port_info *pi; - bool clip_valid; + int ret = 0; int stid; - int ret; - clip_valid = false; rcu_read_lock(); ndev = chtls_find_netdev(cdev, sk); rcu_read_unlock(); @@ -674,6 +681,7 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_sport, 0, cdev->lldi->rxq_ids[0]); +#if IS_ENABLED(CONFIG_IPV6) } else { int addr_type; @@ -689,6 +697,7 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_sport, cdev->lldi->rxq_ids[0]); +#endif } if (ret > 0) ret = net_xmit_errno(ret); @@ -696,8 +705,10 @@ int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) goto del_hash; return 0; del_hash: +#if IS_ENABLED(CONFIG_IPV6) if (clip_valid) cxgb4_clip_release(ndev, (const u32 *)&sk->sk_v6_rcv_saddr, 1); +#endif listen_hash_del(cdev, sk); free_stid: cxgb4_free_stid(cdev->tids, stid, sk->sk_family); @@ -711,8 +722,6 @@ free_ctx: void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) { struct listen_ctx *listen_ctx; - struct chtls_sock *csk; - int addr_type = 0; int stid; stid = listen_hash_del(cdev, sk); @@ -725,7 +734,11 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) cxgb4_remove_server(cdev->lldi->ports[0], stid, cdev->lldi->rxq_ids[0], sk->sk_family == PF_INET6); +#if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { + struct chtls_sock *csk; + int addr_type = 0; + csk = rcu_dereference_sk_user_data(sk); addr_type = ipv6_addr_type((const struct in6_addr *) &sk->sk_v6_rcv_saddr); @@ -733,6 +746,7 @@ void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) cxgb4_clip_release(csk->egress_dev, (const u32 *) &sk->sk_v6_rcv_saddr, 1); } +#endif chtls_disconnect_acceptq(sk); } @@ -941,9 +955,11 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk, tp = tcp_sk(sk); tcpoptsz = 0; +#if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) iphdrsz = sizeof(struct ipv6hdr) + sizeof(struct tcphdr); else +#endif iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); if (req->tcpopt.tstamp) tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4); @@ -1091,13 +1107,13 @@ static struct sock *chtls_recv_sock(struct sock *lsk, const struct cpl_pass_accept_req *req, struct chtls_dev *cdev) { + struct neighbour *n = NULL; struct inet_sock *newinet; const struct iphdr *iph; struct tls_context *ctx; struct net_device *ndev; struct chtls_sock *csk; struct dst_entry *dst; - struct neighbour *n; struct tcp_sock *tp; struct sock *newsk; u16 port_id; @@ -1115,6 +1131,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, goto free_sk; n = dst_neigh_lookup(dst, &iph->saddr); +#if IS_ENABLED(CONFIG_IPV6) } else { const struct ipv6hdr *ip6h; struct flowi6 fl6; @@ -1131,6 +1148,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, if (IS_ERR(dst)) goto free_sk; n = dst_neigh_lookup(dst, &ip6h->saddr); +#endif } if (!n) goto free_sk; @@ -1158,6 +1176,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, newinet->inet_daddr = iph->saddr; newinet->inet_rcv_saddr = iph->daddr; newinet->inet_saddr = iph->daddr; +#if IS_ENABLED(CONFIG_IPV6) } else { struct tcp6_sock *newtcp6sk = (struct tcp6_sock *)newsk; struct inet_request_sock *treq = inet_rsk(oreq); @@ -1175,6 +1194,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, newinet->inet_opt = NULL; newinet->inet_daddr = LOOPBACK4_IPV6; newinet->inet_saddr = LOOPBACK4_IPV6; +#endif } oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid)); @@ -1337,10 +1357,12 @@ static void chtls_pass_accept_request(struct sock *sk, if (iph->version == 0x4) { chtls_set_req_addr(oreq, iph->daddr, iph->saddr); ip_dsfield = ipv4_get_dsfield(iph); +#if IS_ENABLED(CONFIG_IPV6) } else { inet_rsk(oreq)->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; inet_rsk(oreq)->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ip_dsfield = ipv6_get_dsfield(ipv6_hdr(skb)); +#endif } if (req->tcpopt.wsf <= 14 && sock_net(sk)->ipv4.sysctl_tcp_window_scaling) { diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c index 7dfffdde9593..d98b89d0fa6e 100644 --- a/drivers/crypto/chelsio/chtls/chtls_main.c +++ b/drivers/crypto/chelsio/chtls/chtls_main.c @@ -608,9 +608,11 @@ static void __init chtls_init_ulp_ops(void) chtls_cpl_prot.recvmsg = chtls_recvmsg; chtls_cpl_prot.setsockopt = chtls_setsockopt; chtls_cpl_prot.getsockopt = chtls_getsockopt; +#if IS_ENABLED(CONFIG_IPV6) chtls_cpl_protv6 = chtls_cpl_prot; chtls_init_rsk_ops(&chtls_cpl_protv6, &chtls_rsk_opsv6, &tcpv6_prot, PF_INET6); +#endif } static int __init chtls_register(void) From 7d877c35ca84cfa634fd63c2b64bf7b6ae9c71cb Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 4 Jun 2020 23:42:59 +0200 Subject: [PATCH 375/574] net/xdp: use shift instead of 64 bit division 64bit division is kind of expensive, and shift should do the job here. Signed-off-by: Pavel Machek (CIP) Signed-off-by: David S. Miller --- net/xdp/xdp_umem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 1bbaf1747e4f..a0d2b757807f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -336,7 +336,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if ((addr + size) < addr) return -EINVAL; - npgs = div_u64(size, PAGE_SIZE); + npgs = size >> PAGE_SHIFT; if (npgs > U32_MAX) return -EINVAL; From 7e901d6e9519db4db0dd30a33ec172dd094cfb11 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 22 May 2020 13:57:41 -0500 Subject: [PATCH 376/574] gfs2: print mapping->nrpages in glock dump for address space glocks This patch makes the glock dumps in debugfs print the number of pages (nrpages) for address space glocks. This will aid in debugging. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index bf70e3b14938..9a5dadc93cfc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1978,7 +1978,13 @@ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) char gflags_buf[32]; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; + unsigned long nrpages = 0; + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct address_space *mapping = gfs2_glock2aspace(gl); + + nrpages = mapping->nrpages; + } memset(fs_id_buf, 0, sizeof(fs_id_buf)); if (fsid && sdp) /* safety precaution */ sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); @@ -1987,15 +1993,16 @@ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " - "v:%d r:%d m:%ld\n", fs_id_buf, state2str(gl->gl_state), - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number, - gflags2str(gflags_buf, gl), - state2str(gl->gl_target), - state2str(gl->gl_demote_state), dtime, - atomic_read(&gl->gl_ail_count), - atomic_read(&gl->gl_revokes), - (int)gl->gl_lockref.count, gl->gl_hold_time); + "v:%d r:%d m:%ld p:%lu\n", + fs_id_buf, state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); list_for_each_entry(gh, &gl->gl_holders, gh_list) dump_holder(seq, gh, fs_id_buf); From ea4e61c7f46d33fdf17580a925e47cc83570d658 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Sat, 23 May 2020 08:13:50 -0500 Subject: [PATCH 377/574] gfs2: introduce new gfs2_glock_assert_withdraw Before this patch, asserts based on glocks did not print the glock with the error. This patch introduces a new macro, gfs2_glock_assert_withdraw which first prints the glock, then takes the assert. This also changes a few glock asserts to the new macro. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 7 ++++--- fs/gfs2/glock.h | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9a5dadc93cfc..64541d8bf9ad 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -164,7 +164,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - BUG_ON(atomic_read(&gl->gl_revokes)); + gfs2_glock_assert_withdraw(gl, atomic_read(&gl->gl_revokes) == 0); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); @@ -626,7 +626,8 @@ __acquires(&gl->gl_lockref.lock) */ if ((atomic_read(&gl->gl_ail_count) != 0) && (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { - gfs2_assert_warn(sdp, !atomic_read(&gl->gl_ail_count)); + gfs2_glock_assert_warn(gl, + !atomic_read(&gl->gl_ail_count)); gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); @@ -1836,7 +1837,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip) int ret; ret = gfs2_truncatei_resume(ip); - gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0); + gfs2_glock_assert_withdraw(gl, ret == 0); spin_lock(&gl->gl_lockref.lock); clear_bit(GLF_LOCK, &gl->gl_flags); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..1c547dff7c57 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -205,6 +205,15 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \ gfs2_dump_glock(NULL, gl, true); \ BUG(); } } while(0) +#define gfs2_glock_assert_warn(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_warn((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) +#define gfs2_glock_assert_withdraw(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) + extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); From d5dc3d9677394d4fb4dca61856491df5a37db31a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 22 May 2020 14:03:21 -0500 Subject: [PATCH 378/574] gfs2: instrumentation wrt log_flush stuck This adds checks for gfs2_log_flush being stuck, similarly to the check in gfs2_ail1_flush. To faciliate this and make the strings easy to grep we move the ail1 emptying to its own function, empty_ail1_list. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0644e58c6191..fcc7f58d74f0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -145,9 +145,6 @@ static void dump_ail_list(struct gfs2_sbd *sdp) struct gfs2_bufdata *bd; struct buffer_head *bh; - fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n", - current->journal_info ? 1 : 0); - list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry_reverse(bd, &tr->tr_ail1_list, bd_ail_st_list) { @@ -197,6 +194,8 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) restart: ret = 0; if (time_after(jiffies, flush_start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for ten minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); dump_ail_list(sdp); goto out; } @@ -876,6 +875,28 @@ static void ail_drain(struct gfs2_sbd *sdp) spin_unlock(&sdp->sd_ail_lock); } +/** + * empty_ail1_list - try to start IO and empty the ail1 list + * @sdp: Pointer to GFS2 superblock + */ +static void empty_ail1_list(struct gfs2_sbd *sdp) +{ + unsigned long start = jiffies; + + for (;;) { + if (time_after(jiffies, start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); + dump_ail_list(sdp); + return; + } + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp, 0)) + return; + } +} + /** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem @@ -965,12 +986,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { if (!sdp->sd_log_idle) { - for (;;) { - gfs2_ail1_start(sdp); - gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp, 0)) - break; - } + empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) goto out; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ From 15f2547b4157a1e7e4d75bec5df097c1436f6cbd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 15 Jan 2020 12:47:46 -0600 Subject: [PATCH 379/574] gfs2: Allow ASPACE glocks to also have an lvb Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index bf70e3b14938..86e9e621f346 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -125,12 +125,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); - if (gl->gl_ops->go_flags & GLOF_ASPACE) { + kfree(gl->gl_lksb.sb_lvbptr); + if (gl->gl_ops->go_flags & GLOF_ASPACE) kmem_cache_free(gfs2_glock_aspace_cachep, gl); - } else { - kfree(gl->gl_lksb.sb_lvbptr); + else kmem_cache_free(gfs2_glock_cachep, gl); - } } /** From f286d627ef026a4d04b41ae5917d58ddf243c3c5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Jan 2020 21:21:49 +0100 Subject: [PATCH 380/574] gfs2: Keep track of deleted inode generations in LVBs When deleting an inode, keep track of the generation of the deleted inode in the inode glock Lock Value Block (LVB). When trying to delete an inode remotely, check the last-known inode generation against the deleted inode generation to skip duplicate remote deletes. This avoids taking the resource group glock in order to verify the block type. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 19 +++++++++++++++++++ fs/gfs2/glock.h | 3 +++ fs/gfs2/glops.c | 2 +- fs/gfs2/super.c | 3 +++ include/uapi/linux/gfs2_ondisk.h | 6 ++++++ 5 files changed, 32 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 86e9e621f346..12681616eb76 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -755,6 +755,25 @@ out_unlock: return; } +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic == 0) + ri->ri_magic = cpu_to_be32(GFS2_MAGIC); + if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) + ri->ri_generation_deleted = cpu_to_be64(generation); +} + +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) + return false; + return generation <= be64_to_cpu(ri->ri_generation_deleted); +} + static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..5c1b60fdedcf 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -306,4 +306,7 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(&gl->gl_lockref.lock); } +extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 9e9c7a4b8c66..63ae9e45ce34 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -692,7 +692,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_flags = GLOF_ASPACE | GLOF_LRU, + .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, .go_free = inode_go_free, }; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 956fced0a8ec..e69efed9fb51 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1315,6 +1315,8 @@ static void gfs2_evict_inode(struct inode *inode) goto out; } + if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) + goto out_truncate; error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); if (error) goto out_truncate; @@ -1368,6 +1370,7 @@ alloc_failed: that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); goto out_unlock; out_truncate: diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..07e508e6691b 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,6 +171,12 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x00000008 #define GFS2_RGF_TRIMMED 0x00000010 +struct gfs2_inode_lvb { + __be32 ri_magic; + __be32 __pad; + __be64 ri_generation_deleted; +}; + struct gfs2_rgrp_lvb { __be32 rl_magic; __be32 rl_flags; From a0e3cc65fa29f497cc97a069c318532c2a48d148 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 16 Jan 2020 20:12:26 +0100 Subject: [PATCH 381/574] gfs2: Turn gl_delete into a delayed work This requires flushing delayed work items in gfs2_make_fs_ro (which is called before unmounting a filesystem). When inodes are deleted and then recreated, pending gl_delete work items would have no effect because the inode generations will have changed, so we can cancel any pending gl_delete works before reusing iopen glocks. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- fs/gfs2/glock.h | 4 ++++ fs/gfs2/glops.c | 9 ++++++++- fs/gfs2/incore.h | 5 +++-- fs/gfs2/inode.c | 2 ++ fs/gfs2/rgrp.c | 2 +- fs/gfs2/super.c | 4 ++-- 7 files changed, 65 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 12681616eb76..0332086f7ab9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -776,11 +776,16 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) static void delete_work_func(struct work_struct *work) { - struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct delayed_work *dwork = to_delayed_work(work); + struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; + spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + /* If someone's using this glock to create a new dinode, the block must have been freed by another node, then re-used, in which case our iopen callback is too late after the fact. Ignore it. */ @@ -949,7 +954,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_WORK(&gl->gl_delete, delete_work_func); + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1772,6 +1777,44 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) rhashtable_walk_exit(&iter); } +bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay) +{ + bool queued; + + spin_lock(&gl->gl_lockref.lock); + queued = queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, delay); + if (queued) + set_bit(GLF_PENDING_DELETE, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + return queued; +} + +void gfs2_cancel_delete_work(struct gfs2_glock *gl) +{ + if (cancel_delayed_work_sync(&gl->gl_delete)) { + clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + gfs2_glock_put(gl); + } +} + +bool gfs2_delete_work_queued(const struct gfs2_glock *gl) +{ + return test_bit(GLF_PENDING_DELETE, &gl->gl_flags); +} + +static void flush_delete_work(struct gfs2_glock *gl) +{ + flush_delayed_work(&gl->gl_delete); + gfs2_glock_queue_work(gl, 0); +} + +void gfs2_flush_delete_work(struct gfs2_sbd *sdp) +{ + glock_hash_walk(flush_delete_work, sdp); + flush_workqueue(gfs2_delete_workqueue); +} + /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5c1b60fdedcf..711b3005a7ea 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -235,6 +235,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay); +extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); +extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); +extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 63ae9e45ce34..909cd722e46a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -608,11 +608,17 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { gl->gl_lockref.count++; - if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (!queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0)) gl->gl_lockref.count--; } } +static int iopen_go_demote_ok(const struct gfs2_glock *gl) +{ + return !gfs2_delete_work_queued(gl); +} + /** * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it * @gl: glock being freed @@ -716,6 +722,7 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 84a824293a78..fdcf7a2f06c5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -345,6 +345,7 @@ enum { GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, GLF_INODE_CREATING = 16, /* Inode creation occurring */ + GLF_PENDING_DELETE = 17, GLF_FREEING = 18, /* Wait for glock to be freed */ }; @@ -378,8 +379,8 @@ struct gfs2_glock { atomic_t gl_revokes; struct delayed_work gl_work; union { - /* For inode and iopen glocks only */ - struct work_struct gl_delete; + /* For iopen glocks only */ + struct delayed_work gl_delete; /* For rgrp glocks only */ struct { loff_t start; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5acd3ce30759..a4112906abc2 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -170,6 +170,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -724,6 +725,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_set_iop(inode); insert_inode_hash(inode); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index a321c34e3d6e..074f228ea839 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1835,7 +1835,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip */ ip = gl->gl_object; - if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (ip || !gfs2_queue_delete_work(gl, 0)) gfs2_glock_put(gl); else found++; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e69efed9fb51..71218a6fd9b4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -626,7 +626,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) } } - flush_workqueue(gfs2_delete_workqueue); + gfs2_flush_delete_work(sdp); if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); else if (sdp->sd_quotad_process) @@ -1054,7 +1054,7 @@ static int gfs2_drop_inode(struct inode *inode) struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); - if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + if (!gfs2_queue_delete_work(gl, 0)) gfs2_glock_queue_put(gl); return false; } From 8c7b9262a8607636ecd7250f29c7aac17f08901c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Jan 2020 22:16:17 +0100 Subject: [PATCH 382/574] gfs2: Give up the iopen glock on contention When there's contention on the iopen glock, it means that the link count of the corresponding inode has dropped to zero on a remote node which is now trying to delete the inode. In that case, try to evict the inode so that the iopen glock will be released, which will allow the remote node to do its job. When the inode is still open locally, the inode's reference count won't drop to zero and so we'll keep holding the inode and its iopen glock. The remote node will time out its request to grab the iopen glock, and when the inode is finally closed locally, we'll try to delete it ourself. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/incore.h | 1 + fs/gfs2/super.c | 7 +++++-- 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0332086f7ab9..bf7daa35f73f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -774,6 +774,42 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) return generation <= be64_to_cpu(ri->ri_generation_deleted); } +static bool gfs2_try_evict(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + bool evicted = false; + + /* + * If there is contention on the iopen glock and we have an inode, try + * to grab and release the inode so that it can be evicted. This will + * allow the remote node to go ahead and delete the inode without us + * having to do it, which will avoid rgrp glock thrashing. + * + * The remote node is likely still holding the corresponding inode + * glock, so it will run before we get to verify that the delete has + * happened below. + */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + d_prune_aliases(&ip->i_inode); + iput(&ip->i_inode); + + /* If the inode was evicted, gl->gl_object will now be NULL. */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) + clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + spin_unlock(&gl->gl_lockref.lock); + evicted = !ip; + } + return evicted; +} + static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); @@ -792,6 +828,21 @@ static void delete_work_func(struct work_struct *work) if (test_bit(GLF_INODE_CREATING, &gl->gl_flags)) goto out; + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + /* + * If we can evict the inode, give the remote node trying to + * delete the inode some time before verifying that the delete + * has happened. Otherwise, if we cause contention on the inode glock + * immediately, the remote node will think that we still have + * the inode in use, and so it will give up waiting. + */ + if (gfs2_try_evict(gl)) { + if (gfs2_queue_delete_work(gl, 5 * HZ)) + return; + goto out; + } + } + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); if (!IS_ERR_OR_NULL(inode)) { d_prune_aliases(inode); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index fdcf7a2f06c5..76ac2578e658 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -399,6 +399,7 @@ enum { GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, + GIF_DEFERRED_DELETE = 7, }; struct gfs2_inode { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 71218a6fd9b4..7d8caf169efd 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1299,9 +1299,12 @@ static void gfs2_evict_inode(struct inode *inode) if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); gfs2_holder_mark_uninitialized(&gh); - goto alloc_failed; + goto out_delete; } + if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) + goto out; + /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) goto out; @@ -1333,7 +1336,7 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink) goto out_truncate; -alloc_failed: +out_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; From 9e73330f298acf544de72436b7bb825ff3aa1a54 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 14 Jan 2020 14:59:08 +0100 Subject: [PATCH 383/574] gfs2: Try harder to delete inodes locally When an inode's link count drops to zero and the inode is cached on other nodes, the current behavior of gfs2 is to immediately give up and to rely on the other node(s) to delete the inode if there is iopen glock contention. This leads to resource group glock bouncing and the loss of caching. With the previous patches in place, we can fix that by not giving up immediately. When the inode is still open on other nodes, those nodes won't be able to evict the inode and give up the iopen glock. In that case, our lock conversion request will time out. The unlink system call will block for the duration of the iopen lock conversion request. We're also holding the inode glock in EX mode for an extended duration, so other nodes won't be able to make progress on the inode, either. This is worse than what we had before, but we can prevent other nodes from getting stuck by aborting our iopen locking request if there is contention on the inode glock. This will the the subject of a future patch. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 53 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 7d8caf169efd..bcb56f13f50a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1258,6 +1258,50 @@ static void gfs2_glock_put_eventually(struct gfs2_glock *gl) gfs2_glock_put(gl); } +static bool gfs2_upgrade_iopen_glock(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *gh = &ip->i_iopen_gh; + long timeout = 5 * HZ; + int error; + + gh->gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(gh); + + /* + * If there are no other lock holders, we'll get the lock immediately. + * Otherwise, the other nodes holding the lock will be notified about + * our locking request. If they don't have the inode open, they'll + * evict the cached inode and release the lock. As a last resort, + * we'll eventually time out. + * + * Note that we're passing the LM_FLAG_TRY_1CB flag to the first + * locking request as an optimization to notify lock holders as soon as + * possible. Without that flag, they'd be notified implicitly by the + * second locking request. + */ + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error != GLR_TRYFAILED) + return !error; + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error) + return false; + + timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait, + !test_bit(HIF_WAIT, &gh->gh_iflags), + timeout); + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_dq(gh); + return false; + } + return true; +} + /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict @@ -1339,13 +1383,10 @@ static void gfs2_evict_inode(struct inode *inode) out_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, - &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) + if (!gfs2_upgrade_iopen_glock(inode)) { + gfs2_holder_uninit(&ip->i_iopen_gh); goto out_truncate; + } } if (S_ISDIR(inode->i_mode) && From 6bdcadea75768bbd1cd8f6f13011978e1e19a53b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 15 Jan 2020 05:31:38 +0100 Subject: [PATCH 384/574] gfs2: Minor gfs2_lookup_by_inum cleanup Use a zero no_formal_ino instead of a NULL pointer to indicate that any inode generation number will qualify: a valid inode never has a zero no_formal_ino. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/export.c | 4 +++- fs/gfs2/glock.c | 2 +- fs/gfs2/inode.c | 11 +++++++++-- fs/gfs2/inode.h | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 3f717285ee48..756d05779200 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -134,7 +134,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + if (!inum->no_formal_ino) + return ERR_PTR(-ESTALE); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino, GFS2_BLKST_DINODE); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index bf7daa35f73f..b6078b0e74b9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -843,7 +843,7 @@ static void delete_work_func(struct work_struct *work) } } - inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + inode = gfs2_lookup_by_inum(sdp, no_addr, 0, GFS2_BLKST_UNLINKED); if (!IS_ERR_OR_NULL(inode)) { d_prune_aliases(inode); iput(inode); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a4112906abc2..812a6ae03f6c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -207,8 +207,15 @@ fail: return ERR_PTR(error); } +/** + * gfs2_lookup_by_inum - look up an inode by inode number + * @sdp: The super block + * @no_addr: The inode number + * @no_formal_ino: The inode generation number (0 for any) + * @blktype: Requested block type (see gfs2_inode_lookup) + */ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 *no_formal_ino, unsigned int blktype) + u64 no_formal_ino, unsigned int blktype) { struct super_block *sb = sdp->sd_vfs; struct inode *inode; @@ -221,7 +228,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, /* Two extra checks for NFS only */ if (no_formal_ino) { error = -ESTALE; - if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + if (GFS2_I(inode)->i_no_formal_ino != no_formal_ino) goto fail_iput; error = -EIO; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 580adbf0b5e1..b52ecf4ffe63 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -92,7 +92,7 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino, unsigned int blktype); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 *no_formal_ino, + u64 no_formal_ino, unsigned int blktype); extern int gfs2_inode_refresh(struct gfs2_inode *ip); From b66648ad6dcfefd9f02b5408c1381987c090cb13 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 15 Jan 2020 06:21:42 +0100 Subject: [PATCH 385/574] gfs2: Move inode generation number check into gfs2_inode_lookup Move the inode generation number check from gfs2_lookup_by_inum into gfs2_inode_lookup: gfs2_inode_lookup may be able to decide that an inode with the given inode generation number cannot exist without having to verify the block type or reading the inode from disk. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 812a6ae03f6c..499f861b6e09 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -114,6 +114,10 @@ static void gfs2_set_iop(struct inode *inode) * placeholder because it doesn't otherwise make sense), the on-disk block type * is verified to be @blktype. * + * When @no_formal_ino is non-zero, this function will return ERR_PTR(-ESTALE) + * if it detects that @no_formal_ino doesn't match the actual inode generation + * number. However, it doesn't always know unless @type is DT_UNKNOWN. + * * Returns: A VFS inode, or an error */ @@ -157,6 +161,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (error) goto fail; + error = -ESTALE; + if (no_formal_ino && + gfs2_inode_already_deleted(ip->i_gl, no_formal_ino)) + goto fail; + if (blktype != GFS2_BLKST_FREE) { error = gfs2_check_blk_type(sdp, no_addr, blktype); @@ -189,13 +198,23 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, inode->i_mode = DT2IF(type); } - gfs2_set_iop(inode); + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); - unlock_new_inode(inode); + gfs2_set_iop(inode); } - if (gfs2_holder_initialized(&i_gh)) - gfs2_glock_dq_uninit(&i_gh); + if (no_formal_ino && ip->i_no_formal_ino && + no_formal_ino != ip->i_no_formal_ino) { + if (inode->i_state & I_NEW) + goto fail; + iput(inode); + return ERR_PTR(-ESTALE); + } + + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + return inode; fail: @@ -221,16 +240,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, struct inode *inode; int error; - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, no_formal_ino, + blktype); if (IS_ERR(inode)) return inode; - /* Two extra checks for NFS only */ if (no_formal_ino) { - error = -ESTALE; - if (GFS2_I(inode)->i_no_formal_ino != no_formal_ino) - goto fail_iput; - error = -EIO; if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) goto fail_iput; From b0dcffd8da3339ad0300587ce7030efdf2e914a9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 15 Jan 2020 09:54:14 +0100 Subject: [PATCH 386/574] gfs2: Check inode generation number in delete_work_func In delete_work_func, if the iopen glock still has an inode attached, limit the inode lookup to that specific generation number: in the likely case that the inode was deleted on the node on which the inode's link count dropped to zero, we can skip verifying the on-disk block type and reading in the inode. The same applies if another node that had the inode open managed to delete the inode before us. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 4 +++- fs/gfs2/incore.h | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b6078b0e74b9..711259f68d55 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -795,6 +795,7 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { + gl->gl_no_formal_ino = ip->i_no_formal_ino; set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); @@ -843,7 +844,8 @@ static void delete_work_func(struct work_struct *work) } } - inode = gfs2_lookup_by_inum(sdp, no_addr, 0, GFS2_BLKST_UNLINKED); + inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, + GFS2_BLKST_UNLINKED); if (!IS_ERR_OR_NULL(inode)) { d_prune_aliases(inode); iput(inode); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 76ac2578e658..03ab11fab962 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -380,7 +380,10 @@ struct gfs2_glock { struct delayed_work gl_work; union { /* For iopen glocks only */ - struct delayed_work gl_delete; + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; + }; /* For rgrp glocks only */ struct { loff_t start; From 35b6f8fbcf9b2ebdee0a0f77143a8d203a9616e1 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 17 Jan 2020 13:48:49 +0100 Subject: [PATCH 387/574] gfs2: Wake up when setting GLF_DEMOTE Wake up the sdp->sd_async_glock_wait wait queue when setting the GLF_DEMOTE flag. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 711259f68d55..7ad06dd49352 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -464,6 +464,15 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) gl->gl_tchange = jiffies; } +static void gfs2_set_demote(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + set_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb(); + wake_up(&sdp->sd_async_glock_wait); +} + static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; @@ -876,7 +885,7 @@ static void glock_work_func(struct work_struct *work) if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); - set_bit(GLF_DEMOTE, &gl->gl_flags); + gfs2_set_demote(gl); } } run_queue(gl, 0); @@ -1221,9 +1230,10 @@ wait_for_dlm: static void handle_callback(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { - int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; - - set_bit(bit, &gl->gl_flags); + if (delay) + set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + else + gfs2_set_demote(gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; From 9e8990dea9266af68a668b1503dc6f55c56f1bb6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 17 Jan 2020 10:53:23 +0100 Subject: [PATCH 388/574] gfs2: Smarter iopen glock waiting When trying to upgrade the iopen glock from a shared to an exclusive lock in gfs2_evict_inode, abort the wait if there is contention on the corresponding inode glock: in that case, the inode must still be in active use on another node, and we're not guaranteed to get the iopen glock anytime soon. To make this work even better, when we notice contention on the iopen glock and we can't evict the corresponsing inode and release the iopen glock immediately, poke the inode glock. The other node(s) trying to acquire the lock can then abort instead of timing out. Thanks to Heinz Mauelshagen for pointing out a locking bug in a previous version of this patch. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 34 ++++++++++++++++++++++++++++++++-- fs/gfs2/super.c | 11 ++++++++--- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7ad06dd49352..71091e35f83d 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -783,6 +783,17 @@ bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) return generation <= be64_to_cpu(ri->ri_generation_deleted); } +static void gfs2_glock_poke(struct gfs2_glock *gl) +{ + int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, flags, &gh); + if (!error) + gfs2_glock_dq(&gh); +} + static bool gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; @@ -804,6 +815,8 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { + struct gfs2_glock *inode_gl = NULL; + gl->gl_no_formal_ino = ip->i_no_formal_ino; set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); @@ -812,9 +825,16 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) /* If the inode was evicted, gl->gl_object will now be NULL. */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; - if (ip) + if (ip) { + inode_gl = ip->i_gl; + lockref_get(&inode_gl->gl_lockref); clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + } spin_unlock(&gl->gl_lockref.lock); + if (inode_gl) { + gfs2_glock_poke(inode_gl); + gfs2_glock_put(inode_gl); + } evicted = !ip; } return evicted; @@ -845,12 +865,22 @@ static void delete_work_func(struct work_struct *work) * has happened. Otherwise, if we cause contention on the inode glock * immediately, the remote node will think that we still have * the inode in use, and so it will give up waiting. + * + * If we can't evict the inode, signal to the remote node that + * the inode is still in use. We'll later try to delete the + * inode locally in gfs2_evict_inode. + * + * FIXME: We only need to verify that the remote node has + * deleted the inode because nodes before this remote delete + * rework won't cooperate. At a later time, when we no longer + * care about compatibility with such nodes, we can skip this + * step entirely. */ if (gfs2_try_evict(gl)) { if (gfs2_queue_delete_work(gl, 5 * HZ)) return; - goto out; } + goto out; } inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index bcb56f13f50a..32d8d26126a1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1273,8 +1273,12 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) * If there are no other lock holders, we'll get the lock immediately. * Otherwise, the other nodes holding the lock will be notified about * our locking request. If they don't have the inode open, they'll - * evict the cached inode and release the lock. As a last resort, - * we'll eventually time out. + * evict the cached inode and release the lock. Otherwise, if they + * poke the inode glock, we'll take this as an indication that they + * still need the iopen glock and that they'll take care of deleting + * the inode when they're done. As a last resort, if another node + * keeps holding the iopen glock without showing any activity on the + * inode glock, we'll eventually time out. * * Note that we're passing the LM_FLAG_TRY_1CB flag to the first * locking request as an optimization to notify lock holders as soon as @@ -1293,7 +1297,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) return false; timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait, - !test_bit(HIF_WAIT, &gh->gh_iflags), + !test_bit(HIF_WAIT, &gh->gh_iflags) || + test_bit(GLF_DEMOTE, &ip->i_gl->gl_flags), timeout); if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_dq(gh); From cbcc89b630447ec7836aa2b9242d9bb1725f5a61 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 5 Jun 2020 14:12:34 -0500 Subject: [PATCH 389/574] gfs2: initialize transaction tr_ailX_lists earlier Since transactions may be freed shortly after they're created, before a log_flush occurs, we need to initialize their ail1 and ail2 lists earlier. Before this patch, the ail1 list was initialized in gfs2_log_flush(). This moves the initialization to the point when the transaction is first created. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 2 ++ fs/gfs2/log.c | 2 -- fs/gfs2/trans.c | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4862dae868a2..224fb3bd503c 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -91,6 +91,8 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) memset(&tr, 0, sizeof(tr)); INIT_LIST_HEAD(&tr.tr_buf); INIT_LIST_HEAD(&tr.tr_databuf); + INIT_LIST_HEAD(&tr.tr_ail1_list); + INIT_LIST_HEAD(&tr.tr_ail2_list); tr.tr_revokes = atomic_read(&gl->gl_ail_count); if (!tr.tr_revokes) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index fcc7f58d74f0..a81af1bde1bb 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -933,8 +933,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; - INIT_LIST_HEAD(&tr->tr_ail1_list); - INIT_LIST_HEAD(&tr->tr_ail2_list); tr->tr_first = sdp->sd_log_flush_head; if (unlikely (state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ffe840505082..62a65ed9a9f5 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -52,6 +52,8 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); sb_start_intwrite(sdp->sd_vfs); From b839dadae8721eaf7245fcef3d67d82b95d00c77 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 17 Apr 2019 12:04:27 -0600 Subject: [PATCH 390/574] gfs2: new slab for transactions This patch adds a new slab for gfs2 transactions. That allows us to reduce kernel memory fragmentation, have better organization of data for analysis of vmcore dumps. A new centralized function is added to free the slab objects, and it exposes use-after-free by giving warnings if a transaction is freed while it still has bd elements attached to its buffers or ail lists. We make sure to initialize those transaction ail lists so we can check their integrity when freeing. At a later time, we should add a slab initialization function to make it more efficient, but for this initial patch I wanted to minimize the impact. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 9 +++++---- fs/gfs2/main.c | 9 +++++++++ fs/gfs2/trans.c | 19 +++++++++++++++---- fs/gfs2/trans.h | 1 + fs/gfs2/util.c | 1 + fs/gfs2/util.h | 1 + 6 files changed, 32 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a81af1bde1bb..a7415ab91c5f 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -30,6 +30,7 @@ #include "util.h" #include "dir.h" #include "trace_gfs2.h" +#include "trans.h" static void gfs2_log_shutdown(struct gfs2_sbd *sdp); @@ -378,7 +379,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) list_del(&tr->tr_list); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); - kfree(tr); + gfs2_trans_free(sdp, tr); } spin_unlock(&sdp->sd_ail_lock); @@ -863,14 +864,14 @@ static void ail_drain(struct gfs2_sbd *sdp) gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); - kfree(tr); + gfs2_trans_free(sdp, tr); } while (!list_empty(&sdp->sd_ail2_list)) { tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans, tr_list); gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); list_del(&tr->tr_list); - kfree(tr); + gfs2_trans_free(sdp, tr); } spin_unlock(&sdp->sd_ail_lock); } @@ -1008,7 +1009,7 @@ out: trace_gfs2_log_flush(sdp, 0, flags); up_write(&sdp->sd_log_flush_lock); - kfree(tr); + gfs2_trans_free(sdp, tr); } /** diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a1a295b739fb..733470ca6be9 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -143,6 +143,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail_cachep7; + gfs2_trans_cachep = kmem_cache_create("gfs2_trans", + sizeof(struct gfs2_trans), + 0, 0, NULL); + if (!gfs2_trans_cachep) + goto fail_cachep8; + error = register_shrinker(&gfs2_qd_shrinker); if (error) goto fail_shrinker; @@ -194,6 +200,8 @@ fail_fs2: fail_fs1: unregister_shrinker(&gfs2_qd_shrinker); fail_shrinker: + kmem_cache_destroy(gfs2_trans_cachep); +fail_cachep8: kmem_cache_destroy(gfs2_qadata_cachep); fail_cachep7: kmem_cache_destroy(gfs2_quotad_cachep); @@ -236,6 +244,7 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_trans_cachep); kmem_cache_destroy(gfs2_qadata_cachep); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 62a65ed9a9f5..a3dfa3aa87ad 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -37,7 +37,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); if (!tr) return -ENOMEM; @@ -67,7 +67,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, fail: sb_end_intwrite(sdp->sd_vfs); - kfree(tr); + kmem_cache_free(gfs2_trans_cachep, tr); return error; } @@ -95,7 +95,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { gfs2_log_release(sdp, tr->tr_reserved); if (alloced) { - kfree(tr); + gfs2_trans_free(sdp, tr); sb_end_intwrite(sdp->sd_vfs); } return; @@ -111,7 +111,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_log_commit(sdp, tr); if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) - kfree(tr); + gfs2_trans_free(sdp, tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) @@ -278,3 +278,14 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) gfs2_log_unlock(sdp); } +void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + if (tr == NULL) + return; + + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_databuf)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_buf)); + kmem_cache_free(gfs2_trans_cachep, tr); +} diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 6071334de035..83199ce5a5c5 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -42,5 +42,6 @@ extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index aa087a5675af..1cd0328cae20 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -32,6 +32,7 @@ struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; struct kmem_cache *gfs2_qadata_cachep __read_mostly; +struct kmem_cache *gfs2_trans_cachep __read_mostly; mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index a3542560da6f..6d9157efe16c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -172,6 +172,7 @@ extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; +extern struct kmem_cache *gfs2_trans_cachep; extern mempool_t *gfs2_page_pool; extern struct workqueue_struct *gfs2_control_wq; From 83d060ca8d90fa1e3feac227f995c013100862d3 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 4 Jun 2020 14:28:58 -0500 Subject: [PATCH 391/574] gfs2: fix use-after-free on transaction ail lists Before this patch, transactions could be merged into the system transaction by function gfs2_merge_trans(), but the transaction ail lists were never merged. Because the ail flushing mechanism can run separately, bd elements can be attached to the transaction's buffer list during the transaction (trans_add_meta, etc) but quickly moved to its ail lists. Later, in function gfs2_trans_end, the transaction can be freed (by gfs2_trans_end) while it still has bd elements queued to its ail lists, which can cause it to either lose track of the bd elements altogether (memory leak) or worse, reference the bd elements after the parent transaction has been freed. Although I've not seen any serious consequences, the problem becomes apparent with the previous patch's addition of: gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); to function gfs2_trans_free(). This patch adds logic into gfs2_merge_trans() to move the merged transaction's ail lists to the sdp transaction. This prevents the use-after-free. To do this properly, we need to hold the ail lock, so we pass sdp into the function instead of the transaction itself. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a7415ab91c5f..3e4734431783 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -1018,8 +1018,10 @@ out: * @new: New transaction to be merged */ -static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) { + struct gfs2_trans *old = sdp->sd_log_tr; + WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags)); old->tr_num_buf_new += new->tr_num_buf_new; @@ -1031,6 +1033,11 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); + + spin_lock(&sdp->sd_ail_lock); + list_splice_tail_init(&new->tr_ail1_list, &old->tr_ail1_list); + list_splice_tail_init(&new->tr_ail2_list, &old->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); } static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) @@ -1042,7 +1049,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_lock(sdp); if (sdp->sd_log_tr) { - gfs2_merge_trans(sdp->sd_log_tr, tr); + gfs2_merge_trans(sdp, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags)); sdp->sd_log_tr = tr; From 178f67b1288b6952117fdc4e5ffbd4c4bd4e4a7f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jun 2020 14:04:13 +0300 Subject: [PATCH 392/574] ethtool: linkinfo: remove an unnecessary NULL check This code generates a Smatch warning: net/ethtool/linkinfo.c:143 ethnl_set_linkinfo() warn: variable dereferenced before check 'info' (see line 119) Fortunately, the "info" pointer is never NULL so the check can be removed. Signed-off-by: Dan Carpenter Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/linkinfo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 677068deb68c..5eaf173eaaca 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -140,8 +140,7 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { - if (info) - GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); + GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out_ops; } lsettings = &ksettings.base; From e22437207683a9099d5fb3a1b5cc8f5c7b24a583 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 5 Jun 2020 15:53:24 +0300 Subject: [PATCH 393/574] net: ethernet: mvneta: fix MVNETA_SKB_HEADROOM alignment Commit ca23cb0bc50f ("mvneta: MVNETA_SKB_HEADROOM set last 3 bits to zero") added headroom alignment check against 8. Hovewer (if we imagine that NET_SKB_PAD or XDP_PACKET_HEADROOM is not aligned to cacheline size), it actually aligns headroom down, while skb/xdp_buff headroom should be *at least* equal to one of the values (depending on XDP prog presence). So, fix the check to align the value up. This satisfies both hardware/driver and network stack requirements. Fixes: ca23cb0bc50f ("mvneta: MVNETA_SKB_HEADROOM set last 3 bits to zero") Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 011cd26953d9..4cc9abd61c43 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -325,7 +325,7 @@ cache_line_size()) /* Driver assumes that the last 3 bits are 0 */ -#define MVNETA_SKB_HEADROOM (max(XDP_PACKET_HEADROOM, NET_SKB_PAD) & ~0x7) +#define MVNETA_SKB_HEADROOM ALIGN(max(NET_SKB_PAD, XDP_PACKET_HEADROOM), 8) #define MVNETA_SKB_PAD (SKB_DATA_ALIGN(sizeof(struct skb_shared_info) + \ MVNETA_SKB_HEADROOM)) #define MVNETA_SKB_SIZE(len) (SKB_DATA_ALIGN(len) + MVNETA_SKB_PAD) From e9293c982d88b2d859e26ec42b72c06f20235315 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 5 Jun 2020 09:01:04 -0500 Subject: [PATCH 394/574] net: dp83869: Fix OF_MDIO config check When CONFIG_OF_MDIO is set to be a module the code block is not compiled. Use the IS_ENABLED macro that checks for both built in as well as module. Fixes: 01db923e83779 ("net: phy: dp83869: Add TI dp83869 phy") Signed-off-by: Dan Murphy Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83869.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index cfb22a21a2e6..df85ae5b79e4 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -176,7 +176,7 @@ static int dp83869_set_strapped_mode(struct phy_device *phydev) return 0; } -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) static int dp83869_of_init(struct phy_device *phydev) { struct dp83869_private *dp83869 = phydev->priv; From 506de00677b84dfc6718cbbd3495b1d90df5d098 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 5 Jun 2020 09:01:05 -0500 Subject: [PATCH 395/574] net: dp83867: Fix OF_MDIO config check When CONFIG_OF_MDIO is set to be a module the code block is not compiled. Use the IS_ENABLED macro that checks for both built in as well as module. Fixes: 2a10154abcb75 ("net: phy: dp83867: Add TI dp83867 phy") Signed-off-by: Dan Murphy Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 4017ae1692d8..f3c04981b8da 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -488,7 +488,7 @@ static int dp83867_verify_rgmii_cfg(struct phy_device *phydev) return 0; } -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) static int dp83867_of_init(struct phy_device *phydev) { struct dp83867_private *dp83867 = phydev->priv; From 5cd119d9a05f1c1a08778a7305b4ca0f16bc1e20 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 5 Jun 2020 09:01:06 -0500 Subject: [PATCH 396/574] net: marvell: Fix OF_MDIO config check When CONFIG_OF_MDIO is set to be a module the code block is not compiled. Use the IS_ENABLED macro that checks for both built in as well as module. Fixes: cf41a51db8985 ("of/phylib: Use device tree properties to initialize Marvell PHYs.") Signed-off-by: Dan Murphy Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 4ea226566cec..c9ecf3c8c3fd 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -429,7 +429,7 @@ static int m88e1101_config_aneg(struct phy_device *phydev) return marvell_config_aneg(phydev); } -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) /* Set and/or override some configuration registers based on the * marvell,reg-init property stored in the of_node for the phydev. * From ae602786407fef34e1d66b3c8f278a10ed37197e Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 5 Jun 2020 09:01:07 -0500 Subject: [PATCH 397/574] net: mscc: Fix OF_MDIO config check When CONFIG_OF_MDIO is set to be a module the code block is not compiled. Use the IS_ENABLED macro that checks for both built in as well as module. Fixes: 4f58e6dceb0e4 ("net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.") Signed-off-by: Dan Murphy Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc.h | 2 +- drivers/net/phy/mscc/mscc_main.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index f828c917b9f7..fbcee5fce7b2 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -374,7 +374,7 @@ struct vsc8531_private { #endif }; -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) struct vsc8531_edge_rate_table { u32 vddmac; u32 slowdown[8]; diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 7ed0285206d0..b9d60bc8b6b2 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -98,7 +98,7 @@ static const struct vsc85xx_hw_stat vsc8584_hw_stats[] = { }, }; -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) static const struct vsc8531_edge_rate_table edge_table[] = { {MSCC_VDDMAC_3300, { 0, 2, 4, 7, 10, 17, 29, 53} }, {MSCC_VDDMAC_2500, { 0, 3, 6, 10, 14, 23, 37, 63} }, @@ -382,7 +382,7 @@ out_unlock: mutex_unlock(&phydev->lock); } -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) static int vsc85xx_edge_rate_magic_get(struct phy_device *phydev) { u32 vdd, sd; From 49113d5e0c3f3fbf409075e69c2d1e157c387ac5 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 5 Jun 2020 16:00:09 +0200 Subject: [PATCH 398/574] net: phy: mscc: fix Serdes configuration in vsc8584_config_init When converting the MSCC PHY driver to shared PHY packages, the Serdes configuration in vsc8584_config_init was modified to use 'base_addr' instead of 'base' as the port number. But 'base_addr' isn't equal to 'addr' for all PHYs inside the package, which leads to the Serdes still being enabled on those ports. This patch fixes it. Fixes: deb04e9c0ff2 ("net: phy: mscc: use phy_package_shared") Signed-off-by: Antoine Tenart Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index b9d60bc8b6b2..5ddc44f87eaf 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -1396,7 +1396,7 @@ static int vsc8584_config_init(struct phy_device *phydev) /* Disable SerDes for 100Base-FX */ ret = vsc8584_cmd(phydev, PROC_CMD_FIBER_MEDIA_CONF | - PROC_CMD_FIBER_PORT(vsc8531->base_addr) | + PROC_CMD_FIBER_PORT(vsc8531->addr) | PROC_CMD_FIBER_DISABLE | PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_RST_CONF_PORT | PROC_CMD_FIBER_100BASE_FX); @@ -1405,7 +1405,7 @@ static int vsc8584_config_init(struct phy_device *phydev) /* Disable SerDes for 1000Base-X */ ret = vsc8584_cmd(phydev, PROC_CMD_FIBER_MEDIA_CONF | - PROC_CMD_FIBER_PORT(vsc8531->base_addr) | + PROC_CMD_FIBER_PORT(vsc8531->addr) | PROC_CMD_FIBER_DISABLE | PROC_CMD_READ_MOD_WRITE_PORT | PROC_CMD_RST_CONF_PORT | PROC_CMD_FIBER_1000BASE_X); From 90c56b3877995d948dc382fe833a1c5eeaaee2ad Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Jun 2020 07:21:22 -0700 Subject: [PATCH 399/574] net: ethtool: Fix comment mentioning typo in IS_ENABLED() This has no code changes, but it's a typo noticed in other clean-ups, so we might as well fix it. IS_ENABLED() takes full names, and should have the "CONFIG_" prefix. Reported-by: Joe Perches Link: https://lore.kernel.org/lkml/b08611018fdb6d88757c6008a5c02fa0e07b32fb.camel@perches.com Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 8fbe4f97ffad..1e7bf78cb382 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -67,5 +67,5 @@ static inline int ethnl_cable_test_step(struct phy_device *phydev, u32 first, { return -EOPNOTSUPP; } -#endif /* IS_ENABLED(ETHTOOL_NETLINK) */ +#endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ From fdb4276aae1100aa13f955eff6c944a8200a15ac Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 5 Jun 2020 17:12:41 +0200 Subject: [PATCH 400/574] vsock/vmci: make vmci_vsock_transport_cb() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following gcc-9.3 warning when building with 'make W=1': net/vmw_vsock/vmci_transport.c:2058:6: warning: no previous prototype for ‘vmci_vsock_transport_cb’ [-Wmissing-prototypes] 2058 | void vmci_vsock_transport_cb(bool is_host) | ^~~~~~~~~~~~~~~~~~~~~~~ Fixes: b1bba80a4376 ("vsock/vmci: register vmci_transport only when VMCI guest/host are active") Reported-by: kernel test robot Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4b8b1150a738..8b65323207db 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -2055,7 +2055,7 @@ static bool vmci_check_transport(struct vsock_sock *vsk) return vsk->transport == &vmci_transport; } -void vmci_vsock_transport_cb(bool is_host) +static void vmci_vsock_transport_cb(bool is_host) { int features; From 6da95b52b8ea4944ed1e596ee6db176ee83ddbeb Mon Sep 17 00:00:00 2001 From: Alok Prasad Date: Fri, 5 Jun 2020 16:30:34 +0000 Subject: [PATCH 401/574] net: qed: fixes crash while running driver in kdump kernel This fixes a crash introduced by recent is_kdump_kernel() check. The source of the crash is that kdump kernel can be loaded on a system with already created VFs. But for such VFs, it will follow a logic path of PF and eventually crash. Thus, we are partially reverting back previous changes and instead use is_kdump_kernel is a single init point of PF init, where we disable SRIOV explicitly. Fixes: 37d4f8a6b41f ("net: qed: Disable SRIOV functionality inside kdump kernel") Cc: Bhupesh Sharma Signed-off-by: Igor Russkikh Signed-off-by: Alok Prasad Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sriov.c | 4 ++++ drivers/net/ethernet/qlogic/qed/qed_sriov.h | 10 +++------- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c index 66876af814c4..20679fd4204b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "qed_cxt.h" #include "qed_hsi.h" @@ -607,6 +608,9 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn) int pos; int rc; + if (is_kdump_kernel()) + return 0; + if (IS_VF(p_hwfn->cdev)) return 0; diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h index aabeaf03135e..368e88565783 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h @@ -32,7 +32,6 @@ #ifndef _QED_SRIOV_H #define _QED_SRIOV_H -#include #include #include "qed_vf.h" @@ -41,12 +40,9 @@ #define QED_VF_ARRAY_LENGTH (3) #ifdef CONFIG_QED_SRIOV -#define IS_VF(cdev) (is_kdump_kernel() ? \ - (0) : ((cdev)->b_is_vf)) -#define IS_PF(cdev) (is_kdump_kernel() ? \ - (1) : !((cdev)->b_is_vf)) -#define IS_PF_SRIOV(p_hwfn) (is_kdump_kernel() ? \ - (0) : !!((p_hwfn)->cdev->p_iov_info)) +#define IS_VF(cdev) ((cdev)->b_is_vf) +#define IS_PF(cdev) (!((cdev)->b_is_vf)) +#define IS_PF_SRIOV(p_hwfn) (!!((p_hwfn)->cdev->p_iov_info)) #else #define IS_VF(cdev) (0) #define IS_PF(cdev) (1) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index b2d154258b07..756c05eb96f3 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1265,7 +1265,7 @@ static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id) case QEDE_PRIVATE_VF: if (debug & QED_LOG_VERBOSE_MASK) dev_err(&pdev->dev, "Probing a VF\n"); - is_vf = is_kdump_kernel() ? false : true; + is_vf = true; break; default: if (debug & QED_LOG_VERBOSE_MASK) From 7f89cc07d22a4152f3c649a89aa8fe240effd24a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 5 Jun 2020 22:11:44 +0300 Subject: [PATCH 402/574] cxgb4: Use kfree() instead kvfree() where appropriate Use kfree(buf) in blocked_fl_read() because the memory is allocated with kzalloc(). Use kfree(t) in blocked_fl_write() because the memory is allocated with kcalloc(). Signed-off-by: Denis Efremov Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 41315712deb8..828499256004 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3357,7 +3357,7 @@ static ssize_t blocked_fl_read(struct file *filp, char __user *ubuf, adap->sge.egr_sz, adap->sge.blocked_fl); len += sprintf(buf + len, "\n"); size = simple_read_from_buffer(ubuf, count, ppos, buf, len); - kvfree(buf); + kfree(buf); return size; } @@ -3374,12 +3374,12 @@ static ssize_t blocked_fl_write(struct file *filp, const char __user *ubuf, err = bitmap_parse_user(ubuf, count, t, adap->sge.egr_sz); if (err) { - kvfree(t); + kfree(t); return err; } bitmap_copy(adap->sge.blocked_fl, t, adap->sge.egr_sz); - kvfree(t); + kfree(t); return count; } From 388d8bdb87e01bcea6d0b2bf797b5f6d7b2401fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Apr 2020 11:40:34 +0100 Subject: [PATCH 403/574] tracing: Remove obsolete PREEMPTIRQ_EVENTS kconfig option The PREEMPTIRQ_EVENTS option is unused after commit c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage"). Remove it. Note that this option is hazardous as it stands. It enables TRACE_IRQFLAGS event on non-preempt configurations without the irqsoff tracer enabled. TRACE_IRQFLAGS as it stands incurs significant overhead on each IRQ entry/exit. This is because trace_hardirqs_[on|off] does all the per-cpu manipulations and NMI checks even if tracing is completely disabled for some insane reason. For example, netperf running UDP_STREAM on localhost incurs a 4-6% performance penalty without any tracing if IRQFLAGS is set. It can be put behind a static brach but even the function entry/exit costs a little bit. Link: https://lkml.kernel.org/r/20200409104034.GJ3818@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 75407d5dc83a..0c82ac2c5688 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -258,15 +258,6 @@ config TRACE_PREEMPT_TOGGLE Enables hooks which will be called when preemption is first disabled, and last enabled. -config PREEMPTIRQ_EVENTS - bool "Enable trace events for preempt and irq disable/enable" - select TRACE_IRQFLAGS - select TRACE_PREEMPT_TOGGLE if PREEMPTION - select GENERIC_TRACER - default n - help - Enable tracing of disable and enable events for preemption and irqs. - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n From 4a3084aaa88e70a0af0e38b72ca29af32ec6b9c5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 3 Jun 2020 18:12:43 +1000 Subject: [PATCH 404/574] rhashtable: Drop raw RCU deref in nested_table_free This patch replaces some unnecessary uses of rcu_dereference_raw in the rhashtable code with rcu_dereference_protected. The top-level nested table entry is only marked as RCU because it shares the same type as the tree entries underneath it. So it doesn't need any RCU protection. We also don't need RCU protection when we're freeing a nested RCU table because by this stage we've long passed a memory barrier when anyone could change the nested table. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index bdb7e4cadf05..9f6890aedd1a 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -63,13 +63,22 @@ EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #define ASSERT_RHT_MUTEX(HT) #endif +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} + static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; - ntbl = rcu_dereference_raw(ntbl->table); + ntbl = rcu_dereference_protected(ntbl->table, 1); if (!ntbl) return; @@ -89,7 +98,7 @@ static void nested_bucket_table_free(const struct bucket_table *tbl) union nested_table *ntbl; unsigned int i; - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = nested_table_top(tbl); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); @@ -1173,7 +1182,7 @@ struct rhash_lock_head **__rht_bucket_nested(const struct bucket_table *tbl, unsigned int subhash = hash; union nested_table *ntbl; - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = nested_table_top(tbl); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; @@ -1213,7 +1222,7 @@ struct rhash_lock_head **rht_bucket_nested_insert(struct rhashtable *ht, unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; - ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = nested_table_top(tbl); hash >>= tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); From 4e2905adac9f0fdc25154ac83719a986c2367a14 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 5 Jun 2020 15:51:03 -0500 Subject: [PATCH 405/574] net: dp83869: Reset return variable if PHY strap is read When the PHY's strap register is read to determine if lane swapping is needed the phy_read_mmd returns the value back into the ret variable. If the call to read the strap fails the failed value is returned. If the call to read the strap is successful then ret is possibly set to a non-zero positive number. Without reseting the ret value to 0 this will cause the parse DT function to return a failure. Fixes: c4566aec6e808 ("net: phy: dp83869: Update port-mirroring to read straps") Signed-off-by: Dan Murphy Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/dp83869.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index df85ae5b79e4..53ed3abc26c9 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -218,10 +218,13 @@ static int dp83869_of_init(struct phy_device *phydev) ret = phy_read_mmd(phydev, DP83869_DEVADDR, DP83869_STRAP_STS1); if (ret < 0) return ret; + if (ret & DP83869_STRAP_MIRROR_ENABLED) dp83869->port_mirroring = DP83869_PORT_MIRRORING_EN; else dp83869->port_mirroring = DP83869_PORT_MIRRORING_DIS; + + ret = 0; } if (of_property_read_u32(of_node, "rx-fifo-depth", From 9d964e1b82d8182184153b70174f445ea616f053 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 Jun 2020 23:44:24 -0400 Subject: [PATCH 406/574] fix a braino in "sparc32: fix register window handling in genregs32_[gs]et()" lost npc in PTRACE_SETREGSET, breaking PTRACE_SETREGS as well Fixes: cf51e129b968 "sparc32: fix register window handling in genregs32_[gs]et()" Signed-off-by: Al Viro --- arch/sparc/kernel/ptrace_32.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index 60f7205ebe40..646dd58169ec 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -168,12 +168,17 @@ static int genregs32_set(struct task_struct *target, if (ret || !count) return ret; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - ®s->y, + ®s->npc, 34 * sizeof(u32), 35 * sizeof(u32)); if (ret || !count) return ret; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + ®s->y, + 35 * sizeof(u32), 36 * sizeof(u32)); + if (ret || !count) + return ret; return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, - 35 * sizeof(u32), 38 * sizeof(u32)); + 36 * sizeof(u32), 38 * sizeof(u32)); } static int fpregs32_get(struct task_struct *target, From 2dc2f760052da4925482ecdcdc5c94d4a599153c Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Sun, 7 Jun 2020 11:10:27 +0300 Subject: [PATCH 407/574] mlxsw: core: Use different get_trend() callbacks for different thermal zones The driver registers three different types of thermal zones: For the ASIC itself, for port modules and for gearboxes. Currently, all three types use the same get_trend() callback which does not work correctly for the ASIC thermal zone. The callback assumes that the device data is of type 'struct mlxsw_thermal_module', whereas for the ASIC thermal zone 'struct mlxsw_thermal' is passed as device data. Fix this by using one get_trend() callback for the ASIC thermal zone and another for the other two types. Fixes: 6f73862fabd9 ("mlxsw: core: Add the hottest thermal zone detection") Signed-off-by: Vadim Pasternak Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/core_thermal.c | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index ce0a6837daa3..05f8d5a92862 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -391,8 +391,7 @@ static int mlxsw_thermal_set_trip_hyst(struct thermal_zone_device *tzdev, static int mlxsw_thermal_trend_get(struct thermal_zone_device *tzdev, int trip, enum thermal_trend *trend) { - struct mlxsw_thermal_module *tz = tzdev->devdata; - struct mlxsw_thermal *thermal = tz->parent; + struct mlxsw_thermal *thermal = tzdev->devdata; if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS) return -EINVAL; @@ -593,6 +592,22 @@ mlxsw_thermal_module_trip_hyst_set(struct thermal_zone_device *tzdev, int trip, return 0; } +static int mlxsw_thermal_module_trend_get(struct thermal_zone_device *tzdev, + int trip, enum thermal_trend *trend) +{ + struct mlxsw_thermal_module *tz = tzdev->devdata; + struct mlxsw_thermal *thermal = tz->parent; + + if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS) + return -EINVAL; + + if (tzdev == thermal->tz_highest_dev) + return 1; + + *trend = THERMAL_TREND_STABLE; + return 0; +} + static struct thermal_zone_device_ops mlxsw_thermal_module_ops = { .bind = mlxsw_thermal_module_bind, .unbind = mlxsw_thermal_module_unbind, @@ -604,7 +619,7 @@ static struct thermal_zone_device_ops mlxsw_thermal_module_ops = { .set_trip_temp = mlxsw_thermal_module_trip_temp_set, .get_trip_hyst = mlxsw_thermal_module_trip_hyst_get, .set_trip_hyst = mlxsw_thermal_module_trip_hyst_set, - .get_trend = mlxsw_thermal_trend_get, + .get_trend = mlxsw_thermal_module_trend_get, }; static int mlxsw_thermal_gearbox_temp_get(struct thermal_zone_device *tzdev, @@ -643,7 +658,7 @@ static struct thermal_zone_device_ops mlxsw_thermal_gearbox_ops = { .set_trip_temp = mlxsw_thermal_module_trip_temp_set, .get_trip_hyst = mlxsw_thermal_module_trip_hyst_get, .set_trip_hyst = mlxsw_thermal_module_trip_hyst_set, - .get_trend = mlxsw_thermal_trend_get, + .get_trend = mlxsw_thermal_module_trend_get, }; static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev, From 4d3da2d8d91f66988a829a18a0ce59945e8ae4fb Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 7 Jun 2020 15:02:58 +0200 Subject: [PATCH 408/574] net: dsa: lantiq_gswip: fix and improve the unsupported interface error While trying to use the lantiq_gswip driver on one of my boards I made a mistake when specifying the phy-mode (because the out-of-tree driver wants phy-mode "gmii" or "mii" for the internal PHYs). In this case the following error is printed multiple times: Unsupported interface: 3 While it gives at least a hint at what may be wrong it is not very user friendly. Print the human readable phy-mode and also which port is configured incorrectly (this hardware supports ports 0..6) to improve the cases where someone made a mistake. Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Signed-off-by: Martin Blumenstingl Acked-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/dsa/lantiq_gswip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index cf6fa8fede33..521ebc072903 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1452,7 +1452,8 @@ static void gswip_phylink_validate(struct dsa_switch *ds, int port, unsupported: bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); - dev_err(ds->dev, "Unsupported interface: %d\n", state->interface); + dev_err(ds->dev, "Unsupported interface '%s' for port %d\n", + phy_modes(state->interface), port); return; } From 5fc475b749c72e0b3f3991b33a90d302f52ae746 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jun 2020 13:40:45 -0700 Subject: [PATCH 409/574] vfs: do not do group lookup when not necessary Rasmus Villemoes points out that the 'in_group_p()' tests can be a noticeable expense, and often completely unnecessary. A common situation is that the 'group' bits are the same as the 'other' bits wrt the permissions we want to test. So rewrite 'acl_permission_check()' to not bother checking for group ownership when the permission check doesn't care. For example, if we're asking for read permissions, and both 'group' and 'other' allow reading, there's really no reason to check if we're part of the group or not: either way, we'll allow it. Rasmus says: "On a bog-standard Ubuntu 20.04 install, a workload consisting of compiling lots of userspace programs (i.e., calling lots of short-lived programs that all need to get their shared libs mapped in, and the compilers poking around looking for system headers - lots of /usr/lib, /usr/bin, /usr/include/ accesses) puts in_group_p around 0.1% according to perf top. System-installed files are almost always 0755 (directories and binaries) or 0644, so in most cases, we can avoid the binary search and the cost of pulling the cred->groups array and in_group_p() .text into the cpu cache" Reported-by: Rasmus Villemoes Signed-off-by: Linus Torvalds --- fs/namei.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d81f73ff1a8b..e74a7849e9b5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -288,37 +288,51 @@ static int check_acl(struct inode *inode, int mask) } /* - * This does the basic permission checking + * This does the basic UNIX permission checking. + * + * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit, + * for RCU walking. */ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; - if (likely(uid_eq(current_fsuid(), inode->i_uid))) + /* Are we the owner? If so, ACL's don't matter */ + if (likely(uid_eq(current_fsuid(), inode->i_uid))) { + mask &= 7; mode >>= 6; - else { - if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(inode, mask); - if (error != -EAGAIN) - return error; - } + return (mask & ~mode) ? -EACCES : 0; + } + /* Do we have ACL's? */ + if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { + int error = check_acl(inode, mask); + if (error != -EAGAIN) + return error; + } + + /* Only RWX matters for group/other mode bits */ + mask &= 7; + + /* + * Are the group permissions different from + * the other permissions in the bits we care + * about? Need to check group ownership if so. + */ + if (mask & (mode ^ (mode >> 3))) { if (in_group_p(inode->i_gid)) mode >>= 3; } - /* - * If the DACs are ok we don't need any capability check. - */ - if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) - return 0; - return -EACCES; + /* Bits in 'mode' clear that we require? */ + return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, + * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions From 63d72b93f2262900c8de74ad0f5a58e0d452c9d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 7 Jun 2020 12:19:06 -0700 Subject: [PATCH 410/574] vfs: clean up posix_acl_permission() logic aroudn MAY_NOT_BLOCK posix_acl_permission() does not care about MAY_NOT_BLOCK, and in fact the permission logic internally must not check that bit (it's only for upper layers to decide whether they can block to do IO to look up the acl information or not). But the way the code was written, it _looked_ like it cared, since the function explicitly did not mask that bit off. But it has exactly two callers: one for when that bit is set, which first clears the bit before calling posix_acl_permission(), and the other call site when that bit was clear. So stop the silly games "saving" the MAY_NOT_BLOCK bit that must not be used for the actual permission test, and that currently is pointlessly cleared by the callers when the function itself should just not care. Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- fs/posix_acl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e74a7849e9b5..72d4219c93ac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -271,7 +271,7 @@ static int check_acl(struct inode *inode, int mask) /* no ->get_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); + return posix_acl_permission(inode, acl, mask); } acl = get_acl(inode, ACL_TYPE_ACCESS); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 249672bf54fe..95882b3f5f62 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -350,7 +350,7 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - want &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; + want &= MAY_READ | MAY_WRITE | MAY_EXEC; FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { From 92fb1db26eefc11554820f11ce8e92007da2fbf4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 7 Jun 2020 21:40:04 -0700 Subject: [PATCH 411/574] mm/page_idle.c: skip offline pages 'Idle page tracking' users can pass random pfn that might be mapped to an offline page. To avoid accessing such pages, this commit modifies the 'page_idle_get_page()' to use 'pfn_to_online_page()' instead of 'pfn_valid()' and 'pfn_to_page()' combination, so that the pfn mapped to an offline page can be skipped. Reported-by: David Hildenbrand Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Link: http://lkml.kernel.org/r/20200605092502.18018-2-sjpark@amazon.com Signed-off-by: Linus Torvalds --- mm/page_idle.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index 295512465065..057c61df12db 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -30,13 +31,9 @@ */ static struct page *page_idle_get_page(unsigned long pfn) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); pg_data_t *pgdat; - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); if (!page || !PageLRU(page) || !get_page_unless_zero(page)) return NULL; From 4b78e2013a374f379457ceba7dc860d9fc28156b Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 7 Jun 2020 21:40:07 -0700 Subject: [PATCH 412/574] ipc/msg: add missing annotation for freeque() Sparse reports a warning at freeque() warning: context imbalance in freeque() - unexpected unlock The root cause is the missing annotation at freeque() Add the missing __releases(RCU) annotation Add the missing __releases(&msq->q_perm) annotation Signed-off-by: Jules Irenge Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Boqun Feng Cc: Lu Shuaibing Cc: Nathan Chancellor Cc: Manfred Spraul Cc: Davidlohr Bueso Link: http://lkml.kernel.org/r/20200403160505.2832-2-jbi.octave@gmail.com Signed-off-by: Linus Torvalds --- ipc/msg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ipc/msg.c b/ipc/msg.c index caca67368cb5..acd1bc7af55a 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -268,6 +268,8 @@ static void expunge_all(struct msg_queue *msq, int res, * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) + __releases(RCU) + __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); From e1eb26fa62d04ec0955432be1aa8722a97cb52e7 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 7 Jun 2020 21:40:10 -0700 Subject: [PATCH 413/574] ipc/namespace.c: use a work queue to free_ipc the reason is to avoid a delay caused by the synchronize_rcu() call in kern_umount() when the mqueue mount is freed. the code: #define _GNU_SOURCE #include #include #include #include int main() { int i; for (i = 0; i < 1000; i++) if (unshare(CLONE_NEWIPC) < 0) error(EXIT_FAILURE, errno, "unshare"); } goes from Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.06 Percent of CPU this job got: 0% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05 to Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.02 Percent of CPU this job got: 96% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03 Signed-off-by: Giuseppe Scrivano Signed-off-by: Andrew Morton Reviewed-by: Paul E. McKenney Reviewed-by: Waiman Long Cc: Davidlohr Bueso Cc: Manfred Spraul Link: http://lkml.kernel.org/r/20200225145419.527994-1-gscrivan@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ ipc/namespace.c | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45..a06a78c67f19 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -68,6 +68,8 @@ struct ipc_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; + struct llist_node mnt_llist; + struct ns_common ns; } __randomize_layout; diff --git a/ipc/namespace.c b/ipc/namespace.c index fdc3b5f3f53a..24e7b45320f7 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { + /* mq_put_mnt() waits for a grace period as kern_unmount() + * uses synchronize_rcu(). + */ + mq_put_mnt(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); @@ -127,6 +131,21 @@ static void free_ipc_ns(struct ipc_namespace *ns) kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static DECLARE_WORK(free_ipc_work, free_ipc); + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -148,8 +167,9 @@ void put_ipc_ns(struct ipc_namespace *ns) if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); + + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); } } From ceabef7dd71720aef58bd182943352c9c307a3de Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Sun, 7 Jun 2020 21:40:14 -0700 Subject: [PATCH 414/574] dynamic_debug: add an option to enable dynamic debug for modules only Instead of enabling dynamic debug globally with CONFIG_DYNAMIC_DEBUG, CONFIG_DYNAMIC_DEBUG_CORE will only enable core function of dynamic debug. With the DYNAMIC_DEBUG_MODULE defined for any modules, dynamic debug will be tied to them. This is useful for people who only want to enable dynamic debug for kernel modules without worrying about kernel image size and memory consumption is increasing too much. [orson.zhai@unisoc.com: v2] Link: http://lkml.kernel.org/r/1587408228-10861-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Signed-off-by: Andrew Morton Acked-by: Greg Kroah-Hartman Acked-by: Petr Mladek Cc: Jonathan Corbet Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Baron Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1586521984-5890-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++++ include/linux/dev_printk.h | 6 ++++-- include/linux/dynamic_debug.h | 2 +- include/linux/net.h | 3 ++- include/linux/netdevice.h | 6 ++++-- include/linux/printk.h | 9 ++++++--- include/rdma/ib_verbs.h | 6 ++++-- lib/Kconfig.debug | 12 ++++++++++++ lib/Makefile | 2 +- lib/dynamic_debug.c | 9 +++++++-- 10 files changed, 46 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a5..016a9c5faa34 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c..ef2f3986c493 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bd4eb7f5ec1..333e878d8af9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 32f19b4d1d2a..315516fa4ef4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -190,7 +190,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; From db38d5c106dfdd7cb7207c83267d82fdf4950b61 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:17 -0700 Subject: [PATCH 415/574] kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. [akpm@linux-foundation.org: tweak kernel-parameters.txt wording] Suggested-by: Qian Cai Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kdump/kdump.rst | 8 +++++ .../admin-guide/kernel-parameters.txt | 13 +++++++ Documentation/admin-guide/sysctl/kernel.rst | 7 ++++ include/linux/kernel.h | 3 ++ kernel/panic.c | 34 +++++++++++++++++++ kernel/sysctl.c | 11 +++++- 6 files changed, 75 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d2935..2da65fef2a1c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f63..df9b0fe2ed60 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d01141..3b00b9223157 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1239,6 +1239,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9b7a8d74a9d6..f7835db7102e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -528,6 +528,8 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; @@ -596,6 +598,7 @@ extern enum system_states { #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..94b5c973770c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -686,3 +693,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..587ed0494f2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); From 01f39c1c11ee5bf44a1df49e47eb53a86515b9dc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Jun 2020 21:40:20 -0700 Subject: [PATCH 416/574] xarray.h: correct return code documentation for xa_store_{bh,irq}() __xa_store() and xa_store() document that the functions can fail, and that the return code can be an xa_err() encoded error code. xa_store_bh() and xa_store_irq() do not document that the functions can fail and that they can also return xa_err() encoded error codes. Thus: Update the documentation. Signed-off-by: Manfred Spraul Signed-off-by: Andrew Morton Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200430111424.16634-1-manfred@colorfullife.com Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 14c893433139..b4d70e7568b2 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -576,7 +576,7 @@ void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) @@ -602,7 +602,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) From 3db978d480e2843979a2b56f2f7da726f2b295b2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:24 -0700 Subject: [PATCH 417/574] kernel/sysctl: support setting sysctl parameters from kernel command line Patch series "support setting sysctl parameters from kernel command line", v3. This series adds support for something that seems like many people always wanted but nobody added it yet, so here's the ability to set sysctl parameters via kernel command line options in the form of sysctl.vm.something=1 The important part is Patch 1. The second, not so important part is an attempt to clean up legacy one-off parameters that do the same thing as a sysctl. I don't want to remove them completely for compatibility reasons, but with generic sysctl support the idea is to remove the one-off param handlers and treat the parameters as aliases for the sysctl variants. I have identified several parameters that mention sysctl counterparts in Documentation/admin-guide/kernel-parameters.txt but there might be more. The conversion also has varying level of success: - numa_zonelist_order is converted in Patch 2 together with adding the necessary infrastructure. It's easy as it doesn't really do anything but warn on deprecated value these days. - hung_task_panic is converted in Patch 3, but there's a downside that now it only accepts 0 and 1, while previously it was any integer value - nmi_watchdog maps to two sysctls nmi_watchdog and hardlockup_panic, so there's no straighforward conversion possible - traceoff_on_warning is a flag without value and it would be required to handle that somehow in the conversion infractructure, which seems pointless for a single flag This patch (of 5): A recently proposed patch to add vm_swappiness command line parameter in addition to existing sysctl [1] made me wonder why we don't have a general support for passing sysctl parameters via command line. Googling found only somebody else wondering the same [2], but I haven't found any prior discussion with reasons why not to do this. Settings the vm_swappiness issue aside (the underlying issue might be solved in a different way), quick search of kernel-parameters.txt shows there are already some that exist as both sysctl and kernel parameter - hung_task_panic, nmi_watchdog, numa_zonelist_order, traceoff_on_warning. A general mechanism would remove the need to add more of those one-offs and might be handy in situations where configuration by e.g. /etc/sysctl.d/ is impractical. Hence, this patch adds a new parse_args() pass that looks for parameters prefixed by 'sysctl.' and tries to interpret them as writes to the corresponding sys/ files using an temporary in-kernel procfs mount. This mechanism was suggested by Eric W. Biederman [3], as it handles all dynamically registered sysctl tables, even though we don't handle modular sysctls. Errors due to e.g. invalid parameter name or value are reported in the kernel log. The processing is hooked right before the init process is loaded, as some handlers might be more complicated than simple setters and might need some subsystems to be initialized. At the moment the init process can be started and eventually execute a process writing to /proc/sys/ then it should be also fine to do that from the kernel. Sysctls registered later on module load time are not set by this mechanism - it's expected that in such scenarios, setting sysctl values from userspace is practical enough. [1] https://lore.kernel.org/r/BL0PR02MB560167492CA4094C91589930E9FC0@BL0PR02MB5601.namprd02.prod.outlook.com/ [2] https://unix.stackexchange.com/questions/558802/how-to-set-sysctl-using-kernel-command-line-parameter [3] https://lore.kernel.org/r/87bloj2skm.fsf@x220.int.ebiederm.org/ Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Reviewed-by: Masami Hiramatsu Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: "Eric W . Biederman" Cc: "Guilherme G . Piccoli" Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Christian Brauner Link: http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Link: http://lkml.kernel.org/r/20200427180433.7029-2-vbabka@suse.cz Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 9 ++ fs/proc/proc_sysctl.c | 107 ++++++++++++++++++ include/linux/sysctl.h | 4 + init/main.c | 2 + 4 files changed, 122 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index df9b0fe2ed60..d02b9d023660 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4969,6 +4969,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index df2143e05c57..973acf96f37c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1703,3 +1704,109 @@ int __init proc_sys_init(void) return sysctl_init(); } + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) + return 0; + + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f2401e45a3c2..50bb7f383a1b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +void do_sysctl_args(void); extern int pwrsw_enabled; extern int unaligned_enabled; @@ -235,6 +236,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } +static inline void do_sysctl_args(void) +{ +} #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, diff --git a/init/main.c b/init/main.c index 76df62fc3e2c..b59e09353881 100644 --- a/init/main.c +++ b/init/main.c @@ -1412,6 +1412,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) From 0a477e1ae21b28267b9bd8599f75c115291b1666 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:27 -0700 Subject: [PATCH 418/574] kernel/sysctl: support handling command line aliases We can now handle sysctl parameters on kernel command line, but historically some parameters introduced their own command line equivalent, which we don't want to remove for compatibility reasons. We can, however, convert them to the generic infrastructure with a table translating the legacy command line parameters to their sysctl names, and removing the one-off param handlers. This patch adds the support and makes the first conversion to demonstrate it, on the (deprecated) numa_zonelist_order parameter. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-3-vbabka@suse.cz Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 48 ++++++++++++++++++++++++++++++++++++------- mm/page_alloc.c | 9 -------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 973acf96f37c..124298168f8b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1705,6 +1705,37 @@ int __init proc_sys_init(void) return sysctl_init(); } +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) @@ -1718,15 +1749,18 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; - if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) - return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; - param += sizeof("sysctl") - 1; + if (param[0] != '/' && param[0] != '.') + return 0; - if (param[0] != '/' && param[0] != '.') - return 0; - - param++; + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } /* * To set sysctl options, we use a temporary mount of proc, look up the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07ae77d97952..727751219003 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,15 +5575,6 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* From b467f3ef3c50c4fa8926ca07f7db9a33a645e13a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:31 -0700 Subject: [PATCH 419/574] kernel/hung_task convert hung_task_panic boot parameter to sysctl We can now handle sysctl parameters on kernel command line and have infrastructure to convert legacy command line options that duplicate sysctl to become a sysctl alias. This patch converts the hung_task_panic parameter. Note that the sysctl handler is more strict and allows only 0 and 1, while the legacy parameter allowed any non-zero value. But there is little reason anyone would not be using 1. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-4-vbabka@suse.cz Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 2 +- fs/proc/proc_sysctl.c | 1 + kernel/hung_task.c | 10 ---------- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d02b9d023660..0c72b5c22416 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1515,7 +1515,7 @@ [KNL] Should the hung task detector generate panics. Format: - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 124298168f8b..15030784566c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1721,6 +1721,7 @@ struct sysctl_alias { */ static const struct sysctl_alias sysctl_aliases[] = { {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"hung_task_panic", "kernel.hung_task_panic" }, { } }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..b22b5eeab3cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,16 +63,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { From 4546cde96f9fbef8f59c645f2e0677b5a390ed0d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:35 -0700 Subject: [PATCH 420/574] tools/testing/selftests/sysctl/sysctl.sh: support CONFIG_TEST_SYSCTL=y The testing script recommends CONFIG_TEST_SYSCTL=y, but actually only works with CONFIG_TEST_SYSCTL=m. Testing of sysctl setting via boot param however requires the test to be built-in, so make sure the test script supports it. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Luis Chamberlain Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-5-vbabka@suse.cz Signed-off-by: Linus Torvalds --- tools/testing/selftests/sysctl/sysctl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..a4262955d30f 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -122,7 +122,7 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" exit $ksft_skip From 4f2f682d89d83fb6194562321d875253282d8496 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:38 -0700 Subject: [PATCH 421/574] lib/test_sysctl: support testing of sysctl. boot parameter Testing is done by a new parameter debug.test_sysctl.boot_int which defaults to 0 and it's expected that the tester passes a boot parameter that sets it to 1. The test checks if it's set to 1. To distinguish true failure from parameter not being set, the test checks /proc/cmdline for the expected parameter, and whether test_sysctl is built-in and not a module. [vbabka@suse.cz: skip the new test if boot_int sysctl is not present] Link: http://lkml.kernel.org/r/305af605-1e60-cf84-fada-6ce1ca37c102@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-6-vbabka@suse.cz Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 13 ++++++++ tools/testing/selftests/sysctl/sysctl.sh | 42 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 566dad3f4196..84eaae22d3a6 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -44,6 +44,8 @@ struct test_sysctl_data { int int_0002; int int_0003[4]; + int boot_int; + unsigned int uint_0001; char string_0001[65]; @@ -61,6 +63,8 @@ static struct test_sysctl_data test_data = { .int_0003[2] = 2, .int_0003[3] = 3, + .boot_int = 0, + .uint_0001 = 314, .string_0001 = "(none)", @@ -91,6 +95,15 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "boot_int", + .data = &test_data.boot_int, + .maxlen = sizeof(test_data.boot_int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "uint_0001", .data = &test_data.uint_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index a4262955d30f..0aa8a86140e9 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,6 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" test_modprobe() { @@ -752,6 +753,46 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return $ksft_skip + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip +} + list_tests() { echo "Test ID list:" @@ -766,6 +807,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() From f117955a2255721a6a0e9cecf6cad3a6eb43cbc3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:42 -0700 Subject: [PATCH 422/574] kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases After a recent change introduced by Vlastimil's series [0], kernel is able now to handle sysctl parameters on kernel command line; also, the series introduced a simple infrastructure to convert legacy boot parameters (that duplicate sysctls) into sysctl aliases. This patch converts the watchdog parameters softlockup_panic and {hard,soft}lockup_all_cpu_backtrace to use the new alias infrastructure. It fixes the documentation too, since the alias only accepts values 0 or 1, not the full range of integers. We also took the opportunity here to improve the documentation of the previously converted hung_task_panic (see the patch series [0]) and put the alias table in alphabetical order. [0] http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200507214624.21911-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 10 ++--- fs/proc/proc_sysctl.c | 7 +++- kernel/watchdog.c | 37 +++++-------------- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0c72b5c22416..ed9597e37415 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,7 +1513,7 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: + Format: 0 | 1 A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled @@ -4665,9 +4665,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4676,7 +4676,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 15030784566c..5b405f32971d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,8 +1720,11 @@ struct sysctl_alias { * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { - {"numa_zonelist_order", "vm.numa_zonelist_order" }, - {"hung_task_panic", "kernel.hung_task_panic" }, + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + {"softlockup_panic", "kernel.softlockup_panic" }, { } }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* From 0ec9dc9bcba0a62b0844e54c1caf6b8b0bf6b5b4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:45 -0700 Subject: [PATCH 423/574] kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 14 ++++++++++++++ include/linux/sched/sysctl.h | 7 +++++++ kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b00b9223157..861820d27c19 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7b4d3a49b6c5..660ac49f2b53 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,6 +7,13 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK + +#ifdef CONFIG_SMP +extern unsigned int sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b22b5eeab3cb..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -191,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f..34c1278951b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, From 60c958d8df9cfc40b745d6cd583cfbfa7525ead6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:48 -0700 Subject: [PATCH 424/574] panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 16 ++++++++++++++++ include/linux/kernel.h | 6 ++++++ kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 44 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 861820d27c19..83acf5025488 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -646,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7835db7102e..82d91547d122 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -520,6 +520,12 @@ static inline u32 int_sqrt64(u64 x) } #endif +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; diff --git a/kernel/panic.c b/kernel/panic.c index 94b5c973770c..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b9..f69d581d39c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, From e77132e75845470065768e2205727e6be52cb7f4 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:51 -0700 Subject: [PATCH 425/574] kernel/sysctl.c: ignore out-of-range taint bits introduced via kernel.tainted Users with SYS_ADMIN capability can add arbitrary taint flags to the running kernel by writing to /proc/sys/kernel/tainted or issuing the command 'sysctl -w kernel.tainted=...'. This interface, however, is open for any integer value and this might cause an invalid set of flags being committed to the tainted_mask bitset. This patch introduces a simple way for proc_taint() to ignore any eventual invalid bit coming from the user input before committing those bits to the kernel tainted_mask. Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "Theodore Ts'o" Link: http://lkml.kernel.org/r/20200512223946.888020-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f69d581d39c3..db1ce7af2563 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -880,10 +880,9 @@ static int proc_taint(struct ctl_table *table, int write, * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; From dadbb612f6e50bbf9101c2f5d82690ce9ea4d66b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 7 Jun 2020 21:40:55 -0700 Subject: [PATCH 426/574] mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/perf/callchain_64.c | 4 +--- include/linux/mm.h | 10 +++++++-- kernel/events/core.c | 4 ++-- mm/gup.c | 29 ++++++++++++++------------ virt/kvm/kvm_main.c | 8 +++---- 7 files changed, 32 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 18aed9775a3c..ddfc4c90ebb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -581,7 +581,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * We always ask for write permission since the common case * is that the page is writable. */ - if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 02219e28b1e4..a47fa8b4d0f0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -795,7 +795,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index b63086b663ef..9cc1a129737e 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -30,11 +30,9 @@ int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned long addr = (unsigned long) ptr; unsigned long offset; struct page *page; - int nrpages; void *kaddr; - nrpages = __get_user_pages_fast(addr, 1, 1, &page); - if (nrpages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { kaddr = page_address(page); /* align address to page boundary */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 86adc71a972f..22010c4175f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1824,10 +1824,16 @@ extern int mprotect_fixup(struct vm_area_struct *vma, /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } diff --git a/mm/gup.c b/mm/gup.c index e19ff770eb4c..9d1bf3ec8e39 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2294,7 +2294,7 @@ pte_unmap: * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2699,7 +2699,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2811,8 +2811,14 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -/* +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the @@ -2825,8 +2831,8 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { int nr_pinned; /* @@ -2836,10 +2842,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; - - if (write) - gup_flags |= FOLL_WRITE; + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -2855,7 +2858,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr_pinned; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2926,8 +2929,8 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* - * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the - * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..7b0da1c28e51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1740,7 +1740,6 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { struct page *page[1]; - int npages; /* * Fast pin a writable pfn only if it is a write fault request @@ -1750,8 +1749,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, if (!(write_fault || writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); if (writable) @@ -1791,7 +1789,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (unlikely(!write_fault) && writable) { struct page *wpage; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { *writable = true; put_page(page); page = wpage; @@ -2003,7 +2001,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); From a8f80f53fb5c367e1a160adfb3b092a788faeb29 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:40:59 -0700 Subject: [PATCH 427/574] mm/gup: update pin_user_pages.rst for "case 3" (mmu notifiers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update case 3 so that it covers the use of mmu notifiers, for hardware that does, or does not have replayable page faults. Also, elaborate case 4 slightly, as it was quite cryptic. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/20200527194953.11130-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 29 +++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86..4675b04e8829 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,28 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. -Therefore, neither flag needs to be set. +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. page_maybe_dma_pinned(): the whole point of pinning =================================================== From 420c2091b65a95b1daea4c36d27df4ac5f38033d Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:02 -0700 Subject: [PATCH 428/574] mm/gup: introduce pin_user_pages_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: introduce pin_user_pages_locked(), use it in frame_vector.c", v2. This adds yet one more pin_user_pages*() variant, and uses that to convert mm/frame_vector.c. With this, along with maybe 20 or 30 other recent patches in various trees, we are close to having the relevant gup call sites converted--with the notable exception of the bio/block layer. This patch (of 2): Introduce pin_user_pages_locked(), which is nearly identical to get_user_pages_locked() except that it sets FOLL_PIN and rejects FOLL_GET. As with other pairs of get_user_pages*() and pin_user_pages() API calls, it's prudent to assert that FOLL_PIN is *not* set in the get_user_pages*() call, so add that as part of this. [jhubbard@nvidia.com: v2] Link: http://lkml.kernel.org/r/20200531234131.770697-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Souptick Joarder Link: http://lkml.kernel.org/r/20200531234131.770697-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 22010c4175f2..9d6042178ca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1706,6 +1706,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 9d1bf3ec8e39..5ce0a99dd8f3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2035,6 +2035,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, @@ -3058,3 +3064,32 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); From 55a650c35fea7cfdf989647f37960dfaf76da737 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:05 -0700 Subject: [PATCH 429/574] mm/gup: frame_vector: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), and all of the callers so far were in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Pankaj Gupta Cc: Souptick Joarder Link: http://lkml.kernel.org/r/20200527223243.884385-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad5..4107dbca0056 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(start, nr_frames, + ret = pin_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames); */ void put_vaddr_frames(struct frame_vector *vec) { - int i; struct page **pages; if (!vec->got_ref) @@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec) */ if (WARN_ON(IS_ERR(pages))) goto out; - for (i = 0; i < vec->nr_frames; i++) - put_page(pages[i]); + + unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; From 6a005645edd6de2b535783c96e66e08cccc5e067 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:08 -0700 Subject: [PATCH 430/574] mm/gup: documentation fix for pin_user_pages*() APIs All of the pin_user_pages*() API calls will cause pages to be dma-pinned. As such, they are all suitable for either DMA, RDMA, and/or Direct IO. The documentation should say so, but it was instead saying that three of the API calls were only suitable for Direct IO. This was discovered when a reviewer wondered why an API call that specifically recommended against Case 2 (DMA/RDMA) was being used in a DMA situation [1]. Fix this by simply deleting those claims. The gup.c comments already refer to the more extensive Documentation/core-api/pin_user_pages.rst, which does have the correct guidance. So let's just write it once, there. [1] https://lore.kernel.org/r/20200529074658.GM30374@kadam Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Acked-by: Souptick Joarder Cc: Dan Carpenter Cc: Jan Kara Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200529084515.46259-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- mm/gup.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 5ce0a99dd8f3..8bd090b36d1d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2918,9 +2918,6 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2994,9 +2991,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -3030,9 +3024,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, From eaf4d22a9ea419bc4c85ad8f825a331bcc1ae340 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:11 -0700 Subject: [PATCH 431/574] docs: mm/gup: pin_user_pages.rst: add a "case 5" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "vhost, docs: convert to pin_user_pages(), new "case 5"" It recently became clear to me that there are some get_user_pages*() callers that don't fit neatly into any of the four cases that are so far listed in pin_user_pages.rst. vhost.c is one of those. Add a Case 5 to the documentation, and refer to that when converting vhost.c. Thanks to Jan Kara for helping me (again) in understanding the interaction between get_user_pages() and page writeback [1]. This is based on today's mmotm, which has a nearby patch to pin_user_pages.rst that rewords cases 3 and 4. Note that I have only compile-tested the vhost.c patch, although that does also include cross-compiling for a few other arches. Any run-time testing would be greatly appreciated. [1] https://lore.kernel.org/r/20200529070343.GL14550@quack2.suse.cz This patch (of 2): There are four cases listed in pin_user_pages.rst. These are intended to help developers figure out whether to use get_user_pages*(), or pin_user_pages*(). However, the four cases do not cover all the situations. For example, drivers/vhost/vhost.c has a "pin, write to page, set page dirty, unpin" case. Add a fifth case, to help explain that there is a general pattern that requires pin_user_pages*() API calls. [jhubbard@nvidia.com: v2] Link: http://lkml.kernel.org/r/20200601052633.853874-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Vlastimil Babka Cc: Jan Kara Cc: Jérôme Glisse Cc: Dave Chinner Cc: Jonathan Corbet Cc: Souptick Joarder Cc: "Michael S . Tsirkin" Cc: Jason Wang Link: http://lkml.kernel.org/r/20200529234309.484480-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200529234309.484480-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 4675b04e8829..6068266dd303 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -171,6 +171,24 @@ If only struct page data (as opposed to the actual memory contents that a page is tracking) is affected, then normal GUP calls are sufficient, and neither flag needs to be set. +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +write to a page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + write to the data within the pages + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + write to the data within the pages + put_page() + page_maybe_dma_pinned(): the whole point of pinning =================================================== From 690623e1b49603bdd3df4532b7911a2ce26fb4c4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:15 -0700 Subject: [PATCH 432/574] vhost: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), in approximately a "Case 5" scenario (accessing the data within a page), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Michael S. Tsirkin Acked-by: Pankaj Gupta Cc: Jason Wang Cc: Dave Chinner Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Souptick Joarder Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200529234309.484480-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- drivers/vhost/vhost.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 21a59b598ed8..596132a96cd5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1762,15 +1762,14 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); + r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); - set_page_dirty_lock(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); return 0; } From ce450ebf6179acf6e90dcc090e90face215faec4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:25 -0700 Subject: [PATCH 433/574] arm: fix the flush_icache_range arguments in set_fiq_handler Patch series "sort out the flush_icache_range mess", v2. flush_icache_range is mostly used for kernel address, except for the following cases: - the nommu brk and mmap implementations - the read_code helper that is only used for binfmt_flat, binfmt_elf_fdpic, and binfmt_aout including the broken ia32 compat version - binfmt_flat itself none of which really are used by a typical MMU enabled kernel, as a.out can only be build for alpha and m68k to start with. But strangely enough commit ae92ef8a4424 ("PATCH] flush icache in correct context") added a "set_fs(KERNEL_DS)" around the flush_icache_range call in the module loader, because apparently m68k assumed user pointers. This series first cleans up the cacheflush implementations, largely by switching as much as possible to the asm-generic version after a few preparations, then moves the misnamed current flush_icache_user_range to a new name, to finally introduce a real flush_icache_user_range to be used for the above use cases to flush the instruction cache for a userspace address range. The last patch then drops the set_fs in the module code and moves it into the m68k implementation. This patch (of 29): The arguments passed look bogus, try to fix them to something that seems to make sense. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Roman Zippel Cc: Jessica Yu Cc: Michal Simek Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Anton Ivanov Cc: Arnaldo Carvalho de Melo Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Borkmann Cc: Dan Williams Cc: Dave Jiang Cc: "David S. Miller" Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Ivan Kokshaysky Cc: Jeff Dike Cc: Jiri Olsa Cc: Jonas Bonn Cc: Keith Busch Cc: Mark Rutland Cc: Mark Salter Cc: Martin KaFai Lau Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Nick Piggin Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vishal Verma Cc: Will Deacon Cc: Yonghong Song Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200515143646.3857579-1-hch@lst.de Link: http://lkml.kernel.org/r/20200515143646.3857579-2-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/kernel/fiq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index cd1234c103fc..98ca3e3fa847 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -98,8 +98,8 @@ void set_fiq_handler(void *start, unsigned int length) memcpy(base + offset, start, length); if (!cache_is_vipt_nonaliasing()) - flush_icache_range((unsigned long)base + offset, offset + - length); + flush_icache_range((unsigned long)base + offset, + (unsigned long)base + offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); } From e7c1fa11b0b007e5b74c39be38d507d06aa19e8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:29 -0700 Subject: [PATCH 434/574] nds32: unexport flush_icache_page flush_icache_page is only used by mm/memory.c. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Greentime Hu Cc: Vincent Chen Link: http://lkml.kernel.org/r/20200515143646.3857579-3-hch@lst.de Signed-off-by: Linus Torvalds --- arch/nds32/mm/cacheflush.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 254703653b6f..8f168b33065f 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -35,7 +35,6 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) kunmap_atomic((void *)kaddr); local_irq_restore(flags); } -EXPORT_SYMBOL(flush_icache_page); void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) From e292e7403e3fe048abec1a5fd83b3181748dcb1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:32 -0700 Subject: [PATCH 435/574] powerpc: unexport flush_icache_user_range flush_icache_user_range is only used by copy_to_user_page, which is only used by core VM code. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Link: http://lkml.kernel.org/r/20200515143646.3857579-4-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5f7fe13211e9..3f1a70c6781b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -586,7 +586,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, flush_icache_range(maddr, maddr + len); kunmap(page); } -EXPORT_SYMBOL(flush_icache_user_range); /* * System memory should not be in /proc/iomem but various tools expect it From 7c95fda549bdd6308dc833e13263227a7238c50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:36 -0700 Subject: [PATCH 436/574] unicore32: remove flush_cache_user_range flush_cache_user_range is an ARMism not used by any generic or unicore32 specific code. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200515143646.3857579-5-hch@lst.de Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/cacheflush.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index dc8c0b41538f..9393ca4047e9 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -132,14 +132,6 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define flush_cache_dup_mm(mm) flush_cache_mm(mm) -/* - * flush_cache_user_range is used when we want to ensure that the - * Harvard caches are synchronised for the user space address range. - * This is used for the UniCore private sys_cacheflush system call. - */ -#define flush_cache_user_range(vma, start, end) \ - __cpuc_coherent_user_range((start) & PAGE_MASK, PAGE_ALIGN(end)) - /* * Perform necessary cache operations to ensure that data previously * stored within this range of addresses can be executed by the CPU. From 92a73bd29ad0a31cfa9241e86a557252ac77b217 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:39 -0700 Subject: [PATCH 437/574] asm-generic: fix the inclusion guards for cacheflush.h cacheflush.h uses a somewhat to generic include guard name that clashes with various arch files. Use a more specific one. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-6-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index cac7404b2bdd..906277492ec5 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_CACHEFLUSH_H -#define __ASM_CACHEFLUSH_H +#ifndef _ASM_GENERIC_CACHEFLUSH_H +#define _ASM_GENERIC_CACHEFLUSH_H /* Keep includes the same across arches. */ #include @@ -109,4 +109,4 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) memcpy(dst, src, len) #endif -#endif /* __ASM_CACHEFLUSH_H */ +#endif /* _ASM_GENERIC_CACHEFLUSH_H */ From e0cf615d725cb3b69f0bdf1d8afdd4d4c31b4fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:42 -0700 Subject: [PATCH 438/574] asm-generic: don't include in cacheflush.h This seems to lead to some crazy include loops when using asm-generic/cacheflush.h on more architectures, so leave it to the arch header for now. [hch@lst.de: fix warning] Link: http://lkml.kernel.org/r/20200520173520.GA11199@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Will Deacon Cc: Nick Piggin Cc: Peter Zijlstra Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Keith Busch Cc: Ira Weiny Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-7-hch@lst.de Signed-off-by: Linus Torvalds --- arch/um/include/asm/tlb.h | 2 ++ arch/x86/include/asm/cacheflush.h | 2 ++ drivers/media/platform/omap3isp/ispvideo.c | 2 +- drivers/nvdimm/pmem.c | 3 ++- include/asm-generic/cacheflush.h | 3 --- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 70ee60383900..ff9c62828962 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,6 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H +#include + #include #include #include diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63feaf2a5f93..b192d917a6d0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H +#include + /* Caches aren't brain-dead on the intel. */ #include #include diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 6f769c527fae..10c214bd0903 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -10,7 +10,6 @@ * Sakari Ailus */ -#include #include #include #include @@ -19,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 97f948f8f4e6..d1ecd6da11a2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -7,7 +7,6 @@ * Copyright (c) 2015, Boaz Harrosh . */ -#include #include #include #include @@ -25,6 +24,8 @@ #include #include #include +#include +#include #include "pmem.h" #include "pfn.h" #include "nd.h" diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 906277492ec5..bf9bb83e9fc8 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,9 +2,6 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H -/* Keep includes the same across arches. */ -#include - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 /* From 76b3b58fac350c45e44a3868946ff5e454e8d367 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:45 -0700 Subject: [PATCH 439/574] asm-generic: improve the flush_dcache_page stub There is a magic ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE cpp symbol that guards non-stub availability of flush_dcache_pagge. Use that to check if flush_dcache_pagg is implemented. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-8-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index bf9bb83e9fc8..bbbb4d4ef651 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,8 +2,6 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory @@ -42,12 +40,14 @@ static inline void flush_cache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_dcache_page +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static inline void flush_dcache_page(struct page *page) { } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #endif + #ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) { From 43c74ca337ce623a5f0cb93e2903f4a88d3e98a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:48 -0700 Subject: [PATCH 440/574] alpha: use asm-generic/cacheflush.h Alpha needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Link: http://lkml.kernel.org/r/20200515143646.3857579-9-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/cacheflush.h | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 89128489cb59..636d7ca0d05f 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -4,19 +4,6 @@ #include -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* Note that the following two definitions are _highly_ dependent on the contexts in which they are used in the kernel. I personally think it is criminal how loosely defined these macros are. */ @@ -59,20 +46,17 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#else +#define flush_icache_user_range flush_icache_user_range +#else /* CONFIG_SMP */ extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#endif +#define flush_icache_user_range flush_icache_user_range +#endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_range((vma), (page), 0, 0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ALPHA_CACHEFLUSH_H */ From a7ba121215faad444f94db5e89eb5d7dfc58ecb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:51 -0700 Subject: [PATCH 441/574] arm64: use asm-generic/cacheflush.h ARM64 needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Catalin Marinas Cc: Will Deacon Link: http://lkml.kernel.org/r/20200515143646.3857579-10-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/cacheflush.h | 46 ++++------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index ce50c1f1f1ea..9384fd8fc13c 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -94,20 +94,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) kick_all_cpus_sync(); } - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long user_addr, unsigned long pfn) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} +#define flush_icache_range flush_icache_range /* * Cache maintenance functions used by the DMA API. No to be used directly. @@ -123,12 +110,7 @@ extern void __dma_flush_area(const void *, size_t); */ extern void copy_to_user_page(struct vm_area_struct *, struct page *, unsigned long, void *, const void *, unsigned long); -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - } while (0) - -#define flush_cache_dup_mm(mm) flush_cache_mm(mm) +#define copy_to_user_page copy_to_user_page /* * flush_dcache_page is used when the kernel has written to the page @@ -154,29 +136,11 @@ static __always_inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma,page) do { } while (0) - -/* - * Not required on AArch64 (PIPT or VIPT non-aliasing D-cache). - */ -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); -#endif +#include + +#endif /* __ASM_CACHEFLUSH_H */ From 2d49d89c73fe9b76b02799a71e768b312ad65039 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:55 -0700 Subject: [PATCH 442/574] c6x: use asm-generic/cacheflush.h C6x needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Mark Salter Cc: Aurelien Jacquiot Link: http://lkml.kernel.org/r/20200515143646.3857579-11-hch@lst.de Signed-off-by: Linus Torvalds --- arch/c6x/include/asm/cacheflush.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/c6x/include/asm/cacheflush.h b/arch/c6x/include/asm/cacheflush.h index 4540b40475e6..10922d528de6 100644 --- a/arch/c6x/include/asm/cacheflush.h +++ b/arch/c6x/include/asm/cacheflush.h @@ -16,21 +16,6 @@ #include #include -/* - * virtually-indexed cache management (our cache is physically indexed) - */ -#define flush_cache_all() do {} while (0) -#define flush_cache_mm(mm) do {} while (0) -#define flush_cache_dup_mm(mm) do {} while (0) -#define flush_cache_range(mm, start, end) do {} while (0) -#define flush_cache_page(vma, vmaddr, pfn) do {} while (0) -#define flush_cache_vmap(start, end) do {} while (0) -#define flush_cache_vunmap(start, end) do {} while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do {} while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - /* * physically-indexed cache management */ @@ -49,14 +34,12 @@ do { \ (unsigned long) page_address(page) + PAGE_SIZE)); \ } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ flush_icache_range((unsigned) (dst), (unsigned) (dst) + (len)); \ } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ASM_C6X_CACHEFLUSH_H */ From af23eea561961108b22422750f07025d2fd91240 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:58 -0700 Subject: [PATCH 443/574] hexagon: use asm-generic/cacheflush.h Hexagon needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Brian Cain Link: http://lkml.kernel.org/r/20200515143646.3857579-12-hch@lst.de Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/cacheflush.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index fb447de45d54..6eff0730e6ef 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -25,29 +25,17 @@ #define LINESIZE 32 #define LINEBITS 5 -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* * Flush Dcache range through current map. */ extern void flush_dcache_range(unsigned long start, unsigned long end); +#define flush_dcache_range flush_dcache_range /* * Flush Icache range through current map. */ extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range /* * Memory-management related flushes are there to ensure in non-physically @@ -78,6 +66,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); +#define copy_to_user_page copy_to_user_page #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) @@ -85,4 +74,6 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, extern void hexagon_inv_dcache_range(unsigned long start, unsigned long end); extern void hexagon_clean_dcache_range(unsigned long start, unsigned long end); +#include + #endif From 57b94ff59796bfbf7debebb2293b66d96f726a12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:01 -0700 Subject: [PATCH 444/574] ia64: use asm-generic/cacheflush.h IA64 needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Tony Luck Cc: Fenghua Yu Link: http://lkml.kernel.org/r/20200515143646.3857579-13-hch@lst.de Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/cacheflush.h | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index 6d3478f8abc8..a8f1c86ac242 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -12,44 +12,22 @@ #include -/* - * Cache flushing routines. This is the kind of stuff that can be very expensive, so try - * to avoid them whenever possible. - */ - -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma,page) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) \ do { \ clear_bit(PG_arch_1, &(page)->flags); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -extern void flush_icache_range (unsigned long start, unsigned long end); +extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range extern void clflush_cache_range(void *addr, int size); - #define flush_icache_user_range(vma, page, user_addr, len) \ do { \ unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK); \ flush_icache_range(_addr, _addr + (len)); \ } while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ASM_IA64_CACHEFLUSH_H */ From 03518c82b4b33592fed428d38925ce59df91b1c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:04 -0700 Subject: [PATCH 445/574] microblaze: use asm-generic/cacheflush.h Microblaze needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200515143646.3857579-14-hch@lst.de Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/cacheflush.h | 29 ++---------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index 11f56c85056b..39f8fb6768d8 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -57,9 +57,6 @@ void microblaze_cache_init(void); #define invalidate_icache() mbc->iin(); #define invalidate_icache_range(start, end) mbc->iinr(start, end); -#define flush_icache_user_range(vma, pg, adr, len) flush_icache(); -#define flush_icache_page(vma, pg) do { } while (0) - #define enable_dcache() mbc->de(); #define disable_dcache() mbc->dd(); /* FIXME for LL-temac driver */ @@ -77,27 +74,9 @@ do { \ flush_dcache_range((unsigned) (addr), (unsigned) (addr) + PAGE_SIZE); \ } while (0); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) -#define flush_cache_mm(mm) do { } while (0) - #define flush_cache_page(vma, vmaddr, pfn) \ flush_dcache_range(pfn << PAGE_SHIFT, (pfn << PAGE_SHIFT) + PAGE_SIZE); -/* MS: kgdb code use this macro, wrong len with FLASH */ -#if 0 -#define flush_cache_range(vma, start, len) { \ - flush_icache_range((unsigned) (start), (unsigned) (start) + (len)); \ - flush_dcache_range((unsigned) (start), (unsigned) (start) + (len)); \ -} -#endif - -#define flush_cache_range(vma, start, len) do { } while (0) - static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len) @@ -109,12 +88,8 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, flush_dcache_range(addr, addr + PAGE_SIZE); } } +#define copy_to_user_page copy_to_user_page -static inline void copy_from_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, void *src, int len) -{ - memcpy(dst, src, len); -} +#include #endif /* _ASM_MICROBLAZE_CACHEFLUSH_H */ From 9e730ffac148127bc0cbc45097fe23d7e29a8130 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:06 -0700 Subject: [PATCH 446/574] m68knommu: use asm-generic/cacheflush.h m68knommu needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Greg Ungerer Cc: Greg Ungerer Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-15-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/cacheflush_no.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h index 11e9a9dcbfb2..2731f07e7be8 100644 --- a/arch/m68k/include/asm/cacheflush_no.h +++ b/arch/m68k/include/asm/cacheflush_no.h @@ -9,25 +9,8 @@ #include #define flush_cache_all() __flush_cache_all() -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr) do { } while (0) #define flush_dcache_range(start, len) __flush_dcache_all() -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_range(start, len) __flush_icache_all() -#define flush_icache_page(vma,pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) void mcf_cache_push(void); @@ -98,4 +81,6 @@ static inline void cache_clear(unsigned long paddr, int len) __clear_cache_all(); } +#include + #endif /* _M68KNOMMU_CACHEFLUSH_H */ From e050945123065b1804446cf68cdd8dc2a6a2b9f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:09 -0700 Subject: [PATCH 447/574] openrisc: use asm-generic/cacheflush.h OpenRISC needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Link: http://lkml.kernel.org/r/20200515143646.3857579-16-hch@lst.de Signed-off-by: Linus Torvalds --- arch/openrisc/include/asm/cacheflush.h | 31 +++++--------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 79d5d7753fe4..74d1fce4e883 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -62,31 +62,12 @@ static inline void flush_dcache_page(struct page *page) clear_bit(PG_dc_clean, &page->flags); } -/* - * Other interfaces are not required since we do not have virtually - * indexed or tagged caches. So we can use the default here. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +#define flush_icache_user_range(vma, page, addr, len) \ +do { \ + if (vma->vm_flags & VM_EXEC) \ + sync_icache_dcache(page); \ +} while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - if (vma->vm_flags & VM_EXEC) \ - sync_icache_dcache(page); \ - } while (0) - -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* __ASM_CACHEFLUSH_H */ From 5019f76019e5780bad60248638891d13fd905e6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:12 -0700 Subject: [PATCH 448/574] powerpc: use asm-generic/cacheflush.h Power needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Also remove the pointless __KERNEL__ ifdef while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Link: http://lkml.kernel.org/r/20200515143646.3857579-17-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/cacheflush.h | 42 +++++++-------------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index e92191b390f3..e682c8e10e90 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -4,23 +4,9 @@ #ifndef _ASM_POWERPC_CACHEFLUSH_H #define _ASM_POWERPC_CACHEFLUSH_H -#ifdef __KERNEL__ - #include #include -/* - * No cache flushing is required when address mappings are changed, - * because the caches on PowerPCs are physically addressed. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #ifdef CONFIG_PPC_BOOK3S_64 /* * Book3s has no ptesync after setting a pte, so without this ptesync it's @@ -33,20 +19,20 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) { asm volatile("ptesync" ::: "memory"); } -#else -static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } -#endif +#define flush_cache_vmap flush_cache_vmap +#endif /* CONFIG_PPC_BOOK3S_64 */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) void flush_icache_range(unsigned long start, unsigned long stop); -extern void flush_icache_user_range(struct vm_area_struct *vma, - struct page *page, unsigned long addr, - int len); -extern void flush_dcache_icache_page(struct page *page); +#define flush_icache_range flush_icache_range + +void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len); +#define flush_icache_user_range flush_icache_user_range + +void flush_dcache_icache_page(struct page *page); void __flush_dcache_icache(void *page); /** @@ -111,14 +97,6 @@ static inline void invalidate_dcache_range(unsigned long start, mb(); /* sync */ } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - -#endif /* __KERNEL__ */ +#include #endif /* _ASM_POWERPC_CACHEFLUSH_H */ From 396eb69c6e3914c4aa7b2bdeaa7476cc48265df6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:15 -0700 Subject: [PATCH 449/574] riscv: use asm-generic/cacheflush.h RISC-V needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Also remove the pointless __KERNEL__ ifdef while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Link: http://lkml.kernel.org/r/20200515143646.3857579-18-hch@lst.de Signed-off-by: Linus Torvalds --- arch/riscv/include/asm/cacheflush.h | 62 ++--------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index c8677c75f82c..a167b4fbdf00 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -8,65 +8,6 @@ #include -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - -/* - * The cache doesn't need to be flushed when TLB entries change when - * the cache is mapped to physical memory, not virtual memory - */ -static inline void flush_cache_all(void) -{ -} - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_dup_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long vmaddr, - unsigned long pfn) -{ -} - -static inline void flush_dcache_mmap_lock(struct address_space *mapping) -{ -} - -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) -{ -} - -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) -{ -} - -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - static inline void local_flush_icache_all(void) { asm volatile ("fence.i" ::: "memory"); @@ -79,6 +20,7 @@ static inline void flush_dcache_page(struct page *page) if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, @@ -105,4 +47,6 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL #define SYS_RISCV_FLUSH_ICACHE_ALL (SYS_RISCV_FLUSH_ICACHE_LOCAL) +#include + #endif /* _ASM_RISCV_CACHEFLUSH_H */ From 97f52c1536f041fd9f0b62667864930c92ac93cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:19 -0700 Subject: [PATCH 450/574] arm,sparc,unicore32: remove flush_icache_user_range flush_icache_user_range is only used by , so remove it from the architectures that implement it, but don't use . Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Russell King Cc: "David S. Miller" Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200515143646.3857579-19-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/include/asm/cacheflush.h | 3 --- arch/sparc/include/asm/cacheflush_32.h | 2 -- arch/sparc/include/asm/cacheflush_64.h | 1 - arch/unicore32/include/asm/cacheflush.h | 3 --- 4 files changed, 9 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 7114b9aa46b8..c78e14fcfb5d 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -318,9 +318,6 @@ extern void flush_kernel_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index fb66094a2c30..41c6d734a474 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -17,8 +17,6 @@ #define flush_icache_range(start, end) do { } while (0) #define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ flush_cache_page(vma, vaddr, page_to_pfn(page));\ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index e7517434d1fa..b9341836597e 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -49,7 +49,6 @@ void __flush_dcache_range(unsigned long start, unsigned long end); void flush_dcache_page(struct page *page); #define flush_icache_page(vma, pg) do { } while(0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index 9393ca4047e9..ff0be92ebc32 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -162,9 +162,6 @@ extern void flush_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_user_range(vma, page, addr, len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). From 885f7f8e30468705926b3de53d32925bd7a51a03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:22 -0700 Subject: [PATCH 451/574] mm: rename flush_icache_user_range to flush_icache_user_page The function currently known as flush_icache_user_range only operates on a single page. Rename it to flush_icache_user_page as we'll need the name flush_icache_user_range for something else soon. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20200515143646.3857579-20-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/cacheflush.h | 10 +++++----- arch/alpha/kernel/smp.c | 2 +- arch/ia64/include/asm/cacheflush.h | 2 +- arch/m68k/include/asm/cacheflush_mm.h | 4 ++-- arch/m68k/mm/cache.c | 2 +- arch/nds32/include/asm/cacheflush.h | 4 ++-- arch/nds32/mm/cacheflush.c | 2 +- arch/openrisc/include/asm/cacheflush.h | 2 +- arch/powerpc/include/asm/cacheflush.h | 4 ++-- arch/powerpc/mm/mem.c | 2 +- arch/riscv/include/asm/cacheflush.h | 3 ++- include/asm-generic/cacheflush.h | 6 +++--- kernel/events/uprobes.c | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 636d7ca0d05f..9945ff483eaf 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -35,7 +35,7 @@ extern void smp_imb(void); extern void __load_new_mm_context(struct mm_struct *); static inline void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (vma->vm_flags & VM_EXEC) { @@ -46,16 +46,16 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #else /* CONFIG_SMP */ -extern void flush_icache_user_range(struct vm_area_struct *vma, +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_page((vma), (page), 0, 0) #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20..52995bf413fe 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -740,7 +740,7 @@ ipi_flush_icache_page(void *x) } void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index a8f1c86ac242..708c0fa5d975 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -22,7 +22,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range flush_icache_range extern void clflush_cache_range(void *addr, int size); -#define flush_icache_user_range(vma, page, user_addr, len) \ +#define flush_icache_user_page(vma, page, user_addr, len) \ do { \ unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK); \ flush_icache_range(_addr, _addr + (len)); \ diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1e2544ecaf88..95376bf84faa 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -254,7 +254,7 @@ static inline void __flush_page_to_ram(void *vaddr) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) -extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); extern void flush_icache_range(unsigned long address, unsigned long endaddr); @@ -264,7 +264,7 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, { flush_cache_page(vma, vaddr, page_to_pfn(page)); memcpy(dst, src, len); - flush_icache_user_range(vma, page, vaddr, len); + flush_icache_user_page(vma, page, vaddr, len); } static inline void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 079e64898e6a..99057cd5ff7f 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -106,7 +106,7 @@ void flush_icache_range(unsigned long address, unsigned long endaddr) } EXPORT_SYMBOL(flush_icache_range); -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (CPU_IS_COLDFIRE) { diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index caddded56e77..7d6824f7c0e8 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -44,9 +44,9 @@ void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #include #endif diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 8f168b33065f..6eb98a7ad27d 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -36,7 +36,7 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) local_irq_restore(flags); } -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long kaddr; diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 74d1fce4e883..eeac40d4a854 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -62,7 +62,7 @@ static inline void flush_dcache_page(struct page *page) clear_bit(PG_dc_clean, &page->flags); } -#define flush_icache_user_range(vma, page, addr, len) \ +#define flush_icache_user_page(vma, page, addr, len) \ do { \ if (vma->vm_flags & VM_EXEC) \ sync_icache_dcache(page); \ diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index e682c8e10e90..de600b915a3c 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -28,9 +28,9 @@ extern void flush_dcache_page(struct page *page); void flush_icache_range(unsigned long start, unsigned long stop); #define flush_icache_range flush_icache_range -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page void flush_dcache_icache_page(struct page *page); void __flush_dcache_icache(void *page); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3f1a70c6781b..e2d6a6236aa7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -577,7 +577,7 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, flush_dcache_page(pg); } -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long maddr; diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index a167b4fbdf00..23ff70350992 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -27,7 +27,8 @@ static inline void flush_dcache_page(struct page *page) * so instead we just flush the whole thing. */ #define flush_icache_range(start, end) flush_icache_all() -#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0) +#define flush_icache_user_page(vma, pg, addr, len) \ + flush_icache_mm(vma->vm_mm, 0) #ifndef CONFIG_SMP diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index bbbb4d4ef651..2c9686fefb71 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -73,8 +73,8 @@ static inline void flush_icache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_icache_user_range -static inline void flush_icache_user_range(struct vm_area_struct *vma, +#ifndef flush_icache_user_page +static inline void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { @@ -97,7 +97,7 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ + flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eddc8db96027..e51ec844c87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1668,7 +1668,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. From 1268c33382facefd9c389d4c425e8f906f279c7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:26 -0700 Subject: [PATCH 452/574] asm-generic: add a flush_icache_user_range stub Define flush_icache_user_range to flush_icache_range unless the architecture provides its own implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-21-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 2c9686fefb71..907fa5d16494 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -66,6 +66,10 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) } #endif +#ifndef flush_icache_user_range +#define flush_icache_user_range flush_icache_range +#endif + #ifndef flush_icache_page static inline void flush_icache_page(struct vm_area_struct *vma, struct page *page) From 952ec41c44a4e9802f2c991912c117d2347832af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:28 -0700 Subject: [PATCH 453/574] sh: implement flush_icache_user_range The SuperH implementation of flush_icache_range seems to be able to cope with user addresses. Just define flush_icache_user_range to flush_icache_range. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Yoshinori Sato Cc: Rich Felker Link: http://lkml.kernel.org/r/20200515143646.3857579-22-hch@lst.de Signed-off-by: Linus Torvalds --- arch/sh/include/asm/cacheflush.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index b932e42ef028..fe7400079b97 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -46,6 +46,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_user_range flush_icache_range extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); extern void flush_cache_sigtramp(unsigned long address); From 70cd3444c17d80d227c9c6ff2287f10b7114e479 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:32 -0700 Subject: [PATCH 454/574] xtensa: implement flush_icache_user_range The Xtensa implementation of flush_icache_range seems to be able to cope with user addresses. Just define flush_icache_user_range to flush_icache_range. [jcmvbkbc@gmail.com: fix flush_icache_user_range in noMMU configs] Link: http://lkml.kernel.org/r/20200525221556.4270-1-jcmvbkbc@gmail.com Signed-off-by: Christoph Hellwig Signed-off-by: Max Filippov Signed-off-by: Andrew Morton Cc: Chris Zankel Cc: Max Filippov Link: http://lkml.kernel.org/r/20200515143646.3857579-23-hch@lst.de Signed-off-by: Linus Torvalds --- arch/xtensa/include/asm/cacheflush.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index a0d50be5a8cb..cf907e5bf2f2 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -145,6 +145,8 @@ void local_flush_cache_page(struct vm_area_struct *vma, #endif +#define flush_icache_user_range flush_icache_range + /* Ensure consistency between data and instruction cache. */ #define local_flush_icache_range(start, end) \ do { \ From fca7f8e6fdeb716420a348acf6ca9bc7e609f5c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:34 -0700 Subject: [PATCH 455/574] arm: rename flush_cache_user_range to flush_icache_user_range flush_icache_user_range will be the name for a generic primitive. Move the arm name so that arm already has an implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Russell King Link: http://lkml.kernel.org/r/20200515143646.3857579-24-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/include/asm/cacheflush.h | 4 ++-- arch/arm/kernel/traps.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index c78e14fcfb5d..2e24e765e6d3 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -258,11 +258,11 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the + * flush_icache_user_range is used when we want to ensure that the * Harvard caches are synchronised for the user space address range. * This is used for the ARM private sys_cacheflush system call. */ -#define flush_cache_user_range(s,e) __cpuc_coherent_user_range(s,e) +#define flush_icache_user_range(s,e) __cpuc_coherent_user_range(s,e) /* * Perform necessary cache operations to ensure that data previously diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0f..316a7687f813 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -566,7 +566,7 @@ __do_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = flush_cache_user_range(start, start + chunk); + ret = flush_icache_user_range(start, start + chunk); if (ret) return ret; From a1e81f9654eef650d3ee35c94a8cab00b5cd379c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:37 -0700 Subject: [PATCH 456/574] m68k: implement flush_icache_user_range Rename the current flush_icache_range to flush_icache_user_range as per commit ae92ef8a4424 ("PATCH] flush icache in correct context") there seems to be an assumption that it operates on user addresses. Add a flush_icache_range around it that for now is a no-op. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-25-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/cacheflush_mm.h | 2 ++ arch/m68k/mm/cache.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 95376bf84faa..1ac55e7b47f0 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -257,6 +257,8 @@ static inline void __flush_page_to_ram(void *vaddr) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); extern void flush_icache_range(unsigned long address, unsigned long endaddr); +extern void flush_icache_user_range(unsigned long address, + unsigned long endaddr); static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 99057cd5ff7f..7915be3a0971 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -73,7 +73,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) /* Push n pages at kernel virtual address and clear the icache */ /* RZ: use cpush %bc instead of cpush %dc, cinv %ic */ -void flush_icache_range(unsigned long address, unsigned long endaddr) +void flush_icache_user_range(unsigned long address, unsigned long endaddr) { if (CPU_IS_COLDFIRE) { unsigned long start, end; @@ -104,6 +104,11 @@ void flush_icache_range(unsigned long address, unsigned long endaddr) : "di" (FLUSH_I)); } } + +void flush_icache_range(unsigned long address, unsigned long endaddr) +{ + flush_icache_user_range(address, endaddr); +} EXPORT_SYMBOL(flush_icache_range); void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, From 48304f7994d74ef8fddde3ff27ce41aa61453d39 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:40 -0700 Subject: [PATCH 457/574] exec: only build read_code when needed Only build read_code when binary formats that use it are built into the kernel. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-26-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index 93ff1c4c7ebb..88260ec0ac34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1053,6 +1053,8 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); +#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ + defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -1061,6 +1063,7 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) return res; } EXPORT_SYMBOL(read_code); +#endif /* * Maps the mm_struct mm into the current task struct. From bce2b68b891af3dd02fdc1603786c1c28db8bfad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:43 -0700 Subject: [PATCH 458/574] exec: use flush_icache_user_range in read_code read_code operates on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-27-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 88260ec0ac34..02d0c5d19be5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1059,7 +1059,7 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) - flush_icache_range(addr, addr + len); + flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); From 79ef1e1fffebcfcb2c93463ca8d0f4a03eceb8f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:46 -0700 Subject: [PATCH 459/574] binfmt_flat: use flush_icache_user_range load_flat_file works on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Greg Ungerer Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-28-hch@lst.de Signed-off-by: Linus Torvalds --- fs/binfmt_flat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 9b82bc111d0a..87ce229c63cf 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -854,7 +854,7 @@ static int load_flat_file(struct linux_binprm *bprm, #endif /* CONFIG_BINFMT_FLAT_OLD */ } - flush_icache_range(start_code, end_code); + flush_icache_user_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ if (clear_user((void __user *)(datapos + data_len), bss_len + From a75a2df68f87c4924c614204a85b29d82d5c7dc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:49 -0700 Subject: [PATCH 460/574] nommu: use flush_icache_user_range in brk and mmap These obviously operate on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Greg Ungerer Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-29-hch@lst.de Signed-off-by: Linus Torvalds --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index dfae55f41901..062dc1c90d21 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -433,7 +433,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ - flush_icache_range(mm->brk, brk); + flush_icache_user_range(mm->brk, brk); return mm->brk = brk; } @@ -1277,7 +1277,7 @@ share: /* we flush the region from the icache only when the first executable * mapping of it is made */ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { - flush_icache_range(region->vm_start, region->vm_end); + flush_icache_user_range(region->vm_start, region->vm_end); region->vm_icache_flushed = true; } From 490741ab1beb9c9f6580e30c7cd91b17a7560369 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:52 -0700 Subject: [PATCH 461/574] module: move the set_fs hack for flush_icache_range to m68k flush_icache_range generally operates on kernel addresses, but for some reason m68k needed a set_fs override. Move that into the m68k code insted of keeping it in the module loader. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Jessica Yu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Link: http://lkml.kernel.org/r/20200515143646.3857579-30-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/mm/cache.c | 4 ++++ kernel/module.c | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 7915be3a0971..5ecb3310e874 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -107,7 +107,11 @@ void flush_icache_user_range(unsigned long address, unsigned long endaddr) void flush_icache_range(unsigned long address, unsigned long endaddr) { + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); flush_icache_user_range(address, endaddr); + set_fs(old_fs); } EXPORT_SYMBOL(flush_icache_range); diff --git a/kernel/module.c b/kernel/module.c index ef400c389f49..e8a198588f26 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3344,12 +3344,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3361,8 +3355,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, From db33ec371be8e45956e8cebb5b0fe641f008430b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 7 Jun 2020 21:42:55 -0700 Subject: [PATCH 462/574] doc: cgroup: update note about conditions when oom killer is invoked Starting from v4.19 commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path") cgroup oom killer is no longer invoked only from page faults. Now it implements the same semantics as global OOM killer: allocation context invokes OOM killer and keeps retrying until success. [akpm@linux-foundation.org: fixes per Randy] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Randy Dunlap Link: http://lkml.kernel.org/r/158894738928.208854.5244393925922074518.stgit@buzz Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c2a4b652bd1a..ce3e05e41724 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1170,6 +1170,13 @@ PAGE_SIZE multiple when read back. Under certain circumstances, the usage may go over the limit temporarily. + In default configuration regular 0-order allocations always + succeed unless OOM killer chooses current task as a victim. + + Some kinds of allocations don't invoke the OOM killer. + Caller could retry them differently, return into userspace + as -ENOMEM or silently ignore in cases like disk readahead. + This is the ultimate protection mechanism. As long as the high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. @@ -1226,17 +1233,9 @@ PAGE_SIZE multiple when read back. The number of time the cgroup's memory usage was reached the limit and allocation was about to fail. - Depending on context result could be invocation of OOM - killer and retrying allocation or failing allocation. - - Failed allocation in its turn could be returned into - userspace as -ENOMEM or silently ignored in cases like - disk readahead. For now OOM in memory cgroup kills - tasks iff shortage has happened inside page fault. - This event is not raised if the OOM killer is not considered as an option, e.g. for failed high-order - allocations. + allocations or if caller asked to not retry attempts. oom_kill The number of processes belonging to this cgroup From 197298a64983e2beaf1a87413daff3044b4f3821 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 17 Mar 2020 22:34:33 +0100 Subject: [PATCH 463/574] exfat: Simplify exfat_utf8_d_cmp() for code points above U+FFFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If two Unicode code points represented in UTF-16 are different then also their UTF-32 representation must be different. Therefore conversion from UTF-32 to UTF-16 is not needed. Signed-off-by: Pali Rohár Signed-off-by: Namjae Jeon --- fs/exfat/namei.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index a2659a8a68a1..731da41cabbf 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -185,14 +185,9 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len, if (u_a <= 0xFFFF && u_b <= 0xFFFF) { if (exfat_toupper(sb, u_a) != exfat_toupper(sb, u_b)) return 1; - } else if (u_a > 0xFFFF && u_b > 0xFFFF) { - if (exfat_low_surrogate(u_a) != - exfat_low_surrogate(u_b) || - exfat_high_surrogate(u_a) != - exfat_high_surrogate(u_b)) - return 1; } else { - return 1; + if (u_a != u_b) + return 1; } } From d1727d55c0327efdba2a5bad7801d509b721fef3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 24 Apr 2020 13:31:12 +0900 Subject: [PATCH 464/574] exfat: Use a more common logging style Remove the direct use of KERN_ in functions by creating separate exfat_ macros. Miscellanea: o Remove several unnecessary terminating newlines in formats o Realign arguments and fit to 80 columns where appropriate Signed-off-by: Joe Perches Signed-off-by: Namjae Jeon --- fs/exfat/balloc.c | 8 +++----- fs/exfat/dir.c | 9 ++++----- fs/exfat/exfat_fs.h | 7 +++++++ fs/exfat/fatent.c | 13 +++++-------- fs/exfat/misc.c | 4 ++-- fs/exfat/namei.c | 26 ++++++++++--------------- fs/exfat/nls.c | 20 ++++++++----------- fs/exfat/super.c | 47 ++++++++++++++++++++------------------------- 8 files changed, 60 insertions(+), 74 deletions(-) diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 6774a5a6ded8..4055eb00ea9b 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -58,9 +58,8 @@ static int exfat_allocate_bitmap(struct super_block *sb, need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE) + 1; if (need_map_size != map_size) { - exfat_msg(sb, KERN_ERR, - "bogus allocation bitmap size(need : %u, cur : %lld)", - need_map_size, map_size); + exfat_err(sb, "bogus allocation bitmap size(need : %u, cur : %lld)", + need_map_size, map_size); /* * Only allowed when bogus allocation * bitmap size is large @@ -192,8 +191,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu) (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); if (ret_discard == -EOPNOTSUPP) { - exfat_msg(sb, KERN_ERR, - "discard not supported by device, disabling"); + exfat_err(sb, "discard not supported by device, disabling"); opts->discard = 0; } } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 4b91afb0f051..53ae965da7ec 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -720,9 +720,8 @@ static int exfat_dir_readahead(struct super_block *sb, sector_t sec) return 0; if (sec < sbi->data_start_sector) { - exfat_msg(sb, KERN_ERR, - "requested sector is invalid(sect:%llu, root:%llu)", - (unsigned long long)sec, sbi->data_start_sector); + exfat_err(sb, "requested sector is invalid(sect:%llu, root:%llu)", + (unsigned long long)sec, sbi->data_start_sector); return -EIO; } @@ -750,7 +749,7 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb, sector_t sec; if (p_dir->dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry\n"); + exfat_err(sb, "abnormal access to deleted dentry"); return NULL; } @@ -853,7 +852,7 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, struct buffer_head *bh; if (p_dir->dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, "access to deleted dentry\n"); + exfat_err(sb, "access to deleted dentry"); return NULL; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index d67fb8a6f770..1ebfb9085f1f 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -505,6 +505,13 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) fmt, ## args) void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) __printf(3, 4) __cold; +#define exfat_err(sb, fmt, ...) \ + exfat_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) +#define exfat_warn(sb, fmt, ...) \ + exfat_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) +#define exfat_info(sb, fmt, ...) \ + exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__) + void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_ms); void exfat_truncate_atime(struct timespec64 *ts); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index a855b1769a96..267e5e09eb13 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -170,8 +170,7 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) /* check cluster validation */ if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) { - exfat_msg(sb, KERN_ERR, "invalid start cluster (%u)", - p_chain->dir); + exfat_err(sb, "invalid start cluster (%u)", p_chain->dir); return -EIO; } @@ -305,8 +304,7 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) return 0; release_bhs: - exfat_msg(sb, KERN_ERR, "failed zeroed sect %llu\n", - (unsigned long long)blknr); + exfat_err(sb, "failed zeroed sect %llu\n", (unsigned long long)blknr); for (i = 0; i < n; i++) bforget(bhs[i]); return err; @@ -337,9 +335,8 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, /* find new cluster */ if (hint_clu == EXFAT_EOF_CLUSTER) { if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) { - exfat_msg(sb, KERN_ERR, - "sbi->clu_srch_ptr is invalid (%u)\n", - sbi->clu_srch_ptr); + exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)\n", + sbi->clu_srch_ptr); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; } @@ -350,7 +347,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, /* check cluster validation */ if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) { - exfat_msg(sb, KERN_ERR, "hint_cluster is invalid (%u)\n", + exfat_err(sb, "hint_cluster is invalid (%u)", hint_clu); hint_clu = EXFAT_FIRST_CLUSTER; if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index ebd2cbe3cbc1..ce5e8a1b0726 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -32,7 +32,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - exfat_msg(sb, KERN_ERR, "error, %pV\n", &vaf); + exfat_err(sb, "error, %pV", &vaf); va_end(args); } @@ -41,7 +41,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) sb->s_id); } else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) { sb->s_flags |= SB_RDONLY; - exfat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); + exfat_err(sb, "Filesystem has been set read-only"); } } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 731da41cabbf..585b47b2db3d 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -773,8 +773,8 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, if (d_unhashed(alias)) { WARN_ON(alias->d_name.hash_len != dentry->d_name.hash_len); - exfat_msg(sb, KERN_INFO, - "rehashed a dentry(%p) in read lookup", alias); + exfat_info(sb, "rehashed a dentry(%p) in read lookup", + alias); d_drop(dentry); d_rehash(alias); } else if (!S_ISDIR(i_mode)) { @@ -819,7 +819,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) exfat_chain_dup(&cdir, &ei->dir); entry = ei->entry; if (ei->dir.dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + exfat_err(sb, "abnormal access to deleted dentry"); err = -ENOENT; goto unlock; } @@ -974,7 +974,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) entry = ei->entry; if (ei->dir.dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + exfat_err(sb, "abnormal access to deleted dentry"); err = -ENOENT; goto unlock; } @@ -986,9 +986,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) err = exfat_check_dir_empty(sb, &clu_to_free); if (err) { if (err == -EIO) - exfat_msg(sb, KERN_ERR, - "failed to exfat_check_dir_empty : err(%d)", - err); + exfat_err(sb, "failed to exfat_check_dir_empty : err(%d)", + err); goto unlock; } @@ -1009,9 +1008,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries); if (err) { - exfat_msg(sb, KERN_ERR, - "failed to exfat_remove_entries : err(%d)", - err); + exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err); goto unlock; } ei->dir.dir = DIR_DELETED; @@ -1240,8 +1237,7 @@ static int __exfat_rename(struct inode *old_parent_inode, return -EINVAL; if (ei->dir.dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, - "abnormal access to deleted source dentry"); + exfat_err(sb, "abnormal access to deleted source dentry"); return -ENOENT; } @@ -1263,8 +1259,7 @@ static int __exfat_rename(struct inode *old_parent_inode, new_ei = EXFAT_I(new_inode); if (new_ei->dir.dir == DIR_DELETED) { - exfat_msg(sb, KERN_ERR, - "abnormal access to deleted target dentry"); + exfat_err(sb, "abnormal access to deleted target dentry"); goto out; } @@ -1426,8 +1421,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(new_inode->i_mode)) drop_nlink(new_inode); } else { - exfat_msg(sb, KERN_WARNING, - "abnormal access to an inode dropped"); + exfat_warn(sb, "abnormal access to an inode dropped"); WARN_ON(new_inode->i_nlink == 0); } new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 6d1c3ae130ff..2178786b708b 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -503,16 +503,14 @@ static int exfat_utf8_to_utf16(struct super_block *sb, unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN, (wchar_t *)uniname, MAX_NAME_LENGTH + 2); if (unilen < 0) { - exfat_msg(sb, KERN_ERR, - "failed to %s (err : %d) nls len : %d", - __func__, unilen, len); + exfat_err(sb, "failed to %s (err : %d) nls len : %d", + __func__, unilen, len); return unilen; } if (unilen > MAX_NAME_LENGTH) { - exfat_msg(sb, KERN_ERR, - "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", - __func__, len, unilen, MAX_NAME_LENGTH); + exfat_err(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", + __func__, len, unilen, MAX_NAME_LENGTH); return -ENAMETOOLONG; } @@ -687,9 +685,8 @@ static int exfat_load_upcase_table(struct super_block *sb, bh = sb_bread(sb, sector); if (!bh) { - exfat_msg(sb, KERN_ERR, - "failed to read sector(0x%llx)\n", - (unsigned long long)sector); + exfat_err(sb, "failed to read sector(0x%llx)\n", + (unsigned long long)sector); ret = -EIO; goto free_table; } @@ -722,9 +719,8 @@ static int exfat_load_upcase_table(struct super_block *sb, if (index >= 0xFFFF && utbl_checksum == checksum) return 0; - exfat_msg(sb, KERN_ERR, - "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)\n", - index, checksum, utbl_checksum); + exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", + index, checksum, utbl_checksum); ret = -EINVAL; free_table: exfat_free_upcase_table(sbi); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index a846ff555656..f9aa1e5dc238 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -376,15 +376,13 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) if (!is_power_of_2(logical_sect) || logical_sect < 512 || logical_sect > 4096) { - exfat_msg(sb, KERN_ERR, "bogus logical sector size %u", - logical_sect); + exfat_err(sb, "bogus logical sector size %u", logical_sect); return NULL; } if (logical_sect < sb->s_blocksize) { - exfat_msg(sb, KERN_ERR, - "logical sector size too small for device (logical sector size = %u)", - logical_sect); + exfat_err(sb, "logical sector size too small for device (logical sector size = %u)", + logical_sect); return NULL; } @@ -393,15 +391,14 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) sbi->pbr_bh = NULL; if (!sb_set_blocksize(sb, logical_sect)) { - exfat_msg(sb, KERN_ERR, - "unable to set blocksize %u", logical_sect); + exfat_err(sb, "unable to set blocksize %u", + logical_sect); return NULL; } sbi->pbr_bh = sb_bread(sb, 0); if (!sbi->pbr_bh) { - exfat_msg(sb, KERN_ERR, - "unable to read boot sector (logical sector size = %lu)", - sb->s_blocksize); + exfat_err(sb, "unable to read boot sector (logical sector size = %lu)", + sb->s_blocksize); return NULL; } @@ -424,7 +421,7 @@ static int __exfat_fill_super(struct super_block *sb) /* read boot sector */ sbi->pbr_bh = sb_bread(sb, 0); if (!sbi->pbr_bh) { - exfat_msg(sb, KERN_ERR, "unable to read boot sector"); + exfat_err(sb, "unable to read boot sector"); return -EIO; } @@ -433,7 +430,7 @@ static int __exfat_fill_super(struct super_block *sb) /* check the validity of PBR */ if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { - exfat_msg(sb, KERN_ERR, "invalid boot record signature"); + exfat_err(sb, "invalid boot record signature"); ret = -EINVAL; goto free_bh; } @@ -458,7 +455,7 @@ static int __exfat_fill_super(struct super_block *sb) p_bpb = (struct pbr64 *)p_pbr; if (!p_bpb->bsx.num_fats) { - exfat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + exfat_err(sb, "bogus number of FAT structure"); ret = -EINVAL; goto free_bh; } @@ -488,8 +485,7 @@ static int __exfat_fill_super(struct super_block *sb) if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) { sbi->vol_flag |= VOL_DIRTY; - exfat_msg(sb, KERN_WARNING, - "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); + exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); } /* exFAT file size is limited by a disk volume size */ @@ -498,19 +494,19 @@ static int __exfat_fill_super(struct super_block *sb) ret = exfat_create_upcase_table(sb); if (ret) { - exfat_msg(sb, KERN_ERR, "failed to load upcase table"); + exfat_err(sb, "failed to load upcase table"); goto free_bh; } ret = exfat_load_bitmap(sb); if (ret) { - exfat_msg(sb, KERN_ERR, "failed to load alloc-bitmap"); + exfat_err(sb, "failed to load alloc-bitmap"); goto free_upcase_table; } ret = exfat_count_used_clusters(sb, &sbi->used_clusters); if (ret) { - exfat_msg(sb, KERN_ERR, "failed to scan clusters"); + exfat_err(sb, "failed to scan clusters"); goto free_alloc_bitmap; } @@ -539,8 +535,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) { - exfat_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but the device does not support discard"); + exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); opts->discard = 0; } } @@ -555,7 +550,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) err = __exfat_fill_super(sb); if (err) { - exfat_msg(sb, KERN_ERR, "failed to recognize exfat type"); + exfat_err(sb, "failed to recognize exfat type"); goto check_nls_io; } @@ -567,8 +562,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) else { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { - exfat_msg(sb, KERN_ERR, "IO charset %s not found", - sbi->options.iocharset); + exfat_err(sb, "IO charset %s not found", + sbi->options.iocharset); err = -EINVAL; goto free_table; } @@ -581,7 +576,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) root_inode = new_inode(sb); if (!root_inode) { - exfat_msg(sb, KERN_ERR, "failed to allocate root inode."); + exfat_err(sb, "failed to allocate root inode"); err = -ENOMEM; goto free_table; } @@ -590,7 +585,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) inode_set_iversion(root_inode, 1); err = exfat_read_root(root_inode); if (err) { - exfat_msg(sb, KERN_ERR, "failed to initialize root inode."); + exfat_err(sb, "failed to initialize root inode"); goto put_inode; } @@ -599,7 +594,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = d_make_root(root_inode); if (!sb->s_root) { - exfat_msg(sb, KERN_ERR, "failed to get the root dentry"); + exfat_err(sb, "failed to get the root dentry"); err = -ENOMEM; goto put_inode; } From 31f5acc0aaa319a34308442413ef0b878e9d2c93 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 6 Apr 2020 11:13:43 +0200 Subject: [PATCH 465/574] exfat: Improve wording of EXFAT_DEFAULT_IOCHARSET config option - Use consistent capitalization for "exFAT". - Fix grammar, - Split long sentence. Signed-off-by: Geert Uytterhoeven Signed-off-by: Namjae Jeon --- fs/exfat/Kconfig | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 2d3636dc5b8c..5a65071b5ecf 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -16,6 +16,7 @@ config EXFAT_DEFAULT_IOCHARSET depends on EXFAT_FS help Set this to the default input/output character set to use for - converting between the encoding is used for user visible filename and - UTF-16 character that exfat filesystem use, and can be overridden with - the "iocharset" mount option for exFAT filesystems. + converting between the encoding that is used for user visible + filenames and the UTF-16 character encoding that the exFAT + filesystem uses. This can be overridden with the "iocharset" mount + option for the exFAT filesystems. From dddf7da3985ee144b46ee287e81e47b9ee8bb980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 17 Mar 2020 23:25:52 +0100 Subject: [PATCH 466/574] exfat: Simplify exfat_utf8_d_hash() for code points above U+FFFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function partial_name_hash() takes long type value into which can be stored one Unicode code point. Therefore conversion from UTF-32 to UTF-16 is not needed. Signed-off-by: Pali Rohár Signed-off-by: Namjae Jeon --- fs/exfat/namei.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 585b47b2db3d..fa926b9c883a 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -147,16 +147,10 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr) return charlen; /* - * Convert to UTF-16: code points above U+FFFF are encoded as - * surrogate pairs. * exfat_toupper() works only for code points up to the U+FFFF. */ - if (u > 0xFFFF) { - hash = partial_name_hash(exfat_high_surrogate(u), hash); - hash = partial_name_hash(exfat_low_surrogate(u), hash); - } else { - hash = partial_name_hash(exfat_toupper(sb, u), hash); - } + hash = partial_name_hash(u <= 0xFFFF ? exfat_toupper(sb, u) : u, + hash); } qstr->hash = end_name_hash(hash); From 6778337a7a4e51ec9fa4a76846d34e8a66ca6418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Tue, 17 Mar 2020 23:25:54 +0100 Subject: [PATCH 467/574] exfat: Remove unused functions exfat_high_surrogate() and exfat_low_surrogate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After applying previous two patches, these functions are not used anymore. Signed-off-by: Pali Rohár Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 -- fs/exfat/nls.c | 13 ------------- 2 files changed, 15 deletions(-) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 1ebfb9085f1f..3862df6af738 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -492,8 +492,6 @@ int exfat_nls_to_utf16(struct super_block *sb, struct exfat_uni_name *uniname, int *p_lossy); int exfat_create_upcase_table(struct super_block *sb); void exfat_free_upcase_table(struct exfat_sb_info *sbi); -unsigned short exfat_high_surrogate(unicode_t u); -unsigned short exfat_low_surrogate(unicode_t u); /* exfat/misc.c */ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 2178786b708b..1ebda90cbdd7 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -535,22 +535,9 @@ static int exfat_utf8_to_utf16(struct super_block *sb, return unilen; } -#define PLANE_SIZE 0x00010000 #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 #define SURROGATE_LOW 0x00000400 -#define SURROGATE_BITS 0x000003ff - -unsigned short exfat_high_surrogate(unicode_t u) -{ - return ((u - PLANE_SIZE) >> 10) + SURROGATE_PAIR; -} - -unsigned short exfat_low_surrogate(unicode_t u) -{ - return ((u - PLANE_SIZE) & SURROGATE_BITS) | SURROGATE_PAIR | - SURROGATE_LOW; -} static int __exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *p_uniname, unsigned char *p_cstring, From cdc06129a6cea0e4863fa2b34a0c132c6eb7278b Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 16 Apr 2020 13:34:26 +0900 Subject: [PATCH 468/574] exfat: remove the assignment of 0 to bool variable There is no need to init 'sync' in exfat_set_vol_flags(). This also fixes the following coccicheck warning: fs/exfat/super.c:104:6-10: WARNING: Assignment of 0/1 to bool variable Signed-off-by: Jason Yan Signed-off-by: Namjae Jeon --- fs/exfat/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index f9aa1e5dc238..c1f47f4071a8 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -102,7 +102,7 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data; - bool sync = 0; + bool sync; /* flags are not changed */ if (sbi->vol_flag == new_flag) From ed0f84d30ba65f44bed2739572c7ab0fdeed4004 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Wed, 22 Apr 2020 08:30:56 +0900 Subject: [PATCH 469/574] exfat: replace 'time_ms' with 'time_cs' Replace time_ms with time_cs in the file directory entry structure and related functions. The unit of create_time_ms/modify_time_ms in File Directory Entry are not 'milli-second', but 'centi-second'. The exfat specification uses the term '10ms', but instead use 'cs' as in msdos_fs.h. Signed-off-by: Tetsuhiro Kohada Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 8 ++++---- fs/exfat/exfat_fs.h | 4 ++-- fs/exfat/exfat_raw.h | 4 ++-- fs/exfat/file.c | 2 +- fs/exfat/inode.c | 4 ++-- fs/exfat/misc.c | 18 +++++++++--------- fs/exfat/namei.c | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 53ae965da7ec..b5a237c33d50 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -137,12 +137,12 @@ static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) ep->dentry.file.create_tz, ep->dentry.file.create_time, ep->dentry.file.create_date, - ep->dentry.file.create_time_ms); + ep->dentry.file.create_time_cs); exfat_get_entry_time(sbi, &dir_entry->mtime, ep->dentry.file.modify_tz, ep->dentry.file.modify_time, ep->dentry.file.modify_date, - ep->dentry.file.modify_time_ms); + ep->dentry.file.modify_time_cs); exfat_get_entry_time(sbi, &dir_entry->atime, ep->dentry.file.access_tz, ep->dentry.file.access_time, @@ -461,12 +461,12 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, - &ep->dentry.file.create_time_ms); + &ep->dentry.file.create_time_cs); exfat_set_entry_time(sbi, &ts, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_ms); + &ep->dentry.file.modify_time_cs); exfat_set_entry_time(sbi, &ts, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 3862df6af738..294aa7792bc3 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -511,10 +511,10 @@ void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__) void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, - u8 tz, __le16 time, __le16 date, u8 time_ms); + u8 tz, __le16 time, __le16 date, u8 time_cs); void exfat_truncate_atime(struct timespec64 *ts); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, - u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); + u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); unsigned short exfat_calc_chksum_2byte(void *data, int len, unsigned short chksum, int type); void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync); diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 2a841010e649..8d6c64a7546d 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -136,8 +136,8 @@ struct exfat_dentry { __le16 modify_date; __le16 access_time; __le16 access_date; - __u8 create_time_ms; - __u8 modify_time_ms; + __u8 create_time_cs; + __u8 modify_time_cs; __u8 create_tz; __u8 modify_tz; __u8 access_tz; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index c9db8eb0cfc3..84f3d31a3a55 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -165,7 +165,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_ms); + &ep->dentry.file.modify_time_cs); ep->dentry.file.attr = cpu_to_le16(ei->attr); /* File size should be zero if there is no cluster allocated */ diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..3f367d081cd6 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -56,12 +56,12 @@ static int __exfat_write_inode(struct inode *inode, int sync) &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, - &ep->dentry.file.create_time_ms); + &ep->dentry.file.create_time_cs); exfat_set_entry_time(sbi, &inode->i_mtime, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_ms); + &ep->dentry.file.modify_time_cs); exfat_set_entry_time(sbi, &inode->i_atime, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index ce5e8a1b0726..ab7f88b1f6d3 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -75,7 +75,7 @@ static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off) /* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, - u8 tz, __le16 time, __le16 date, u8 time_ms) + u8 tz, __le16 time, __le16 date, u8 time_cs) { u16 t = le16_to_cpu(time); u16 d = le16_to_cpu(date); @@ -84,10 +84,10 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1); - /* time_ms field represent 0 ~ 199(1990 ms) */ - if (time_ms) { - ts->tv_sec += time_ms / 100; - ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; + /* time_cs field represent 0 ~ 199cs(1990 ms) */ + if (time_cs) { + ts->tv_sec += time_cs / 100; + ts->tv_nsec = (time_cs % 100) * 10 * NSEC_PER_MSEC; } else ts->tv_nsec = 0; @@ -101,7 +101,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, /* Convert linear UNIX date to a EXFAT time/date pair. */ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, - u8 *tz, __le16 *time, __le16 *date, u8 *time_ms) + u8 *tz, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; u16 t, d; @@ -113,9 +113,9 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, *time = cpu_to_le16(t); *date = cpu_to_le16(d); - /* time_ms field represent 0 ~ 199(1990 ms) */ - if (time_ms) - *time_ms = (tm.tm_sec & 1) * 100 + + /* time_cs field represent 0 ~ 199cs(1990 ms) */ + if (time_cs) + *time_cs = (tm.tm_sec & 1) * 100 + ts->tv_nsec / (10 * NSEC_PER_MSEC); /* diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index fa926b9c883a..48f4df883f3b 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -689,12 +689,12 @@ static int exfat_find(struct inode *dir, struct qstr *qname, ep->dentry.file.create_tz, ep->dentry.file.create_time, ep->dentry.file.create_date, - ep->dentry.file.create_time_ms); + ep->dentry.file.create_time_cs); exfat_get_entry_time(sbi, &info->mtime, ep->dentry.file.modify_tz, ep->dentry.file.modify_time, ep->dentry.file.modify_date, - ep->dentry.file.modify_time_ms); + ep->dentry.file.modify_time_cs); exfat_get_entry_time(sbi, &info->atime, ep->dentry.file.access_tz, ep->dentry.file.access_time, From 943af1fdacfebe4ff13430655abc460a5e1072f7 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Wed, 20 May 2020 16:56:41 +0900 Subject: [PATCH 470/574] exfat: optimize dir-cache Optimize directory access based on exfat_entry_set_cache. - Hold bh instead of copied d-entry. - Modify bh->data directly instead of the copied d-entry. - Write back the retained bh instead of rescanning the d-entry-set. And - Remove unused cache related definitions. Signed-off-by: Tetsuhiro Kohada Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 197 +++++++++++++++++--------------------------- fs/exfat/exfat_fs.h | 27 +++--- fs/exfat/file.c | 15 ++-- fs/exfat/inode.c | 51 +++++------- fs/exfat/namei.c | 14 ++-- 5 files changed, 123 insertions(+), 181 deletions(-) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index b5a237c33d50..2902d285bf20 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -32,35 +32,30 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned short *uniname) { int i; - struct exfat_dentry *ep; struct exfat_entry_set_cache *es; - es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES, &ep); + es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES); if (!es) return; - if (es->num_entries < 3) - goto free_es; - - ep += 2; - /* * First entry : file entry * Second entry : stream-extension entry * Third entry : first file-name entry * So, the index of first file-name dentry should start from 2. */ - for (i = 2; i < es->num_entries; i++, ep++) { + for (i = 2; i < es->num_entries; i++) { + struct exfat_dentry *ep = exfat_get_dentry_cached(es, i); + /* end of name entry */ if (exfat_get_entry_type(ep) != TYPE_EXTEND) - goto free_es; + break; exfat_extract_uni_name(ep, uniname); uniname += EXFAT_FILE_NAME_LEN; } -free_es: - kfree(es); + exfat_free_dentry_set(es, false); } /* read a directory entry from the opened directory */ @@ -590,62 +585,33 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, return 0; } -int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, - struct exfat_entry_set_cache *es, int sync) +void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es) { - struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct buffer_head *bh; - sector_t sec = es->sector; - unsigned int off = es->offset; - int chksum_type = CS_DIR_ENTRY, i, num_entries = es->num_entries; - unsigned int buf_off = (off - es->offset); - unsigned int remaining_byte_in_sector, copy_entries, clu; + int chksum_type = CS_DIR_ENTRY, i; unsigned short chksum = 0; + struct exfat_dentry *ep; - for (i = 0; i < num_entries; i++) { - chksum = exfat_calc_chksum_2byte(&es->entries[i], DENTRY_SIZE, - chksum, chksum_type); + for (i = 0; i < es->num_entries; i++) { + ep = exfat_get_dentry_cached(es, i); + chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum, + chksum_type); chksum_type = CS_DEFAULT; } + ep = exfat_get_dentry_cached(es, 0); + ep->dentry.file.checksum = cpu_to_le16(chksum); + es->modified = true; +} - es->entries[0].dentry.file.checksum = cpu_to_le16(chksum); +void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync) +{ + int i; - while (num_entries) { - /* write per sector base */ - remaining_byte_in_sector = (1 << sb->s_blocksize_bits) - off; - copy_entries = min_t(int, - EXFAT_B_TO_DEN(remaining_byte_in_sector), - num_entries); - bh = sb_bread(sb, sec); - if (!bh) - goto err_out; - memcpy(bh->b_data + off, - (unsigned char *)&es->entries[0] + buf_off, - EXFAT_DEN_TO_B(copy_entries)); - exfat_update_bh(sb, bh, sync); - brelse(bh); - num_entries -= copy_entries; - - if (num_entries) { - /* get next sector */ - if (exfat_is_last_sector_in_cluster(sbi, sec)) { - clu = exfat_sector_to_cluster(sbi, sec); - if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) - clu++; - else if (exfat_get_next_cluster(sb, &clu)) - goto err_out; - sec = exfat_cluster_to_sector(sbi, clu); - } else { - sec++; - } - off = 0; - buf_off += EXFAT_DEN_TO_B(copy_entries); - } + for (i = 0; i < es->num_bh; i++) { + if (es->modified) + exfat_update_bh(es->sb, es->bh[i], sync); + brelse(es->bh[i]); } - - return 0; -err_out: - return -EIO; + kfree(es); } static int exfat_walk_fat_chain(struct super_block *sb, @@ -820,34 +786,40 @@ static bool exfat_validate_entry(unsigned int type, } } +struct exfat_dentry *exfat_get_dentry_cached( + struct exfat_entry_set_cache *es, int num) +{ + int off = es->start_off + num * DENTRY_SIZE; + struct buffer_head *bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)]; + char *p = bh->b_data + EXFAT_BLK_OFFSET(off, es->sb); + + return (struct exfat_dentry *)p; +} + /* * Returns a set of dentries for a file or dir. * - * Note that this is a copy (dump) of dentries so that user should - * call write_entry_set() to apply changes made in this entry set - * to the real device. + * Note It provides a direct pointer to bh->data via exfat_get_dentry_cached(). + * User should call exfat_get_dentry_set() after setting 'modified' to apply + * changes made in this entry set to the real device. * * in: * sb+p_dir+entry: indicates a file/dir * type: specifies how many dentries should be included. - * out: - * file_ep: will point the first dentry(= file dentry) on success * return: * pointer of entry set on success, * NULL on failure. */ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type, - struct exfat_dentry **file_ep) + struct exfat_chain *p_dir, int entry, unsigned int type) { - int ret; + int ret, i, num_bh; unsigned int off, byte_offset, clu = 0; - unsigned int entry_type; sector_t sec; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_entry_set_cache *es; - struct exfat_dentry *ep, *pos; - unsigned char num_entries; + struct exfat_dentry *ep; + int num_entries; enum exfat_validate_dentry_mode mode = ES_MODE_STARTED; struct buffer_head *bh; @@ -861,11 +833,18 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, if (ret) return NULL; + es = kzalloc(sizeof(*es), GFP_KERNEL); + if (!es) + return NULL; + es->sb = sb; + es->modified = false; + /* byte offset in cluster */ byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi); /* byte offset in sector */ off = EXFAT_BLK_OFFSET(byte_offset, sb); + es->start_off = off; /* sector offset in cluster */ sec = EXFAT_B_TO_BLK(byte_offset, sb); @@ -873,72 +852,46 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, bh = sb_bread(sb, sec); if (!bh) - return NULL; + goto free_es; + es->bh[es->num_bh++] = bh; - ep = (struct exfat_dentry *)(bh->b_data + off); - entry_type = exfat_get_entry_type(ep); - - if (entry_type != TYPE_FILE && entry_type != TYPE_DIR) - goto release_bh; + ep = exfat_get_dentry_cached(es, 0); + if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) + goto free_es; num_entries = type == ES_ALL_ENTRIES ? ep->dentry.file.num_ext + 1 : type; - es = kmalloc(struct_size(es, entries, num_entries), GFP_KERNEL); - if (!es) - goto release_bh; - es->num_entries = num_entries; - es->sector = sec; - es->offset = off; - es->alloc_flag = p_dir->flags; - pos = &es->entries[0]; - - while (num_entries) { - if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) - goto free_es; - - /* copy dentry */ - memcpy(pos, ep, sizeof(struct exfat_dentry)); - - if (--num_entries == 0) - break; - - if (((off + DENTRY_SIZE) & (sb->s_blocksize - 1)) < - (off & (sb->s_blocksize - 1))) { - /* get the next sector */ - if (exfat_is_last_sector_in_cluster(sbi, sec)) { - if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) - clu++; - else if (exfat_get_next_cluster(sb, &clu)) - goto free_es; - sec = exfat_cluster_to_sector(sbi, clu); - } else { - sec++; - } - - brelse(bh); - bh = sb_bread(sb, sec); - if (!bh) + num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb); + for (i = 1; i < num_bh; i++) { + /* get the next sector */ + if (exfat_is_last_sector_in_cluster(sbi, sec)) { + if (p_dir->flags == ALLOC_NO_FAT_CHAIN) + clu++; + else if (exfat_get_next_cluster(sb, &clu)) goto free_es; - off = 0; - ep = (struct exfat_dentry *)bh->b_data; + sec = exfat_cluster_to_sector(sbi, clu); } else { - ep++; - off += DENTRY_SIZE; + sec++; } - pos++; + + bh = sb_bread(sb, sec); + if (!bh) + goto free_es; + es->bh[es->num_bh++] = bh; } - if (file_ep) - *file_ep = &es->entries[0]; - brelse(bh); + /* validiate cached dentries */ + for (i = 1; i < num_entries; i++) { + ep = exfat_get_dentry_cached(es, i); + if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) + goto free_es; + } return es; free_es: - kfree(es); -release_bh: - brelse(bh); + exfat_free_dentry_set(es, false); return NULL; } diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 294aa7792bc3..c84ae9e60508 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -71,10 +71,8 @@ enum { #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ #define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) -#define FAT_CACHE_SIZE 128 -#define FAT_CACHE_HASH_SIZE 64 -#define BUF_CACHE_SIZE 256 -#define BUF_CACHE_HASH_SIZE 64 +/* Enough size to hold 256 dentry (even 512 Byte sector) */ +#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1) #define EXFAT_HINT_NONE -1 #define EXFAT_MIN_SUBDIR 2 @@ -170,14 +168,12 @@ struct exfat_hint { }; struct exfat_entry_set_cache { - /* sector number that contains file_entry */ - sector_t sector; - /* byte offset in the sector */ - unsigned int offset; - /* flag in stream entry. 01 for cluster chain, 03 for contig. */ - int alloc_flag; + struct super_block *sb; + bool modified; + unsigned int start_off; + int num_bh; + struct buffer_head *bh[DIR_CACHE_SIZE]; unsigned int num_entries; - struct exfat_dentry entries[]; }; struct exfat_dir_entry { @@ -451,8 +447,7 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, int entry, int order, int num_entries); int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, int entry); -int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, - struct exfat_entry_set_cache *es, int sync); +void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es); int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, @@ -463,9 +458,11 @@ int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh, sector_t *sector); +struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, + int num); struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, - struct exfat_chain *p_dir, int entry, unsigned int type, - struct exfat_dentry **file_ep); + struct exfat_chain *p_dir, int entry, unsigned int type); +void exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); /* inode.c */ diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 84f3d31a3a55..8e3f0eef45d7 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -96,11 +96,9 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) unsigned int num_clusters_new, num_clusters_phys; unsigned int last_clu = EXFAT_FREE_CLUSTER; struct exfat_chain clu; - struct exfat_dentry *ep, *ep2; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - struct exfat_entry_set_cache *es = NULL; int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0; /* check if the given file ID is opened */ @@ -153,12 +151,15 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) /* update the directory entry */ if (!evict) { struct timespec64 ts; + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es; es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, - ES_ALL_ENTRIES, &ep); + ES_ALL_ENTRIES); if (!es) return -EIO; - ep2 = ep + 1; + ep = exfat_get_dentry_cached(es, 0); + ep2 = exfat_get_dentry_cached(es, 1); ts = current_time(inode); exfat_set_entry_time(sbi, &ts, @@ -185,10 +186,8 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } - if (exfat_update_dir_chksum_with_entry_set(sb, es, - inode_needs_sync(inode))) - return -EIO; - kfree(es); + exfat_update_dir_chksum_with_entry_set(es); + exfat_free_dentry_set(es, inode_needs_sync(inode)); } /* cut off from the FAT chain */ diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 3f367d081cd6..ef7cf7a6d187 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -19,7 +19,6 @@ static int __exfat_write_inode(struct inode *inode, int sync) { - int ret = -EIO; unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; struct exfat_entry_set_cache *es = NULL; @@ -43,11 +42,11 @@ static int __exfat_write_inode(struct inode *inode, int sync) exfat_set_vol_flags(sb, VOL_DIRTY); /* get the directory entry of given file or directory */ - es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES, - &ep); + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES); if (!es) return -EIO; - ep2 = ep + 1; + ep = exfat_get_dentry_cached(es, 0); + ep2 = exfat_get_dentry_cached(es, 1); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); @@ -77,9 +76,9 @@ static int __exfat_write_inode(struct inode *inode, int sync) ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); ep2->dentry.stream.size = ep2->dentry.stream.valid_size; - ret = exfat_update_dir_chksum_with_entry_set(sb, es, sync); - kfree(es); - return ret; + exfat_update_dir_chksum_with_entry_set(es); + exfat_free_dentry_set(es, sync); + return 0; } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -110,8 +109,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, int ret, modified = false; unsigned int last_clu; struct exfat_chain new_clu; - struct exfat_dentry *ep; - struct exfat_entry_set_cache *es = NULL; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); @@ -222,34 +219,28 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, num_clusters += num_to_be_allocated; *clu = new_clu.dir; - if (ei->dir.dir != DIR_DELETED) { + if (ei->dir.dir != DIR_DELETED && modified) { + struct exfat_dentry *ep; + struct exfat_entry_set_cache *es; + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, - ES_ALL_ENTRIES, &ep); + ES_ALL_ENTRIES); if (!es) return -EIO; /* get stream entry */ - ep++; + ep = exfat_get_dentry_cached(es, 1); /* update directory entry */ - if (modified) { - if (ep->dentry.stream.flags != ei->flags) - ep->dentry.stream.flags = ei->flags; + ep->dentry.stream.flags = ei->flags; + ep->dentry.stream.start_clu = + cpu_to_le32(ei->start_clu); + ep->dentry.stream.valid_size = + cpu_to_le64(i_size_read(inode)); + ep->dentry.stream.size = + ep->dentry.stream.valid_size; - if (le32_to_cpu(ep->dentry.stream.start_clu) != - ei->start_clu) - ep->dentry.stream.start_clu = - cpu_to_le32(ei->start_clu); - - ep->dentry.stream.valid_size = - cpu_to_le64(i_size_read(inode)); - ep->dentry.stream.size = - ep->dentry.stream.valid_size; - } - - if (exfat_update_dir_chksum_with_entry_set(sb, es, - inode_needs_sync(inode))) - return -EIO; - kfree(es); + exfat_update_dir_chksum_with_entry_set(es); + exfat_free_dentry_set(es, inode_needs_sync(inode)); } /* end of if != DIR_DELETED */ diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 48f4df883f3b..5b0f35329d63 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -600,8 +600,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname, int ret, dentry, num_entries, count; struct exfat_chain cdir; struct exfat_uni_name uni_name; - struct exfat_dentry *ep, *ep2; - struct exfat_entry_set_cache *es = NULL; struct super_block *sb = dir->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(dir); @@ -660,10 +658,14 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->num_subdirs = count; } else { - es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES, &ep); + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es; + + es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES); if (!es) return -EIO; - ep2 = ep + 1; + ep = exfat_get_dentry_cached(es, 0); + ep2 = exfat_get_dentry_cached(es, 1); info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); @@ -681,7 +683,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, exfat_fs_error(sb, "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", i_size_read(dir), ei->dir.dir, ei->entry); - kfree(es); + exfat_free_dentry_set(es, false); return -EIO; } @@ -700,7 +702,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, ep->dentry.file.access_time, ep->dentry.file.access_date, 0); - kfree(es); + exfat_free_dentry_set(es, false); if (info->type == TYPE_DIR) { exfat_chain_set(&cdir, info->start_clu, From 181a9e8009a8a8bdb19c2e24eeeae7d8e77c8c47 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Fri, 29 May 2020 19:14:56 +0900 Subject: [PATCH 471/574] exfat: redefine PBR as boot_sector Aggregate PBR related definitions and redefine as "boot_sector" to comply with the exFAT specification. And, rename variable names including 'pbr'. Signed-off-by: Tetsuhiro Kohada Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 2 +- fs/exfat/exfat_raw.h | 79 +++++++++++++++-------------------------- fs/exfat/super.c | 84 ++++++++++++++++++++++---------------------- 3 files changed, 72 insertions(+), 93 deletions(-) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index c84ae9e60508..911f58b93f3d 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -227,7 +227,7 @@ struct exfat_sb_info { unsigned int root_dir; /* root dir cluster */ unsigned int dentries_per_clu; /* num of dentries per cluster */ unsigned int vol_flag; /* volume dirty flag */ - struct buffer_head *pbr_bh; /* buffer_head of PBR sector */ + struct buffer_head *boot_bh; /* buffer_head of BOOT sector */ unsigned int map_clu; /* allocation bitmap start cluster */ unsigned int map_sectors; /* num of allocation bitmap sectors */ diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 8d6c64a7546d..07f74190df44 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -8,7 +8,8 @@ #include -#define PBR_SIGNATURE 0xAA55 +#define BOOT_SIGNATURE 0xAA55 +#define EXBOOT_SIGNATURE 0xAA550000 #define EXFAT_MAX_FILE_LEN 255 @@ -55,7 +56,7 @@ /* checksum types */ #define CS_DIR_ENTRY 0 -#define CS_PBR_SECTOR 1 +#define CS_BOOT_SECTOR 1 #define CS_DEFAULT 2 /* file attributes */ @@ -69,57 +70,35 @@ #define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \ ATTR_SUBDIR | ATTR_ARCHIVE) -#define PBR64_JUMP_BOOT_LEN 3 -#define PBR64_OEM_NAME_LEN 8 -#define PBR64_RESERVED_LEN 53 +#define BOOTSEC_JUMP_BOOT_LEN 3 +#define BOOTSEC_FS_NAME_LEN 8 +#define BOOTSEC_OLDBPB_LEN 53 #define EXFAT_FILE_NAME_LEN 15 -/* EXFAT BIOS parameter block (64 bytes) */ -struct bpb64 { - __u8 jmp_boot[PBR64_JUMP_BOOT_LEN]; - __u8 oem_name[PBR64_OEM_NAME_LEN]; - __u8 res_zero[PBR64_RESERVED_LEN]; -} __packed; - -/* EXFAT EXTEND BIOS parameter block (56 bytes) */ -struct bsx64 { - __le64 vol_offset; - __le64 vol_length; - __le32 fat_offset; - __le32 fat_length; - __le32 clu_offset; - __le32 clu_count; - __le32 root_cluster; - __le32 vol_serial; - __u8 fs_version[2]; - __le16 vol_flags; - __u8 sect_size_bits; - __u8 sect_per_clus_bits; - __u8 num_fats; - __u8 phy_drv_no; - __u8 perc_in_use; - __u8 reserved2[7]; -} __packed; - -/* EXFAT PBR[BPB+BSX] (120 bytes) */ -struct pbr64 { - struct bpb64 bpb; - struct bsx64 bsx; -} __packed; - -/* Common PBR[Partition Boot Record] (512 bytes) */ -struct pbr { - union { - __u8 raw[64]; - struct bpb64 f64; - } bpb; - union { - __u8 raw[56]; - struct bsx64 f64; - } bsx; - __u8 boot_code[390]; - __le16 signature; +/* EXFAT: Main and Backup Boot Sector (512 bytes) */ +struct boot_sector { + __u8 jmp_boot[BOOTSEC_JUMP_BOOT_LEN]; + __u8 fs_name[BOOTSEC_FS_NAME_LEN]; + __u8 must_be_zero[BOOTSEC_OLDBPB_LEN]; + __le64 partition_offset; + __le64 vol_length; + __le32 fat_offset; + __le32 fat_length; + __le32 clu_offset; + __le32 clu_count; + __le32 root_cluster; + __le32 vol_serial; + __u8 fs_revision[2]; + __le16 vol_flags; + __u8 sect_size_bits; + __u8 sect_per_clus_bits; + __u8 num_fats; + __u8 drv_sel; + __u8 percent_in_use; + __u8 reserved[7]; + __u8 boot_code[390]; + __le16 signature; } __packed; struct exfat_dentry { diff --git a/fs/exfat/super.c b/fs/exfat/super.c index c1f47f4071a8..e60d28e73ff0 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -49,7 +49,7 @@ static void exfat_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); exfat_set_vol_flags(sb, VOL_CLEAN); exfat_free_bitmap(sbi); - brelse(sbi->pbr_bh); + brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); call_rcu(&sbi->rcu, exfat_delayed_free); @@ -101,7 +101,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data; + struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data; bool sync; /* flags are not changed */ @@ -116,18 +116,18 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) if (sb_rdonly(sb)) return 0; - bpb->bsx.vol_flags = cpu_to_le16(new_flag); + p_boot->vol_flags = cpu_to_le16(new_flag); - if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) + if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->boot_bh)) sync = true; else sync = false; - set_buffer_uptodate(sbi->pbr_bh); - mark_buffer_dirty(sbi->pbr_bh); + set_buffer_uptodate(sbi->boot_bh); + mark_buffer_dirty(sbi->boot_bh); if (sync) - sync_dirty_buffer(sbi->pbr_bh); + sync_dirty_buffer(sbi->boot_bh); return 0; } @@ -366,13 +366,14 @@ static int exfat_read_root(struct inode *inode) return 0; } -static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) +static struct boot_sector *exfat_read_boot_with_logical_sector( + struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data; + struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data; unsigned short logical_sect = 0; - logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; + logical_sect = 1 << p_boot->sect_size_bits; if (!is_power_of_2(logical_sect) || logical_sect < 512 || logical_sect > 4096) { @@ -387,49 +388,48 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) } if (logical_sect > sb->s_blocksize) { - brelse(sbi->pbr_bh); - sbi->pbr_bh = NULL; + brelse(sbi->boot_bh); + sbi->boot_bh = NULL; if (!sb_set_blocksize(sb, logical_sect)) { exfat_err(sb, "unable to set blocksize %u", logical_sect); return NULL; } - sbi->pbr_bh = sb_bread(sb, 0); - if (!sbi->pbr_bh) { + sbi->boot_bh = sb_bread(sb, 0); + if (!sbi->boot_bh) { exfat_err(sb, "unable to read boot sector (logical sector size = %lu)", sb->s_blocksize); return NULL; } - p_pbr = (struct pbr *)sbi->pbr_bh->b_data; + p_boot = (struct boot_sector *)sbi->boot_bh->b_data; } - return p_pbr; + return p_boot; } /* mount the file system volume */ static int __exfat_fill_super(struct super_block *sb) { int ret; - struct pbr *p_pbr; - struct pbr64 *p_bpb; + struct boot_sector *p_boot; struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ sb_min_blocksize(sb, 512); /* read boot sector */ - sbi->pbr_bh = sb_bread(sb, 0); - if (!sbi->pbr_bh) { + sbi->boot_bh = sb_bread(sb, 0); + if (!sbi->boot_bh) { exfat_err(sb, "unable to read boot sector"); return -EIO; } /* PRB is read */ - p_pbr = (struct pbr *)sbi->pbr_bh->b_data; + p_boot = (struct boot_sector *)sbi->boot_bh->b_data; - /* check the validity of PBR */ - if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { + /* check the validity of BOOT */ + if (le16_to_cpu((p_boot->signature)) != BOOT_SIGNATURE) { exfat_err(sb, "invalid boot record signature"); ret = -EINVAL; goto free_bh; @@ -437,8 +437,8 @@ static int __exfat_fill_super(struct super_block *sb) /* check logical sector size */ - p_pbr = exfat_read_pbr_with_logical_sector(sb); - if (!p_pbr) { + p_boot = exfat_read_boot_with_logical_sector(sb); + if (!p_boot) { ret = -EIO; goto free_bh; } @@ -447,43 +447,43 @@ static int __exfat_fill_super(struct super_block *sb) * res_zero field must be filled with zero to prevent mounting * from FAT volume. */ - if (memchr_inv(p_pbr->bpb.f64.res_zero, 0, - sizeof(p_pbr->bpb.f64.res_zero))) { + if (memchr_inv(p_boot->must_be_zero, 0, + sizeof(p_boot->must_be_zero))) { ret = -EINVAL; goto free_bh; } - p_bpb = (struct pbr64 *)p_pbr; - if (!p_bpb->bsx.num_fats) { + p_boot = (struct boot_sector *)p_boot; + if (!p_boot->num_fats) { exfat_err(sb, "bogus number of FAT structure"); ret = -EINVAL; goto free_bh; } - sbi->sect_per_clus = 1 << p_bpb->bsx.sect_per_clus_bits; - sbi->sect_per_clus_bits = p_bpb->bsx.sect_per_clus_bits; + sbi->sect_per_clus = 1 << p_boot->sect_per_clus_bits; + sbi->sect_per_clus_bits = p_boot->sect_per_clus_bits; sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits; sbi->cluster_size = 1 << sbi->cluster_size_bits; - sbi->num_FAT_sectors = le32_to_cpu(p_bpb->bsx.fat_length); - sbi->FAT1_start_sector = le32_to_cpu(p_bpb->bsx.fat_offset); - sbi->FAT2_start_sector = p_bpb->bsx.num_fats == 1 ? + sbi->num_FAT_sectors = le32_to_cpu(p_boot->fat_length); + sbi->FAT1_start_sector = le32_to_cpu(p_boot->fat_offset); + sbi->FAT2_start_sector = p_boot->num_fats == 1 ? sbi->FAT1_start_sector : sbi->FAT1_start_sector + sbi->num_FAT_sectors; - sbi->data_start_sector = le32_to_cpu(p_bpb->bsx.clu_offset); - sbi->num_sectors = le64_to_cpu(p_bpb->bsx.vol_length); + sbi->data_start_sector = le32_to_cpu(p_boot->clu_offset); + sbi->num_sectors = le64_to_cpu(p_boot->vol_length); /* because the cluster index starts with 2 */ - sbi->num_clusters = le32_to_cpu(p_bpb->bsx.clu_count) + + sbi->num_clusters = le32_to_cpu(p_boot->clu_count) + EXFAT_RESERVED_CLUSTERS; - sbi->root_dir = le32_to_cpu(p_bpb->bsx.root_cluster); + sbi->root_dir = le32_to_cpu(p_boot->root_cluster); sbi->dentries_per_clu = 1 << (sbi->cluster_size_bits - DENTRY_SIZE_BITS); - sbi->vol_flag = le16_to_cpu(p_bpb->bsx.vol_flags); + sbi->vol_flag = le16_to_cpu(p_boot->vol_flags); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; - if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) { + if (le16_to_cpu(p_boot->vol_flags) & VOL_DIRTY) { sbi->vol_flag |= VOL_DIRTY; exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); } @@ -517,7 +517,7 @@ free_alloc_bitmap: free_upcase_table: exfat_free_upcase_table(sbi); free_bh: - brelse(sbi->pbr_bh); + brelse(sbi->boot_bh); return ret; } @@ -608,7 +608,7 @@ put_inode: free_table: exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); - brelse(sbi->pbr_bh); + brelse(sbi->boot_bh); check_nls_io: unload_nls(sbi->nls_io); From 33404a159828a97fefada0d8f4cf910873b8e9f1 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Fri, 29 May 2020 19:14:57 +0900 Subject: [PATCH 472/574] exfat: separate the boot sector analysis Separate the boot sector analysis to read_boot_sector(). And add a check for the fs_name field. Furthermore, add a strict consistency check, because overlapping areas can cause serious corruption. Signed-off-by: Tetsuhiro Kohada Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_raw.h | 2 + fs/exfat/super.c | 97 ++++++++++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 43 deletions(-) diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 07f74190df44..350ce59cc324 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -10,11 +10,13 @@ #define BOOT_SIGNATURE 0xAA55 #define EXBOOT_SIGNATURE 0xAA550000 +#define STR_EXFAT "EXFAT " /* size should be 8 */ #define EXFAT_MAX_FILE_LEN 255 #define VOL_CLEAN 0x0000 #define VOL_DIRTY 0x0002 +#define ERR_MEDIUM 0x0004 #define EXFAT_EOF_CLUSTER 0xFFFFFFFFu #define EXFAT_BAD_CLUSTER 0xFFFFFFF7u diff --git a/fs/exfat/super.c b/fs/exfat/super.c index e60d28e73ff0..6a1330be5a9a 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -366,25 +366,20 @@ static int exfat_read_root(struct inode *inode) return 0; } -static struct boot_sector *exfat_read_boot_with_logical_sector( - struct super_block *sb) +static int exfat_calibrate_blocksize(struct super_block *sb, int logical_sect) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data; - unsigned short logical_sect = 0; - - logical_sect = 1 << p_boot->sect_size_bits; if (!is_power_of_2(logical_sect) || logical_sect < 512 || logical_sect > 4096) { exfat_err(sb, "bogus logical sector size %u", logical_sect); - return NULL; + return -EIO; } if (logical_sect < sb->s_blocksize) { exfat_err(sb, "logical sector size too small for device (logical sector size = %u)", logical_sect); - return NULL; + return -EIO; } if (logical_sect > sb->s_blocksize) { @@ -394,24 +389,20 @@ static struct boot_sector *exfat_read_boot_with_logical_sector( if (!sb_set_blocksize(sb, logical_sect)) { exfat_err(sb, "unable to set blocksize %u", logical_sect); - return NULL; + return -EIO; } sbi->boot_bh = sb_bread(sb, 0); if (!sbi->boot_bh) { exfat_err(sb, "unable to read boot sector (logical sector size = %lu)", sb->s_blocksize); - return NULL; + return -EIO; } - - p_boot = (struct boot_sector *)sbi->boot_bh->b_data; } - return p_boot; + return 0; } -/* mount the file system volume */ -static int __exfat_fill_super(struct super_block *sb) +static int exfat_read_boot_sector(struct super_block *sb) { - int ret; struct boot_sector *p_boot; struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -424,51 +415,41 @@ static int __exfat_fill_super(struct super_block *sb) exfat_err(sb, "unable to read boot sector"); return -EIO; } - - /* PRB is read */ p_boot = (struct boot_sector *)sbi->boot_bh->b_data; /* check the validity of BOOT */ if (le16_to_cpu((p_boot->signature)) != BOOT_SIGNATURE) { exfat_err(sb, "invalid boot record signature"); - ret = -EINVAL; - goto free_bh; + return -EINVAL; } - - /* check logical sector size */ - p_boot = exfat_read_boot_with_logical_sector(sb); - if (!p_boot) { - ret = -EIO; - goto free_bh; + if (memcmp(p_boot->fs_name, STR_EXFAT, BOOTSEC_FS_NAME_LEN)) { + exfat_err(sb, "invalid fs_name"); /* fs_name may unprintable */ + return -EINVAL; } /* - * res_zero field must be filled with zero to prevent mounting + * must_be_zero field must be filled with zero to prevent mounting * from FAT volume. */ - if (memchr_inv(p_boot->must_be_zero, 0, - sizeof(p_boot->must_be_zero))) { - ret = -EINVAL; - goto free_bh; - } + if (memchr_inv(p_boot->must_be_zero, 0, sizeof(p_boot->must_be_zero))) + return -EINVAL; - p_boot = (struct boot_sector *)p_boot; - if (!p_boot->num_fats) { + if (p_boot->num_fats != 1 && p_boot->num_fats != 2) { exfat_err(sb, "bogus number of FAT structure"); - ret = -EINVAL; - goto free_bh; + return -EINVAL; } sbi->sect_per_clus = 1 << p_boot->sect_per_clus_bits; sbi->sect_per_clus_bits = p_boot->sect_per_clus_bits; - sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits; + sbi->cluster_size_bits = p_boot->sect_per_clus_bits + + p_boot->sect_size_bits; sbi->cluster_size = 1 << sbi->cluster_size_bits; sbi->num_FAT_sectors = le32_to_cpu(p_boot->fat_length); sbi->FAT1_start_sector = le32_to_cpu(p_boot->fat_offset); - sbi->FAT2_start_sector = p_boot->num_fats == 1 ? - sbi->FAT1_start_sector : - sbi->FAT1_start_sector + sbi->num_FAT_sectors; + sbi->FAT2_start_sector = le32_to_cpu(p_boot->fat_offset); + if (p_boot->num_fats == 2) + sbi->FAT2_start_sector += sbi->num_FAT_sectors; sbi->data_start_sector = le32_to_cpu(p_boot->clu_offset); sbi->num_sectors = le64_to_cpu(p_boot->vol_length); /* because the cluster index starts with 2 */ @@ -483,15 +464,45 @@ static int __exfat_fill_super(struct super_block *sb) sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; - if (le16_to_cpu(p_boot->vol_flags) & VOL_DIRTY) { - sbi->vol_flag |= VOL_DIRTY; - exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); + /* check consistencies */ + if (sbi->num_FAT_sectors << p_boot->sect_size_bits < + sbi->num_clusters * 4) { + exfat_err(sb, "bogus fat length"); + return -EINVAL; } + if (sbi->data_start_sector < + sbi->FAT1_start_sector + sbi->num_FAT_sectors * p_boot->num_fats) { + exfat_err(sb, "bogus data start sector"); + return -EINVAL; + } + if (sbi->vol_flag & VOL_DIRTY) + exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); + if (sbi->vol_flag & ERR_MEDIUM) + exfat_warn(sb, "Medium has reported failures. Some data may be lost."); /* exFAT file size is limited by a disk volume size */ sb->s_maxbytes = (u64)(sbi->num_clusters - EXFAT_RESERVED_CLUSTERS) << sbi->cluster_size_bits; + /* check logical sector size */ + if (exfat_calibrate_blocksize(sb, 1 << p_boot->sect_size_bits)) + return -EIO; + + return 0; +} + +/* mount the file system volume */ +static int __exfat_fill_super(struct super_block *sb) +{ + int ret; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + ret = exfat_read_boot_sector(sb); + if (ret) { + exfat_err(sb, "failed to read boot sector"); + goto free_bh; + } + ret = exfat_create_upcase_table(sb); if (ret) { exfat_err(sb, "failed to load upcase table"); From 476189c0ef3b658de3f6b89fd0fdeb6dc451b564 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Sun, 31 May 2020 18:30:17 +0900 Subject: [PATCH 473/574] exfat: add boot region verification Add Boot-Regions verification specified in exFAT specification. Note that the checksum type is strongly related to the raw structure, so the'u32 'type is used to clarify the number of bits. Signed-off-by: Tetsuhiro Kohada Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/exfat_fs.h | 1 + fs/exfat/misc.c | 14 +++++++++++++ fs/exfat/super.c | 50 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 911f58b93f3d..8c2a70bfaa10 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -514,6 +514,7 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); unsigned short exfat_calc_chksum_2byte(void *data, int len, unsigned short chksum, int type); +u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type); void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync); void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, unsigned int size, unsigned char flags); diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index ab7f88b1f6d3..b82d2dd5bd7c 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -151,6 +151,20 @@ unsigned short exfat_calc_chksum_2byte(void *data, int len, return chksum; } +u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type) +{ + int i; + u8 *c = (u8 *)data; + + for (i = 0; i < len; i++, c++) { + if (unlikely(type == CS_BOOT_SECTOR && + (i == 106 || i == 107 || i == 112))) + continue; + chksum = ((chksum << 31) | (chksum >> 1)) + *c; + } + return chksum; +} + void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync) { set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state); diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 6a1330be5a9a..405717e4e3ea 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -491,6 +491,50 @@ static int exfat_read_boot_sector(struct super_block *sb) return 0; } +static int exfat_verify_boot_region(struct super_block *sb) +{ + struct buffer_head *bh = NULL; + u32 chksum = 0; + __le32 *p_sig, *p_chksum; + int sn, i; + + /* read boot sector sub-regions */ + for (sn = 0; sn < 11; sn++) { + bh = sb_bread(sb, sn); + if (!bh) + return -EIO; + + if (sn != 0 && sn <= 8) { + /* extended boot sector sub-regions */ + p_sig = (__le32 *)&bh->b_data[sb->s_blocksize - 4]; + if (le32_to_cpu(*p_sig) != EXBOOT_SIGNATURE) + exfat_warn(sb, "Invalid exboot-signature(sector = %d): 0x%08x", + sn, le32_to_cpu(*p_sig)); + } + + chksum = exfat_calc_chksum32(bh->b_data, sb->s_blocksize, + chksum, sn ? CS_DEFAULT : CS_BOOT_SECTOR); + brelse(bh); + } + + /* boot checksum sub-regions */ + bh = sb_bread(sb, sn); + if (!bh) + return -EIO; + + for (i = 0; i < sb->s_blocksize; i += sizeof(u32)) { + p_chksum = (__le32 *)&bh->b_data[i]; + if (le32_to_cpu(*p_chksum) != chksum) { + exfat_err(sb, "Invalid boot checksum (boot checksum : 0x%08x, checksum : 0x%08x)", + le32_to_cpu(*p_chksum), chksum); + brelse(bh); + return -EINVAL; + } + } + brelse(bh); + return 0; +} + /* mount the file system volume */ static int __exfat_fill_super(struct super_block *sb) { @@ -503,6 +547,12 @@ static int __exfat_fill_super(struct super_block *sb) goto free_bh; } + ret = exfat_verify_boot_region(sb); + if (ret) { + exfat_err(sb, "invalid boot region"); + goto free_bh; + } + ret = exfat_create_upcase_table(sb); if (ret) { exfat_err(sb, "failed to load upcase table"); From 5875bf287d95314c58add01184f361cc5aa38429 Mon Sep 17 00:00:00 2001 From: Tetsuhiro Kohada Date: Fri, 29 May 2020 19:14:59 +0900 Subject: [PATCH 474/574] exfat: standardize checksum calculation To clarify that it is a 16-bit checksum, the parts related to the 16-bit checksum are renamed and change type to u16. Furthermore, replace checksum calculation in exfat_load_upcase_table() with exfat_calc_checksum32(). Signed-off-by: Tetsuhiro Kohada Reviewed-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/dir.c | 12 ++++++------ fs/exfat/exfat_fs.h | 5 ++--- fs/exfat/misc.c | 10 ++++------ fs/exfat/nls.c | 19 +++++++------------ 4 files changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 2902d285bf20..de43534aa299 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -491,7 +491,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, int ret = 0; int i, num_entries; sector_t sector; - unsigned short chksum; + u16 chksum; struct exfat_dentry *ep, *fep; struct buffer_head *fbh, *bh; @@ -500,7 +500,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, return -EIO; num_entries = fep->dentry.file.num_ext + 1; - chksum = exfat_calc_chksum_2byte(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); + chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); for (i = 1; i < num_entries; i++) { ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL); @@ -508,7 +508,7 @@ int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, ret = -EIO; goto release_fbh; } - chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum, + chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, CS_DEFAULT); brelse(bh); } @@ -593,8 +593,8 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es) for (i = 0; i < es->num_entries; i++) { ep = exfat_get_dentry_cached(es, i); - chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum, - chksum_type); + chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum, + chksum_type); chksum_type = CS_DEFAULT; } ep = exfat_get_dentry_cached(es, 0); @@ -1000,7 +1000,7 @@ rewind: } if (entry_type == TYPE_STREAM) { - unsigned short name_hash; + u16 name_hash; if (step != DIRENT_STEP_STRM) { step = DIRENT_STEP_FILE; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 8c2a70bfaa10..595f3117f492 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -137,7 +137,7 @@ struct exfat_dentry_namebuf { struct exfat_uni_name { /* +3 for null and for converting */ unsigned short name[MAX_NAME_LENGTH + 3]; - unsigned short name_hash; + u16 name_hash; unsigned char name_len; }; @@ -512,8 +512,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, void exfat_truncate_atime(struct timespec64 *ts); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); -unsigned short exfat_calc_chksum_2byte(void *data, int len, - unsigned short chksum, int type); +u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type); u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type); void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync); void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index b82d2dd5bd7c..17d41f3d3709 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -136,17 +136,15 @@ void exfat_truncate_atime(struct timespec64 *ts) ts->tv_nsec = 0; } -unsigned short exfat_calc_chksum_2byte(void *data, int len, - unsigned short chksum, int type) +u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type) { int i; - unsigned char *c = (unsigned char *)data; + u8 *c = (u8 *)data; for (i = 0; i < len; i++, c++) { - if (((i == 2) || (i == 3)) && (type == CS_DIR_ENTRY)) + if (unlikely(type == CS_DIR_ENTRY && (i == 2 || i == 3))) continue; - chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + - (unsigned short)*c; + chksum = ((chksum << 15) | (chksum >> 1)) + *c; } return chksum; } diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 1ebda90cbdd7..19321773dd07 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -527,7 +527,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb, *uniname = '\0'; p_uniname->name_len = unilen; - p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, CS_DEFAULT); if (p_lossy) @@ -623,7 +623,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb, *uniname = '\0'; p_uniname->name_len = unilen; - p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, CS_DEFAULT); if (p_lossy) @@ -655,7 +655,8 @@ static int exfat_load_upcase_table(struct super_block *sb, { struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int sect_size = sb->s_blocksize; - unsigned int i, index = 0, checksum = 0; + unsigned int i, index = 0; + u32 chksum = 0; int ret; unsigned char skip = false; unsigned short *upcase_table; @@ -681,13 +682,6 @@ static int exfat_load_upcase_table(struct super_block *sb, for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { unsigned short uni = get_unaligned_le16(bh->b_data + i); - checksum = ((checksum & 1) ? 0x80000000 : 0) + - (checksum >> 1) + - *(((unsigned char *)bh->b_data) + i); - checksum = ((checksum & 1) ? 0x80000000 : 0) + - (checksum >> 1) + - *(((unsigned char *)bh->b_data) + (i + 1)); - if (skip) { index += uni; skip = false; @@ -701,13 +695,14 @@ static int exfat_load_upcase_table(struct super_block *sb, } } brelse(bh); + chksum = exfat_calc_chksum32(bh->b_data, i, chksum, CS_DEFAULT); } - if (index >= 0xFFFF && utbl_checksum == checksum) + if (index >= 0xFFFF && utbl_checksum == chksum) return 0; exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", - index, checksum, utbl_checksum); + index, chksum, utbl_checksum); ret = -EINVAL; free_table: exfat_free_upcase_table(sbi); From f78059805fb9fc5c343e89f39cf11856d047dd60 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 1 Jun 2020 09:43:49 +0900 Subject: [PATCH 475/574] exfat: remove unnecessary reassignment of p_uniname->name_len kbuild test robot reported : fs/exfat/nls.c:531:22: warning: Variable 'p_uniname->name_len' is reassigned a value before the old one has been used. The reassignment of p_uniname->name_len is not needed and remove it. Reported-by: kbuild test robot Signed-off-by: Namjae Jeon --- fs/exfat/nls.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 19321773dd07..c1ec05695497 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -514,8 +514,6 @@ static int exfat_utf8_to_utf16(struct super_block *sb, return -ENAMETOOLONG; } - p_uniname->name_len = unilen & 0xFF; - for (i = 0; i < unilen; i++) { if (*uniname < 0x0020 || exfat_wstrchr(bad_uni_chars, *uniname)) From f341a7d8dcc4e3d01544d7bc145633f062ef6249 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 3 Jun 2020 09:48:36 +0900 Subject: [PATCH 476/574] exfat: fix memory leak in exfat_parse_param() butt3rflyh4ck reported memory leak found by syzkaller. A param->string held by exfat_mount_options. BUG: memory leak unreferenced object 0xffff88801972e090 (size 8): comm "syz-executor.2", pid 16298, jiffies 4295172466 (age 14.060s) hex dump (first 8 bytes): 6b 6f 69 38 2d 75 00 00 koi8-u.. backtrace: [<000000005bfe35d6>] kstrdup+0x36/0x70 mm/util.c:60 [<0000000018ed3277>] exfat_parse_param+0x160/0x5e0 fs/exfat/super.c:276 [<000000007680462b>] vfs_parse_fs_param+0x2b4/0x610 fs/fs_context.c:147 [<0000000097c027f2>] vfs_parse_fs_string+0xe6/0x150 fs/fs_context.c:191 [<00000000371bf78f>] generic_parse_monolithic+0x16f/0x1f0 fs/fs_context.c:231 [<000000005ce5eb1b>] do_new_mount fs/namespace.c:2812 [inline] [<000000005ce5eb1b>] do_mount+0x12bb/0x1b30 fs/namespace.c:3141 [<00000000b642040c>] __do_sys_mount fs/namespace.c:3350 [inline] [<00000000b642040c>] __se_sys_mount fs/namespace.c:3327 [inline] [<00000000b642040c>] __x64_sys_mount+0x18f/0x230 fs/namespace.c:3327 [<000000003b024e98>] do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 [<00000000ce2b698c>] entry_SYSCALL_64_after_hwframe+0x49/0xb3 exfat_free() should call exfat_free_iocharset(), to prevent a leak in case we fail after parsing iocharset= but before calling get_tree_bdev(). Additionally, there's no point copying param->string in exfat_parse_param() - just steal it, leaving NULL in param->string. That's independent from the leak or fix thereof - it's simply avoiding an extra copy. Fixes: 719c1e182916 ("exfat: add super block operations") Cc: stable@vger.kernel.org # v5.7 Reported-by: butt3rflyh4ck Signed-off-by: Al Viro Signed-off-by: Namjae Jeon --- fs/exfat/super.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 405717e4e3ea..e650e65536f8 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -273,9 +273,8 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_charset: exfat_free_iocharset(sbi); - opts->iocharset = kstrdup(param->string, GFP_KERNEL); - if (!opts->iocharset) - return -ENOMEM; + opts->iocharset = param->string; + param->string = NULL; break; case Opt_errors: opts->errors = result.uint_32; @@ -686,7 +685,12 @@ static int exfat_get_tree(struct fs_context *fc) static void exfat_free(struct fs_context *fc) { - kfree(fc->s_fs_info); + struct exfat_sb_info *sbi = fc->s_fs_info; + + if (sbi) { + exfat_free_iocharset(sbi); + kfree(sbi); + } } static const struct fs_context_operations exfat_context_ops = { From 29bbb14bfc80dd760b07d2be0a27e610562982e3 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 4 Jun 2020 08:05:31 +0900 Subject: [PATCH 477/574] exfat: fix incorrect update of stream entry in __exfat_truncate() At truncate, there is a problem of incorrect updating in the file entry pointer instead of stream entry. This will cause the problem of overwriting the time field of the file entry to new_size. Fix it to update stream entry. Fixes: 98d917047e8b ("exfat: add file operations") Cc: stable@vger.kernel.org # v5.7 Signed-off-by: Namjae Jeon --- fs/exfat/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 8e3f0eef45d7..fce03f318787 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -171,11 +171,11 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) /* File size should be zero if there is no cluster allocated */ if (ei->start_clu == EXFAT_EOF_CLUSTER) { - ep->dentry.stream.valid_size = 0; - ep->dentry.stream.size = 0; + ep2->dentry.stream.valid_size = 0; + ep2->dentry.stream.size = 0; } else { - ep->dentry.stream.valid_size = cpu_to_le64(new_size); - ep->dentry.stream.size = ep->dentry.stream.valid_size; + ep2->dentry.stream.valid_size = cpu_to_le64(new_size); + ep2->dentry.stream.size = ep->dentry.stream.valid_size; } if (new_size == 0) { From a949824f01f3b39f737d77aed6cba47aced09319 Mon Sep 17 00:00:00 2001 From: "hyeongseok.kim" Date: Thu, 4 Jun 2020 13:54:28 +0900 Subject: [PATCH 478/574] exfat: fix range validation error in alloc and free cluster There is check error in range condition that can never be entered even with invalid input. Replace incorrent checking code with already existing valid checker. Signed-off-by: hyeongseok.kim Acked-by: Sungjong Seo Signed-off-by: Namjae Jeon --- fs/exfat/fatent.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 267e5e09eb13..4e5c5c9c0f2d 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -169,7 +169,7 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) return 0; /* check cluster validation */ - if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) { + if (!is_valid_cluster(sbi, p_chain->dir)) { exfat_err(sb, "invalid start cluster (%u)", p_chain->dir); return -EIO; } @@ -346,7 +346,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } /* check cluster validation */ - if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) { + if (!is_valid_cluster(sbi, hint_clu)) { exfat_err(sb, "hint_cluster is invalid (%u)", hint_clu); hint_clu = EXFAT_FIRST_CLUSTER; From fc961522ddbdf00254dd03b677627139cc1f68bc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jun 2020 17:16:29 +0300 Subject: [PATCH 479/574] exfat: Fix potential use after free in exfat_load_upcase_table() This code calls brelse(bh) and then dereferences "bh" on the next line resulting in a possible use after free. The brelse() should just be moved down a line. Fixes: b676fdbcf4c8 ("exfat: standardize checksum calculation") Signed-off-by: Dan Carpenter Signed-off-by: Namjae Jeon --- fs/exfat/nls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index c1ec05695497..57b5a7a4d1f7 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -692,8 +692,8 @@ static int exfat_load_upcase_table(struct super_block *sb, index++; } } - brelse(bh); chksum = exfat_calc_chksum32(bh->b_data, i, chksum, CS_DEFAULT); + brelse(bh); } if (index >= 0xFFFF && utbl_checksum == chksum) From 2062a4e8ae9f486847652927aaf88e21ab8d195d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:29:56 -0700 Subject: [PATCH 480/574] kallsyms/printk: add loglvl to print_ip_sym() Patch series "Add log level to show_stack()", v3. Add log level argument to show_stack(). Done in three stages: 1. Introducing show_stack_loglvl() for every architecture 2. Migrating old users with an explicit log level 3. Renaming show_stack_loglvl() into show_stack() Justification: - It's a design mistake to move a business-logic decision into platform realization detail. - I have currently two patches sets that would benefit from this work: Removing console_loglevel jumps in sysrq driver [1] Hung task warning before panic [2] - suggested by Tetsuo (but he probably didn't realise what it would involve). - While doing (1), (2) the backtraces were adjusted to headers and other messages for each situation - so there won't be a situation when the backtrace is printed, but the headers are missing because they have lesser log level (or the reverse). - As the result in (2) plays with console_loglevel for kdb are removed. The least important for upstream, but maybe still worth to note that every company I've worked in so far had an off-list patch to print backtrace with the needed log level (but only for the architecture they cared about). If you have other ideas how you will benefit from show_stack() with a log level - please, reply to this cover letter. See also discussion on v1: https://lore.kernel.org/linux-riscv/20191106083538.z5nlpuf64cigxigh@pathway.suse.cz/ This patch (of 50): print_ip_sym() needs to have a log level parameter to comply with other parts being printed. Otherwise, half of the expected backtrace would be printed and other may be missing with some logging level. The following callee(s) are using now the adjusted log level: - microblaze/unwind: the same level as headers & userspace unwind. Note that pr_debug()'s there are for debugging the unwinder itself. - nds32/traps: symbol addresses are printed with the same log level as backtrace headers. - lockdep: ip for locking issues is printed with the same log level as other part of the warning. - sched: ip where preemption was disabled is printed as error like the rest part of the message. - ftrace: bug reports are now consistent in the log level being used. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Steven Rostedt (VMware) Cc: Albert Ou Cc: Ben Segall Cc: Dietmar Eggemann Cc: Greentime Hu Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: James Hogan Cc: Juri Lelli Cc: Mel Gorman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Burton Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vincent Chen Cc: Vincent Guittot Cc: Will Deacon Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Dmitry Safonov Cc: Jiri Slaby Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Aurelien Jacquiot Cc: Mark Salter Cc: Guo Ren Cc: Yoshinori Sato Cc: Brian Cain Cc: Fenghua Yu Cc: Tony Luck Cc: Geert Uytterhoeven Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Rich Felker Cc: "David S. Miller" Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Chris Zankel Cc: Max Filippov Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Rafael J. Wysocki" Cc: Daniel Thompson Cc: Douglas Anderson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-2-dima@arista.com Signed-off-by: Linus Torvalds --- arch/microblaze/kernel/unwind.c | 2 +- arch/mips/kernel/traps.c | 4 ++-- arch/nds32/kernel/traps.c | 4 ++-- arch/riscv/kernel/stacktrace.c | 2 +- include/linux/kallsyms.h | 4 ++-- kernel/locking/lockdep.c | 4 ++-- kernel/sched/core.c | 6 ++---- kernel/trace/ftrace.c | 8 ++++---- tools/include/linux/kallsyms.h | 2 +- 9 files changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 34c270cb11fc..4241cdd28ee7 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -254,7 +254,7 @@ static void microblaze_unwind_inner(struct task_struct *task, task->comm); break; } else - print_ip_sym(pc); + print_ip_sym(KERN_INFO, pc); } /* Stop when we reach anything not part of the kernel */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 22f805a73921..210fea63de75 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -125,7 +125,7 @@ static void show_raw_backtrace(unsigned long reg29) break; } if (__kernel_text_address(addr)) - print_ip_sym(addr); + print_ip_sym(KERN_DEFAULT, addr); } printk("\n"); } @@ -155,7 +155,7 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) } printk("Call Trace:\n"); do { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); pc = unwind_stack(task, &sp, pc, &ra); } while (pc); pr_cont("\n"); diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index f4d386b52622..40625760a125 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -108,7 +108,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) if (__kernel_text_address(ret_addr)) { ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; @@ -124,7 +124,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(ret_addr); + print_ip_sym(KERN_EMERG, ret_addr); } if (--cnt < 0) break; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 837b9b38f825..9f1ac258482f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -99,7 +99,7 @@ void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(pc); + print_ip_sym(KERN_DEFAULT, pc); return false; } diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 657a83b943f0..98338dc6b5d2 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -165,9 +165,9 @@ static inline int kallsyms_show_value(void) #endif /*CONFIG_KALLSYMS*/ -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { - printk("[<%px>] %pS\n", (void *) ip, (void *) ip); + printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4c057dd8e93b..38cce34d03dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce..c06da3c3e317 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5765aeea698..7d0ebd104706 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2020,12 +2020,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EFAULT: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2036,12 +2036,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EPERM: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { diff --git a/tools/include/linux/kallsyms.h b/tools/include/linux/kallsyms.h index 89ca6fe257cc..efb6c3f5f2a9 100644 --- a/tools/include/linux/kallsyms.h +++ b/tools/include/linux/kallsyms.h @@ -20,7 +20,7 @@ static inline const char *kallsyms_lookup(unsigned long addr, #include #include -static inline void print_ip_sym(unsigned long ip) +static inline void print_ip_sym(const char *loglvl, unsigned long ip) { char **name; From 8c49a909872ca757a62ccbfca08784c224d20fdc Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:01 -0700 Subject: [PATCH 481/574] alpha: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Richard Henderson Link: http://lkml.kernel.org/r/20200418201944.482088-3-dima@arista.com Signed-off-by: Linus Torvalds --- arch/alpha/kernel/traps.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index f6b9664ac504..2402f1777f54 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -121,10 +121,10 @@ dik_show_code(unsigned int *pc) } static void -dik_show_trace(unsigned long *sp) +dik_show_trace(unsigned long *sp, const char *loglvl) { long i = 0; - printk("Trace:\n"); + printk("%sTrace:\n", loglvl); while (0x1ff8 & (unsigned long) sp) { extern char _stext[], _etext[]; unsigned long tmp = *sp; @@ -133,24 +133,25 @@ dik_show_trace(unsigned long *sp) continue; if (tmp >= (unsigned long) &_etext) continue; - printk("[<%lx>] %pSR\n", tmp, (void *)tmp); + printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp); if (i > 40) { - printk(" ..."); + printk("%s ...", loglvl); break; } } - printk("\n"); + printk("%s\n", loglvl); } static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { unsigned long *stack; int i; /* - * debugging aid: "show_stack(NULL);" prints the + * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the * back trace for this cpu. */ if(sp==NULL) @@ -163,14 +164,19 @@ void show_stack(struct task_struct *task, unsigned long *sp) if ((i % 4) == 0) { if (i) pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } else { pr_cont(" "); } pr_cont("%016lx", *stack++); } pr_cont("\n"); - dik_show_trace(sp); + dik_show_trace(sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } void @@ -184,7 +190,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs, r9_15); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); dik_show_code((unsigned int *)regs->pc); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { @@ -625,7 +631,7 @@ got_exception: printk("gp = %016lx sp = %p\n", regs->gp, regs+1); dik_show_code((unsigned int *)pc); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { printk("die_if_kernel recursion detected.\n"); From 8ca4d19932a5649930db29202b285d8e4e96a98a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:04 -0700 Subject: [PATCH 482/574] arc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a good side-effect header "Stack Trace:" is now printed with the same log level as the rest of backtrace. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Vineet Gupta Link: http://lkml.kernel.org/r/20200418201944.482088-4-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/bug.h | 3 ++- arch/arc/kernel/stacktrace.c | 21 +++++++++++++++------ arch/arc/kernel/troubleshoot.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 0be19fd1a412..4c453ba96c51 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -13,7 +13,8 @@ struct task_struct; void show_regs(struct pt_regs *regs); -void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs); +void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl); void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 1e440bbfa876..24f9cd8a12c9 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -158,9 +158,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, /* Call-back which plugs into unwinding core to dump the stack in * case of panic/OOPs/BUG etc */ -static int __print_sym(unsigned int address, void *unused) +static int __print_sym(unsigned int address, void *arg) { - printk(" %pS\n", (void *)address); + const char *loglvl = arg; + + printk("%s %pS\n", loglvl, (void *)address); return 0; } @@ -217,17 +219,24 @@ static int __get_first_nonsched(unsigned int address, void *unused) *------------------------------------------------------------------------- */ -noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs) +noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl) { - pr_info("\nStack Trace:\n"); - arc_unwind_core(tsk, regs, __print_sym, NULL); + printk("%s\nStack Trace:\n", loglvl); + arc_unwind_core(tsk, regs, __print_sym, (void *)loglvl); } EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) +{ + show_stacktrace(tsk, NULL, loglvl); +} + void show_stack(struct task_struct *tsk, unsigned long *sp) { - show_stacktrace(tsk, NULL); + show_stack_loglvl(tsk, sp, KERN_DEFAULT); } /* Another API expected by schedular, shows up in "ps" as Wait Channel diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 3393558876a9..3044c8347b74 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -240,5 +240,5 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, /* Show stack trace if this Fatality happened in kernel mode */ if (!user_mode(regs)) - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); } From 5489ab50c22771ddcad014968141b0d104d650a3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:07 -0700 Subject: [PATCH 483/574] arm/asm: add loglvl to c_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to c_backtrace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-5-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/bug.h | 3 ++- arch/arm/include/asm/traps.h | 3 ++- arch/arm/kernel/traps.c | 9 +++++---- arch/arm/kernel/unwind.c | 2 +- arch/arm/lib/backtrace-clang.S | 9 +++++++-- arch/arm/lib/backtrace.S | 14 ++++++++++---- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index deef4d0cb3b5..673c7dd75ab9 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -82,7 +82,8 @@ void hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, int code, const char *name); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp, int pmode, + const char *loglvl); struct mm_struct; void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index 172b08ff3760..987fefb0a4db 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -29,7 +29,8 @@ static inline int __in_irqentry_text(unsigned long ptr) } extern void __init early_trap_init(void *); -extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame); +extern void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl); extern void ptrace_break(struct pt_regs *regs); extern void *vectors_page; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 316a7687f813..7bba15fdce9f 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -62,7 +62,8 @@ __setup("user_debug=", user_debug_setup); static void dump_mem(const char *, const char *, unsigned long, unsigned long); -void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame) +void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl) { unsigned long end = frame + 4 + sizeof(struct pt_regs); @@ -76,7 +77,7 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long dump_mem("", "Exception stack", frame + 4, end); } -void dump_backtrace_stm(u32 *stack, u32 instruction) +void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) { char str[80], *p; unsigned int x; @@ -238,7 +239,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp, mode, NULL); } #endif @@ -666,7 +667,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) dump_instr("", regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs)); + c_backtrace(frame_pointer(regs), processor_mode(regs), NULL); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 11a964fd66f4..343cc27b36c4 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -493,7 +493,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, NULL); } } diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 2ff375144b55..6174c45f53a5 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -17,6 +17,7 @@ #define sv_pc r6 #define mask r7 #define sv_lr r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -99,6 +100,7 @@ ENDPROC(c_backtrace) @ to ensure 8 byte alignment movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? moveq mask, #0xfc000003 movne mask, #0 @ mask for 32-bit @@ -167,6 +169,7 @@ finished_setup: mov r1, sv_lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry /* @@ -183,6 +186,7 @@ finished_setup: ldr r0, [frame] @ locals are stored in @ the preceding frame subeq r0, r0, #4 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers /* @@ -196,7 +200,8 @@ finished_setup: bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) @@ -209,7 +214,7 @@ ENDPROC(c_backtrace) .long 1005b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Lopcode: .word 0xe92d4800 >> 11 @ stmfd sp!, {... fp, lr} .word 0x0b000000 @ bl if these bits are set diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 582925238d65..872f658638d9 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -18,6 +18,7 @@ #define sv_pc r6 #define mask r7 #define offset r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -25,9 +26,10 @@ ENTRY(c_backtrace) ret lr ENDPROC(c_backtrace) #else - stmfd sp!, {r4 - r8, lr} @ Save an extra register so we have a location... + stmfd sp!, {r4 - r9, lr} @ Save an extra register so we have a location... movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? ARM( moveq mask, #0xfc000003 ) @@ -73,6 +75,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions ldr r1, [frame, #-4] @ get saved lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry ldr r1, [sv_pc, #-4] @ if stmfd sp!, {args} exists, @@ -80,12 +83,14 @@ for_each_frame: tst frame, mask @ Check for address exceptions teq r3, r1, lsr #11 ldreq r0, [frame, #-8] @ get sp subeq r0, r0, #4 @ point at the last arg + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers 1004: ldr r1, [sv_pc, #0] @ if stmfd sp!, {..., fp, ip, lr, pc} ldr r3, .Ldsi @ instruction exists, teq r3, r1, lsr #11 subeq r0, frame, #16 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers teq sv_fp, #0 @ zero saved fp means @@ -96,9 +101,10 @@ for_each_frame: tst frame, mask @ Check for address exceptions bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk -no_frame: ldmfd sp!, {r4 - r8, pc} +no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -109,7 +115,7 @@ ENDPROC(c_backtrace) .long 1004b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0xe92dd800 >> 11 @ stmfd sp!, {... fp, ip, lr, pc} .word 0xe92d0000 >> 11 @ stmfd sp!, {} From e8d7b7353216586f64f5bbcc024951e3cab10a60 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:10 -0700 Subject: [PATCH 484/574] arm: add loglvl to unwind_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to unwind_backtrace() as a preparation for introducing show_stack_loglvl(). As a good side-effect arm_syscall() is now printing errors with the same log level as the backtrace. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-6-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/unwind.h | 3 ++- arch/arm/kernel/traps.c | 6 +++--- arch/arm/kernel/unwind.c | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 6e282c33126b..0f8a3439902d 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -36,7 +36,8 @@ extern struct unwind_table *unwind_table_add(unsigned long start, unsigned long text_addr, unsigned long text_size); extern void unwind_table_del(struct unwind_table *tab); -extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 7bba15fdce9f..aef52a401ad5 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -204,7 +204,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) #ifdef CONFIG_ARM_UNWIND static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { - unwind_backtrace(regs, tsk); + unwind_backtrace(regs, tsk, KERN_DEFAULT); } #else static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) @@ -664,10 +664,10 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) if (user_debug & UDBG_SYSCALL) { pr_err("[%d] %s: arm syscall %d\n", task_pid_nr(current), current->comm, no); - dump_instr("", regs); + dump_instr(KERN_ERR, regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs), NULL); + c_backtrace(frame_pointer(regs), processor_mode(regs), KERN_ERR); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 343cc27b36c4..d2bd0df2318d 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -455,7 +455,8 @@ int unwind_frame(struct stackframe *frame) return URC_OK; } -void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; @@ -493,7 +494,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4, NULL); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, loglvl); } } From ee65ca01c62cd89cea6192017806b55c34f1cfda Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:13 -0700 Subject: [PATCH 485/574] arm: add loglvl to dump_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_backtrace() as a preparation for introducing show_stack_loglvl(). As a good side-effect __die() now prints not only "Stack:" header with KERN_EMERG, but the backtrace itself. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-7-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm/kernel/traps.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index aef52a401ad5..014fb5ad3a8b 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -202,17 +202,19 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } #ifdef CONFIG_ARM_UNWIND -static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - unwind_backtrace(regs, tsk, KERN_DEFAULT); + unwind_backtrace(regs, tsk, loglvl); } #else -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp, mode; int ok = 1; - printk("Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -239,13 +241,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode, NULL); + c_backtrace(fp, mode, loglvl); } #endif void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, KERN_DEFAULT); barrier(); } @@ -289,7 +291,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } From 34135eacae6497d9c7bfca450f6c33e5245c92d2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:16 -0700 Subject: [PATCH 486/574] arm: wire up dump_backtrace_{entry,stm} Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Now that c_backtrace() always emits correct loglvl, use it for printing. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-8-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm/kernel/traps.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 014fb5ad3a8b..0aa55dc025a1 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -68,13 +68,15 @@ void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", where, (void *)where, from, (void *)from); + printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", + loglvl, where, (void *)where, from, (void *)from); #else - printk("Function entered at [<%08lx>] from [<%08lx>]\n", where, from); + printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", + loglvl, where, from); #endif if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem("", "Exception stack", frame + 4, end); + dump_mem(loglvl, "Exception stack", frame + 4, end); } void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) @@ -89,12 +91,12 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) if (++x == 6) { x = 0; p = str; - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } } } if (p != str) - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } #ifndef CONFIG_ARM_UNWIND From a4502d04c7dd075846bb70ebfde3843f9df86f98 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:19 -0700 Subject: [PATCH 487/574] arm: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-9-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 0aa55dc025a1..2ae2c23799ad 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -247,10 +247,16 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, } #endif +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) +{ + dump_backtrace(NULL, tsk, loglvl); + barrier(); +} + void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk, KERN_DEFAULT); - barrier(); + show_stack_loglvl(tsk, sp, KERN_DEFAULT); } #ifdef CONFIG_PREEMPT From c76898373f9b71586edaf150190c493ae9ed3e77 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:23 -0700 Subject: [PATCH 488/574] arm64: add loglvl to dump_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_backtrace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-10-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/stacktrace.h | 3 ++- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/traps.c | 15 ++++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 5017b531a415..fc7613023c19 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -64,7 +64,8 @@ struct stackframe { extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); -extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index eade7807e819..6089638c7d43 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -306,7 +306,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs * regs) { __show_regs(regs); - dump_backtrace(regs, NULL); + dump_backtrace(regs, NULL, KERN_DEFAULT); } static void tls_thread_flush(void) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index d332590f5978..f602aca64baa 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -53,9 +53,9 @@ static const char *handler[]= { int show_unhandled_signals = 0; -static void dump_backtrace_entry(unsigned long where) +static void dump_backtrace_entry(unsigned long where, const char *loglvl) { - printk(" %pS\n", (void *)where); + printk("%s %pS\n", loglvl, (void *)where); } static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) @@ -83,7 +83,8 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; int skip = 0; @@ -115,11 +116,11 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) thread_saved_pc(tsk)); } - printk("Call trace:\n"); + printk("%sCall trace:\n", loglvl); do { /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(frame.pc); + dump_backtrace_entry(frame.pc, loglvl); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -129,7 +130,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) * at which an exception has taken place, use regs->pc * instead. */ - dump_backtrace_entry(regs->pc); + dump_backtrace_entry(regs->pc, loglvl); } } while (!unwind_frame(tsk, &frame)); @@ -138,7 +139,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, KERN_DEFAULT); barrier(); } From c0fe096a8aba4b00c2a928c22b7f6a3ce3e9ca97 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:26 -0700 Subject: [PATCH 489/574] arm64: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-11-dima@arista.com Signed-off-by: Linus Torvalds --- arch/arm64/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f602aca64baa..3621868b2fcc 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -137,10 +137,16 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, put_task_stack(tsk); } +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) +{ + dump_backtrace(NULL, tsk, loglvl); + barrier(); +} + void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk, KERN_DEFAULT); - barrier(); + show_stack_loglvl(tsk, sp, KERN_DEFAULT); } #ifdef CONFIG_PREEMPT From a1eea2efdcaa69275294ea2432114369c14b2b8f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:29 -0700 Subject: [PATCH 490/574] c6x: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Aurelien Jacquiot Cc: Mark Salter Link: http://lkml.kernel.org/r/20200418201944.482088-12-dima@arista.com Signed-off-by: Linus Torvalds --- arch/c6x/kernel/traps.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index ec61034fdf56..4afbf48f1ce0 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -344,12 +344,13 @@ asmlinkage int process_exception(struct pt_regs *regs) static int kstack_depth_to_print = 48; -static void show_trace(unsigned long *stack, unsigned long *endstack) +static void show_trace(unsigned long *stack, unsigned long *endstack, + const char *loglvl) { unsigned long addr; int i; - pr_debug("Call trace:"); + printk("%sCall trace:", loglvl); i = 0; while (stack + 1 <= endstack) { addr = *stack++; @@ -364,16 +365,17 @@ static void show_trace(unsigned long *stack, unsigned long *endstack) if (__kernel_text_address(addr)) { #ifndef CONFIG_KALLSYMS if (i % 5 == 0) - pr_debug("\n "); + printk("%s\n ", loglvl); #endif - pr_debug(" [<%08lx>] %pS\n", addr, (void *)addr); + printk("%s [<%08lx>] %pS\n", loglvl, addr, (void *)addr); i++; } } - pr_debug("\n"); + printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p, *endstack; int i; @@ -398,7 +400,12 @@ void show_stack(struct task_struct *task, unsigned long *stack) pr_cont(" %08lx", *p++); } pr_cont("\n"); - show_trace(stack, endstack); + show_trace(stack, endstack, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_DEBUG); } int is_valid_bugaddr(unsigned long addr) From aeeb59d692c1bafecde3dfc47bfd0958fc96dfad Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:32 -0700 Subject: [PATCH 491/574] csky: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Guo Ren Link: http://lkml.kernel.org/r/20200418201944.482088-13-dima@arista.com Signed-off-by: Linus Torvalds --- arch/csky/kernel/stacktrace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index 92809e1da723..ca135f13cc13 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -91,14 +91,21 @@ static void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(pc); + print_ip_sym((const char *)arg, pc); return false; } +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) +{ + pr_cont("Call Trace:\n"); + walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); +} + void show_stack(struct task_struct *task, unsigned long *sp) { pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, NULL); + walk_stackframe(task, NULL, print_trace_address, KERN_INFO); } static bool save_wchan(unsigned long pc, void *arg) From 0b2ad0c7ae0fb37dee4e8ae36b887c1d4db1f898 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:35 -0700 Subject: [PATCH 492/574] h8300: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200418201944.482088-14-dima@arista.com Signed-off-by: Linus Torvalds --- arch/h8300/kernel/traps.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index e47a9e0dc278..6362446563d6 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -115,7 +115,8 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack_loglvl(struct task_struct *task, unsigned long *esp, + const char *loglvl) { unsigned long *stack, addr; int i; @@ -125,17 +126,17 @@ void show_stack(struct task_struct *task, unsigned long *esp) stack = esp; - pr_info("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); for (i = 0; i < kstack_depth_to_print; i++) { if (((unsigned long)stack & (THREAD_SIZE - 1)) >= THREAD_SIZE-4) break; if (i % 8 == 0) - pr_info(" "); + printk("%s ", loglvl); pr_cont(" %08lx", *stack++); } - pr_info("\nCall Trace:\n"); + printk("%s\nCall Trace:\n", loglvl); i = 0; stack = esp; while (((unsigned long)stack & (THREAD_SIZE - 1)) < THREAD_SIZE-4) { @@ -150,10 +151,15 @@ void show_stack(struct task_struct *task, unsigned long *esp) */ if (check_kernel_text(addr)) { if (i % 4 == 0) - pr_info(" "); + printk("%s ", loglvl); pr_cont(" [<%08lx>]", addr); i++; } } - pr_info("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + show_stack_loglvl(task, esp, KERN_INFO); } From d1e9086dd99b2501a692e9dbccdb211ac6e09d14 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:38 -0700 Subject: [PATCH 493/574] hexagon: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a good side-effect die() now prints the stacktrace with KERN_EMERG aligned with other messages. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Brian Cain Link: http://lkml.kernel.org/r/20200418201944.482088-15-dima@arista.com Signed-off-by: Linus Torvalds --- arch/hexagon/kernel/traps.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index 69c623b14ddd..a8a3a210d781 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -79,7 +79,7 @@ static const char *ex_name(int ex) } static void do_show_stack(struct task_struct *task, unsigned long *fp, - unsigned long ip) + unsigned long ip, const char *loglvl) { int kstack_depth_to_print = 24; unsigned long offset, size; @@ -93,9 +93,8 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, if (task == NULL) task = current; - printk(KERN_INFO "CPU#%d, %s/%d, Call Trace:\n", - raw_smp_processor_id(), task->comm, - task_pid_nr(task)); + printk("%sCPU#%d, %s/%d, Call Trace:\n", loglvl, raw_smp_processor_id(), + task->comm, task_pid_nr(task)); if (fp == NULL) { if (task == current) { @@ -108,7 +107,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } if ((((unsigned long) fp) & 0x3) || ((unsigned long) fp < 0x1000)) { - printk(KERN_INFO "-- Corrupt frame pointer %p\n", fp); + printk("%s-- Corrupt frame pointer %p\n", loglvl, fp); return; } @@ -125,8 +124,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, name = kallsyms_lookup(ip, &size, &offset, &modname, tmpstr); - printk(KERN_INFO "[%p] 0x%lx: %s + 0x%lx", fp, ip, name, - offset); + printk("%s[%p] 0x%lx: %s + 0x%lx", loglvl, fp, ip, name, offset); if (((unsigned long) fp < low) || (high < (unsigned long) fp)) printk(KERN_CONT " (FP out of bounds!)"); if (modname) @@ -136,8 +134,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, newfp = (unsigned long *) *fp; if (((unsigned long) newfp) & 0x3) { - printk(KERN_INFO "-- Corrupt frame pointer %p\n", - newfp); + printk("%s-- Corrupt frame pointer %p\n", loglvl, newfp); break; } @@ -147,7 +144,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, + 8); if (regs->syscall_nr != -1) { - printk(KERN_INFO "-- trap0 -- syscall_nr: %ld", + printk("%s-- trap0 -- syscall_nr: %ld", loglvl, regs->syscall_nr); printk(KERN_CONT " psp: %lx elr: %lx\n", pt_psp(regs), pt_elr(regs)); @@ -155,7 +152,7 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } else { /* really want to see more ... */ kstack_depth_to_print += 6; - printk(KERN_INFO "-- %s (0x%lx) badva: %lx\n", + printk("%s-- %s (0x%lx) badva: %lx\n", loglvl, ex_name(pt_cause(regs)), pt_cause(regs), pt_badva(regs)); } @@ -178,10 +175,16 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } } -void show_stack(struct task_struct *task, unsigned long *fp) +void show_stack_loglvl(struct task_struct *task, unsigned long *fp, + const char *loglvl) { /* Saved link reg is one word above FP */ - do_show_stack(task, fp, 0); + do_show_stack(task, fp, 0, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *fp) +{ + show_stack_loglvl(task, fp, 0, KERN_INFO); } int die(const char *str, struct pt_regs *regs, long err) @@ -207,7 +210,7 @@ int die(const char *str, struct pt_regs *regs, long err) print_modules(); show_regs(regs); - do_show_stack(current, ®s->r30, pt_elr(regs)); + do_show_stack(current, ®s->r30, pt_elr(regs), KERN_EMERG); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); From c261ad6ee80e337fb01a9cee91241f195c80f3d1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:41 -0700 Subject: [PATCH 494/574] ia64: pass log level as arg into ia64_do_show_stack() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to ia64_do_show_stack() as a preparation to introduce show_stack_loglvl(). Also, make ia64_do_show_stack() static as it's not used outside. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Fenghua Yu Cc: Tony Luck Link: http://lkml.kernel.org/r/20200418201944.482088-16-dima@arista.com Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/ptrace.h | 1 - arch/ia64/kernel/process.c | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 7ff574d56429..b3aa46090101 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -114,7 +114,6 @@ static inline long regs_return_value(struct pt_regs *regs) struct task_struct; /* forward decl */ struct unw_frame_info; /* forward decl */ - extern void ia64_do_show_stack (struct unw_frame_info *, void *); extern unsigned long ia64_get_user_rbs_end (struct task_struct *, struct pt_regs *, unsigned long *); extern long ia64_peek (struct task_struct *, struct switch_stack *, unsigned long, diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 10cb9382ab76..332c6dfe7333 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -64,12 +64,13 @@ EXPORT_SYMBOL(boot_option_idle_override); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); -void +static void ia64_do_show_stack (struct unw_frame_info *info, void *arg) { unsigned long ip, sp, bsp; + const char *loglvl = arg; - printk("\nCall Trace:\n"); + printk("%s\nCall Trace:\n", loglvl); do { unw_get_ip(info, &ip); if (ip == 0) @@ -77,9 +78,9 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) unw_get_sp(info, &sp); unw_get_bsp(info, &bsp); - printk(" [<%016lx>] %pS\n" + printk("%s [<%016lx>] %pS\n" " sp=%016lx bsp=%016lx\n", - ip, (void *)ip, sp, bsp); + loglvl, ip, (void *)ip, sp, bsp); } while (unw_unwind(info) >= 0); } @@ -87,12 +88,12 @@ void show_stack (struct task_struct *task, unsigned long *sp) { if (!task) - unw_init_running(ia64_do_show_stack, NULL); + unw_init_running(ia64_do_show_stack, (void *)KERN_DEFAULT); else { struct unw_frame_info info; unw_init_from_blocked_task(&info, task); - ia64_do_show_stack(&info, NULL); + ia64_do_show_stack(&info, (void *)KERN_DEFAULT); } } From ffdac29e40545cfb21d347c67daeb72541792ec8 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:44 -0700 Subject: [PATCH 495/574] ia64: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Fenghua Yu Cc: Tony Luck Link: http://lkml.kernel.org/r/20200418201944.482088-17-dima@arista.com Signed-off-by: Linus Torvalds --- arch/ia64/kernel/process.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 332c6dfe7333..913d9a01cbf9 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -85,18 +85,25 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) } void -show_stack (struct task_struct *task, unsigned long *sp) +show_stack_loglvl (struct task_struct *task, unsigned long *sp, + const char *loglvl) { if (!task) - unw_init_running(ia64_do_show_stack, (void *)KERN_DEFAULT); + unw_init_running(ia64_do_show_stack, (void *)loglvl); else { struct unw_frame_info info; unw_init_from_blocked_task(&info, task); - ia64_do_show_stack(&info, (void *)KERN_DEFAULT); + ia64_do_show_stack(&info, (void *)loglvl); } } +void +show_stack (struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); +} + void show_regs (struct pt_regs *regs) { From ce23c47a5632612dfaa80bd8f861256a419c0abf Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:47 -0700 Subject: [PATCH 496/574] m68k: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200418201944.482088-18-dima@arista.com Signed-off-by: Linus Torvalds --- arch/m68k/kernel/traps.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 344f93d36a9a..ffcc5ec4fac3 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -811,13 +811,13 @@ asmlinkage void buserr_c(struct frame *fp) static int kstack_depth_to_print = 48; -void show_trace(unsigned long *stack) +static void show_trace(unsigned long *stack, const char *loglvl) { unsigned long *endstack; unsigned long addr; int i; - pr_info("Call Trace:"); + printk("%sCall Trace:", loglvl); addr = (unsigned long)stack + THREAD_SIZE - 1; endstack = (unsigned long *)(addr & -THREAD_SIZE); i = 0; @@ -935,7 +935,8 @@ void show_registers(struct pt_regs *regs) pr_cont("\n"); } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p; unsigned long *endstack; @@ -949,7 +950,7 @@ void show_stack(struct task_struct *task, unsigned long *stack) } endstack = (unsigned long *)(((unsigned long)stack + THREAD_SIZE - 1) & -THREAD_SIZE); - pr_info("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); p = stack; for (i = 0; i < kstack_depth_to_print; i++) { if (p + 1 > endstack) @@ -959,7 +960,12 @@ void show_stack(struct task_struct *task, unsigned long *stack) pr_cont(" %08lx", *p++); } pr_cont("\n"); - show_trace(stack); + show_trace(stack, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_INFO); } /* From 77530a5277bcab0433d0f68af11e04a5c4d79039 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:50 -0700 Subject: [PATCH 497/574] microblaze: add loglvl to microblaze_unwind_inner() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to microblaze_unwind_inner() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200418201944.482088-19-dima@arista.com Signed-off-by: Linus Torvalds --- arch/microblaze/kernel/unwind.c | 38 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 4241cdd28ee7..804bf0c99d8b 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -154,7 +154,8 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc, static void microblaze_unwind_inner(struct task_struct *task, unsigned long pc, unsigned long fp, unsigned long leaf_return, - struct stack_trace *trace); + struct stack_trace *trace, + const char *loglvl); /** * unwind_trap - Unwind through a system trap, that stored previous state @@ -162,16 +163,18 @@ static void microblaze_unwind_inner(struct task_struct *task, */ #ifdef CONFIG_MMU static inline void unwind_trap(struct task_struct *task, unsigned long pc, - unsigned long fp, struct stack_trace *trace) + unsigned long fp, struct stack_trace *trace, + const char *loglvl) { /* To be implemented */ } #else static inline void unwind_trap(struct task_struct *task, unsigned long pc, - unsigned long fp, struct stack_trace *trace) + unsigned long fp, struct stack_trace *trace, + const char *loglvl) { const struct pt_regs *regs = (const struct pt_regs *) fp; - microblaze_unwind_inner(task, regs->pc, regs->r1, regs->r15, trace); + microblaze_unwind_inner(task, regs->pc, regs->r1, regs->r15, trace, loglvl); } #endif @@ -184,11 +187,13 @@ static inline void unwind_trap(struct task_struct *task, unsigned long pc, * the caller's return address. * @trace : Where to store stack backtrace (PC values). * NULL == print backtrace to kernel log + * @loglvl : Used for printk log level if (trace == NULL). */ static void microblaze_unwind_inner(struct task_struct *task, unsigned long pc, unsigned long fp, unsigned long leaf_return, - struct stack_trace *trace) + struct stack_trace *trace, + const char *loglvl) { int ofs = 0; @@ -214,11 +219,11 @@ static void microblaze_unwind_inner(struct task_struct *task, const struct pt_regs *regs = (const struct pt_regs *) fp; #endif - pr_info("HW EXCEPTION\n"); + printk("%sHW EXCEPTION\n", loglvl); #ifndef CONFIG_MMU microblaze_unwind_inner(task, regs->r17 - 4, fp + EX_HANDLER_STACK_SIZ, - regs->r15, trace); + regs->r15, trace, loglvl); #endif return; } @@ -228,8 +233,8 @@ static void microblaze_unwind_inner(struct task_struct *task, if ((return_to >= handler->start_addr) && (return_to <= handler->end_addr)) { if (!trace) - pr_info("%s\n", handler->trap_name); - unwind_trap(task, pc, fp, trace); + printk("%s%s\n", loglvl, handler->trap_name); + unwind_trap(task, pc, fp, trace, loglvl); return; } } @@ -248,13 +253,13 @@ static void microblaze_unwind_inner(struct task_struct *task, } else { /* Have we reached userland? */ if (unlikely(pc == task_pt_regs(task)->pc)) { - pr_info("[<%p>] PID %lu [%s]\n", - (void *) pc, + printk("%s[<%p>] PID %lu [%s]\n", + loglvl, (void *) pc, (unsigned long) task->pid, task->comm); break; } else - print_ip_sym(KERN_INFO, pc); + print_ip_sym(loglvl, pc); } /* Stop when we reach anything not part of the kernel */ @@ -285,11 +290,13 @@ static void microblaze_unwind_inner(struct task_struct *task, */ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) { + const char *loglvl = KERN_INFO; + if (task) { if (task == current) { const struct pt_regs *regs = task_pt_regs(task); microblaze_unwind_inner(task, regs->pc, regs->r1, - regs->r15, trace); + regs->r15, trace, loglvl); } else { struct thread_info *thread_info = (struct thread_info *)(task->stack); @@ -299,7 +306,8 @@ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) microblaze_unwind_inner(task, (unsigned long) &_switch_to, cpu_context->r1, - cpu_context->r15, trace); + cpu_context->r15, + trace, loglvl); } } else { unsigned long pc, fp; @@ -314,7 +322,7 @@ void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) ); /* Since we are not a leaf function, use leaf_return = 0 */ - microblaze_unwind_inner(current, pc, fp, 0, trace); + microblaze_unwind_inner(current, pc, fp, 0, trace, loglvl); } } From 14b0dd870f6f3b28fad1235b70d1a692db1d6a2f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:53 -0700 Subject: [PATCH 498/574] microblaze: add loglvl to microblaze_unwind() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to microblaze_unwind() as a preparation to add show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200418201944.482088-20-dima@arista.com Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/unwind.h | 3 ++- arch/microblaze/kernel/stacktrace.c | 4 ++-- arch/microblaze/kernel/traps.c | 2 +- arch/microblaze/kernel/unwind.c | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/microblaze/include/asm/unwind.h b/arch/microblaze/include/asm/unwind.h index c327d673622a..3db81777a887 100644 --- a/arch/microblaze/include/asm/unwind.h +++ b/arch/microblaze/include/asm/unwind.h @@ -20,7 +20,8 @@ extern struct trap_handler_info microblaze_trap_handlers; extern const char _hw_exception_handler; extern const char ex_handler_unhandled; -void microblaze_unwind(struct task_struct *task, struct stack_trace *trace); +void microblaze_unwind(struct task_struct *task, struct stack_trace *trace, + const char *loglvl); #endif /* __MICROBLAZE_UNWIND_H */ diff --git a/arch/microblaze/kernel/stacktrace.c b/arch/microblaze/kernel/stacktrace.c index b4debe283a79..b266c4d6ed9d 100644 --- a/arch/microblaze/kernel/stacktrace.c +++ b/arch/microblaze/kernel/stacktrace.c @@ -20,12 +20,12 @@ void save_stack_trace(struct stack_trace *trace) { /* Exclude our helper functions from the trace*/ trace->skip += 2; - microblaze_unwind(NULL, trace); + microblaze_unwind(NULL, trace, ""); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - microblaze_unwind(tsk, trace); + microblaze_unwind(tsk, trace, ""); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 45bbba9d919f..be726ee120fb 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -68,7 +68,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, words_to_show << 2, 0); pr_info("\n\nCall Trace:\n"); - microblaze_unwind(task, NULL); + microblaze_unwind(task, NULL, KERN_INFO); pr_info("\n"); if (!task) diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 804bf0c99d8b..778a761af0a7 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -287,11 +287,11 @@ static void microblaze_unwind_inner(struct task_struct *task, * @task : Task whose stack we are to unwind (NULL == current) * @trace : Where to store stack backtrace (PC values). * NULL == print backtrace to kernel log + * @loglvl : Used for printk log level if (trace == NULL). */ -void microblaze_unwind(struct task_struct *task, struct stack_trace *trace) +void microblaze_unwind(struct task_struct *task, struct stack_trace *trace, + const char *loglvl) { - const char *loglvl = KERN_INFO; - if (task) { if (task == current) { const struct pt_regs *regs = task_pt_regs(task); From 35f3968b499c6dd026a828933b57ebdb11e74cff Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:56 -0700 Subject: [PATCH 499/574] microblaze: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200418201944.482088-21-dima@arista.com Signed-off-by: Linus Torvalds --- arch/microblaze/kernel/traps.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index be726ee120fb..149ae534937e 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -31,7 +31,8 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { unsigned long words_to_show; u32 fp = (u32) sp; @@ -50,7 +51,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (kstack_depth_to_print && (words_to_show > kstack_depth_to_print)) words_to_show = kstack_depth_to_print; - pr_info("Kernel Stack:\n"); + printk("%sKernel Stack:\n", loglvl); /* * Make the first line an 'odd' size if necessary to get @@ -65,14 +66,19 @@ void show_stack(struct task_struct *task, unsigned long *sp) words_to_show -= line1_words; } } - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, + print_hex_dump(loglvl, "", DUMP_PREFIX_ADDRESS, 32, 4, (void *)fp, words_to_show << 2, 0); - pr_info("\n\nCall Trace:\n"); - microblaze_unwind(task, NULL, KERN_INFO); - pr_info("\n"); + printk("%s\n\nCall Trace:\n", loglvl); + microblaze_unwind(task, NULL, loglvl); + printk("%s\n", loglvl); if (!task) task = current; debug_show_held_locks(task); } + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_INFO); +} From 96f0458a96892dd2f1589a7517724d541e1c4520 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:30:59 -0700 Subject: [PATCH 500/574] mips: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: James Hogan Cc: Paul Burton Cc: Ralf Baechle Link: http://lkml.kernel.org/r/20200418201944.482088-22-dima@arista.com Signed-off-by: Linus Torvalds --- arch/mips/kernel/traps.c | 41 +++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 210fea63de75..e49040739b61 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -108,26 +108,26 @@ void (*board_bind_eic_interrupt)(int irq, int regset); void (*board_ebase_setup)(void); void(*board_cache_error_setup)(void); -static void show_raw_backtrace(unsigned long reg29) +static void show_raw_backtrace(unsigned long reg29, const char *loglvl) { unsigned long *sp = (unsigned long *)(reg29 & ~3); unsigned long addr; - printk("Call Trace:"); + printk("%sCall Trace:", loglvl); #ifdef CONFIG_KALLSYMS - printk("\n"); + printk("%s\n", loglvl); #endif while (!kstack_end(sp)) { unsigned long __user *p = (unsigned long __user *)(unsigned long)sp++; if (__get_user(addr, p)) { - printk(" (Bad stack address)"); + printk("%s (Bad stack address)", loglvl); break; } if (__kernel_text_address(addr)) - print_ip_sym(KERN_DEFAULT, addr); + print_ip_sym(loglvl, addr); } - printk("\n"); + printk("%s\n", loglvl); } #ifdef CONFIG_KALLSYMS @@ -140,7 +140,8 @@ static int __init set_raw_show_trace(char *str) __setup("raw_show_trace", set_raw_show_trace); #endif -static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) +static void show_backtrace(struct task_struct *task, const struct pt_regs *regs, + const char *loglvl) { unsigned long sp = regs->regs[29]; unsigned long ra = regs->regs[31]; @@ -150,12 +151,12 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) task = current; if (raw_show_trace || user_mode(regs) || !__kernel_text_address(pc)) { - show_raw_backtrace(sp); + show_raw_backtrace(sp, loglvl); return; } - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { - print_ip_sym(KERN_DEFAULT, pc); + print_ip_sym(loglvl, pc); pc = unwind_stack(task, &sp, pc, &ra); } while (pc); pr_cont("\n"); @@ -166,19 +167,19 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs) * with at least a bit of error checking ... */ static void show_stacktrace(struct task_struct *task, - const struct pt_regs *regs) + const struct pt_regs *regs, const char *loglvl) { const int field = 2 * sizeof(unsigned long); long stackdata; int i; unsigned long __user *sp = (unsigned long __user *)regs->regs[29]; - printk("Stack :"); + printk("%sStack :", loglvl); i = 0; while ((unsigned long) sp & (PAGE_SIZE - 1)) { if (i && ((i % (64 / field)) == 0)) { pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } if (i > 39) { pr_cont(" ..."); @@ -194,10 +195,11 @@ static void show_stacktrace(struct task_struct *task, i++; } pr_cont("\n"); - show_backtrace(task, regs); + show_backtrace(task, regs, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { struct pt_regs regs; mm_segment_t old_fs = get_fs(); @@ -221,10 +223,15 @@ void show_stack(struct task_struct *task, unsigned long *sp) * the stack in the kernel (not user) address space. */ set_fs(KERNEL_DS); - show_stacktrace(task, ®s); + show_stacktrace(task, ®s, loglvl); set_fs(old_fs); } +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT) +} + static void show_code(unsigned int __user *pc) { long i; @@ -373,7 +380,7 @@ void show_registers(struct pt_regs *regs) if (!user_mode(regs)) /* Necessary for getting the correct stack content */ set_fs(KERNEL_DS); - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); show_code((unsigned int __user *) regs->cp0_epc); printk("\n"); set_fs(old_fs); From 18a4753f90175a6acfefd4cf3da1bcb163e12216 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:02 -0700 Subject: [PATCH 501/574] nds32: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Greentime Hu Cc: Vincent Chen Link: http://lkml.kernel.org/r/20200418201944.482088-23-dima@arista.com Signed-off-by: Linus Torvalds --- arch/nds32/kernel/traps.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 40625760a125..90f12582c218 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -97,18 +97,19 @@ static void dump_instr(struct pt_regs *regs) } #define LOOP_TIMES (100) -static void __dump(struct task_struct *tsk, unsigned long *base_reg) +static void __dump(struct task_struct *tsk, unsigned long *base_reg, + const char *loglvl) { unsigned long ret_addr; int cnt = LOOP_TIMES, graph = 0; - pr_emerg("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); if (!IS_ENABLED(CONFIG_FRAME_POINTER)) { while (!kstack_end(base_reg)) { ret_addr = *base_reg++; if (__kernel_text_address(ret_addr)) { ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(KERN_EMERG, ret_addr); + print_ip_sym(loglvl, ret_addr); } if (--cnt < 0) break; @@ -124,17 +125,18 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg) ret_addr = ftrace_graph_ret_addr( tsk, &graph, ret_addr, NULL); - print_ip_sym(KERN_EMERG, ret_addr); + print_ip_sym(loglvl, ret_addr); } if (--cnt < 0) break; base_reg = (unsigned long *)next_fp; } } - pr_emerg("\n"); + printk("%s\n", loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { unsigned long *base_reg; @@ -151,10 +153,15 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) else __asm__ __volatile__("\tori\t%0, $fp, #0\n":"=r"(base_reg)); } - __dump(tsk, base_reg); + __dump(tsk, base_reg, loglvl); barrier(); } +void show_stack(struct task_struct *tsk, unsigned long *sp) +{ + show_stack_loglvl(tsk, sp, KERN_EMERG); +} + DEFINE_SPINLOCK(die_lock); /* From 351dd61c3821fe9b6702205d3a894004a9674295 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:05 -0700 Subject: [PATCH 502/574] nios2: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ley Foon Tan Link: http://lkml.kernel.org/r/20200418201944.482088-24-dima@arista.com Signed-off-by: Linus Torvalds --- arch/nios2/kernel/traps.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 486db793923c..08071caa9b36 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -52,12 +52,14 @@ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) } /* - * The show_stack is an external API which we do not use ourselves. + * The show_stack(), show_stack_loglvl() are external API + * which we do not use ourselves. */ int kstack_depth_to_print = 48; -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *endstack, addr; int i; @@ -72,16 +74,16 @@ void show_stack(struct task_struct *task, unsigned long *stack) addr = (unsigned long) stack; endstack = (unsigned long *) PAGE_ALIGN(addr); - pr_emerg("Stack from %08lx:", (unsigned long)stack); + printk("%sStack from %08lx:", loglvl, (unsigned long)stack); for (i = 0; i < kstack_depth_to_print; i++) { if (stack + 1 > endstack) break; if (i % 8 == 0) - pr_emerg("\n "); - pr_emerg(" %08lx", *stack++); + printk("%s\n ", loglvl); + printk("%s %08lx", loglvl, *stack++); } - pr_emerg("\nCall Trace:"); + printk("%s\nCall Trace:", loglvl); i = 0; while (stack + 1 <= endstack) { addr = *stack++; @@ -97,11 +99,16 @@ void show_stack(struct task_struct *task, unsigned long *stack) (addr <= (unsigned long) _etext))) { if (i % 4 == 0) pr_emerg("\n "); - pr_emerg(" [<%08lx>]", addr); + printk("%s [<%08lx>]", loglvl, addr); i++; } } - pr_emerg("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_EMERG); } void __init trap_init(void) From 0633032f083a53cd33f1171cca38c8ba835d1eba Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:08 -0700 Subject: [PATCH 503/574] openrisc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Jonas Bonn Cc: Stafford Horne Cc: Stefan Kristiansson Link: http://lkml.kernel.org/r/20200418201944.482088-25-dima@arista.com Signed-off-by: Linus Torvalds --- arch/openrisc/kernel/traps.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index c11aa2e17ce0..3b7978a22d68 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -41,18 +41,26 @@ unsigned long __user *lwa_addr; void print_trace(void *data, unsigned long addr, int reliable) { - pr_emerg("[<%p>] %s%pS\n", (void *) addr, reliable ? "" : "? ", + const char *loglvl = data; + + printk("%s[<%p>] %s%pS\n", loglvl, (void *) addr, reliable ? "" : "? ", (void *) addr); } /* displays a short stack trace */ -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack_loglvl(struct task_struct *task, unsigned long *esp, + const char *loglvl) { if (esp == NULL) esp = (unsigned long *)&esp; - pr_emerg("Call trace:\n"); - unwind_stack(NULL, esp, print_trace); + printk("%sCall trace:\n", loglvl); + unwind_stack((void *)loglvl, esp, print_trace); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + show_stack_loglvl(task, esp, KERN_EMERG); } void show_registers(struct pt_regs *regs) From 3481d31bf7473ced5a39fbfd2786b141798b6764 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:11 -0700 Subject: [PATCH 504/574] parisc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Helge Deller Cc: "James E.J. Bottomley" Link: http://lkml.kernel.org/r/20200418201944.482088-26-dima@arista.com Signed-off-by: Linus Torvalds --- arch/parisc/kernel/traps.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 82fc01189488..c2411de3730f 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -49,7 +49,7 @@ #include "../math-emu/math-emu.h" /* for handle_fpe() */ static void parisc_show_stack(struct task_struct *task, - struct pt_regs *regs); + struct pt_regs *regs, const char *loglvl); static int printbinary(char *buf, unsigned long x, int nbits) { @@ -155,7 +155,7 @@ void show_regs(struct pt_regs *regs) printk("%s IAOQ[1]: %pS\n", level, (void *) regs->iaoq[1]); printk("%s RP(r2): %pS\n", level, (void *) regs->gr[2]); - parisc_show_stack(current, regs); + parisc_show_stack(current, regs, KERN_DEFAULT); } } @@ -170,37 +170,43 @@ static DEFINE_RATELIMIT_STATE(_hppa_rs, } -static void do_show_stack(struct unwind_frame_info *info) +static void do_show_stack(struct unwind_frame_info *info, const char *loglvl) { int i = 1; - printk(KERN_CRIT "Backtrace:\n"); + printk("%sBacktrace:\n", loglvl); while (i <= MAX_UNWIND_ENTRIES) { if (unwind_once(info) < 0 || info->ip == 0) break; if (__kernel_text_address(info->ip)) { - printk(KERN_CRIT " [<" RFMT ">] %pS\n", - info->ip, (void *) info->ip); + printk("%s [<" RFMT ">] %pS\n", + loglvl, info->ip, (void *) info->ip); i++; } } - printk(KERN_CRIT "\n"); + printk("%s\n", loglvl); } static void parisc_show_stack(struct task_struct *task, - struct pt_regs *regs) + struct pt_regs *regs, const char *loglvl) { struct unwind_frame_info info; unwind_frame_init_task(&info, task, regs); - do_show_stack(&info); + do_show_stack(&info, loglvl); +} + +void show_stack_loglvl(struct task_struct *t, unsigned long *sp, + const char *loglvl) +{ + parisc_show_stack(t, NULL, loglvl); } void show_stack(struct task_struct *t, unsigned long *sp) { - parisc_show_stack(t, NULL); + show_stack_loglvl(t, sp, KERN_CRIT) } int is_valid_bugaddr(unsigned long iaoq) @@ -446,7 +452,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o /* show_stack(NULL, (unsigned long *)regs->gr[30]); */ struct unwind_frame_info info; unwind_frame_init(&info, current, regs); - do_show_stack(&info); + do_show_stack(&info, KERN_CRIT); } printk("\n"); From b9677a8cf60995acdd1b36ff59e6b437154bff9e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:14 -0700 Subject: [PATCH 505/574] powerpc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Michael Ellerman (powerpc) Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Link: http://lkml.kernel.org/r/20200418201944.482088-27-dima@arista.com Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/process.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 048d64c4e115..a456b4454b3f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2063,7 +2063,8 @@ unsigned long get_wchan(struct task_struct *p) static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack(struct task_struct *tsk, unsigned long *stack) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; @@ -2088,7 +2089,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) } lr = 0; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) break; @@ -2097,7 +2098,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) newsp = stack[0]; ip = stack[STACK_FRAME_LR_SAVE]; if (!firstframe || ip != lr) { - printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); + printk("%s["REG"] ["REG"] %pS", + loglvl, sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER ret_addr = ftrace_graph_ret_addr(current, &ftrace_idx, ip, stack); @@ -2119,8 +2121,9 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) struct pt_regs *regs = (struct pt_regs *) (sp + STACK_FRAME_OVERHEAD); lr = regs->link; - printk("--- interrupt: %lx at %pS\n LR = %pS\n", - regs->trap, (void *)regs->nip, (void *)lr); + printk("%s--- interrupt: %lx at %pS\n LR = %pS\n", + loglvl, regs->trap, + (void *)regs->nip, (void *)lr); firstframe = 1; } @@ -2130,6 +2133,11 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) put_task_stack(tsk); } +void show_stack(struct task_struct *tsk, unsigned long *stack) +{ + show_stack_loglvl(tsk, stack, KERN_DEFAULT); +} + #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) From 0b3d43657489711bf927998fde82b5cc575d9400 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:17 -0700 Subject: [PATCH 506/574] riscv: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Albert Ou Cc: Palmer Dabbelt Cc: Paul Walmsley Link: http://lkml.kernel.org/r/20200418201944.482088-28-dima@arista.com Signed-off-by: Linus Torvalds --- arch/riscv/kernel/stacktrace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 9f1ac258482f..aaa64bf007f8 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -99,17 +99,24 @@ void notrace walk_stackframe(struct task_struct *task, static bool print_trace_address(unsigned long pc, void *arg) { - print_ip_sym(KERN_DEFAULT, pc); + const char *loglvl = arg; + + print_ip_sym(loglvl, pc); return false; } +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) +{ + pr_cont("Call Trace:\n"); + walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); +} + void show_stack(struct task_struct *task, unsigned long *sp) { - pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, NULL); + show_stack_loglvl(task, sp, KERN_DEFAULT); } - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { From 8539c1288ddc23eef0fe71e1bcecc04b3fc6a9ee Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:20 -0700 Subject: [PATCH 507/574] s390: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Link: http://lkml.kernel.org/r/20200418201944.482088-29-dima@arista.com Signed-off-by: Linus Torvalds --- arch/s390/kernel/dumpstack.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 2c122d8bab93..887a054919fc 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -126,18 +126,24 @@ unknown: return -EINVAL; } -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { struct unwind_state state; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) - printk(state.reliable ? " [<%016lx>] %pSR \n" : - "([<%016lx>] %pSR)\n", - state.ip, (void *) state.ip); + printk(state.reliable ? "%s [<%016lx>] %pSR \n" : + "%s([<%016lx>] %pSR)\n", + loglvl, state.ip, (void *) state.ip); debug_show_held_locks(task ? : current); } +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_DEFAULT); +} + static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); From ebf0a36a32b25fe6ba8a1b6bd3135432ebd9aa5c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:24 -0700 Subject: [PATCH 508/574] sh: add loglvl to dump_mem() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to dump_mem() as a preparation to introduce show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Rich Felker Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200418201944.482088-30-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sh/include/asm/kdebug.h | 3 ++- arch/sh/kernel/dumpstack.c | 17 +++++++++-------- arch/sh/kernel/traps.c | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/sh/include/asm/kdebug.h b/arch/sh/include/asm/kdebug.h index 5212f5fcd752..de8693fabb1d 100644 --- a/arch/sh/include/asm/kdebug.h +++ b/arch/sh/include/asm/kdebug.h @@ -13,6 +13,7 @@ enum die_val { /* arch/sh/kernel/dumpstack.c */ extern void printk_address(unsigned long address, int reliable); -extern void dump_mem(const char *str, unsigned long bottom, unsigned long top); +extern void dump_mem(const char *str, const char *loglvl, + unsigned long bottom, unsigned long top); #endif /* __ASM_SH_KDEBUG_H */ diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 9f1c9c11d62d..6784b914fba0 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -16,30 +16,31 @@ #include #include -void dump_mem(const char *str, unsigned long bottom, unsigned long top) +void dump_mem(const char *str, const char *loglvl, + unsigned long bottom, unsigned long top) { unsigned long p; int i; - printk("%s(0x%08lx to 0x%08lx)\n", str, bottom, top); + printk("%s%s(0x%08lx to 0x%08lx)\n", loglvl, str, bottom, top); for (p = bottom & ~31; p < top; ) { - printk("%04lx: ", p & 0xffff); + printk("%s%04lx: ", loglvl, p & 0xffff); for (i = 0; i < 8; i++, p += 4) { unsigned int val; if (p < bottom || p >= top) - printk(" "); + printk("%s ", loglvl); else { if (__get_user(val, (unsigned int __user *)p)) { - printk("\n"); + printk("%s\n", loglvl); return; } - printk("%08x ", val); + printk("%s%08x ", loglvl, val); } } - printk("\n"); + printk("%s\n", loglvl); } } @@ -156,7 +157,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) sp = (unsigned long *)tsk->thread.sp; stack = (unsigned long)sp; - dump_mem("Stack: ", stack, THREAD_SIZE + + dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL); } diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 2130381c9d57..a33025451fcd 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -38,8 +38,8 @@ void die(const char *str, struct pt_regs *regs, long err) task_pid_nr(current), task_stack_page(current) + 1); if (!user_mode(regs) || in_interrupt()) - dump_mem("Stack: ", regs->regs[15], THREAD_SIZE + - (unsigned long)task_stack_page(current)); + dump_mem("Stack: ", KERN_DEFAULT, regs->regs[15], + THREAD_SIZE + (unsigned long)task_stack_page(current)); notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV); From 8b92f34877225c8eb85e3ab7f1177fc248ba26d0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:27 -0700 Subject: [PATCH 509/574] sh: remove needless printk() Currently `data' is always an empty line "". No need for additional printk() call. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Rich Felker Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200418201944.482088-31-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sh/kernel/dumpstack.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 6784b914fba0..2c1a78e5776b 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -118,7 +118,6 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk("%s", (char *)data); printk_address(addr, reliable); } From 2deebe4d56d638269a4a728086d64de5734b460a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:30 -0700 Subject: [PATCH 510/574] sh: add loglvl to printk_address() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to printk_address() as a preparation to introduce show_stack_loglvl(). As a good side-effect show_fault_oops() now prints the address with KERN_EMREG as the rest of output, making sure there won't be situation where "PC: " is printed without actual address. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Rich Felker Link: http://lkml.kernel.org/r/20200418201944.482088-32-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sh/include/asm/kdebug.h | 3 ++- arch/sh/kernel/dumpstack.c | 6 +++--- arch/sh/mm/fault.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/sh/include/asm/kdebug.h b/arch/sh/include/asm/kdebug.h index de8693fabb1d..960545306afa 100644 --- a/arch/sh/include/asm/kdebug.h +++ b/arch/sh/include/asm/kdebug.h @@ -12,7 +12,8 @@ enum die_val { }; /* arch/sh/kernel/dumpstack.c */ -extern void printk_address(unsigned long address, int reliable); +extern void printk_address(unsigned long address, int reliable, + const char *loglvl); extern void dump_mem(const char *str, const char *loglvl, unsigned long bottom, unsigned long top); diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 2c1a78e5776b..959064b90055 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -44,9 +44,9 @@ void dump_mem(const char *str, const char *loglvl, } } -void printk_address(unsigned long address, int reliable) +void printk_address(unsigned long address, int reliable, const char *loglvl) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk("%s [<%p>] %s%pS\n", loglvl, (void *) address, reliable ? "" : "? ", (void *) address); } @@ -118,7 +118,7 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk_address(addr, reliable); + printk_address(addr, reliable, (char *)data); } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 7260a1a7fdca..67d0e739897c 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -214,7 +214,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long address) : "paging request", address); pr_alert("PC:"); - printk_address(regs->pc, 1); + printk_address(regs->pc, 1, KERN_ALERT); show_pte(NULL, address); } From 539e786cc37ee5cb6e051ef5eb72b7a7c03022cf Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:33 -0700 Subject: [PATCH 511/574] sh: add loglvl to show_trace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to show_trace() as a preparation to introduce show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Rich Felker Link: http://lkml.kernel.org/r/20200418201944.482088-33-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sh/include/asm/processor_32.h | 2 +- arch/sh/kernel/dumpstack.c | 10 +++++----- arch/sh/kernel/process_32.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 0e0ecc0132e3..d44409413418 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -171,7 +171,7 @@ static __inline__ void enable_fpu(void) #define thread_saved_pc(tsk) (tsk->thread.pc) void show_trace(struct task_struct *tsk, unsigned long *sp, - struct pt_regs *regs); + struct pt_regs *regs, const char *loglvl); #ifdef CONFIG_DUMP_CODE void show_code(struct pt_regs *regs); diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index 959064b90055..d488a47a1f0f 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -127,16 +127,16 @@ static const struct stacktrace_ops print_trace_ops = { }; void show_trace(struct task_struct *tsk, unsigned long *sp, - struct pt_regs *regs) + struct pt_regs *regs, const char *loglvl) { if (regs && user_mode(regs)) return; - printk("\nCall trace:\n"); + printk("%s\nCall trace:\n", loglvl); - unwind_stack(tsk, regs, sp, &print_trace_ops, ""); + unwind_stack(tsk, regs, sp, &print_trace_ops, (void *)loglvl); - printk("\n"); + printk("%s\n", loglvl); if (!tsk) tsk = current; @@ -158,5 +158,5 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) stack = (unsigned long)sp; dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - show_trace(tsk, sp, NULL); + show_trace(tsk, sp, NULL, KERN_DEFAULT); } diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index a094633874c3..456cc8d171f7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -59,7 +59,7 @@ void show_regs(struct pt_regs * regs) printk("MACH: %08lx MACL: %08lx GBR : %08lx PR : %08lx\n", regs->mach, regs->macl, regs->gbr, regs->pr); - show_trace(NULL, (unsigned long *)regs->regs[15], regs); + show_trace(NULL, (unsigned long *)regs->regs[15], regs, KERN_DEFAULT); show_code(regs); } From e6e371c4f653db7581072a0a20b32a11a751ebda Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:36 -0700 Subject: [PATCH 512/574] sh: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Rich Felker Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200418201944.482088-34-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sh/kernel/dumpstack.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index d488a47a1f0f..cc51e9d74667 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -144,7 +144,8 @@ void show_trace(struct task_struct *tsk, unsigned long *sp, debug_show_held_locks(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) { unsigned long stack; @@ -156,7 +157,12 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) sp = (unsigned long *)tsk->thread.sp; stack = (unsigned long)sp; - dump_mem("Stack: ", KERN_DEFAULT, stack, THREAD_SIZE + + dump_mem("Stack: ", loglvl, stack, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - show_trace(tsk, sp, NULL, KERN_DEFAULT); + show_trace(tsk, sp, NULL, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } From 13c6371ae576840bcacc0fc8407d7b11572eb7fb Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:39 -0700 Subject: [PATCH 513/574] sparc: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: David S. Miller Link: http://lkml.kernel.org/r/20200418201944.482088-35-dima@arista.com Signed-off-by: Linus Torvalds --- arch/sparc/kernel/process_32.c | 17 ++++++++++++----- arch/sparc/kernel/traps_64.c | 15 +++++++++++---- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 26cca65e9246..0b07de5618e5 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -145,10 +145,12 @@ void show_regs(struct pt_regs *r) } /* - * The show_stack is an external API which we do not use ourselves. + * The show_stack(), show_stack_loglvl() are external APIs which + * we do not use ourselves. * The oops is printed in die_if_kernel. */ -void show_stack(struct task_struct *tsk, unsigned long *_ksp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, + const char *loglvl) { unsigned long pc, fp; unsigned long task_base; @@ -170,11 +172,16 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) break; rw = (struct reg_window32 *) fp; pc = rw->ins[7]; - printk("[%08lx : ", pc); - printk("%pS ] ", (void *) pc); + printk("%s[%08lx : ", loglvl, pc); + printk("%s%pS ] ", loglvl, (void *) pc); fp = rw->ins[6]; } while (++count < 16); - printk("\n"); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } /* diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 27778b65a965..8715bc93bd9d 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -2452,7 +2453,8 @@ static void user_instruction_dump(unsigned int __user *pc) printk("\n"); } -void show_stack(struct task_struct *tsk, unsigned long *_ksp) +void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, + const char *loglvl) { unsigned long fp, ksp; struct thread_info *tp; @@ -2476,7 +2478,7 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) fp = ksp + STACK_BIAS; - printk("Call Trace:\n"); + printk("%sCall Trace:\n", loglvl); do { struct sparc_stackf *sf; struct pt_regs *regs; @@ -2497,14 +2499,14 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) fp = (unsigned long)sf->fp + STACK_BIAS; } - printk(" [%016lx] %pS\n", pc, (void *) pc); + print_ip_sym(loglvl, pc); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if ((pc + 8UL) == (unsigned long) &return_to_handler) { struct ftrace_ret_stack *ret_stack; ret_stack = ftrace_graph_get_ret_stack(tsk, graph); if (ret_stack) { pc = ret_stack->ret; - printk(" [%016lx] %pS\n", pc, (void *) pc); + print_ip_sym(loglvl, pc); graph++; } } @@ -2512,6 +2514,11 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) } while (++count < 16); } +void show_stack(struct task_struct *tsk, unsigned long *_ksp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); +} + static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; From 3dd923f39a03dede001afe0edcc08613d5f403e5 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:42 -0700 Subject: [PATCH 514/574] um/sysrq: remove needless variable sp `sp' is a needless excercise here. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Link: http://lkml.kernel.org/r/20200418201944.482088-36-dima@arista.com Signed-off-by: Linus Torvalds --- arch/um/kernel/sysrq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index c71b5ef7ea8c..c831a1c2eb94 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -27,7 +27,6 @@ static const struct stacktrace_ops stackops = { void show_stack(struct task_struct *task, unsigned long *stack) { - unsigned long *sp = stack; struct pt_regs *segv_regs = current->thread.segv_regs; int i; @@ -38,10 +37,9 @@ void show_stack(struct task_struct *task, unsigned long *stack) } if (!stack) - sp = get_stack_pointer(task, segv_regs); + stack = get_stack_pointer(task, segv_regs); pr_info("Stack:\n"); - stack = sp; for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; From 1ad87824f4cf16a7f381e1f94943a96bb7a99062 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:45 -0700 Subject: [PATCH 515/574] um: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Anton Ivanov Cc: Jeff Dike Cc: Richard Weinberger Link: http://lkml.kernel.org/r/20200418201944.482088-37-dima@arista.com Signed-off-by: Linus Torvalds --- arch/um/kernel/sysrq.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index c831a1c2eb94..1b54b6431499 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -17,7 +17,9 @@ static void _print_addr(void *data, unsigned long address, int reliable) { - pr_info(" [<%08lx>] %s%pS\n", address, reliable ? "" : "? ", + const char *loglvl = data; + + printk("%s [<%08lx>] %s%pS\n", loglvl, address, reliable ? "" : "? ", (void *)address); } @@ -25,7 +27,8 @@ static const struct stacktrace_ops stackops = { .address = _print_addr }; -void show_stack(struct task_struct *task, unsigned long *stack) +void show_stack_loglvl(struct task_struct *task, unsigned long *stack, + const char *loglvl) { struct pt_regs *segv_regs = current->thread.segv_regs; int i; @@ -39,17 +42,22 @@ void show_stack(struct task_struct *task, unsigned long *stack) if (!stack) stack = get_stack_pointer(task, segv_regs); - pr_info("Stack:\n"); + printk("%sStack:\n", loglvl); for (i = 0; i < 3 * STACKSLOTS_PER_LINE; i++) { if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); + printk("%s\n", loglvl); pr_cont(" %08lx", *stack++); } - pr_cont("\n"); + printk("%s\n", loglvl); - pr_info("Call Trace:\n"); - dump_trace(current, &stackops, NULL); - pr_info("\n"); + printk("%sCall Trace:\n", loglvl); + dump_trace(current, &stackops, (void *)loglvl); + printk("%s\n", loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *stack) +{ + show_stack_loglvl(task, stack, KERN_INFO); } From ee1e99009e914e4150774f5a717c179545c7699e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:48 -0700 Subject: [PATCH 516/574] unicore32: remove unused pmode argument in c_backtrace() The pmode parameter isn't used in assembly - remove it. Second argument will be reused for printk() log level. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200418201944.482088-38-dima@arista.com Signed-off-by: Linus Torvalds --- arch/unicore32/kernel/setup.h | 2 +- arch/unicore32/kernel/traps.c | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index e40d3603c7e7..03e70e37f472 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -29,7 +29,7 @@ extern void kernel_thread_helper(void); extern void __init early_signal_init(void); extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp); extern void __show_regs(struct pt_regs *); diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index e24f67283864..3682a4c5d927 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -137,7 +137,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { - unsigned int fp, mode; + unsigned int fp; int ok = 1; printk(KERN_DEFAULT "Backtrace: "); @@ -145,16 +145,12 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; - if (regs) { + if (regs) fp = regs->UCreg_fp; - mode = processor_mode(regs); - } else if (tsk != current) { + else if (tsk != current) fp = thread_saved_fp(tsk); - mode = 0x10; - } else { + else asm("mov %0, fp" : "=r" (fp) : : "cc"); - mode = 0x10; - } if (!fp) { printk("no frame pointer"); @@ -167,7 +163,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) printk("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp); } void show_stack(struct task_struct *tsk, unsigned long *sp) From de985dd50158fedde5f2916c1bcf949a0acf5cd0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:51 -0700 Subject: [PATCH 517/574] unicore32: add loglvl to c_backtrace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level parameter to c_backtrace() as a preparation for introducing show_stack_loglvl() [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200418201944.482088-39-dima@arista.com Signed-off-by: Linus Torvalds --- arch/unicore32/kernel/setup.h | 2 +- arch/unicore32/kernel/traps.c | 2 +- arch/unicore32/lib/backtrace.S | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index 03e70e37f472..967352323185 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -29,7 +29,7 @@ extern void kernel_thread_helper(void); extern void __init early_signal_init(void); extern asmlinkage void __backtrace(void); -extern asmlinkage void c_backtrace(unsigned long fp); +extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl); extern void __show_regs(struct pt_regs *); diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 3682a4c5d927..2b7d73456865 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -163,7 +163,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) printk("\n"); if (ok) - c_backtrace(fp); + c_backtrace(fp, KERN_DEFAULT); } void show_stack(struct task_struct *tsk, unsigned long *sp) diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S index f303671e2a4e..6221944b81f3 100644 --- a/arch/unicore32/lib/backtrace.S +++ b/arch/unicore32/lib/backtrace.S @@ -16,6 +16,7 @@ #define sv_fp v5 #define sv_pc v6 #define offset v8 +#define loglvl v9 ENTRY(__backtrace) mov r0, fp @@ -27,10 +28,11 @@ ENTRY(c_backtrace) ENDPROC(__backtrace) ENDPROC(c_backtrace) #else - stm.w (v4 - v8, lr), [sp-] @ Save an extra register + stm.w (v4 - v10, lr), [sp-] @ Save an extra register @ so we have a location... mov.a frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r1 1: stm.w (pc), [sp-] @ calculate offset of PC stored ldw.w r0, [sp]+, #4 @ by stmfd for this CPU @@ -95,9 +97,10 @@ for_each_frame: bua for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame b.l printk -no_frame: ldm.w (v4 - v8, pc), [sp]+ +no_frame: ldm.w (v4 - v10, pc), [sp]+ ENDPROC(__backtrace) ENDPROC(c_backtrace) @@ -128,8 +131,11 @@ ENDPROC(c_backtrace) add v7, v7, #1 cxor.a v7, #6 cmoveq v7, #1 - cmoveq r1, #'\n' - cmovne r1, #' ' + bne 201f + adr r0, .Lcr + mov r1, loglvl + b.l printk +201: ldw.w r3, [stack]+, #-4 mov r2, reg csub.a r2, #8 @@ -141,18 +147,20 @@ ENDPROC(c_backtrace) add r2, r2, #0x10 @ so r2 need add 16 201: adr r0, .Lfp + mov r1, loglvl b.l printk 2: sub.a reg, reg, #1 bns 1b cxor.a v7, #0 beq 201f adr r0, .Lcr + mov r1, loglvl b.l printk 201: ldm.w (instr, reg, stack, v7, pc), [sp]+ -.Lfp: .asciz "%cr%d:%08x" -.Lcr: .asciz "\n" -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lfp: .asciz "%sr%d:%08x " +.Lcr: .asciz "%s\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0x92eec000 >> 14 @ stm.w sp, (... fp, ip, lr, pc) .word 0x92e10000 >> 14 @ stm.w sp, () From 5c0884694f7fd1efc846f720f10d190bf708e461 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:54 -0700 Subject: [PATCH 518/574] unicore32: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). As a nice side-effect - print backtrace in __die() with the same log level as the rest of function. [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200418201944.482088-40-dima@arista.com Signed-off-by: Linus Torvalds --- arch/unicore32/kernel/traps.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 2b7d73456865..8b1335997f50 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -135,12 +135,13 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) set_fs(fs); } -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp; int ok = 1; - printk(KERN_DEFAULT "Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -153,23 +154,29 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) asm("mov %0, fp" : "=r" (fp) : : "cc"); if (!fp) { - printk("no frame pointer"); + printk("%sno frame pointer", loglvl); ok = 0; } else if (verify_stack(fp)) { - printk("invalid frame pointer 0x%08x", fp); + printk("%sinvalid frame pointer 0x%08x", loglvl, fp); ok = 0; } else if (fp < (unsigned long)end_of_stack(tsk)) - printk("frame pointer underflow"); - printk("\n"); + printk("%sframe pointer underflow", loglvl); + printk("%s\n", loglvl); if (ok) - c_backtrace(fp, KERN_DEFAULT); + c_backtrace(fp, loglvl); +} + +void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, + const char *loglvl) +{ + dump_backtrace(NULL, tsk, loglvl); + barrier(); } void show_stack(struct task_struct *tsk, unsigned long *sp) { - dump_backtrace(NULL, tsk); - barrier(); + show_stack_loglvl(tsk, sp, KERN_DEFAULT) } static int __die(const char *str, int err, struct thread_info *thread, @@ -196,7 +203,7 @@ static int __die(const char *str, int err, struct thread_info *thread, if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } From d46b3df78ad4b4c178f1035a35463cbc0ce768b2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:57 -0700 Subject: [PATCH 519/574] x86: add missing const qualifiers for log_lvl Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Keep log_lvl const show_trace_log_lvl() and printk_stack_address() as the new generic show_stack_loglvl() wants to have a proper const qualifier. And gcc rightfully produces warnings in case it's not keept: arch/x86/kernel/dumpstack.c: In function `show_stack': arch/x86/kernel/dumpstack.c:294:37: warning: passing argument 4 of `show_trace_log_lv ' discards `const' qualifier from pointer target type [-Wdiscarded-qualifiers] 294 | show_trace_log_lvl(task, NULL, sp, loglvl); | ^~~~~~ arch/x86/kernel/dumpstack.c:163:32: note: expected `char *' but argument is of type `const char *' 163 | unsigned long *stack, char *log_lvl) | ~~~~~~^~~~~~~ [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-41-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/stacktrace.h | 2 +- arch/x86/kernel/dumpstack.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 14db05086bbf..5ae5a68e469d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -87,7 +87,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, const char *log_lvl); /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f..b94bc31a1757 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); @@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; From a832ff02244e36da0bf4bb3a1aec0ce9a23b0bad Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:00 -0700 Subject: [PATCH 520/574] x86: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-42-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/dumpstack.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b94bc31a1757..4396f2cfad19 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,8 @@ next: } } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { task = task ? : current; @@ -290,7 +291,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) From 47fb70294976cb1ea110f4fc01ae1bc2f450933a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:04 -0700 Subject: [PATCH 521/574] xtensa: add loglvl to show_trace() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Add log level argument to show_trace() as a preparation for introducing show_stack_loglvl(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u [rppt@kernel.org: build fix] Link: http://lkml.kernel.org/r/20200511194534.GA1018386@kernel.org Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Chris Zankel Cc: Max Filippov Cc: Mike Rapoport Link: http://lkml.kernel.org/r/20200418201944.482088-43-dima@arista.com Signed-off-by: Linus Torvalds --- arch/xtensa/kernel/traps.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 0976e27b8d5d..5013b62a9943 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -479,18 +479,22 @@ void show_regs(struct pt_regs * regs) static int show_trace_cb(struct stackframe *frame, void *data) { + const char *loglvl = data; + if (kernel_text_address(frame->pc)) - pr_cont(" [<%08lx>] %pB\n", frame->pc, (void *)frame->pc); + printk("%s [<%08lx>] %pB\n", + loglvl, frame->pc, (void *)frame->pc); return 0; } -void show_trace(struct task_struct *task, unsigned long *sp) +static void show_trace(struct task_struct *task, unsigned long *sp, + const char *loglvl) { if (!sp) sp = stack_pointer(task); - pr_info("Call Trace:\n"); - walk_stackframe(sp, show_trace_cb, NULL); + printk("%sCall Trace:\n", loglvl); + walk_stackframe(sp, show_trace_cb, (void *)loglvl); } #define STACK_DUMP_ENTRY_SIZE 4 @@ -511,7 +515,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, sp, len, false); - show_trace(task, sp); + show_trace(task, sp, KERN_INFO); } DEFINE_SPINLOCK(die_lock); From 20da1e8bb06d0e3ccd172de3e193c987e88013cb Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:07 -0700 Subject: [PATCH 522/574] xtensa: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Chris Zankel Cc: Max Filippov Link: http://lkml.kernel.org/r/20200418201944.482088-44-dima@arista.com Signed-off-by: Linus Torvalds --- arch/xtensa/kernel/traps.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 5013b62a9943..1013acc2e03e 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -501,7 +501,8 @@ static void show_trace(struct task_struct *task, unsigned long *sp, #define STACK_DUMP_LINE_SIZE 32 static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { size_t len; @@ -511,11 +512,16 @@ void show_stack(struct task_struct *task, unsigned long *sp) len = min((-(size_t)sp) & (THREAD_SIZE - STACK_DUMP_ENTRY_SIZE), kstack_depth_to_print * STACK_DUMP_ENTRY_SIZE); - pr_info("Stack:\n"); - print_hex_dump(KERN_INFO, " ", DUMP_PREFIX_NONE, + printk("%sStack:\n", loglvl); + print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, sp, len, false); - show_trace(task, sp, KERN_INFO); + show_trace(task, sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_INFO); } DEFINE_SPINLOCK(die_lock); From ab34b46d1a74e98996e67a7da7e5d683ecfd9f57 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:10 -0700 Subject: [PATCH 523/574] sysrq: use show_stack_loglvl() Show the stack trace on a CPU with the same log level as "CPU%d" header. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: Jiri Slaby Link: http://lkml.kernel.org/r/20200418201944.482088-45-dima@arista.com Signed-off-by: Linus Torvalds --- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 477cdc1e9cf3..7bd935379dec 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 95fb9e025247..373e4e3faf2a 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -31,6 +31,8 @@ extern void show_regs(struct pt_regs *); * trace (or NULL if the entire call-chain of the task should be shown). */ extern void show_stack(struct task_struct *task, unsigned long *sp); +extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); From 9ed5b01a36a0e40a7450b8a5caf82e0552c41bb3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:13 -0700 Subject: [PATCH 524/574] x86/amd_gart: print stacktrace for a leak with KERN_ERR It's under CONFIG_IOMMU_LEAK option which is enabled by debug config. Likely the backtrace is worth to be seen - so aligning with log level of error message in iommu_full(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-46-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 16133819415c..9d2c076be37a 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif From 3f0543780e09d6f475f043f6ce0824106e610fd4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:16 -0700 Subject: [PATCH 525/574] power: use show_stack_loglvl() Aligning with other watchdog messages just before panic - use KERN_EMERG. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Acked-by: Rafael J. Wysocki Cc: Greg Kroah-Hartman Cc: Len Brown Cc: Pavel Machek Link: http://lkml.kernel.org/r/20200418201944.482088-47-dima@arista.com Signed-off-by: Linus Torvalds --- drivers/base/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index bb98b813554f..be5af89bbff1 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -519,7 +519,7 @@ static void dpm_watchdog_handler(struct timer_list *t) struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); - show_stack(wd->tsk, NULL); + show_stack_loglvl(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } From 77819daf247aad16beaeb537ae77d1d6d0697ca2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:19 -0700 Subject: [PATCH 526/574] kdb: don't play with console_loglevel Print the stack trace with KERN_EMERG - it should be always visible. Playing with console_loglevel is a bad idea as there may be more messages printed than wanted. Also the stack trace might be not printed at all if printk() was deferred and console_loglevel was raised back before the trace got flushed. Unfortunately, after rebasing on commit 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master"), kdb_show_stack() uses now kdb_dump_stack_on_cpu(), which for now won't be converted as it uses dump_stack() instead of show_stack(). Convert for now the branch that uses show_stack() and remove console_loglevel exercise from that case. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Reviewed-by: Douglas Anderson Acked-by: Daniel Thompson Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200418201944.482088-48-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bt.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c16..43f5dcd2b9ac 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) - kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; + kdb_dump_stack_on_cpu(kdb_process_cpu(p)); + console_loglevel = old_lvl; + } else { + show_stack_loglvl(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } From 8ba09b1dc131ff9bb530967c593e298c600f72c0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:23 -0700 Subject: [PATCH 527/574] sched: print stack trace with KERN_INFO Aligning with other messages printed in sched_show_task() - use KERN_INFO to print the backtrace. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20200418201944.482088-49-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c06da3c3e317..c68a6e7b306f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack_loglvl(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); From fe1993a001094a596576334c56e7a10e4cd69e6c Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:26 -0700 Subject: [PATCH 528/574] kernel: use show_stack_loglvl() Align the last users of show_stack() by KERN_DEFAULT as the surrounding headers/messages. Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Link: http://lkml.kernel.org/r/20200418201944.482088-50-dima@arista.com Signed-off-by: Linus Torvalds --- kernel/locking/rtmutex-debug.c | 2 +- lib/dump_stack.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458..5e63d6e8a223 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack_loglvl(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 33ffbf308853..5595e8962cf6 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -74,7 +74,7 @@ void show_regs_print_info(const char *log_lvl) static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_DEFAULT); } /** From 9cb8f069deeed708bf19486d5893e297dc467ae0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:29 -0700 Subject: [PATCH 529/574] kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Linus Torvalds --- arch/alpha/kernel/traps.c | 8 +------- arch/arc/kernel/stacktrace.c | 8 +------- arch/arm/kernel/traps.c | 8 +------- arch/arm64/kernel/traps.c | 8 +------- arch/c6x/kernel/traps.c | 7 +------ arch/csky/kernel/ptrace.c | 4 ++-- arch/csky/kernel/stacktrace.c | 9 +-------- arch/h8300/kernel/traps.c | 8 +------- arch/hexagon/kernel/traps.c | 8 +------- arch/ia64/kernel/mca.c | 2 +- arch/ia64/kernel/process.c | 11 ++--------- arch/m68k/kernel/traps.c | 11 +++-------- arch/microblaze/kernel/traps.c | 8 +------- arch/mips/kernel/traps.c | 8 +------- arch/nds32/kernel/traps.c | 8 +------- arch/nios2/kernel/traps.c | 12 +++--------- arch/openrisc/kernel/traps.c | 10 ++-------- arch/parisc/kernel/traps.c | 8 +------- arch/powerpc/kernel/process.c | 11 +++-------- arch/powerpc/kernel/stacktrace.c | 2 +- arch/riscv/kernel/stacktrace.c | 8 +------- arch/s390/kernel/dumpstack.c | 9 ++------- arch/sh/kernel/dumpstack.c | 8 +------- arch/sparc/kernel/process_32.c | 11 ++--------- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/traps_64.c | 8 +------- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/kernel/sysrq.c | 7 +------ arch/unicore32/kernel/traps.c | 7 +------ arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/dumpstack.c | 7 +------ arch/xtensa/kernel/traps.c | 10 ++-------- drivers/base/power/main.c | 2 +- drivers/tty/sysrq.c | 2 +- include/linux/sched/debug.h | 5 ++--- kernel/debug/kdb/kdb_bt.c | 2 +- kernel/locking/rtmutex-debug.c | 2 +- kernel/sched/core.c | 2 +- lib/dump_stack.c | 2 +- 39 files changed, 52 insertions(+), 205 deletions(-) diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 2402f1777f54..8383ccfaccdc 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -144,8 +144,7 @@ dik_show_trace(unsigned long *sp, const char *loglvl) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long *stack; int i; @@ -174,11 +173,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, dik_show_trace(sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) { diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 24f9cd8a12c9..feba91c9d969 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -228,17 +228,11 @@ noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { show_stacktrace(tsk, NULL, loglvl); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - /* Another API expected by schedular, shows up in "ps" as Wait Channel * Of course just returning schedule( ) would be pointless so unwind until * the function is not in schedular code diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 2ae2c23799ad..65a3b1e75480 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -247,18 +247,12 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, } #endif -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3621868b2fcc..3fd9e2731e0d 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -137,18 +137,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, put_task_stack(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c index 4afbf48f1ce0..2b9121c755be 100644 --- a/arch/c6x/kernel/traps.c +++ b/arch/c6x/kernel/traps.c @@ -374,7 +374,7 @@ static void show_trace(unsigned long *stack, unsigned long *endstack, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { unsigned long *p, *endstack; @@ -403,11 +403,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, endstack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEBUG); -} - int is_valid_bugaddr(unsigned long addr) { return __kernel_text_address(addr); diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c index 5a82230bddf9..bbd801f86eb5 100644 --- a/arch/csky/kernel/ptrace.c +++ b/arch/csky/kernel/ptrace.c @@ -344,7 +344,7 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs) trace_sys_exit(regs, syscall_get_return_value(current, regs)); } -extern void show_stack(struct task_struct *task, unsigned long *stack); +extern void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl); void show_regs(struct pt_regs *fp) { unsigned long *sp; @@ -420,6 +420,6 @@ void show_regs(struct pt_regs *fp) } pr_cont("\n"); - show_stack(NULL, (unsigned long *)fp->regs[4]); + show_stack(NULL, (unsigned long *)fp->regs[4], KERN_INFO); return; } diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index ca135f13cc13..16ae20a0af34 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -95,19 +95,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - pr_cont("Call Trace:\n"); - walk_stackframe(task, NULL, print_trace_address, KERN_INFO); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index 6362446563d6..5d8b969cd8f3 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -115,8 +115,7 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) static int kstack_depth_to_print = 24; -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { unsigned long *stack, addr; int i; @@ -158,8 +157,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, } printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_INFO); -} diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index a8a3a210d781..904134b37232 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -175,18 +175,12 @@ static void do_show_stack(struct task_struct *task, unsigned long *fp, } } -void show_stack_loglvl(struct task_struct *task, unsigned long *fp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *fp, const char *loglvl) { /* Saved link reg is one word above FP */ do_show_stack(task, fp, 0, loglvl); } -void show_stack(struct task_struct *task, unsigned long *fp) -{ - show_stack_loglvl(task, fp, 0, KERN_INFO); -} - int die(const char *str, struct pt_regs *regs, long err) { static struct { diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6fb54dfa1350..2703f7795672 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1631,7 +1631,7 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi if (read_trylock(&tasklist_lock)) { do_each_thread (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); - show_stack(t, NULL); + show_stack(t, NULL, KERN_DEFAULT); } while_each_thread (g, t); read_unlock(&tasklist_lock); } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 913d9a01cbf9..96dfb9e4b16f 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -85,8 +85,7 @@ ia64_do_show_stack (struct unw_frame_info *info, void *arg) } void -show_stack_loglvl (struct task_struct *task, unsigned long *sp, - const char *loglvl) +show_stack (struct task_struct *task, unsigned long *sp, const char *loglvl) { if (!task) unw_init_running(ia64_do_show_stack, (void *)loglvl); @@ -98,12 +97,6 @@ show_stack_loglvl (struct task_struct *task, unsigned long *sp, } } -void -show_stack (struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_regs (struct pt_regs *regs) { @@ -158,7 +151,7 @@ show_regs (struct pt_regs *regs) ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); } } else - show_stack(NULL, NULL); + show_stack(NULL, NULL, KERN_DEFAULT); } /* local support for deprecated console_print */ diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index ffcc5ec4fac3..df6fc782754f 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -916,7 +916,7 @@ void show_registers(struct pt_regs *regs) default: pr_cont("\n"); } - show_stack(NULL, (unsigned long *)addr); + show_stack(NULL, (unsigned long *)addr, KERN_INFO); pr_info("Code:"); set_fs(KERNEL_DS); @@ -935,8 +935,8 @@ void show_registers(struct pt_regs *regs) pr_cont("\n"); } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *p; unsigned long *endstack; @@ -963,11 +963,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, show_trace(stack, loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} - /* * The vector number returned in the frame pointer may also contain * the "fs" (Fault Status) bits on ColdFire. These are in the bottom diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c index 149ae534937e..94b6fe93147d 100644 --- a/arch/microblaze/kernel/traps.c +++ b/arch/microblaze/kernel/traps.c @@ -31,8 +31,7 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long words_to_show; u32 fp = (u32) sp; @@ -77,8 +76,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, debug_show_held_locks(task); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index e49040739b61..320797bd03f6 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -198,8 +198,7 @@ static void show_stacktrace(struct task_struct *task, show_backtrace(task, regs, loglvl); } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { struct pt_regs regs; mm_segment_t old_fs = get_fs(); @@ -227,11 +226,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, set_fs(old_fs); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT) -} - static void show_code(unsigned int __user *pc) { long i; diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 90f12582c218..6a9772ba7392 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -135,8 +135,7 @@ static void __dump(struct task_struct *tsk, unsigned long *base_reg, printk("%s\n", loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long *base_reg; @@ -157,11 +156,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_EMERG); -} - DEFINE_SPINLOCK(die_lock); /* diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 08071caa9b36..b172da4eb1a9 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -52,14 +52,13 @@ void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) } /* - * The show_stack(), show_stack_loglvl() are external API - * which we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. */ int kstack_depth_to_print = 48; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *stack, + const char *loglvl) { unsigned long *endstack, addr; int i; @@ -106,11 +105,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_EMERG); -} - void __init trap_init(void) { /* Nothing to do here */ diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 3b7978a22d68..3022b0ad142c 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -48,8 +48,7 @@ void print_trace(void *data, unsigned long addr, int reliable) } /* displays a short stack trace */ -void show_stack_loglvl(struct task_struct *task, unsigned long *esp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { if (esp == NULL) esp = (unsigned long *)&esp; @@ -58,11 +57,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *esp, unwind_stack((void *)loglvl, esp, print_trace); } -void show_stack(struct task_struct *task, unsigned long *esp) -{ - show_stack_loglvl(task, esp, KERN_EMERG); -} - void show_registers(struct pt_regs *regs) { int i; @@ -104,7 +98,7 @@ void show_registers(struct pt_regs *regs) if (in_kernel) { printk("\nStack: "); - show_stack(NULL, (unsigned long *)esp); + show_stack(NULL, (unsigned long *)esp, KERN_EMERG); printk("\nCode: "); if (regs->pc < PAGE_OFFSET) diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index c2411de3730f..0a89899f154a 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -198,17 +198,11 @@ static void parisc_show_stack(struct task_struct *task, do_show_stack(&info, loglvl); } -void show_stack_loglvl(struct task_struct *t, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *t, unsigned long *sp, const char *loglvl) { parisc_show_stack(t, NULL, loglvl); } -void show_stack(struct task_struct *t, unsigned long *sp) -{ - show_stack_loglvl(t, sp, KERN_CRIT) -} - int is_valid_bugaddr(unsigned long iaoq) { return 1; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a456b4454b3f..99d619f81cb5 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1456,7 +1456,7 @@ void show_regs(struct pt_regs * regs) printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); #endif - show_stack(current, (unsigned long *) regs->gpr[1]); + show_stack(current, (unsigned long *) regs->gpr[1], KERN_DEFAULT); if (!user_mode(regs)) show_instructions(regs); } @@ -2063,8 +2063,8 @@ unsigned long get_wchan(struct task_struct *p) static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *stack, + const char *loglvl) { unsigned long sp, ip, lr, newsp; int count = 0; @@ -2133,11 +2133,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *stack, put_task_stack(tsk); } -void show_stack(struct task_struct *tsk, unsigned long *stack) -{ - show_stack_loglvl(tsk, stack, KERN_DEFAULT); -} - #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ void notrace __ppc64_runlatch_on(void) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index c477b8585a29..b6440657ef92 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -260,7 +260,7 @@ static void raise_backtrace_ipi(cpumask_t *mask) pr_cont(" current pointer corrupt? (%px)\n", p->__current); pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly stale):\n", p->saved_r1); - show_stack(p->__current, (unsigned long *)p->saved_r1); + show_stack(p->__current, (unsigned long *)p->saved_r1, KERN_WARNING); } } diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index aaa64bf007f8..595342910c3f 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -105,18 +105,12 @@ static bool print_trace_address(unsigned long pc, void *arg) return false; } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { pr_cont("Call Trace:\n"); walk_stackframe(task, NULL, print_trace_address, (void *)loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static bool save_wchan(unsigned long pc, void *arg) { if (!in_sched_functions(pc)) { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 887a054919fc..0dc4b258b98d 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -126,7 +126,7 @@ unknown: return -EINVAL; } -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct unwind_state state; @@ -139,11 +139,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, debug_show_held_locks(task ? : current); } -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_DEFAULT); -} - static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); @@ -181,7 +176,7 @@ void show_regs(struct pt_regs *regs) show_registers(regs); /* Show stack backtrace if pt_regs is from kernel mode */ if (!user_mode(regs)) - show_stack(NULL, (unsigned long *) regs->gprs[15]); + show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT); show_last_breaking_event(regs); } diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c index cc51e9d74667..a13c045804ed 100644 --- a/arch/sh/kernel/dumpstack.c +++ b/arch/sh/kernel/dumpstack.c @@ -144,8 +144,7 @@ void show_trace(struct task_struct *tsk, unsigned long *sp, debug_show_held_locks(tsk); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { unsigned long stack; @@ -161,8 +160,3 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, (unsigned long)task_stack_page(tsk)); show_trace(tsk, sp, NULL, loglvl); } - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 0b07de5618e5..65c0d5207b0c 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -145,12 +145,10 @@ void show_regs(struct pt_regs *r) } /* - * The show_stack(), show_stack_loglvl() are external APIs which - * we do not use ourselves. + * The show_stack() is external API which we do not use ourselves. * The oops is printed in die_if_kernel. */ -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long pc, fp; unsigned long task_base; @@ -179,11 +177,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, printk("%s\n", loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - /* * Free current thread data structures etc.. */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 423011e60982..5a4d9c8f8903 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -195,7 +195,7 @@ void show_regs(struct pt_regs *regs) regs->u_regs[15]); printk("RPC: <%pS>\n", (void *) regs->u_regs[15]); show_regwindow(regs); - show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]); + show_stack(current, (unsigned long *)regs->u_regs[UREG_FP], KERN_DEFAULT); } union global_cpu_snapshot global_cpu_snapshot[NR_CPUS]; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 8715bc93bd9d..96d92f107551 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2453,8 +2453,7 @@ static void user_instruction_dump(unsigned int __user *pc) printk("\n"); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, - const char *loglvl) +void show_stack(struct task_struct *tsk, unsigned long *_ksp, const char *loglvl) { unsigned long fp, ksp; struct thread_info *tp; @@ -2514,11 +2513,6 @@ void show_stack_loglvl(struct task_struct *tsk, unsigned long *_ksp, } while (++count < 16); } -void show_stack(struct task_struct *tsk, unsigned long *_ksp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - static inline struct reg_window *kernel_stack_up(struct reg_window *rw) { unsigned long fp = rw->ins[6]; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 30575bd92975..a2e680f7d39f 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -648,7 +648,7 @@ static void stack_proc(void *arg) { struct task_struct *task = arg; - show_stack(task, NULL); + show_stack(task, NULL, KERN_INFO); } /* diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 1b54b6431499..acbc879d2773 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -27,7 +27,7 @@ static const struct stacktrace_ops stackops = { .address = _print_addr }; -void show_stack_loglvl(struct task_struct *task, unsigned long *stack, +void show_stack(struct task_struct *task, unsigned long *stack, const char *loglvl) { struct pt_regs *segv_regs = current->thread.segv_regs; @@ -56,8 +56,3 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *stack, dump_trace(current, &stackops, (void *)loglvl); printk("%s\n", loglvl); } - -void show_stack(struct task_struct *task, unsigned long *stack) -{ - show_stack_loglvl(task, stack, KERN_INFO); -} diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c index 8b1335997f50..a3ac01df1a2e 100644 --- a/arch/unicore32/kernel/traps.c +++ b/arch/unicore32/kernel/traps.c @@ -167,18 +167,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, c_backtrace(fp, loglvl); } -void show_stack_loglvl(struct task_struct *tsk, unsigned long *sp, +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { dump_backtrace(NULL, tsk, loglvl); barrier(); } -void show_stack(struct task_struct *tsk, unsigned long *sp) -{ - show_stack_loglvl(tsk, sp, KERN_DEFAULT) -} - static int __die(const char *str, int err, struct thread_info *thread, struct pt_regs *regs) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 9d2c076be37a..5f816861f5d2 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack_loglvl(NULL, NULL, KERN_ERR); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4396f2cfad19..456511b2284e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,7 @@ next: } } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { task = task ? : current; @@ -294,11 +294,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace_log_lvl(task, NULL, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_stack_regs(struct pt_regs *regs) { show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 1013acc2e03e..e880460741d2 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -501,8 +501,7 @@ static void show_trace(struct task_struct *task, unsigned long *sp, #define STACK_DUMP_LINE_SIZE 32 static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { size_t len; @@ -519,11 +518,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace(task, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_INFO); -} - DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) @@ -540,7 +534,7 @@ void die(const char * str, struct pt_regs * regs, long err) pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr); show_regs(regs); if (!user_mode(regs)) - show_stack(NULL, (unsigned long*)regs->areg[1]); + show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irq(&die_lock); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index be5af89bbff1..9dd85bea4026 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -519,7 +519,7 @@ static void dpm_watchdog_handler(struct timer_list *t) struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); - show_stack_loglvl(wd->tsk, NULL, KERN_EMERG); + show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 7bd935379dec..7c95afa905a0 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -235,7 +235,7 @@ static void showacpu(void *dummy) raw_spin_lock_irqsave(&show_lock, flags); pr_info("CPU%d:\n", smp_processor_id()); - show_stack_loglvl(NULL, NULL, KERN_INFO); + show_stack(NULL, NULL, KERN_INFO); raw_spin_unlock_irqrestore(&show_lock, flags); } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 373e4e3faf2a..00c45a0e6abe 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -30,9 +30,8 @@ extern void show_regs(struct pt_regs *); * task), SP is the stack pointer of the first frame that should be shown in the back * trace (or NULL if the entire call-chain of the task should be shown). */ -extern void show_stack(struct task_struct *task, unsigned long *sp); -extern void show_stack_loglvl(struct task_struct *task, unsigned long *sp, - const char *loglvl); +extern void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl); extern void sched_show_task(struct task_struct *p); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 43f5dcd2b9ac..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -30,7 +30,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) kdb_dump_stack_on_cpu(kdb_process_cpu(p)); console_loglevel = old_lvl; } else { - show_stack_loglvl(p, addr, KERN_EMERG); + show_stack(p, addr, KERN_EMERG); } kdb_trap_printk--; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 5e63d6e8a223..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack_loglvl(task, NULL, KERN_DEFAULT); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c68a6e7b306f..8f360326861e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6025,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack_loglvl(p, NULL, KERN_INFO); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 5595e8962cf6..a00ee6eedc7c 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -74,7 +74,7 @@ void show_regs_print_info(const char *log_lvl) static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); - show_stack_loglvl(NULL, NULL, KERN_DEFAULT); + show_stack(NULL, NULL, KERN_DEFAULT); } /** From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: [PATCH 530/574] mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/boot/bootp.c | 1 - arch/alpha/boot/bootpz.c | 1 - arch/alpha/boot/main.c | 1 - arch/alpha/include/asm/io.h | 1 - arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/ptrace.c | 1 - arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/smp.c | 1 - arch/alpha/kernel/sys_alcor.c | 1 - arch/alpha/kernel/sys_cabriolet.c | 1 - arch/alpha/kernel/sys_dp264.c | 1 - arch/alpha/kernel/sys_eb64p.c | 1 - arch/alpha/kernel/sys_eiger.c | 1 - arch/alpha/kernel/sys_jensen.c | 1 - arch/alpha/kernel/sys_marvel.c | 1 - arch/alpha/kernel/sys_miata.c | 1 - arch/alpha/kernel/sys_mikasa.c | 1 - arch/alpha/kernel/sys_nautilus.c | 1 - arch/alpha/kernel/sys_noritake.c | 1 - arch/alpha/kernel/sys_rawhide.c | 1 - arch/alpha/kernel/sys_ruffian.c | 1 - arch/alpha/kernel/sys_rx164.c | 1 - arch/alpha/kernel/sys_sable.c | 1 - arch/alpha/kernel/sys_sio.c | 1 - arch/alpha/kernel/sys_sx164.c | 1 - arch/alpha/kernel/sys_takara.c | 1 - arch/alpha/kernel/sys_titan.c | 1 - arch/alpha/kernel/sys_wildfire.c | 1 - arch/alpha/mm/init.c | 1 - arch/arm/kernel/machine_kexec.c | 1 - arch/arm/kernel/module.c | 1 - arch/arm/kernel/ptrace.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-footbridge/common.c | 1 - arch/arm/mach-imx/mm-imx21.c | 1 - arch/arm/mach-imx/mm-imx27.c | 1 - arch/arm/mach-imx/mm-imx3.c | 1 - arch/arm/mach-iop32x/i2c.c | 1 - arch/arm/mach-iop32x/iq31244.c | 1 - arch/arm/mach-iop32x/iq80321.c | 1 - arch/arm/mach-iop32x/n2100.c | 1 - arch/arm/mach-ixp4xx/common.c | 1 - arch/arm/mach-sa1100/assabet.c | 1 - arch/arm/mm/copypage-v4mc.c | 1 - arch/arm/mm/copypage-v6.c | 1 - arch/arm/mm/copypage-xscale.c | 1 - arch/arm/mm/dump.c | 1 - arch/arm/mm/fault-armv.c | 1 - arch/arm/mm/fault.c | 1 - arch/arm/mm/pageattr.c | 1 - arch/arm64/kernel/hibernate.c | 1 - arch/arm64/kernel/ptrace.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/dump.c | 1 - arch/arm64/mm/fault.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/pageattr.c | 1 - arch/csky/kernel/module.c | 1 - arch/csky/kernel/ptrace.c | 1 - arch/csky/mm/init.c | 1 - arch/csky/mm/tlb.c | 1 - arch/h8300/kernel/process.c | 1 - arch/h8300/kernel/setup.c | 1 - arch/h8300/kernel/signal.c | 1 - arch/h8300/mm/fault.c | 1 - arch/h8300/mm/init.c | 1 - arch/h8300/mm/memory.c | 1 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/kernel/efi.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/ia64/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/ia64/mm/contig.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/68000/timers.c | 1 - arch/m68k/amiga/config.c | 1 - arch/m68k/apollo/config.c | 1 - arch/m68k/atari/atasound.c | 1 - arch/m68k/atari/stram.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68k/kernel/ptrace.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/signal.c | 1 - arch/m68k/kernel/uboot.c | 1 - arch/m68k/mac/config.c | 1 - arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/sun3kmap.c | 1 - arch/m68k/mm/sun3mmu.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - arch/m68k/q40/config.c | 1 - arch/m68k/sun3/config.c | 1 - arch/m68k/sun3/dvma.c | 1 - arch/m68k/sun3/mmu_emu.c | 1 - arch/m68k/sun3/sun3dvma.c | 1 - arch/m68k/sun3x/dvma.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/mm/fault.c | 1 - arch/mips/fw/arc/memory.c | 1 - arch/mips/include/asm/mach-generic/floppy.h | 1 - arch/mips/include/asm/mach-jazz/floppy.h | 1 - arch/mips/jazz/jazzdma.c | 1 - arch/mips/kernel/module.c | 1 - arch/mips/kernel/process.c | 1 - arch/mips/kernel/ptrace.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/kernel/smp-bmips.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kvm/tlb.c | 1 - arch/mips/lib/dump_tlb.c | 1 - arch/mips/lib/r3k_dump_tlb.c | 1 - arch/mips/mm/c-octeon.c | 1 - arch/mips/mm/c-r3k.c | 1 - arch/mips/mm/c-r4k.c | 1 - arch/mips/mm/c-tx39.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/mm/page.c | 1 - arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - arch/mips/mm/sc-ip22.c | 1 - arch/mips/mm/sc-mips.c | 1 - arch/mips/mm/sc-r5k.c | 1 - arch/mips/mm/tlb-r3k.c | 1 - arch/mips/mm/tlb-r4k.c | 1 - arch/mips/sgi-ip27/ip27-init.c | 1 - arch/mips/sgi-ip27/ip27-timer.c | 1 - arch/mips/sgi-ip32/ip32-memory.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nds32/mm/proc.c | 1 - arch/nios2/kernel/module.c | 1 - arch/nios2/mm/init.c | 1 - arch/nios2/mm/pgtable.c | 1 - arch/nios2/mm/tlb.c | 1 - arch/openrisc/include/asm/tlbflush.h | 1 - arch/openrisc/kernel/asm-offsets.c | 1 - arch/openrisc/kernel/process.c | 1 - arch/openrisc/kernel/ptrace.c | 1 - arch/openrisc/kernel/setup.c | 1 - arch/openrisc/kernel/traps.c | 1 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/tlb.c | 1 - arch/parisc/include/asm/mmu_context.h | 1 - arch/parisc/kernel/module.c | 1 - arch/parisc/kernel/ptrace.c | 1 - arch/parisc/kernel/smp.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/include/asm/io.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/process.c | 1 - arch/powerpc/kernel/signal_32.c | 1 - arch/powerpc/kernel/signal_64.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/vdso.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/radix_pgtable.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/init_32.c | 1 - arch/powerpc/mm/init_64.c | 1 - arch/powerpc/mm/mem.c | 1 - arch/powerpc/mm/nohash/40x.c | 1 - arch/powerpc/mm/nohash/fsl_booke.c | 1 - arch/powerpc/mm/pgtable_32.c | 1 - arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/mm/ptdump/hashpagetable.c | 1 - arch/powerpc/mm/ptdump/ptdump.c | 1 - arch/powerpc/perf/callchain.c | 1 - arch/powerpc/perf/callchain_32.c | 1 - arch/powerpc/perf/callchain_64.c | 1 - arch/powerpc/platforms/8xx/cpm1.c | 1 - arch/powerpc/platforms/8xx/micropatch.c | 1 - arch/powerpc/platforms/cell/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/maple/setup.c | 1 - arch/powerpc/platforms/maple/time.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - arch/powerpc/platforms/powermac/time.c | 1 - arch/powerpc/platforms/pseries/setup.c | 1 - arch/powerpc/sysdev/cpm2.c | 1 - arch/powerpc/xmon/xmon.c | 1 - arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/include/asm/tlbflush.h | 1 - arch/s390/kernel/machine_kexec.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/kernel/vdso.c | 1 - arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/fault.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pageattr.c | 1 - arch/s390/mm/pgtable.c | 1 - arch/s390/mm/vmem.c | 1 - arch/sh/kernel/machine_kexec.c | 1 - arch/sh/kernel/ptrace_32.c | 1 - arch/sh/kernel/signal_32.c | 1 - arch/sh/mm/cache-sh3.c | 1 - arch/sh/mm/cache-sh4.c | 1 - arch/sh/mm/cache-sh7705.c | 1 - arch/sh/mm/nommu.c | 1 - arch/sparc/kernel/leon_smp.c | 1 - arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/setup_32.c | 1 - arch/sparc/kernel/setup_64.c | 1 - arch/sparc/kernel/signal32.c | 1 - arch/sparc/kernel/signal_32.c | 1 - arch/sparc/kernel/signal_64.c | 1 - arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 1 - arch/sparc/kernel/traps_64.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/sparc/mm/hugetlbpage.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - arch/sparc/mm/tlb.c | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/skas/mmu.c | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/um/kernel/tlb.c | 1 - arch/um/kernel/trap.c | 1 - arch/um/kernel/um_arch.c | 1 - arch/unicore32/kernel/module.c | 1 - arch/unicore32/mm/fault.c | 1 - arch/x86/include/asm/iomap.h | 1 - arch/x86/include/asm/xen/page.h | 1 - arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - arch/x86/mm/pat/cpa-test.c | 1 - arch/x86/mm/pat/memtype.c | 1 - arch/x86/mm/pgtable.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/mm/pti.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/enlighten_pv.c | 1 - arch/x86/xen/grant-table.c | 1 - arch/xtensa/kernel/process.c | 1 - arch/xtensa/kernel/ptrace.c | 1 - arch/xtensa/kernel/setup.c | 1 - drivers/char/agp/frontend.c | 1 - drivers/char/agp/generic.c | 1 - drivers/char/bsr.c | 1 - drivers/char/mspec.c | 1 - drivers/gpu/drm/i915/i915_mm.c | 1 - drivers/infiniband/sw/rdmavt/mmap.c | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - drivers/media/platform/davinci/vpbe_display.c | 1 - drivers/media/v4l2-core/v4l2-common.c | 1 - drivers/misc/sgi-gru/grufault.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/a2091.c | 1 - drivers/scsi/a3000.c | 1 - drivers/scsi/gvp11.c | 1 - drivers/scsi/lasi700.c | 1 - drivers/scsi/mvme147.c | 1 - drivers/scsi/sni_53c710.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/fbdev/acornfb.c | 1 - drivers/video/fbdev/atafb.c | 1 - drivers/video/fbdev/cirrusfb.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/fb-puv3.c | 1 - drivers/video/fbdev/hitfb.c | 1 - drivers/video/fbdev/neofb.c | 1 - drivers/video/fbdev/q40fb.c | 1 - drivers/video/fbdev/savage/savagefb_driver.c | 1 - drivers/xen/balloon.c | 1 - drivers/xen/grant-table.c | 1 - drivers/xen/privcmd.c | 1 - drivers/xen/xenbus/xenbus_probe.c | 1 - drivers/xen/xenbus/xenbus_probe_backend.c | 1 - drivers/xen/xenbus/xenbus_probe_frontend.c | 1 - fs/proc/array.c | 1 - fs/proc/meminfo.c | 1 - fs/proc/nommu.c | 1 - fs/proc/vmcore.c | 1 - include/linux/dax.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - lib/ioremap.c | 1 - mm/debug_vm_pgtable.c | 1 - mm/gup.c | 1 - mm/hugetlb.c | 1 - mm/memory.c | 1 - mm/page_io.c | 1 - mm/shmem.c | 1 - mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - mm/swap_state.c | 1 - mm/swapfile.c | 1 - mm/vmacache.c | 1 - sound/core/sgbuf.c | 1 - virt/kvm/kvm_main.c | 1 - 319 files changed, 319 deletions(-) diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f4858..00266e6e1b71 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344b..43af71835adf 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed8610970..e5347a080008 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5..13bea465f1c0 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a..b45f0b0d6511 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d6..8c43212ae38e 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b4..f5c42a8fcf9c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 52995bf413fe..631cc17410d1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65..e063b3857b3d 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f..47459b73cdb7 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d33508621820..9fb445d7dca5 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb987..3c43fd347526 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141..bf99dcfd40c4 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77..0a2ab6cb18db 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1..83d6c53d6d4d 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225..e1bee8f84c58 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1..7690dfd57cb6 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b..53adf43dcd44 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765..47f3ce4f719a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b5..b5846ffdadce 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d33074011960..4b1c8d85c4f0 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa..94046f9aea08 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d8..930005b2f630 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47ad..7c420d8dac53 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c8..dd9de84b630c 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c6864088..9e2adb69bc74 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de140..b1f3b4fcf99b 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fd..2c54d707142a 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5..3c42b3147fd6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e8..974b6c64d3e6 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24e..e15444b25ca0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff635..d0f7c8896c96 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a8..9a6432557871 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759..5960e3dfd2bf 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d..eee095f0e2f6 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include