Merge commit c288d9cd71 ("Merge tag 'for-5.14/io_uring-2021-06-30' of git://git.kernel.dk/linux-block") into android-mainline

Another small step en route to v5.14-rc1

Change-Id: I24899ab78da7d367574ed69ceaa82ab0837d9556
Signed-off-by: Lee Jones <lee.jones@linaro.org>
This commit is contained in:
Lee Jones
2021-07-12 09:47:50 +01:00
3009 changed files with 139205 additions and 42385 deletions

View File

@@ -197,8 +197,24 @@ Description:
Drivers may emit a CHANGE uevent when a password is set or unset
userspace may check it again.
On Dell systems, if Admin password is set, then all BIOS attributes
On Dell and Lenovo systems, if Admin password is set, then all BIOS attributes
require password validation.
On Lenovo systems if you change the Admin password the new password is not active until
the next boot.
Lenovo specific class extensions
------------------------------
On Lenovo systems the following additional settings are available:
lenovo_encoding:
The encoding method that is used. This can be either "ascii"
or "scancode". Default is set to "ascii"
lenovo_kbdlang:
The keyboard language method that is used. This is generally a
two char code (e.g. "us", "fr", "gr") and may vary per platform.
Default is set to "us"
What: /sys/class/firmware-attributes/*/attributes/pending_reboot
Date: February 2021

View File

@@ -0,0 +1,78 @@
What: /sys/devices/platform/soc@X/XXXXXXX.ipa/
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The /sys/devices/platform/soc@X/XXXXXXX.ipa/ directory
contains read-only attributes exposing information about
an IPA device. The X values could vary, but are typically
"soc@0/1e40000.ipa".
What: .../XXXXXXX.ipa/version
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/version file contains the IPA hardware
version, as a period-separated set of two or three integers
(e.g., "3.5.1" or "4.2").
What: .../XXXXXXX.ipa/feature/
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/feature/ directory contains a set of
attributes describing features implemented by the IPA
hardware.
What: .../XXXXXXX.ipa/feature/rx_offload
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/feature/rx_offload file contains a
string indicating the type of receive checksum offload
that is supported by the hardware. The possible values
are "MAPv4" or "MAPv5".
What: .../XXXXXXX.ipa/feature/tx_offload
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/feature/tx_offload file contains a
string indicating the type of transmit checksum offload
that is supported by the hardware. The possible values
are "MAPv4" or "MAPv5".
What: .../XXXXXXX.ipa/modem/
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/modem/ directory contains a set of
attributes describing properties of the modem execution
environment reachable by the IPA hardware.
What: .../XXXXXXX.ipa/modem/rx_endpoint_id
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/feature/rx_endpoint_id file contains
the AP endpoint ID that receives packets originating from
the modem execution environment. The "rx" is from the
perspective of the AP; this endpoint is considered an "IPA
producer". An endpoint ID is a small unsigned integer.
What: .../XXXXXXX.ipa/modem/tx_endpoint_id
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
The .../XXXXXXX.ipa/feature/tx_endpoint_id file contains
the AP endpoint ID used to transmit packets destined for
the modem execution environment. The "tx" is from the
perspective of the AP; this endpoint is considered an "IPA
consumer". An endpoint ID is a small unsigned integer.

View File

@@ -0,0 +1,55 @@
What: /sys/bus/wmi/devices/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_supported_type
Date: Apr 2021
KernelVersion: 5.13
Contact: "perry.yuan@dell.com>"
Description:
Display which dell hardware level privacy devices are supported
“Dell Privacy” is a set of HW, FW, and SW features to enhance
Dells commitment to platform privacy for MIC, Camera, and
ePrivacy screens.
The supported hardware privacy devices are:
Attributes:
Microphone Mute:
Identifies the local microphone can be muted by hardware, no applications
is available to capture system mic sound
Camera Shutter:
Identifies camera shutter controlled by hardware, which is a micromechanical
shutter assembly that is built onto the camera module to block capturing images
from outside the laptop
supported:
The privacy device is supported by this system
unsupported:
The privacy device is not supported on this system
For example to check which privacy devices are supported:
# cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_supported_type
[Microphone Mute] [supported]
[Camera Shutter] [supported]
[ePrivacy Screen] [unsupported]
What: /sys/bus/wmi/devices/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_current_state
Date: Apr 2021
KernelVersion: 5.13
Contact: "perry.yuan@dell.com>"
Description:
Allow user space to check current dell privacy device state.
Describes the Device State class exposed by BIOS which can be
consumed by various applications interested in knowing the Privacy
feature capabilities
Attributes:
muted:
Identifies the privacy device is turned off and cannot send stream to OS applications
unmuted:
Identifies the privacy device is turned on ,audio or camera driver can get
stream from mic and camera module to OS applications
For example to check all supported current privacy device states:
# cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_current_state
[Microphone] [unmuted]
[Camera Shutter] [unmuted]

View File

@@ -211,27 +211,40 @@ over a rather long period of time, but improvements are always welcome!
of the system, especially to real-time workloads running on
the rest of the system.
7. As of v4.20, a given kernel implements only one RCU flavor,
which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
If the updater uses call_rcu() or synchronize_rcu(),
then the corresponding readers may use rcu_read_lock() and
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
or any pair of primitives that disables and re-enables preemption,
for example, rcu_read_lock_sched() and rcu_read_unlock_sched().
If the updater uses synchronize_srcu() or call_srcu(),
then the corresponding readers must use srcu_read_lock() and
srcu_read_unlock(), and with the same srcu_struct. The rules for
the expedited primitives are the same as for their non-expedited
counterparts. Mixing things up will result in confusion and
broken kernels, and has even resulted in an exploitable security
issue.
7. As of v4.20, a given kernel implements only one RCU flavor, which
is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
If the updater uses call_rcu() or synchronize_rcu(), then
the corresponding readers may use: (1) rcu_read_lock() and
rcu_read_unlock(), (2) any pair of primitives that disables
and re-enables softirq, for example, rcu_read_lock_bh() and
rcu_read_unlock_bh(), or (3) any pair of primitives that disables
and re-enables preemption, for example, rcu_read_lock_sched() and
rcu_read_unlock_sched(). If the updater uses synchronize_srcu()
or call_srcu(), then the corresponding readers must use
srcu_read_lock() and srcu_read_unlock(), and with the same
srcu_struct. The rules for the expedited RCU grace-period-wait
primitives are the same as for their non-expedited counterparts.
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
in cases where local bottom halves are already known to be
disabled, for example, in irq or softirq context. Commenting
such cases is a must, of course! And the jury is still out on
whether the increased speed is worth it.
If the updater uses call_rcu_tasks() or synchronize_rcu_tasks(),
then the readers must refrain from executing voluntary
context switches, that is, from blocking. If the updater uses
call_rcu_tasks_trace() or synchronize_rcu_tasks_trace(), then
the corresponding readers must use rcu_read_lock_trace() and
rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
or synchronize_rcu_tasks_rude(), then the corresponding readers
must use anything that disables interrupts.
Mixing things up will result in confusion and broken kernels, and
has even resulted in an exploitable security issue. Therefore,
when using non-obvious pairs of primitives, commenting is
of course a must. One example of non-obvious pairing is
the XDP feature in networking, which calls BPF programs from
network-driver NAPI (softirq) context. BPF relies heavily on RCU
protection for its data structures, but because the BPF program
invocation happens entirely within a single local_bh_disable()
section in a NAPI poll cycle, this usage is safe. The reason
that this usage is safe is that readers can use anything that
disables BH when updaters use call_rcu() or synchronize_rcu().
8. Although synchronize_rcu() is slower than is call_rcu(), it
usually results in simpler code. So, unless update performance is

View File

@@ -17,29 +17,30 @@ level logical devices like device mapper.
HOWTO
=====
Throttling/Upper Limit policy
-----------------------------
- Enable Block IO controller::
Enable Block IO controller::
CONFIG_BLK_CGROUP=y
- Enable throttling in block layer::
Enable throttling in block layer::
CONFIG_BLK_DEV_THROTTLING=y
- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
- Specify a bandwidth rate on particular device for root group. The format
Specify a bandwidth rate on particular device for root group. The format
for policy is "<major>:<minor> <bytes_per_second>"::
echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
Above will put a limit of 1MB/second on reads happening for root group
This will put a limit of 1MB/second on reads happening for root group
on device having major/minor number 8:16.
- Run dd to read a file and see if rate is throttled to 1MB/s or not::
Run dd to read a file and see if rate is throttled to 1MB/s or not::
# dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
1024+0 records in
@@ -79,85 +80,89 @@ following::
Various user visible config options
===================================
CONFIG_BLK_CGROUP
- Block IO controller.
Block IO controller.
CONFIG_BFQ_CGROUP_DEBUG
- Debug help. Right now some additional stats file show up in cgroup
Debug help. Right now some additional stats file show up in cgroup
if this option is enabled.
CONFIG_BLK_DEV_THROTTLING
- Enable block device throttling support in block layer.
Enable block device throttling support in block layer.
Details of cgroup files
=======================
Proportional weight policy files
--------------------------------
- blkio.weight
- Specifies per cgroup weight. This is default weight of the group
on all the devices until and unless overridden by per device rule.
(See blkio.weight_device).
Currently allowed range of weights is from 10 to 1000.
- blkio.weight_device
- One can specify per cgroup per device rules using this interface.
These rules override the default value of group weight as specified
by blkio.weight.
blkio.bfq.weight
Specifies per cgroup weight. This is default weight of the group
on all the devices until and unless overridden by per device rule
(see `blkio.bfq.weight_device` below).
Currently allowed range of weights is from 1 to 1000. For more details,
see Documentation/block/bfq-iosched.rst.
blkio.bfq.weight_device
Specifes per cgroup per device weights, overriding the default group
weight. For more details, see Documentation/block/bfq-iosched.rst.
Following is the format::
# echo dev_maj:dev_minor weight > blkio.weight_device
# echo dev_maj:dev_minor weight > blkio.bfq.weight_device
Configure weight=300 on /dev/sdb (8:16) in this cgroup::
# echo 8:16 300 > blkio.weight_device
# cat blkio.weight_device
# echo 8:16 300 > blkio.bfq.weight_device
# cat blkio.bfq.weight_device
dev weight
8:16 300
Configure weight=500 on /dev/sda (8:0) in this cgroup::
# echo 8:0 500 > blkio.weight_device
# cat blkio.weight_device
# echo 8:0 500 > blkio.bfq.weight_device
# cat blkio.bfq.weight_device
dev weight
8:0 500
8:16 300
Remove specific weight for /dev/sda in this cgroup::
# echo 8:0 0 > blkio.weight_device
# cat blkio.weight_device
# echo 8:0 0 > blkio.bfq.weight_device
# cat blkio.bfq.weight_device
dev weight
8:16 300
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
blkio.time
Disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
third field specifies the disk time allocated to group in
milliseconds.
- blkio.sectors
- number of sectors transferred to/from disk by the group. First
blkio.sectors
Number of sectors transferred to/from disk by the group. First
two fields specify the major and minor number of the device and
third field specifies the number of sectors transferred by the
group to/from the device.
- blkio.io_service_bytes
- Number of bytes transferred to/from the disk by the group. These
blkio.io_service_bytes
Number of bytes transferred to/from the disk by the group. These
are further divided by the type of operation - read or write, sync
or async. First two fields specify the major and minor number of the
device, third field specifies the operation type and the fourth field
specifies the number of bytes.
- blkio.io_serviced
- Number of IOs (bio) issued to the disk by the group. These
blkio.io_serviced
Number of IOs (bio) issued to the disk by the group. These
are further divided by the type of operation - read or write, sync
or async. First two fields specify the major and minor number of the
device, third field specifies the operation type and the fourth field
specifies the number of IOs.
- blkio.io_service_time
- Total amount of time between request dispatch and request completion
blkio.io_service_time
Total amount of time between request dispatch and request completion
for the IOs done by this cgroup. This is in nanoseconds to make it
meaningful for flash devices too. For devices with queue depth of 1,
this time represents the actual service time. When queue_depth > 1,
@@ -170,8 +175,8 @@ Proportional weight policy files
specifies the operation type and the fourth field specifies the
io_service_time in ns.
- blkio.io_wait_time
- Total amount of time the IOs for this cgroup spent waiting in the
blkio.io_wait_time
Total amount of time the IOs for this cgroup spent waiting in the
scheduler queues for service. This can be greater than the total time
elapsed since it is cumulative io_wait_time for all IOs. It is not a
measure of total time the cgroup spent waiting but rather a measure of
@@ -185,24 +190,24 @@ Proportional weight policy files
minor number of the device, third field specifies the operation type
and the fourth field specifies the io_wait_time in ns.
- blkio.io_merged
- Total number of bios/requests merged into requests belonging to this
blkio.io_merged
Total number of bios/requests merged into requests belonging to this
cgroup. This is further divided by the type of operation - read or
write, sync or async.
- blkio.io_queued
- Total number of requests queued up at any given instant for this
blkio.io_queued
Total number of requests queued up at any given instant for this
cgroup. This is further divided by the type of operation - read or
write, sync or async.
- blkio.avg_queue_size
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
blkio.avg_queue_size
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
The average queue size for this cgroup over the entire time of this
cgroup's existence. Queue size samples are taken each time one of the
queues of this cgroup gets a timeslice.
- blkio.group_wait_time
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
blkio.group_wait_time
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
This is the amount of time the cgroup had to wait since it became busy
(i.e., went from 0 to 1 request queued) to get a timeslice for one of
its queues. This is different from the io_wait_time which is the
@@ -212,8 +217,8 @@ Proportional weight policy files
will only report the group_wait_time accumulated till the last time it
got a timeslice and will not include the current delta.
- blkio.empty_time
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
blkio.empty_time
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
This is the amount of time a cgroup spends without any pending
requests when not being served, i.e., it does not include any time
spent idling for one of the queues of the cgroup. This is in
@@ -221,8 +226,8 @@ Proportional weight policy files
the stat will only report the empty_time accumulated till the last
time it had a pending request and will not include the current delta.
- blkio.idle_time
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
blkio.idle_time
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
This is the amount of time spent by the IO scheduler idling for a
given cgroup in anticipation of a better request than the existing ones
from other queues/cgroups. This is in nanoseconds. If this is read
@@ -230,43 +235,43 @@ Proportional weight policy files
idle_time accumulated till the last idle period and will not include
the current delta.
- blkio.dequeue
- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
blkio.dequeue
Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
gives the statistics about how many a times a group was dequeued
from service tree of the device. First two fields specify the major
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.
- blkio.*_recursive
- Recursive version of various stats. These files show the
blkio.*_recursive
Recursive version of various stats. These files show the
same information as their non-recursive counterparts but
include stats from all the descendant cgroups.
Throttling/Upper limit policy files
-----------------------------------
- blkio.throttle.read_bps_device
- Specifies upper limit on READ rate from the device. IO rate is
blkio.throttle.read_bps_device
Specifies upper limit on READ rate from the device. IO rate is
specified in bytes per second. Rules are per device. Following is
the format::
echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
- blkio.throttle.write_bps_device
- Specifies upper limit on WRITE rate to the device. IO rate is
blkio.throttle.write_bps_device
Specifies upper limit on WRITE rate to the device. IO rate is
specified in bytes per second. Rules are per device. Following is
the format::
echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
- blkio.throttle.read_iops_device
- Specifies upper limit on READ rate from the device. IO rate is
blkio.throttle.read_iops_device
Specifies upper limit on READ rate from the device. IO rate is
specified in IO per second. Rules are per device. Following is
the format::
echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
- blkio.throttle.write_iops_device
- Specifies upper limit on WRITE rate to the device. IO rate is
blkio.throttle.write_iops_device
Specifies upper limit on WRITE rate to the device. IO rate is
specified in io per second. Rules are per device. Following is
the format::
@@ -275,15 +280,15 @@ Throttling/Upper limit policy files
Note: If both BW and IOPS rules are specified for a device, then IO is
subjected to both the constraints.
- blkio.throttle.io_serviced
- Number of IOs (bio) issued to the disk by the group. These
blkio.throttle.io_serviced
Number of IOs (bio) issued to the disk by the group. These
are further divided by the type of operation - read or write, sync
or async. First two fields specify the major and minor number of the
device, third field specifies the operation type and the fourth field
specifies the number of IOs.
- blkio.throttle.io_service_bytes
- Number of bytes transferred to/from the disk by the group. These
blkio.throttle.io_service_bytes
Number of bytes transferred to/from the disk by the group. These
are further divided by the type of operation - read or write, sync
or async. First two fields specify the major and minor number of the
device, third field specifies the operation type and the fourth field
@@ -291,6 +296,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
Common files among various policies
-----------------------------------
- blkio.reset_stats
- Writing an int to this file will result in resetting all the stats
blkio.reset_stats
Writing an int to this file will result in resetting all the stats
for that cgroup.

View File

@@ -56,6 +56,7 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
5-3-3. IO Latency
5-3-3-1. How IO Latency Throttling Works
5-3-3-2. IO Latency Interface Files
5-3-4. IO Priority
5-4. PID
5-4-1. PID Interface Files
5-5. Cpuset
@@ -1866,6 +1867,60 @@ IO Latency Interface Files
duration of time between evaluation events. Windows only elapse
with IO activity. Idle periods extend the most recent window.
IO Priority
~~~~~~~~~~~
A single attribute controls the behavior of the I/O priority cgroup policy,
namely the blkio.prio.class attribute. The following values are accepted for
that attribute:
no-change
Do not modify the I/O priority class.
none-to-rt
For requests that do not have an I/O priority class (NONE),
change the I/O priority class into RT. Do not modify
the I/O priority class of other requests.
restrict-to-be
For requests that do not have an I/O priority class or that have I/O
priority class RT, change it into BE. Do not modify the I/O priority
class of requests that have priority class IDLE.
idle
Change the I/O priority class of all requests into IDLE, the lowest
I/O priority class.
The following numerical values are associated with the I/O priority policies:
+-------------+---+
| no-change | 0 |
+-------------+---+
| none-to-rt | 1 |
+-------------+---+
| rt-to-be | 2 |
+-------------+---+
| all-to-idle | 3 |
+-------------+---+
The numerical value that corresponds to each I/O priority class is as follows:
+-------------------------------+---+
| IOPRIO_CLASS_NONE | 0 |
+-------------------------------+---+
| IOPRIO_CLASS_RT (real-time) | 1 |
+-------------------------------+---+
| IOPRIO_CLASS_BE (best effort) | 2 |
+-------------------------------+---+
| IOPRIO_CLASS_IDLE | 3 |
+-------------------------------+---+
The algorithm to set the I/O priority class for a request is as follows:
- Translate the I/O priority class policy into a number.
- Change the request I/O priority class into the maximum of the I/O priority
class policy number and the numerical I/O priority class.
PID
---

View File

@@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
Constructor parameters:
1. type of the cache device - "p" or "s"
- p - persistent memory
- s - SSD
2. the underlying device that will be cached
@@ -21,7 +20,6 @@ Constructor parameters:
size)
5. the number of optional parameters (the parameters with an argument
count as two)
start_sector n (default: 0)
offset from the start of cache device in 512-byte sectors
high_watermark n (default: 50)
@@ -53,6 +51,27 @@ Constructor parameters:
- some underlying devices perform better with fua, some
with nofua. The user should test it
cleaner
when this option is activated (either in the constructor
arguments or by a message), the cache will not promote
new writes (however, writes to already cached blocks are
promoted, to avoid data corruption due to misordered
writes) and it will gradually writeback any cached
data. The userspace can then monitor the cleaning
process with "dmsetup status". When the number of cached
blocks drops to zero, userspace can unload the
dm-writecache target and replace it with dm-linear or
other targets.
max_age n
specifies the maximum age of a block in milliseconds. If
a block is stored in the cache for too long, it will be
written to the underlying device and cleaned up.
metadata_only
only metadata is promoted to the cache. This option
improves performance for heavier REQ_META workloads.
pause_writeback n (default: 3000)
pause writeback if there was some write I/O redirected to
the origin volume in the last n milliseconds
Status:
1. error indicator - 0 if there was no error, otherwise error number
@@ -77,3 +96,5 @@ Messages:
5. resume the device, so that it will use the linear
target
6. the cache device is now inactive and it can be deleted
cleaner
See above "cleaner" constructor documentation.

View File

@@ -113,7 +113,7 @@
the GPE dispatcher.
This facility can be used to prevent such uncontrolled
GPE floodings.
Format: <byte>
Format: <byte> or <bitmap-list>
acpi_no_auto_serialize [HW,ACPI]
Disable auto-serialization of AML methods
@@ -586,6 +586,28 @@
loops can be debugged more effectively on production
systems.
clocksource.max_cswd_read_retries= [KNL]
Number of clocksource_watchdog() retries due to
external delays before the clock will be marked
unstable. Defaults to three retries, that is,
four attempts to read the clock under test.
clocksource.verify_n_cpus= [KNL]
Limit the number of CPUs checked for clocksources
marked with CLOCK_SOURCE_VERIFY_PERCPU that
are marked unstable due to excessive skew.
A negative value says to check all CPUs, while
zero says not to check any. Values larger than
nr_cpu_ids are silently truncated to nr_cpu_ids.
The actual CPUs are chosen randomly, with
no replacement if the same CPU is chosen twice.
clocksource-wdtest.holdoff= [KNL]
Set the time in seconds that the clocksource
watchdog test waits before commencing its tests.
Defaults to zero when built as a module and to
10 seconds when built into the kernel.
clearcpuid=BITNUM[,BITNUM...] [X86]
Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeatures.h for the valid bit
@@ -3578,6 +3600,12 @@
off: turn off poisoning (default)
on: turn on poisoning
page_reporting.page_reporting_order=
[KNL] Minimal page reporting order
Format: <integer>
Adjust the minimal page reporting order. The page
reporting is disabled when it exceeds (MAX_ORDER-1).
panic= [KNL] Kernel behaviour on panic: delay <timeout>
timeout > 0: seconds before rebooting
timeout = 0: wait forever

View File

@@ -101,17 +101,6 @@ this results in concentration of disk activity in a small time interval which
occurs only once every 10 minutes, or whenever the disk is forced to spin up by
a cache miss. The disk can then be spun down in the periods of inactivity.
If you want to find out which process caused the disk to spin up, you can
gather information by setting the flag /proc/sys/vm/block_dump. When this flag
is set, Linux reports all disk read and write operations that take place, and
all block dirtyings done to files. This makes it possible to debug why a disk
needs to spin up, and to increase battery life even more. The output of
block_dump is written to the kernel output, and it can be retrieved using
"dmesg". When you use block_dump and your kernel logging level also includes
kernel debugging messages, you probably want to turn off klogd, otherwise
the output of block_dump will be logged, causing disk activity that is not
normally there.
Configuration
-------------

View File

@@ -39,7 +39,7 @@ in principle, they should work in any architecture where these
subsystems are present.
A periodic hrtimer runs to generate interrupts and kick the watchdog
task. An NMI perf event is generated every "watchdog_thresh"
job. An NMI perf event is generated every "watchdog_thresh"
(compile-time initialized to 10 and configurable through sysctl of the
same name) seconds to check for hardlockups. If any CPU in the system
does not receive any hrtimer interrupt during that time the
@@ -47,7 +47,7 @@ does not receive any hrtimer interrupt during that time the
generate a kernel warning or call panic, depending on the
configuration.
The watchdog task is a high priority kernel thread that updates a
The watchdog job runs in a stop scheduling thread that updates a
timestamp every time it is scheduled. If that timestamp is not updated
for 2*watchdog_thresh seconds (the softlockup threshold) the
'softlockup detector' (coded inside the hrtimer callback function)

View File

@@ -347,81 +347,8 @@ for tickless systems. It follows the same basic strategy as the ``menu`` `one
<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
given conditions. However, it applies a different approach to that problem.
First, it does not use sleep length correction factors, but instead it attempts
to correlate the observed idle duration values with the available idle states
and use that information to pick up the idle state that is most likely to
"match" the upcoming CPU idle interval. Second, it does not take the tasks
that were running on the given CPU in the past and are waiting on some I/O
operations to complete now at all (there is no guarantee that they will run on
the same CPU when they become runnable again) and the pattern detection code in
it avoids taking timer wakeups into account. It also only uses idle duration
values less than the current time till the closest timer (with the scheduler
tick excluded) for that purpose.
Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
the *sleep length*, which is the time until the closest timer event with the
assumption that the scheduler tick will be stopped (that also is the upper bound
on the time until the next CPU wakeup). That value is then used to preselect an
idle state on the basis of three metrics maintained for each idle state provided
by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
state will "match" the observed (post-wakeup) idle duration if it "matches" the
sleep length. They both are subject to decay (after a CPU wakeup) every time
the target residency of the idle state corresponding to them is less than or
equal to the sleep length and the target residency of the next idle state is
greater than the sleep length (that is, when the idle state corresponding to
them "matches" the sleep length). The ``hits`` metric is increased if the
former condition is satisfied and the target residency of the given idle state
is less than or equal to the observed idle duration and the target residency of
the next idle state is greater than the observed idle duration at the same time
(that is, it is increased when the given idle state "matches" both the sleep
length and the observed idle duration). In turn, the ``misses`` metric is
increased when the given idle state "matches" the sleep length only and the
observed idle duration is too short for its target residency.
The ``early_hits`` metric measures the likelihood that a given idle state will
"match" the observed (post-wakeup) idle duration if it does not "match" the
sleep length. It is subject to decay on every CPU wakeup and it is increased
when the idle state corresponding to it "matches" the observed (post-wakeup)
idle duration and the target residency of the next idle state is less than or
equal to the sleep length (i.e. the idle state "matching" the sleep length is
deeper than the given one).
The governor walks the list of idle states provided by the ``CPUIdle`` driver
and finds the last (deepest) one with the target residency less than or equal
to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle
state are compared with each other and it is preselected if the ``hits`` one is
greater (which means that that idle state is likely to "match" the observed idle
duration after CPU wakeup). If the ``misses`` one is greater, the governor
preselects the shallower idle state with the maximum ``early_hits`` metric
(or if there are multiple shallower idle states with equal ``early_hits``
metric which also is the maximum, the shallowest of them will be preselected).
[If there is a wakeup latency constraint coming from the `PM QoS framework
<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
target residency within the sleep length, the deepest idle state with the exit
latency within the constraint is preselected without consulting the ``hits``,
``misses`` and ``early_hits`` metrics.]
Next, the governor takes several idle duration values observed most recently
into consideration and if at least a half of them are greater than or equal to
the target residency of the preselected idle state, that idle state becomes the
final candidate to ask for. Otherwise, the average of the most recent idle
duration values below the target residency of the preselected idle state is
computed and the governor walks the idle states shallower than the preselected
one and finds the deepest of them with the target residency within that average.
That idle state is then taken as the final candidate to ask for.
Still, at this point the governor may need to refine the idle state selection if
it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
generally happens if the target residency of the idle state selected so far is
less than the tick period and the tick has not been stopped already (in a
previous iteration of the idle loop). Then, like in the ``menu`` governor
`case <menu-gov_>`_, the sleep length used in the previous computations may not
reflect the real time until the closest timer event and if it really is greater
than that time, a shallower state with a suitable target residency may need to
be selected.
.. kernel-doc:: drivers/cpuidle/governors/teo.c
:doc: teo-description
.. _idle-states-representation:

View File

@@ -365,6 +365,9 @@ argument is passed to the kernel in the command line.
inclusive) including both turbo and non-turbo P-states (see
`Turbo P-states Support`_).
This attribute is present only if the value exposed by it is the same
for all of the CPUs in the system.
The value of this attribute is not affected by the ``no_turbo``
setting described `below <no_turbo_attr_>`_.
@@ -374,6 +377,9 @@ argument is passed to the kernel in the command line.
Ratio of the `turbo range <turbo_>`_ size to the size of the entire
range of supported P-states, in percent.
This attribute is present only if the value exposed by it is the same
for all of the CPUs in the system.
This attribute is read-only.
.. _no_turbo_attr:

View File

@@ -1297,11 +1297,11 @@ This parameter can be used to control the soft lockup detector.
= =================================
The soft lockup detector monitors CPUs for threads that are hogging the CPUs
without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
from running. The mechanism depends on the CPUs ability to respond to timer
interrupts which are needed for the 'watchdog/N' threads to be woken up by
the watchdog timer function, otherwise the NMI watchdog — if enabled — can
detect a hard lockup condition.
without rescheduling voluntarily, and thus prevent the 'migration/N' threads
from running, causing the watchdog work fail to execute. The mechanism depends
on the CPUs ability to respond to timer interrupts which are needed for the
watchdog work to be queued by the watchdog timer function, otherwise the NMI
watchdog — if enabled — can detect a hard lockup condition.
stack_erasing

View File

@@ -25,7 +25,6 @@ files can be found in mm/swap.c.
Currently, these files are in /proc/sys/vm:
- admin_reserve_kbytes
- block_dump
- compact_memory
- compaction_proactiveness
- compact_unevictable_allowed
@@ -65,7 +64,7 @@ Currently, these files are in /proc/sys/vm:
- overcommit_ratio
- page-cluster
- panic_on_oom
- percpu_pagelist_fraction
- percpu_pagelist_high_fraction
- stat_interval
- stat_refresh
- numa_stat
@@ -107,13 +106,6 @@ On x86_64 this is about 128MB.
Changing this takes effect whenever an application requests memory.
block_dump
==========
block_dump enables block I/O debugging when set to a nonzero value. More
information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst.
compact_memory
==============
@@ -806,22 +798,24 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
why oom happens. You can get snapshot.
percpu_pagelist_fraction
========================
percpu_pagelist_high_fraction
=============================
This is the fraction of pages at most (high mark pcp->high) in each zone that
are allocated for each per cpu page list. The min value for this is 8. It
means that we don't allow more than 1/8th of pages in each zone to be
allocated in any single per_cpu_pagelist. This entry only changes the value
of hot per cpu pagelists. User can specify a number like 100 to allocate
1/100th of each zone to each per cpu page list.
This is the fraction of pages in each zone that are can be stored to
per-cpu page lists. It is an upper boundary that is divided depending
on the number of online CPUs. The min value for this is 8 which means
that we do not allow more than 1/8th of pages in each zone to be stored
on per-cpu page lists. This entry only changes the value of hot per-cpu
page lists. A user can specify a number like 100 to allocate 1/100th of
each zone between per-cpu lists.
The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The batch value of each per-cpu page list remains the same regardless of
the value of the high fraction so allocation latencies are unaffected.
The initial value is zero. Kernel does not use this value at boot time to set
the high water marks for each per cpu page list. If the user writes '0' to this
sysctl, it will revert to this default behavior.
The initial value is zero. Kernel uses this value to set the high pcp->high
mark based on the low watermark for the zone and the number of local
online CPUs. If the user writes '0' to this sysctl, it will revert to
this default behavior.
stat_interval
@@ -952,12 +946,12 @@ allocations, THP and hugetlbfs pages.
To make it sensible with respect to the watermark_scale_factor
parameter, the unit is in fractions of 10,000. The default value of
15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
watermark will be reclaimed in the event of a pageblock being mixed due
to fragmentation. The level of reclaim is determined by the number of
fragmentation events that occurred in the recent past. If this value is
smaller than a pageblock then a pageblocks worth of pages will be reclaimed
(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature.
15,000 means that up to 150% of the high watermark will be reclaimed in the
event of a pageblock being mixed due to fragmentation. The level of reclaim
is determined by the number of fragmentation events that occurred in the
recent past. If this value is smaller than a pageblock then a pageblocks
worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor
of 0 will disable the feature.
watermark_scale_factor

View File

@@ -553,14 +553,21 @@ throughput sustainable with bfq, because updating the blkio.bfq.*
stats is rather costly, especially for some of the stats enabled by
CONFIG_BFQ_CGROUP_DEBUG.
Parameters to set
-----------------
Parameters
----------
For each group, there is only the following parameter to set.
For each group, the following parameters can be set:
weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
group inside its parent. Available values: 1..1000 (default 100). The
linear mapping between ioprio and weights, described at the beginning
weight
This specifies the default weight for the cgroup inside its parent.
Available values: 1..1000 (default: 100).
For cgroup v1, it is set by writing the value to `blkio.bfq.weight`.
For cgroup v2, it is set by writing the value to `io.bfq.weight`.
(with an optional prefix of `default` and a space).
The linear mapping between ioprio and weights, described at the beginning
of the tunable section, is still valid, but all weights higher than
IOPRIO_BE_NR*10 are mapped to ioprio 0.
@@ -568,6 +575,15 @@ Recall that, if low-latency is set, then BFQ automatically raises the
weight of the queues associated with interactive and soft real-time
applications. Unset this tunable if you need/want to control weights.
weight_device
This specifies a per-device weight for the cgroup. The syntax is
`minor:major weight`. A weight of `0` may be used to reset to the default
weight.
For cgroup v1, it is set by writing the value to `blkio.bfq.weight_device`.
For cgroup v2, the file name is `io.bfq.weight`.
[1]
P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O

View File

@@ -12,6 +12,19 @@ BPF instruction-set.
The Cilium project also maintains a `BPF and XDP Reference Guide`_
that goes into great technical depth about the BPF Architecture.
libbpf
======
Libbpf is a userspace library for loading and interacting with bpf programs.
.. toctree::
:maxdepth: 1
libbpf/libbpf
libbpf/libbpf_api
libbpf/libbpf_build
libbpf/libbpf_naming_convention
BPF Type Format (BTF)
=====================
@@ -84,6 +97,7 @@ Other
:maxdepth: 1
ringbuf
llvm_reloc
.. Links:
.. _networking-filter: ../networking/filter.rst

View File

@@ -0,0 +1,14 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
libbpf
======
This is documentation for libbpf, a userspace library for loading and
interacting with bpf programs.
All general BPF questions, including kernel functionality, libbpf APIs and
their application, should be sent to bpf@vger.kernel.org mailing list.
You can `subscribe <http://vger.kernel.org/vger-lists.html#bpf>`_ to the
mailing list search its `archive <https://lore.kernel.org/bpf/>`_.
Please search the archive before asking new questions. It very well might
be that this was already addressed or answered before.

View File

@@ -0,0 +1,27 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
API
===
This documentation is autogenerated from header files in libbpf, tools/lib/bpf
.. kernel-doc:: tools/lib/bpf/libbpf.h
:internal:
.. kernel-doc:: tools/lib/bpf/bpf.h
:internal:
.. kernel-doc:: tools/lib/bpf/btf.h
:internal:
.. kernel-doc:: tools/lib/bpf/xsk.h
:internal:
.. kernel-doc:: tools/lib/bpf/bpf_tracing.h
:internal:
.. kernel-doc:: tools/lib/bpf/bpf_core_read.h
:internal:
.. kernel-doc:: tools/lib/bpf/bpf_endian.h
:internal:

View File

@@ -0,0 +1,37 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
Building libbpf
===============
libelf and zlib are internal dependencies of libbpf and thus are required to link
against and must be installed on the system for applications to work.
pkg-config is used by default to find libelf, and the program called
can be overridden with PKG_CONFIG.
If using pkg-config at build time is not desired, it can be disabled by
setting NO_PKG_CONFIG=1 when calling make.
To build both static libbpf.a and shared libbpf.so:
.. code-block:: bash
$ cd src
$ make
To build only static libbpf.a library in directory build/ and install them
together with libbpf headers in a staging directory root/:
.. code-block:: bash
$ cd src
$ mkdir build root
$ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install
To build both static libbpf.a and shared libbpf.so against a custom libelf
dependency installed in /build/root/ and install them together with libbpf
headers in a build directory /build/root/:
.. code-block:: bash
$ cd src
$ PKG_CONFIG_PATH=/build/root/lib64/pkgconfig DESTDIR=/build/root make

View File

@@ -1,7 +1,7 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
libbpf API naming convention
============================
API naming convention
=====================
libbpf API provides access to a few logically separated groups of
functions and types. Every group has its own naming convention
@@ -10,14 +10,14 @@ new function or type is added to keep libbpf API clean and consistent.
All types and functions provided by libbpf API should have one of the
following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``,
``perf_buffer_``.
``btf_dump_``, ``ring_buffer_``, ``perf_buffer_``.
System call wrappers
--------------------
System call wrappers are simple wrappers for commands supported by
sys_bpf system call. These wrappers should go to ``bpf.h`` header file
and map one-on-one to corresponding commands.
and map one to one to corresponding commands.
For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM``
command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc.
@@ -49,10 +49,6 @@ object, ``bpf_object``, double underscore and ``open`` that defines the
purpose of the function to open ELF file and create ``bpf_object`` from
it.
Another example: ``bpf_program__load`` is named for corresponding
object, ``bpf_program``, that is separated from other part of the name
by double underscore.
All objects and corresponding functions other than BTF related should go
to ``libbpf.h``. BTF types and functions should go to ``btf.h``.
@@ -72,11 +68,7 @@ of both low-level ring access functions and high-level configuration
functions. These can be mixed and matched. Note that these functions
are not reentrant for performance reasons.
Please take a look at Documentation/networking/af_xdp.rst in the Linux
kernel source tree on how to use XDP sockets and for some common
mistakes in case you do not get any traffic up to user space.
libbpf ABI
ABI
==========
libbpf can be both linked statically or used as DSO. To avoid possible
@@ -116,7 +108,8 @@ This bump in ABI version is at most once per kernel development cycle.
For example, if current state of ``libbpf.map`` is:
.. code-block::
.. code-block:: c
LIBBPF_0.0.1 {
global:
bpf_func_a;
@@ -128,7 +121,8 @@ For example, if current state of ``libbpf.map`` is:
, and a new symbol ``bpf_func_c`` is being introduced, then
``libbpf.map`` should be changed like this:
.. code-block::
.. code-block:: c
LIBBPF_0.0.1 {
global:
bpf_func_a;
@@ -148,7 +142,7 @@ Format of version script and ways to handle ABI changes, including
incompatible ones, described in details in [1].
Stand-alone build
=================
-------------------
Under https://github.com/libbpf/libbpf there is a (semi-)automated
mirror of the mainline's version of libbpf for a stand-alone build.
@@ -157,12 +151,12 @@ However, all changes to libbpf's code base must be upstreamed through
the mainline kernel tree.
License
=======
-------------------
libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause.
Links
=====
-------------------
[1] https://www.akkadia.org/drepper/dsohowto.pdf
(Chapter 3. Maintaining APIs and ABIs).

View File

@@ -0,0 +1,240 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
====================
BPF LLVM Relocations
====================
This document describes LLVM BPF backend relocation types.
Relocation Record
=================
LLVM BPF backend records each relocation with the following 16-byte
ELF structure::
typedef struct
{
Elf64_Addr r_offset; // Offset from the beginning of section.
Elf64_Xword r_info; // Relocation type and symbol index.
} Elf64_Rel;
For example, for the following code::
int g1 __attribute__((section("sec")));
int g2 __attribute__((section("sec")));
static volatile int l1 __attribute__((section("sec")));
static volatile int l2 __attribute__((section("sec")));
int test() {
return g1 + g2 + l1 + l2;
}
Compiled with ``clang -target bpf -O2 -c test.c``, the following is
the code with ``llvm-objdump -dr test.o``::
0: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
0000000000000000: R_BPF_64_64 g1
2: 61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
3: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0 ll
0000000000000018: R_BPF_64_64 g2
5: 61 20 00 00 00 00 00 00 r0 = *(u32 *)(r2 + 0)
6: 0f 10 00 00 00 00 00 00 r0 += r1
7: 18 01 00 00 08 00 00 00 00 00 00 00 00 00 00 00 r1 = 8 ll
0000000000000038: R_BPF_64_64 sec
9: 61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
10: 0f 10 00 00 00 00 00 00 r0 += r1
11: 18 01 00 00 0c 00 00 00 00 00 00 00 00 00 00 00 r1 = 12 ll
0000000000000058: R_BPF_64_64 sec
13: 61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
14: 0f 10 00 00 00 00 00 00 r0 += r1
15: 95 00 00 00 00 00 00 00 exit
There are four relations in the above for four ``LD_imm64`` instructions.
The following ``llvm-readelf -r test.o`` shows the binary values of the four
relocations::
Relocation section '.rel.text' at offset 0x190 contains 4 entries:
Offset Info Type Symbol's Value Symbol's Name
0000000000000000 0000000600000001 R_BPF_64_64 0000000000000000 g1
0000000000000018 0000000700000001 R_BPF_64_64 0000000000000004 g2
0000000000000038 0000000400000001 R_BPF_64_64 0000000000000000 sec
0000000000000058 0000000400000001 R_BPF_64_64 0000000000000000 sec
Each relocation is represented by ``Offset`` (8 bytes) and ``Info`` (8 bytes).
For example, the first relocation corresponds to the first instruction
(Offset 0x0) and the corresponding ``Info`` indicates the relocation type
of ``R_BPF_64_64`` (type 1) and the entry in the symbol table (entry 6).
The following is the symbol table with ``llvm-readelf -s test.o``::
Symbol table '.symtab' contains 8 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
1: 0000000000000000 0 FILE LOCAL DEFAULT ABS test.c
2: 0000000000000008 4 OBJECT LOCAL DEFAULT 4 l1
3: 000000000000000c 4 OBJECT LOCAL DEFAULT 4 l2
4: 0000000000000000 0 SECTION LOCAL DEFAULT 4 sec
5: 0000000000000000 128 FUNC GLOBAL DEFAULT 2 test
6: 0000000000000000 4 OBJECT GLOBAL DEFAULT 4 g1
7: 0000000000000004 4 OBJECT GLOBAL DEFAULT 4 g2
The 6th entry is global variable ``g1`` with value 0.
Similarly, the second relocation is at ``.text`` offset ``0x18``, instruction 3,
for global variable ``g2`` which has a symbol value 4, the offset
from the start of ``.data`` section.
The third and fourth relocations refers to static variables ``l1``
and ``l2``. From ``.rel.text`` section above, it is not clear
which symbols they really refers to as they both refers to
symbol table entry 4, symbol ``sec``, which has ``STT_SECTION`` type
and represents a section. So for static variable or function,
the section offset is written to the original insn
buffer, which is called ``A`` (addend). Looking at
above insn ``7`` and ``11``, they have section offset ``8`` and ``12``.
From symbol table, we can find that they correspond to entries ``2``
and ``3`` for ``l1`` and ``l2``.
In general, the ``A`` is 0 for global variables and functions,
and is the section offset or some computation result based on
section offset for static variables/functions. The non-section-offset
case refers to function calls. See below for more details.
Different Relocation Types
==========================
Six relocation types are supported. The following is an overview and
``S`` represents the value of the symbol in the symbol table::
Enum ELF Reloc Type Description BitSize Offset Calculation
0 R_BPF_NONE None
1 R_BPF_64_64 ld_imm64 insn 32 r_offset + 4 S + A
2 R_BPF_64_ABS64 normal data 64 r_offset S + A
3 R_BPF_64_ABS32 normal data 32 r_offset S + A
4 R_BPF_64_NODYLD32 .BTF[.ext] data 32 r_offset S + A
10 R_BPF_64_32 call insn 32 r_offset + 4 (S + A) / 8 - 1
For example, ``R_BPF_64_64`` relocation type is used for ``ld_imm64`` instruction.
The actual to-be-relocated data (0 or section offset)
is stored at ``r_offset + 4`` and the read/write
data bitsize is 32 (4 bytes). The relocation can be resolved with
the symbol value plus implicit addend. Note that the ``BitSize`` is 32 which
means the section offset must be less than or equal to ``UINT32_MAX`` and this
is enforced by LLVM BPF backend.
In another case, ``R_BPF_64_ABS64`` relocation type is used for normal 64-bit data.
The actual to-be-relocated data is stored at ``r_offset`` and the read/write data
bitsize is 64 (8 bytes). The relocation can be resolved with
the symbol value plus implicit addend.
Both ``R_BPF_64_ABS32`` and ``R_BPF_64_NODYLD32`` types are for 32-bit data.
But ``R_BPF_64_NODYLD32`` specifically refers to relocations in ``.BTF`` and
``.BTF.ext`` sections. For cases like bcc where llvm ``ExecutionEngine RuntimeDyld``
is involved, ``R_BPF_64_NODYLD32`` types of relocations should not be resolved
to actual function/variable address. Otherwise, ``.BTF`` and ``.BTF.ext``
become unusable by bcc and kernel.
Type ``R_BPF_64_32`` is used for call instruction. The call target section
offset is stored at ``r_offset + 4`` (32bit) and calculated as
``(S + A) / 8 - 1``.
Examples
========
Types ``R_BPF_64_64`` and ``R_BPF_64_32`` are used to resolve ``ld_imm64``
and ``call`` instructions. For example::
__attribute__((noinline)) __attribute__((section("sec1")))
int gfunc(int a, int b) {
return a * b;
}
static __attribute__((noinline)) __attribute__((section("sec1")))
int lfunc(int a, int b) {
return a + b;
}
int global __attribute__((section("sec2")));
int test(int a, int b) {
return gfunc(a, b) + lfunc(a, b) + global;
}
Compiled with ``clang -target bpf -O2 -c test.c``, we will have
following code with `llvm-objdump -dr test.o``::
Disassembly of section .text:
0000000000000000 <test>:
0: bf 26 00 00 00 00 00 00 r6 = r2
1: bf 17 00 00 00 00 00 00 r7 = r1
2: 85 10 00 00 ff ff ff ff call -1
0000000000000010: R_BPF_64_32 gfunc
3: bf 08 00 00 00 00 00 00 r8 = r0
4: bf 71 00 00 00 00 00 00 r1 = r7
5: bf 62 00 00 00 00 00 00 r2 = r6
6: 85 10 00 00 02 00 00 00 call 2
0000000000000030: R_BPF_64_32 sec1
7: 0f 80 00 00 00 00 00 00 r0 += r8
8: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll
0000000000000040: R_BPF_64_64 global
10: 61 11 00 00 00 00 00 00 r1 = *(u32 *)(r1 + 0)
11: 0f 10 00 00 00 00 00 00 r0 += r1
12: 95 00 00 00 00 00 00 00 exit
Disassembly of section sec1:
0000000000000000 <gfunc>:
0: bf 20 00 00 00 00 00 00 r0 = r2
1: 2f 10 00 00 00 00 00 00 r0 *= r1
2: 95 00 00 00 00 00 00 00 exit
0000000000000018 <lfunc>:
3: bf 20 00 00 00 00 00 00 r0 = r2
4: 0f 10 00 00 00 00 00 00 r0 += r1
5: 95 00 00 00 00 00 00 00 exit
The first relocation corresponds to ``gfunc(a, b)`` where ``gfunc`` has a value of 0,
so the ``call`` instruction offset is ``(0 + 0)/8 - 1 = -1``.
The second relocation corresponds to ``lfunc(a, b)`` where ``lfunc`` has a section
offset ``0x18``, so the ``call`` instruction offset is ``(0 + 0x18)/8 - 1 = 2``.
The third relocation corresponds to ld_imm64 of ``global``, which has a section
offset ``0``.
The following is an example to show how R_BPF_64_ABS64 could be generated::
int global() { return 0; }
struct t { void *g; } gbl = { global };
Compiled with ``clang -target bpf -O2 -g -c test.c``, we will see a
relocation below in ``.data`` section with command
``llvm-readelf -r test.o``::
Relocation section '.rel.data' at offset 0x458 contains 1 entries:
Offset Info Type Symbol's Value Symbol's Name
0000000000000000 0000000700000002 R_BPF_64_ABS64 0000000000000000 global
The relocation says the first 8-byte of ``.data`` section should be
filled with address of ``global`` variable.
With ``llvm-readelf`` output, we can see that dwarf sections have a bunch of
``R_BPF_64_ABS32`` and ``R_BPF_64_ABS64`` relocations::
Relocation section '.rel.debug_info' at offset 0x468 contains 13 entries:
Offset Info Type Symbol's Value Symbol's Name
0000000000000006 0000000300000003 R_BPF_64_ABS32 0000000000000000 .debug_abbrev
000000000000000c 0000000400000003 R_BPF_64_ABS32 0000000000000000 .debug_str
0000000000000012 0000000400000003 R_BPF_64_ABS32 0000000000000000 .debug_str
0000000000000016 0000000600000003 R_BPF_64_ABS32 0000000000000000 .debug_line
000000000000001a 0000000400000003 R_BPF_64_ABS32 0000000000000000 .debug_str
000000000000001e 0000000200000002 R_BPF_64_ABS64 0000000000000000 .text
000000000000002b 0000000400000003 R_BPF_64_ABS32 0000000000000000 .debug_str
0000000000000037 0000000800000002 R_BPF_64_ABS64 0000000000000000 gbl
0000000000000040 0000000400000003 R_BPF_64_ABS32 0000000000000000 .debug_str
......
The .BTF/.BTF.ext sections has R_BPF_64_NODYLD32 relocations::
Relocation section '.rel.BTF' at offset 0x538 contains 1 entries:
Offset Info Type Symbol's Value Symbol's Name
0000000000000084 0000000800000004 R_BPF_64_NODYLD32 0000000000000000 gbl
Relocation section '.rel.BTF.ext' at offset 0x548 contains 2 entries:
Offset Info Type Symbol's Value Symbol's Name
000000000000002c 0000000200000004 R_BPF_64_NODYLD32 0000000000000000 .text
0000000000000040 0000000200000004 R_BPF_64_NODYLD32 0000000000000000 .text

View File

@@ -146,7 +146,6 @@ Legacy
irq_domain_add_simple()
irq_domain_add_legacy()
irq_domain_add_legacy_isa()
irq_domain_create_simple()
irq_domain_create_legacy()

View File

@@ -513,9 +513,10 @@ Time and date
::
%pt[RT] YYYY-mm-ddTHH:MM:SS
%pt[RT]s YYYY-mm-dd HH:MM:SS
%pt[RT]d YYYY-mm-dd
%pt[RT]t HH:MM:SS
%pt[RT][dt][r]
%pt[RT][dt][r][s]
For printing date and time as represented by::
@@ -527,6 +528,10 @@ in human readable format.
By default year will be incremented by 1900 and month by 1.
Use %pt[RT]r (raw) to suppress this behaviour.
The %pt[RT]s (space) will override ISO 8601 separator by using ' ' (space)
instead of 'T' (Capital T) between date and time. It won't have any effect
when date or time is omitted.
Passed by reference.
struct clk

View File

@@ -447,11 +447,10 @@ When a test fails due to a failed ``kmalloc``::
When a test fails due to a missing KASAN report::
# kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629
Expected kasan_data->report_expected == kasan_data->report_found, but
kasan_data->report_expected == 1
kasan_data->report_found == 0
not ok 28 - kmalloc_double_kzfree
# kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974
KASAN failure expected in "kfree_sensitive(ptr)", but none occurred
not ok 44 - kmalloc_double_kzfree
At the end the cumulative status of all KASAN tests is printed. On success::

View File

@@ -1,57 +0,0 @@
NVIDIA Tegra Activity Monitor
The activity monitor block collects statistics about the behaviour of other
components in the system. This information can be used to derive the rate at
which the external memory needs to be clocked in order to serve all requests
from the monitored clients.
Required properties:
- compatible: should be "nvidia,tegra<chip>-actmon"
- reg: offset and length of the register set for the device
- interrupts: standard interrupt property
- clocks: Must contain a phandle and clock specifier pair for each entry in
clock-names. See ../../clock/clock-bindings.txt for details.
- clock-names: Must include the following entries:
- actmon
- emc
- resets: Must contain an entry for each entry in reset-names. See
../../reset/reset.txt for details.
- reset-names: Must include the following entries:
- actmon
- operating-points-v2: See ../bindings/opp/opp.txt for details.
- interconnects: Should contain entries for memory clients sitting on
MC->EMC memory interconnect path.
- interconnect-names: Should include name of the interconnect path for each
interconnect entry. Consult TRM documentation for
information about available memory clients, see MEMORY
CONTROLLER section.
For each opp entry in 'operating-points-v2' table:
- opp-supported-hw: bitfield indicating SoC speedo ID mask
- opp-peak-kBps: peak bandwidth of the memory channel
Example:
dfs_opp_table: opp-table {
compatible = "operating-points-v2";
opp@12750000 {
opp-hz = /bits/ 64 <12750000>;
opp-supported-hw = <0x000F>;
opp-peak-kBps = <51000>;
};
...
};
actmon@6000c800 {
compatible = "nvidia,tegra124-actmon";
reg = <0x0 0x6000c800 0x0 0x400>;
interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&tegra_car TEGRA124_CLK_ACTMON>,
<&tegra_car TEGRA124_CLK_EMC>;
clock-names = "actmon", "emc";
resets = <&tegra_car 119>;
reset-names = "actmon";
operating-points-v2 = <&dfs_opp_table>;
interconnects = <&mc TEGRA124_MC_MPCORER &emc>;
interconnect-names = "cpu";
};

View File

@@ -0,0 +1,126 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/devfreq/nvidia,tegra30-actmon.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: NVIDIA Tegra30 Activity Monitor
maintainers:
- Dmitry Osipenko <digetx@gmail.com>
- Jon Hunter <jonathanh@nvidia.com>
- Thierry Reding <thierry.reding@gmail.com>
description: |
The activity monitor block collects statistics about the behaviour of other
components in the system. This information can be used to derive the rate at
which the external memory needs to be clocked in order to serve all requests
from the monitored clients.
properties:
compatible:
enum:
- nvidia,tegra30-actmon
- nvidia,tegra114-actmon
- nvidia,tegra124-actmon
- nvidia,tegra210-actmon
reg:
maxItems: 1
clocks:
maxItems: 2
clock-names:
items:
- const: actmon
- const: emc
resets:
maxItems: 1
reset-names:
items:
- const: actmon
interrupts:
maxItems: 1
interconnects:
minItems: 1
maxItems: 12
interconnect-names:
minItems: 1
maxItems: 12
description:
Should include name of the interconnect path for each interconnect
entry. Consult TRM documentation for information about available
memory clients, see MEMORY CONTROLLER and ACTIVITY MONITOR sections.
operating-points-v2:
description:
Should contain freqs and voltages and opp-supported-hw property, which
is a bitfield indicating SoC speedo ID mask.
"#cooling-cells":
const: 2
required:
- compatible
- reg
- clocks
- clock-names
- resets
- reset-names
- interrupts
- interconnects
- interconnect-names
- operating-points-v2
- "#cooling-cells"
additionalProperties: false
examples:
- |
#include <dt-bindings/memory/tegra30-mc.h>
mc: memory-controller@7000f000 {
compatible = "nvidia,tegra30-mc";
reg = <0x7000f000 0x400>;
clocks = <&clk 32>;
clock-names = "mc";
interrupts = <0 77 4>;
#iommu-cells = <1>;
#reset-cells = <1>;
#interconnect-cells = <1>;
};
emc: external-memory-controller@7000f400 {
compatible = "nvidia,tegra30-emc";
reg = <0x7000f400 0x400>;
interrupts = <0 78 4>;
clocks = <&clk 57>;
nvidia,memory-controller = <&mc>;
operating-points-v2 = <&dvfs_opp_table>;
power-domains = <&domain>;
#interconnect-cells = <0>;
};
actmon@6000c800 {
compatible = "nvidia,tegra30-actmon";
reg = <0x6000c800 0x400>;
interrupts = <0 45 4>;
clocks = <&clk 119>, <&clk 57>;
clock-names = "actmon", "emc";
resets = <&rst 119>;
reset-names = "actmon";
operating-points-v2 = <&dvfs_opp_table>;
interconnects = <&mc TEGRA30_MC_MPCORER &emc>;
interconnect-names = "cpu-read";
#cooling-cells = <2>;
};

View File

@@ -145,6 +145,19 @@ properties:
required:
- affinity
clocks:
maxItems: 1
clock-names:
items:
- const: aclk
power-domains:
maxItems: 1
resets:
maxItems: 1
dependencies:
mbi-ranges: [ msi-controller ]
msi-controller: [ mbi-ranges ]

View File

@@ -29,6 +29,7 @@ properties:
- renesas,intc-ex-r8a774c0 # RZ/G2E
- renesas,intc-ex-r8a7795 # R-Car H3
- renesas,intc-ex-r8a7796 # R-Car M3-W
- renesas,intc-ex-r8a77961 # R-Car M3-W+
- renesas,intc-ex-r8a77965 # R-Car M3-N
- renesas,intc-ex-r8a77970 # R-Car V3M
- renesas,intc-ex-r8a77980 # R-Car V3H

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/ipmi/aspeed,ast2400-kcs-bmc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: ASPEED BMC KCS Devices
maintainers:
- Andrew Jeffery <andrew@aj.id.au>
description: |
The Aspeed BMC SoCs typically use the Keyboard-Controller-Style (KCS)
interfaces on the LPC bus for in-band IPMI communication with their host.
properties:
compatible:
oneOf:
- description: Channel ID derived from reg
items:
enum:
- aspeed,ast2400-kcs-bmc-v2
- aspeed,ast2500-kcs-bmc-v2
- aspeed,ast2600-kcs-bmc
- description: Old-style with explicit channel ID, no reg
deprecated: true
items:
enum:
- aspeed,ast2400-kcs-bmc
- aspeed,ast2500-kcs-bmc
interrupts:
maxItems: 1
reg:
# maxItems: 3
items:
- description: IDR register
- description: ODR register
- description: STR register
aspeed,lpc-io-reg:
$ref: '/schemas/types.yaml#/definitions/uint32-array'
minItems: 1
maxItems: 2
description: |
The host CPU LPC IO data and status addresses for the device. For most
channels the status address is derived from the data address, but the
status address may be optionally provided.
aspeed,lpc-interrupts:
$ref: "/schemas/types.yaml#/definitions/uint32-array"
minItems: 2
maxItems: 2
description: |
A 2-cell property expressing the LPC SerIRQ number and the interrupt
level/sense encoding (specified in the standard fashion).
Note that the generated interrupt is issued from the BMC to the host, and
thus the target interrupt controller is not captured by the BMC's
devicetree.
kcs_chan:
deprecated: true
$ref: '/schemas/types.yaml#/definitions/uint32'
description: The LPC channel number in the controller
kcs_addr:
deprecated: true
$ref: '/schemas/types.yaml#/definitions/uint32'
description: The host CPU IO map address
required:
- compatible
- interrupts
additionalProperties: false
allOf:
- if:
properties:
compatible:
contains:
enum:
- aspeed,ast2400-kcs-bmc
- aspeed,ast2500-kcs-bmc
then:
required:
- kcs_chan
- kcs_addr
else:
required:
- reg
- aspeed,lpc-io-reg
examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
kcs3: kcs@24 {
compatible = "aspeed,ast2600-kcs-bmc";
reg = <0x24 0x1>, <0x30 0x1>, <0x3c 0x1>;
aspeed,lpc-io-reg = <0xca2>;
aspeed,lpc-interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
interrupts = <8>;
};

View File

@@ -1,33 +0,0 @@
# Aspeed KCS (Keyboard Controller Style) IPMI interface
The Aspeed SOCs (AST2400 and AST2500) are commonly used as BMCs
(Baseboard Management Controllers) and the KCS interface can be
used to perform in-band IPMI communication with their host.
## v1
Required properties:
- compatible : should be one of
"aspeed,ast2400-kcs-bmc"
"aspeed,ast2500-kcs-bmc"
- interrupts : interrupt generated by the controller
- kcs_chan : The LPC channel number in the controller
- kcs_addr : The host CPU IO map address
## v2
Required properties:
- compatible : should be one of
"aspeed,ast2400-kcs-bmc-v2"
"aspeed,ast2500-kcs-bmc-v2"
- reg : The address and size of the IDR, ODR and STR registers
- interrupts : interrupt generated by the controller
- aspeed,lpc-io-reg : The host CPU LPC IO address for the device
Example:
kcs3: kcs@24 {
compatible = "aspeed,ast2500-kcs-bmc-v2";
reg = <0x24 0x1>, <0x30 0x1>, <0x3c 0x1>;
aspeed,lpc-reg = <0xca2>;
interrupts = <8>;
status = "okay";
};

View File

@@ -26,6 +26,7 @@ properties:
oneOf:
- const: fsl,imx6sx-mu
- const: fsl,imx7ulp-mu
- const: fsl,imx8ulp-mu
- const: fsl,imx8-mu-scu
- items:
- enum:

View File

@@ -0,0 +1,47 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: "http://devicetree.org/schemas/mailbox/microchip,polarfire-soc-mailbox.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
title: Microchip PolarFire SoC (MPFS) MSS (microprocessor subsystem) mailbox controller
maintainers:
- Conor Dooley <conor.dooley@microchip.com>
properties:
compatible:
const: microchip,polarfire-soc-mailbox
reg:
items:
- description: mailbox data registers
- description: mailbox interrupt registers
interrupts:
maxItems: 1
"#mbox-cells":
const: 1
required:
- compatible
- reg
- interrupts
- "#mbox-cells"
additionalProperties: false
examples:
- |
soc {
#address-cells = <2>;
#size-cells = <2>;
mbox: mailbox@37020000 {
compatible = "microchip,polarfire-soc-mailbox";
reg = <0x0 0x37020000 0x0 0x1000>, <0x0 0x2000318c 0x0 0x40>;
interrupt-parent = <&L1>;
interrupts = <96>;
#mbox-cells = <1>;
};
};

View File

@@ -19,6 +19,7 @@ properties:
- qcom,ipq6018-apcs-apps-global
- qcom,ipq8074-apcs-apps-global
- qcom,msm8916-apcs-kpss-global
- qcom,msm8939-apcs-kpss-global
- qcom,msm8994-apcs-kpss-global
- qcom,msm8996-apcs-hmss-global
- qcom,msm8998-apcs-hmss-global
@@ -27,6 +28,7 @@ properties:
- qcom,sc8180x-apss-shared
- qcom,sdm660-apcs-hmss-global
- qcom,sdm845-apss-shared
- qcom,sm6125-apcs-hmss-global
- qcom,sm8150-apss-shared
reg:
@@ -75,6 +77,7 @@ allOf:
- qcom,sc7180-apss-shared
- qcom,sdm660-apcs-hmss-global
- qcom,sdm845-apss-shared
- qcom,sm6125-apcs-hmss-global
- qcom,sm8150-apss-shared
then:
properties:

View File

@@ -1,23 +0,0 @@
* Broadcom iProc MDIO bus controller
Required properties:
- compatible: should be "brcm,iproc-mdio"
- reg: address and length of the register set for the MDIO interface
- #size-cells: must be 1
- #address-cells: must be 0
Child nodes of this MDIO bus controller node are standard Ethernet PHY device
nodes as described in Documentation/devicetree/bindings/net/phy.txt
Example:
mdio@18002000 {
compatible = "brcm,iproc-mdio";
reg = <0x18002000 0x8>;
#size-cells = <1>;
#address-cells = <0>;
enet-gphy@0 {
reg = <0>;
};
};

View File

@@ -0,0 +1,38 @@
# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/brcm,iproc-mdio.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Broadcom iProc MDIO bus controller
maintainers:
- Rafał Miłecki <rafal@milecki.pl>
allOf:
- $ref: mdio.yaml#
properties:
compatible:
const: brcm,iproc-mdio
reg:
maxItems: 1
unevaluatedProperties: false
required:
- reg
examples:
- |
mdio@18002000 {
compatible = "brcm,iproc-mdio";
reg = <0x18002000 0x8>;
#address-cells = <1>;
#size-cells = <0>;
ethernet-phy@0 {
reg = <0>;
};
};

View File

@@ -1,80 +0,0 @@
Renesas R-Car CAN controller Device Tree Bindings
-------------------------------------------------
Required properties:
- compatible: "renesas,can-r8a7742" if CAN controller is a part of R8A7742 SoC.
"renesas,can-r8a7743" if CAN controller is a part of R8A7743 SoC.
"renesas,can-r8a7744" if CAN controller is a part of R8A7744 SoC.
"renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC.
"renesas,can-r8a77470" if CAN controller is a part of R8A77470 SoC.
"renesas,can-r8a774a1" if CAN controller is a part of R8A774A1 SoC.
"renesas,can-r8a774b1" if CAN controller is a part of R8A774B1 SoC.
"renesas,can-r8a774c0" if CAN controller is a part of R8A774C0 SoC.
"renesas,can-r8a774e1" if CAN controller is a part of R8A774E1 SoC.
"renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
"renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC.
"renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC.
"renesas,can-r8a7791" if CAN controller is a part of R8A7791 SoC.
"renesas,can-r8a7792" if CAN controller is a part of R8A7792 SoC.
"renesas,can-r8a7793" if CAN controller is a part of R8A7793 SoC.
"renesas,can-r8a7794" if CAN controller is a part of R8A7794 SoC.
"renesas,can-r8a7795" if CAN controller is a part of R8A7795 SoC.
"renesas,can-r8a7796" if CAN controller is a part of R8A77960 SoC.
"renesas,can-r8a77961" if CAN controller is a part of R8A77961 SoC.
"renesas,can-r8a77965" if CAN controller is a part of R8A77965 SoC.
"renesas,can-r8a77990" if CAN controller is a part of R8A77990 SoC.
"renesas,can-r8a77995" if CAN controller is a part of R8A77995 SoC.
"renesas,rcar-gen1-can" for a generic R-Car Gen1 compatible device.
"renesas,rcar-gen2-can" for a generic R-Car Gen2 or RZ/G1
compatible device.
"renesas,rcar-gen3-can" for a generic R-Car Gen3 or RZ/G2
compatible device.
When compatible with the generic version, nodes must list the
SoC-specific version corresponding to the platform first
followed by the generic version.
- reg: physical base address and size of the R-Car CAN register map.
- interrupts: interrupt specifier for the sole interrupt.
- clocks: phandles and clock specifiers for 3 CAN clock inputs.
- clock-names: 3 clock input name strings: "clkp1", "clkp2", and "can_clk".
- pinctrl-0: pin control group to be used for this controller.
- pinctrl-names: must be "default".
Required properties for R8A774A1, R8A774B1, R8A774C0, R8A774E1, R8A7795,
R8A77960, R8A77961, R8A77965, R8A77990, and R8A77995:
For the denoted SoCs, "clkp2" can be CANFD clock. This is a div6 clock and can
be used by both CAN and CAN FD controller at the same time. It needs to be
scaled to maximum frequency if any of these controllers use it. This is done
using the below properties:
- assigned-clocks: phandle of clkp2(CANFD) clock.
- assigned-clock-rates: maximum frequency of this clock.
Optional properties:
- renesas,can-clock-select: R-Car CAN Clock Source Select. Valid values are:
<0x0> (default) : Peripheral clock (clkp1)
<0x1> : Peripheral clock (clkp2)
<0x3> : External input clock
Example
-------
SoC common .dtsi file:
can0: can@e6e80000 {
compatible = "renesas,can-r8a7791", "renesas,rcar-gen2-can";
reg = <0 0xe6e80000 0 0x1000>;
interrupts = <0 186 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&mstp9_clks R8A7791_CLK_RCAN0>,
<&cpg_clocks R8A7791_CLK_RCAN>, <&can_clk>;
clock-names = "clkp1", "clkp2", "can_clk";
status = "disabled";
};
Board specific .dts file:
&can0 {
pinctrl-0 = <&can0_pins>;
pinctrl-names = "default";
status = "okay";
};

View File

@@ -1,107 +0,0 @@
Renesas R-Car CAN FD controller Device Tree Bindings
----------------------------------------------------
Required properties:
- compatible: Must contain one or more of the following:
- "renesas,rcar-gen3-canfd" for R-Car Gen3 and RZ/G2 compatible controllers.
- "renesas,r8a774a1-canfd" for R8A774A1 (RZ/G2M) compatible controller.
- "renesas,r8a774b1-canfd" for R8A774B1 (RZ/G2N) compatible controller.
- "renesas,r8a774c0-canfd" for R8A774C0 (RZ/G2E) compatible controller.
- "renesas,r8a774e1-canfd" for R8A774E1 (RZ/G2H) compatible controller.
- "renesas,r8a7795-canfd" for R8A7795 (R-Car H3) compatible controller.
- "renesas,r8a7796-canfd" for R8A7796 (R-Car M3-W) compatible controller.
- "renesas,r8a77965-canfd" for R8A77965 (R-Car M3-N) compatible controller.
- "renesas,r8a77970-canfd" for R8A77970 (R-Car V3M) compatible controller.
- "renesas,r8a77980-canfd" for R8A77980 (R-Car V3H) compatible controller.
- "renesas,r8a77990-canfd" for R8A77990 (R-Car E3) compatible controller.
- "renesas,r8a77995-canfd" for R8A77995 (R-Car D3) compatible controller.
When compatible with the generic version, nodes must list the
SoC-specific version corresponding to the platform first, followed by the
family-specific and/or generic versions.
- reg: physical base address and size of the R-Car CAN FD register map.
- interrupts: interrupt specifiers for the Channel & Global interrupts
- clocks: phandles and clock specifiers for 3 clock inputs.
- clock-names: 3 clock input name strings: "fck", "canfd", "can_clk".
- pinctrl-0: pin control group to be used for this controller.
- pinctrl-names: must be "default".
Required child nodes:
The controller supports two channels and each is represented as a child node.
The name of the child nodes are "channel0" and "channel1" respectively. Each
child node supports the "status" property only, which is used to
enable/disable the respective channel.
Required properties for R8A774A1, R8A774B1, R8A774C0, R8A774E1, R8A7795,
R8A7796, R8A77965, R8A77990, and R8A77995:
In the denoted SoCs, canfd clock is a div6 clock and can be used by both CAN
and CAN FD controller at the same time. It needs to be scaled to maximum
frequency if any of these controllers use it. This is done using the below
properties:
- assigned-clocks: phandle of canfd clock.
- assigned-clock-rates: maximum frequency of this clock.
Optional property:
The controller can operate in either CAN FD only mode (default) or
Classical CAN only mode. The mode is global to both the channels. In order to
enable the later, define the following optional property.
- renesas,no-can-fd: puts the controller in Classical CAN only mode.
Example
-------
SoC common .dtsi file:
canfd: can@e66c0000 {
compatible = "renesas,r8a7795-canfd",
"renesas,rcar-gen3-canfd";
reg = <0 0xe66c0000 0 0x8000>;
interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&cpg CPG_MOD 914>,
<&cpg CPG_CORE R8A7795_CLK_CANFD>,
<&can_clk>;
clock-names = "fck", "canfd", "can_clk";
assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
assigned-clock-rates = <40000000>;
power-domains = <&cpg>;
status = "disabled";
channel0 {
status = "disabled";
};
channel1 {
status = "disabled";
};
};
Board specific .dts file:
E.g. below enables Channel 1 alone in the board in Classical CAN only mode.
&canfd {
pinctrl-0 = <&canfd1_pins>;
pinctrl-names = "default";
renesas,no-can-fd;
status = "okay";
channel1 {
status = "okay";
};
};
E.g. below enables Channel 0 alone in the board using External clock
as fCAN clock.
&canfd {
pinctrl-0 = <&canfd0_pins>, <&can_clk_pins>;
pinctrl-names = "default";
status = "okay";
channel0 {
status = "okay";
};
};

View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/can/renesas,rcar-can.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas R-Car CAN Controller
maintainers:
- Sergei Shtylyov <sergei.shtylyov@gmail.com>
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,can-r8a7778 # R-Car M1-A
- renesas,can-r8a7779 # R-Car H1
- const: renesas,rcar-gen1-can # R-Car Gen1
- items:
- enum:
- renesas,can-r8a7742 # RZ/G1H
- renesas,can-r8a7743 # RZ/G1M
- renesas,can-r8a7744 # RZ/G1N
- renesas,can-r8a7745 # RZ/G1E
- renesas,can-r8a77470 # RZ/G1C
- renesas,can-r8a7790 # R-Car H2
- renesas,can-r8a7791 # R-Car M2-W
- renesas,can-r8a7792 # R-Car V2H
- renesas,can-r8a7793 # R-Car M2-N
- renesas,can-r8a7794 # R-Car E2
- const: renesas,rcar-gen2-can # R-Car Gen2 and RZ/G1
- items:
- enum:
- renesas,can-r8a774a1 # RZ/G2M
- renesas,can-r8a774b1 # RZ/G2N
- renesas,can-r8a774c0 # RZ/G2E
- renesas,can-r8a774e1 # RZ/G2H
- renesas,can-r8a7795 # R-Car H3
- renesas,can-r8a7796 # R-Car M3-W
- renesas,can-r8a77961 # R-Car M3-W+
- renesas,can-r8a77965 # R-Car M3-N
- renesas,can-r8a77990 # R-Car E3
- renesas,can-r8a77995 # R-Car D3
- const: renesas,rcar-gen3-can # R-Car Gen3 and RZ/G2
reg:
maxItems: 1
interrupts:
maxItems: 1
clocks:
maxItems: 3
clock-names:
items:
- const: clkp1
- const: clkp2
- const: can_clk
power-domains:
maxItems: 1
resets:
maxItems: 1
renesas,can-clock-select:
$ref: /schemas/types.yaml#/definitions/uint32
enum: [ 0, 1, 3 ]
default: 0
description: |
R-Car CAN Clock Source Select. Valid values are:
<0x0> (default) : Peripheral clock (clkp1)
<0x1> : Peripheral clock (clkp2)
<0x3> : External input clock
assigned-clocks:
description:
Reference to the clkp2 (CANFD) clock.
On R-Car Gen3 and RZ/G2 SoCs, "clkp2" is the CANFD clock. This is a div6
clock and can be used by both CAN and CAN FD controllers at the same
time. It needs to be scaled to maximum frequency if any of these
controllers use it.
assigned-clock-rates:
description: Maximum frequency of the CANFD clock.
required:
- compatible
- reg
- interrupts
- clocks
- clock-names
- power-domains
allOf:
- $ref: can-controller.yaml#
- if:
not:
properties:
compatible:
contains:
const: renesas,rcar-gen1-can
then:
required:
- resets
- if:
properties:
compatible:
contains:
const: renesas,rcar-gen3-can
then:
required:
- assigned-clocks
- assigned-clock-rates
unevaluatedProperties: false
examples:
- |
#include <dt-bindings/clock/r8a7791-cpg-mssr.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/power/r8a7791-sysc.h>
can0: can@e6e80000 {
compatible = "renesas,can-r8a7791", "renesas,rcar-gen2-can";
reg = <0xe6e80000 0x1000>;
interrupts = <GIC_SPI 186 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&cpg CPG_MOD 916>,
<&cpg CPG_CORE R8A7791_CLK_RCAN>, <&can_clk>;
clock-names = "clkp1", "clkp2", "can_clk";
power-domains = <&sysc R8A7791_PD_ALWAYS_ON>;
resets = <&cpg 916>;
};

View File

@@ -0,0 +1,122 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/can/renesas,rcar-canfd.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas R-Car CAN FD Controller
maintainers:
- Fabrizio Castro <fabrizio.castro.jz@renesas.com>
allOf:
- $ref: can-controller.yaml#
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,r8a774a1-canfd # RZ/G2M
- renesas,r8a774b1-canfd # RZ/G2N
- renesas,r8a774c0-canfd # RZ/G2E
- renesas,r8a774e1-canfd # RZ/G2H
- renesas,r8a7795-canfd # R-Car H3
- renesas,r8a7796-canfd # R-Car M3-W
- renesas,r8a77965-canfd # R-Car M3-N
- renesas,r8a77970-canfd # R-Car V3M
- renesas,r8a77980-canfd # R-Car V3H
- renesas,r8a77990-canfd # R-Car E3
- renesas,r8a77995-canfd # R-Car D3
- const: renesas,rcar-gen3-canfd # R-Car Gen3 and RZ/G2
reg:
maxItems: 1
interrupts:
items:
- description: Channel interrupt
- description: Global interrupt
clocks:
maxItems: 3
clock-names:
items:
- const: fck
- const: canfd
- const: can_clk
power-domains:
maxItems: 1
resets:
maxItems: 1
renesas,no-can-fd:
$ref: /schemas/types.yaml#/definitions/flag
description:
The controller can operate in either CAN FD only mode (default) or
Classical CAN only mode. The mode is global to both the channels.
Specify this property to put the controller in Classical CAN only mode.
assigned-clocks:
description:
Reference to the CANFD clock. The CANFD clock is a div6 clock and can be
used by both CAN (if present) and CAN FD controllers at the same time.
It needs to be scaled to maximum frequency if any of these controllers
use it.
assigned-clock-rates:
description: Maximum frequency of the CANFD clock.
patternProperties:
"^channel[01]$":
type: object
description:
The controller supports two channels and each is represented as a child
node. Each child node supports the "status" property only, which
is used to enable/disable the respective channel.
required:
- compatible
- reg
- interrupts
- clocks
- clock-names
- power-domains
- resets
- assigned-clocks
- assigned-clock-rates
- channel0
- channel1
unevaluatedProperties: false
examples:
- |
#include <dt-bindings/clock/r8a7795-cpg-mssr.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/power/r8a7795-sysc.h>
canfd: can@e66c0000 {
compatible = "renesas,r8a7795-canfd",
"renesas,rcar-gen3-canfd";
reg = <0xe66c0000 0x8000>;
interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&cpg CPG_MOD 914>,
<&cpg CPG_CORE R8A7795_CLK_CANFD>,
<&can_clk>;
clock-names = "fck", "canfd", "can_clk";
assigned-clocks = <&cpg CPG_CORE R8A7795_CLK_CANFD>;
assigned-clock-rates = <40000000>;
power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
resets = <&cpg 914>;
channel0 {
};
channel1 {
};
};

View File

@@ -81,6 +81,12 @@ Optional properties:
- gpio-controller: Boolean; if defined, MT7530's LED controller will run on
GPIO mode.
- #gpio-cells: Must be 2 if gpio-controller is defined.
- interrupt-controller: Boolean; Enables the internal interrupt controller.
If interrupt-controller is defined, the following properties are required.
- #interrupt-cells: Must be 1.
- interrupts: Parent interrupt for the interrupt controller.
See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
required, optional properties and how the integrated switch subnodes must

View File

@@ -0,0 +1,132 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/dsa/nxp,sja1105.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: NXP SJA1105 Automotive Ethernet Switch Family Device Tree Bindings
description:
The SJA1105 SPI interface requires a CS-to-CLK time (t2 in UM10944.pdf) of at
least one half of t_CLK. At an SPI frequency of 1MHz, this means a minimum
cs_sck_delay of 500ns. Ensuring that this SPI timing requirement is observed
depends on the SPI bus master driver.
allOf:
- $ref: "dsa.yaml#"
maintainers:
- Vladimir Oltean <vladimir.oltean@nxp.com>
properties:
compatible:
enum:
- nxp,sja1105e
- nxp,sja1105t
- nxp,sja1105p
- nxp,sja1105q
- nxp,sja1105r
- nxp,sja1105s
- nxp,sja1110a
- nxp,sja1110b
- nxp,sja1110c
- nxp,sja1110d
reg:
maxItems: 1
# Optional container node for the 2 internal MDIO buses of the SJA1110
# (one for the internal 100base-T1 PHYs and the other for the single
# 100base-TX PHY). The "reg" property does not have physical significance.
# The PHY addresses to port correspondence is as follows: for 100base-T1,
# port 5 has PHY 1, port 6 has PHY 2 etc, while for 100base-TX, port 1 has
# PHY 1.
mdios:
type: object
properties:
'#address-cells':
const: 1
'#size-cells':
const: 0
patternProperties:
"^mdio@[0-1]$":
type: object
allOf:
- $ref: "http://devicetree.org/schemas/net/mdio.yaml#"
properties:
compatible:
oneOf:
- enum:
- nxp,sja1110-base-t1-mdio
- nxp,sja1110-base-tx-mdio
reg:
oneOf:
- enum:
- 0
- 1
required:
- compatible
- reg
required:
- compatible
- reg
unevaluatedProperties: false
examples:
- |
spi {
#address-cells = <1>;
#size-cells = <0>;
ethernet-switch@1 {
reg = <0x1>;
compatible = "nxp,sja1105t";
ethernet-ports {
#address-cells = <1>;
#size-cells = <0>;
port@0 {
phy-handle = <&rgmii_phy6>;
phy-mode = "rgmii-id";
reg = <0>;
};
port@1 {
phy-handle = <&rgmii_phy3>;
phy-mode = "rgmii-id";
reg = <1>;
};
port@2 {
phy-handle = <&rgmii_phy4>;
phy-mode = "rgmii-id";
reg = <2>;
};
port@3 {
phy-mode = "rgmii-id";
reg = <3>;
};
port@4 {
ethernet = <&enet2>;
phy-mode = "rgmii";
reg = <4>;
fixed-link {
speed = <1000>;
full-duplex;
};
};
};
};
};

View File

@@ -3,6 +3,7 @@
Required properties:
- compatible: should be one of:
"qca,qca8327"
"qca,qca8334"
"qca,qca8337"
@@ -20,6 +21,10 @@ described in dsa/dsa.txt. If the QCA8K switch is connect to a SoC's external
mdio-bus each subnode describing a port needs to have a valid phandle
referencing the internal PHY it is connected to. This is because there's no
N:N mapping of port and PHY id.
To declare the internal mdio-bus configuration, declare a mdio node in the
switch node and declare the phandle for the port referencing the internal
PHY is connected to. In this config a internal mdio-bus is registered and
the mdio MASTER is used as communication.
Don't use mixed external and internal mdio-bus configurations, as this is
not supported by the hardware.
@@ -149,26 +154,61 @@ for the internal master mdio-bus configuration:
port@1 {
reg = <1>;
label = "lan1";
phy-mode = "internal";
phy-handle = <&phy_port1>;
};
port@2 {
reg = <2>;
label = "lan2";
phy-mode = "internal";
phy-handle = <&phy_port2>;
};
port@3 {
reg = <3>;
label = "lan3";
phy-mode = "internal";
phy-handle = <&phy_port3>;
};
port@4 {
reg = <4>;
label = "lan4";
phy-mode = "internal";
phy-handle = <&phy_port4>;
};
port@5 {
reg = <5>;
label = "wan";
phy-mode = "internal";
phy-handle = <&phy_port5>;
};
};
mdio {
#address-cells = <1>;
#size-cells = <0>;
phy_port1: phy@0 {
reg = <0>;
};
phy_port2: phy@1 {
reg = <1>;
};
phy_port3: phy@2 {
reg = <2>;
};
phy_port4: phy@3 {
reg = <3>;
};
phy_port5: phy@4 {
reg = <4>;
};
};
};

View File

@@ -1,156 +0,0 @@
NXP SJA1105 switch driver
=========================
Required properties:
- compatible:
Must be one of:
- "nxp,sja1105e"
- "nxp,sja1105t"
- "nxp,sja1105p"
- "nxp,sja1105q"
- "nxp,sja1105r"
- "nxp,sja1105s"
Although the device ID could be detected at runtime, explicit bindings
are required in order to be able to statically check their validity.
For example, SGMII can only be specified on port 4 of R and S devices,
and the non-SGMII devices, while pin-compatible, are not equal in terms
of support for RGMII internal delays (supported on P/Q/R/S, but not on
E/T).
Optional properties:
- sja1105,role-mac:
- sja1105,role-phy:
Boolean properties that can be assigned under each port node. By
default (unless otherwise specified) a port is configured as MAC if it
is driving a PHY (phy-handle is present) or as PHY if it is PHY-less
(fixed-link specified, presumably because it is connected to a MAC).
The effect of this property (in either its implicit or explicit form)
is:
- In the case of MII or RMII it specifies whether the SJA1105 port is a
clock source or sink for this interface (not applicable for RGMII
where there is a Tx and an Rx clock).
- In the case of RGMII it affects the behavior regarding internal
delays:
1. If sja1105,role-mac is specified, and the phy-mode property is one
of "rgmii-id", "rgmii-txid" or "rgmii-rxid", then the entity
designated to apply the delay/clock skew necessary for RGMII
is the PHY. The SJA1105 MAC does not apply any internal delays.
2. If sja1105,role-phy is specified, and the phy-mode property is one
of the above, the designated entity to apply the internal delays
is the SJA1105 MAC (if hardware-supported). This is only supported
by the second-generation (P/Q/R/S) hardware. On a first-generation
E or T device, it is an error to specify an RGMII phy-mode other
than "rgmii" for a port that is in fixed-link mode. In that case,
the clock skew must either be added by the MAC at the other end of
the fixed-link, or by PCB serpentine traces on the board.
These properties are required, for example, in the case where SJA1105
ports are at both ends of a MII/RMII PHY-less setup. One end would need
to have sja1105,role-mac, while the other sja1105,role-phy.
See Documentation/devicetree/bindings/net/dsa/dsa.txt for the list of standard
DSA required and optional properties.
Other observations
------------------
The SJA1105 SPI interface requires a CS-to-CLK time (t2 in UM10944) of at least
one half of t_CLK. At an SPI frequency of 1MHz, this means a minimum
cs_sck_delay of 500ns. Ensuring that this SPI timing requirement is observed
depends on the SPI bus master driver.
Example
-------
Ethernet switch connected via SPI to the host, CPU port wired to enet2:
arch/arm/boot/dts/ls1021a-tsn.dts:
/* SPI controller of the LS1021 */
&dspi0 {
sja1105@1 {
reg = <0x1>;
#address-cells = <1>;
#size-cells = <0>;
compatible = "nxp,sja1105t";
spi-max-frequency = <4000000>;
fsl,spi-cs-sck-delay = <1000>;
fsl,spi-sck-cs-delay = <1000>;
ports {
#address-cells = <1>;
#size-cells = <0>;
port@0 {
/* ETH5 written on chassis */
label = "swp5";
phy-handle = <&rgmii_phy6>;
phy-mode = "rgmii-id";
reg = <0>;
/* Implicit "sja1105,role-mac;" */
};
port@1 {
/* ETH2 written on chassis */
label = "swp2";
phy-handle = <&rgmii_phy3>;
phy-mode = "rgmii-id";
reg = <1>;
/* Implicit "sja1105,role-mac;" */
};
port@2 {
/* ETH3 written on chassis */
label = "swp3";
phy-handle = <&rgmii_phy4>;
phy-mode = "rgmii-id";
reg = <2>;
/* Implicit "sja1105,role-mac;" */
};
port@3 {
/* ETH4 written on chassis */
phy-handle = <&rgmii_phy5>;
label = "swp4";
phy-mode = "rgmii-id";
reg = <3>;
/* Implicit "sja1105,role-mac;" */
};
port@4 {
/* Internal port connected to eth2 */
ethernet = <&enet2>;
phy-mode = "rgmii";
reg = <4>;
/* Implicit "sja1105,role-phy;" */
fixed-link {
speed = <1000>;
full-duplex;
};
};
};
};
};
/* MDIO controller of the LS1021 */
&mdio0 {
/* BCM5464 */
rgmii_phy3: ethernet-phy@3 {
reg = <0x3>;
};
rgmii_phy4: ethernet-phy@4 {
reg = <0x4>;
};
rgmii_phy5: ethernet-phy@5 {
reg = <0x5>;
};
rgmii_phy6: ethernet-phy@6 {
reg = <0x6>;
};
};
/* Ethernet master port of the LS1021 */
&enet2 {
phy-connection-type = "rgmii";
status = "ok";
fixed-link {
speed = <1000>;
full-duplex;
};
};

View File

@@ -68,6 +68,7 @@ properties:
- tbi
- rev-mii
- rmii
- rev-rmii
# RX and TX delays are added by the MAC when required
- rgmii
@@ -97,6 +98,7 @@ properties:
- 10gbase-kr
- usxgmii
- 10gbase-r
- 25gbase-r
phy-mode:
$ref: "#/properties/phy-connection-type"

View File

@@ -0,0 +1,76 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/ingenic,mac.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Bindings for MAC in Ingenic SoCs
maintainers:
- 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
description:
The Ethernet Media Access Controller in Ingenic SoCs.
properties:
compatible:
enum:
- ingenic,jz4775-mac
- ingenic,x1000-mac
- ingenic,x1600-mac
- ingenic,x1830-mac
- ingenic,x2000-mac
reg:
maxItems: 1
interrupts:
maxItems: 1
interrupt-names:
const: macirq
clocks:
maxItems: 1
clock-names:
const: stmmaceth
mode-reg:
description: An extra syscon register that control ethernet interface and timing delay
rx-clk-delay-ps:
description: RGMII receive clock delay defined in pico seconds
tx-clk-delay-ps:
description: RGMII transmit clock delay defined in pico seconds
required:
- compatible
- reg
- interrupts
- interrupt-names
- clocks
- clock-names
- mode-reg
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/x1000-cgu.h>
mac: ethernet@134b0000 {
compatible = "ingenic,x1000-mac";
reg = <0x134b0000 0x2000>;
interrupt-parent = <&intc>;
interrupts = <55>;
interrupt-names = "macirq";
clocks = <&cgu X1000_CLK_MAC>;
clock-names = "stmmaceth";
mode-reg = <&mac_phy_ctrl>;
};
...

View File

@@ -0,0 +1,226 @@
# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/microchip,sparx5-switch.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Microchip Sparx5 Ethernet switch controller
maintainers:
- Steen Hegelund <steen.hegelund@microchip.com>
- Lars Povlsen <lars.povlsen@microchip.com>
description: |
The SparX-5 Enterprise Ethernet switch family provides a rich set of
Enterprise switching features such as advanced TCAM-based VLAN and
QoS processing enabling delivery of differentiated services, and
security through TCAM-based frame processing using versatile content
aware processor (VCAP).
IPv4/IPv6 Layer 3 (L3) unicast and multicast routing is supported
with up to 18K IPv4/9K IPv6 unicast LPM entries and up to 9K IPv4/3K
IPv6 (S,G) multicast groups.
L3 security features include source guard and reverse path
forwarding (uRPF) tasks. Additional L3 features include VRF-Lite and
IP tunnels (IP over GRE/IP).
The SparX-5 switch family targets managed Layer 2 and Layer 3
equipment in SMB, SME, and Enterprise where high port count
1G/2.5G/5G/10G switching with 10G/25G aggregation links is required.
properties:
$nodename:
pattern: "^switch@[0-9a-f]+$"
compatible:
const: microchip,sparx5-switch
reg:
items:
- description: cpu target
- description: devices target
- description: general control block target
reg-names:
items:
- const: cpu
- const: devices
- const: gcb
interrupts:
minItems: 1
items:
- description: register based extraction
- description: frame dma based extraction
interrupt-names:
minItems: 1
items:
- const: xtr
- const: fdma
resets:
items:
- description: Reset controller used for switch core reset (soft reset)
reset-names:
items:
- const: switch
mac-address: true
ethernet-ports:
type: object
patternProperties:
"^port@[0-9a-f]+$":
type: object
properties:
'#address-cells':
const: 1
'#size-cells':
const: 0
reg:
description: Switch port number
phys:
maxItems: 1
description:
phandle of a Ethernet SerDes PHY. This defines which SerDes
instance will handle the Ethernet traffic.
phy-mode:
description:
This specifies the interface used by the Ethernet SerDes towards
the PHY or SFP.
microchip,bandwidth:
description: Specifies bandwidth in Mbit/s allocated to the port.
$ref: "/schemas/types.yaml#/definitions/uint32"
maximum: 25000
phy-handle:
description:
phandle of a Ethernet PHY. This is optional and if provided it
points to the cuPHY used by the Ethernet SerDes.
sfp:
description:
phandle of an SFP. This is optional and used when not specifying
a cuPHY. It points to the SFP node that describes the SFP used by
the Ethernet SerDes.
managed: true
microchip,sd-sgpio:
description:
Index of the ports Signal Detect SGPIO in the set of 384 SGPIOs
This is optional, and only needed if the default used index is
is not correct.
$ref: "/schemas/types.yaml#/definitions/uint32"
minimum: 0
maximum: 383
required:
- reg
- phys
- phy-mode
- microchip,bandwidth
oneOf:
- required:
- phy-handle
- required:
- sfp
- managed
required:
- compatible
- reg
- reg-names
- interrupts
- interrupt-names
- resets
- reset-names
- ethernet-ports
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
switch: switch@600000000 {
compatible = "microchip,sparx5-switch";
reg = <0 0x401000>,
<0x10004000 0x7fc000>,
<0x11010000 0xaf0000>;
reg-names = "cpu", "devices", "gcb";
interrupts = <GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "xtr";
resets = <&reset 0>;
reset-names = "switch";
ethernet-ports {
#address-cells = <1>;
#size-cells = <0>;
port0: port@0 {
reg = <0>;
microchip,bandwidth = <1000>;
phys = <&serdes 13>;
phy-handle = <&phy0>;
phy-mode = "qsgmii";
};
/* ... */
/* Then the 25G interfaces */
port60: port@60 {
reg = <60>;
microchip,bandwidth = <25000>;
phys = <&serdes 29>;
phy-mode = "10gbase-r";
sfp = <&sfp_eth60>;
managed = "in-band-status";
microchip,sd-sgpio = <365>;
};
port61: port@61 {
reg = <61>;
microchip,bandwidth = <25000>;
phys = <&serdes 30>;
phy-mode = "10gbase-r";
sfp = <&sfp_eth61>;
managed = "in-band-status";
microchip,sd-sgpio = <369>;
};
port62: port@62 {
reg = <62>;
microchip,bandwidth = <25000>;
phys = <&serdes 31>;
phy-mode = "10gbase-r";
sfp = <&sfp_eth62>;
managed = "in-band-status";
microchip,sd-sgpio = <373>;
};
port63: port@63 {
reg = <63>;
microchip,bandwidth = <25000>;
phys = <&serdes 32>;
phy-mode = "10gbase-r";
sfp = <&sfp_eth63>;
managed = "in-band-status";
microchip,sd-sgpio = <377>;
};
/* Finally the Management interface */
port64: port@64 {
reg = <64>;
microchip,bandwidth = <1000>;
phys = <&serdes 0>;
phy-handle = <&phy64>;
phy-mode = "sgmii";
mac-address = [ 00 00 00 01 02 03 ];
};
};
};
...
# vim: set ts=2 sw=2 sts=2 tw=80 et cc=80 ft=yaml :

View File

@@ -27,6 +27,9 @@ properties:
reg:
maxItems: 1
clocks:
maxItems: 1
wake-gpios:
maxItems: 1
description:
@@ -80,6 +83,8 @@ examples:
en-gpios = <&gpf1 4 GPIO_ACTIVE_HIGH>;
wake-gpios = <&gpj0 2 GPIO_ACTIVE_HIGH>;
clocks = <&rpmcc 20>;
};
};
# UART example on Raspberry Pi

View File

@@ -44,6 +44,7 @@ description:
properties:
compatible:
enum:
- qcom,msm8998-ipa
- qcom,sc7180-ipa
- qcom,sc7280-ipa
- qcom,sdm845-ipa

View File

@@ -1,69 +0,0 @@
Qualcomm Bluetooth Chips
---------------------
This documents the binding structure and common properties for serial
attached Qualcomm devices.
Serial attached Qualcomm devices shall be a child node of the host UART
device the slave device is attached to.
Required properties:
- compatible: should contain one of the following:
* "qcom,qca6174-bt"
* "qcom,qca9377-bt"
* "qcom,wcn3990-bt"
* "qcom,wcn3991-bt"
* "qcom,wcn3998-bt"
* "qcom,qca6390-bt"
Optional properties for compatible string qcom,qca6174-bt:
- enable-gpios: gpio specifier used to enable chip
- clocks: clock provided to the controller (SUSCLK_32KHZ)
- firmware-name: specify the name of nvm firmware to load
Optional properties for compatible string qcom,qca9377-bt:
- max-speed: see Documentation/devicetree/bindings/serial/serial.yaml
Required properties for compatible string qcom,wcn399x-bt:
- vddio-supply: VDD_IO supply regulator handle.
- vddxo-supply: VDD_XO supply regulator handle.
- vddrf-supply: VDD_RF supply regulator handle.
- vddch0-supply: VDD_CH0 supply regulator handle.
Optional properties for compatible string qcom,wcn399x-bt:
- max-speed: see Documentation/devicetree/bindings/serial/serial.yaml
- firmware-name: specify the name of nvm firmware to load
- clocks: clock provided to the controller
Examples:
serial@7570000 {
label = "BT-UART";
status = "okay";
bluetooth {
compatible = "qcom,qca6174-bt";
enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
clocks = <&divclk4>;
firmware-name = "nvm_00440302.bin";
};
};
serial@898000 {
bluetooth {
compatible = "qcom,wcn3990-bt";
vddio-supply = <&vreg_s4a_1p8>;
vddxo-supply = <&vreg_l7a_1p8>;
vddrf-supply = <&vreg_l17a_1p3>;
vddch0-supply = <&vreg_l25a_3p3>;
max-speed = <3200000>;
firmware-name = "crnv21.bin";
clocks = <&rpmhcc RPMH_RF_CLK2>;
};
};

View File

@@ -0,0 +1,183 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/qualcomm-bluetooth.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Bluetooth Chips
maintainers:
- Balakrishna Godavarthi <bgodavar@codeaurora.org>
- Rocky Liao <rjliao@codeaurora.org>
description:
This binding describes Qualcomm UART-attached bluetooth chips.
properties:
compatible:
enum:
- qcom,qca6174-bt
- qcom,qca9377-bt
- qcom,wcn3990-bt
- qcom,wcn3991-bt
- qcom,wcn3998-bt
- qcom,qca6390-bt
- qcom,wcn6750-bt
enable-gpios:
maxItems: 1
description: gpio specifier used to enable chip
swctrl-gpios:
maxItems: 1
description: gpio specifier is used to find status
of clock supply to SoC
clocks:
maxItems: 1
description: clock provided to the controller (SUSCLK_32KHZ)
vddio-supply:
description: VDD_IO supply regulator handle
vddxo-supply:
description: VDD_XO supply regulator handle
vddrf-supply:
description: VDD_RF supply regulator handle
vddch0-supply:
description: VDD_CH0 supply regulator handle
vddaon-supply:
description: VDD_AON supply regulator handle
vddbtcxmx-supply:
description: VDD_BT_CXMX supply regulator handle
vddrfacmn-supply:
description: VDD_RFA_CMN supply regulator handle
vddrfa0p8-supply:
description: VDD_RFA_0P8 suppply regulator handle
vddrfa1p7-supply:
description: VDD_RFA_1P7 supply regulator handle
vddrfa1p2-supply:
description: VDD_RFA_1P2 supply regulator handle
vddrfa2p2-supply:
description: VDD_RFA_2P2 supply regulator handle
vddasd-supply:
description: VDD_ASD supply regulator handle
max-speed:
description: see Documentation/devicetree/bindings/serial/serial.yaml
firmware-name:
description: specify the name of nvm firmware to load
local-bd-address:
description: see Documentation/devicetree/bindings/net/bluetooth.txt
required:
- compatible
additionalProperties: false
allOf:
- if:
properties:
compatible:
contains:
enum:
- qcom,qca6174-bt
then:
required:
- enable-gpios
- clocks
- if:
properties:
compatible:
contains:
enum:
- qcom,wcn3990-bt
- qcom,wcn3991-bt
- qcom,wcn3998-bt
then:
required:
- vddio-supply
- vddxo-supply
- vddrf-supply
- vddch0-supply
- if:
properties:
compatible:
contains:
enum:
- qcom,wcn6750-bt
then:
required:
- enable-gpios
- swctrl-gpios
- vddio-supply
- vddaon-supply
- vddbtcxmx-supply
- vddrfacmn-supply
- vddrfa0p8-supply
- vddrfa1p7-supply
- vddrfa1p2-supply
- vddasd-supply
examples:
- |
#include <dt-bindings/gpio/gpio.h>
serial {
bluetooth {
compatible = "qcom,qca6174-bt";
enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
clocks = <&divclk4>;
firmware-name = "nvm_00440302.bin";
};
};
- |
serial {
bluetooth {
compatible = "qcom,wcn3990-bt";
vddio-supply = <&vreg_s4a_1p8>;
vddxo-supply = <&vreg_l7a_1p8>;
vddrf-supply = <&vreg_l17a_1p3>;
vddch0-supply = <&vreg_l25a_3p3>;
max-speed = <3200000>;
firmware-name = "crnv21.bin";
};
};
- |
serial {
bluetooth {
compatible = "qcom,wcn6750-bt";
pinctrl-names = "default";
pinctrl-0 = <&bt_en_default>;
enable-gpios = <&tlmm 85 GPIO_ACTIVE_HIGH>;
swctrl-gpios = <&tlmm 86 GPIO_ACTIVE_HIGH>;
vddio-supply = <&vreg_l19b_1p8>;
vddaon-supply = <&vreg_s7b_0p9>;
vddbtcxmx-supply = <&vreg_s7b_0p9>;
vddrfacmn-supply = <&vreg_s7b_0p9>;
vddrfa0p8-supply = <&vreg_s7b_0p9>;
vddrfa1p7-supply = <&vreg_s1b_1p8>;
vddrfa1p2-supply = <&vreg_s8b_1p2>;
vddrfa2p2-supply = <&vreg_s1c_2p2>;
vddasd-supply = <&vreg_l11c_2p8>;
max-speed = <3200000>;
firmware-name = "msnv11.bin";
};
};

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: GPL-2.0+
%YAML 1.2
---
$id: http://devicetree.org/schemas/net/realtek,rtl82xx.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Realtek RTL82xx PHY
maintainers:
- Andrew Lunn <andrew@lunn.ch>
- Florian Fainelli <f.fainelli@gmail.com>
- Heiner Kallweit <hkallweit1@gmail.com>
description:
Bindings for Realtek RTL82xx PHYs
allOf:
- $ref: ethernet-phy.yaml#
properties:
realtek,clkout-disable:
type: boolean
description:
Disable CLKOUT clock, CLKOUT clock default is enabled after hardware reset.
realtek,aldps-enable:
type: boolean
description:
Enable ALDPS mode, ALDPS mode default is disabled after hardware reset.
unevaluatedProperties: false
examples:
- |
mdio {
#address-cells = <1>;
#size-cells = <0>;
ethphy1: ethernet-phy@1 {
reg = <1>;
realtek,clkout-disable;
realtek,aldps-enable;
};
};

View File

@@ -19,10 +19,12 @@ select:
- rockchip,rk3128-gmac
- rockchip,rk3228-gmac
- rockchip,rk3288-gmac
- rockchip,rk3308-gmac
- rockchip,rk3328-gmac
- rockchip,rk3366-gmac
- rockchip,rk3368-gmac
- rockchip,rk3399-gmac
- rockchip,rk3568-gmac
- rockchip,rv1108-gmac
required:
- compatible
@@ -32,17 +34,23 @@ allOf:
properties:
compatible:
items:
oneOf:
- items:
- enum:
- rockchip,px30-gmac
- rockchip,rk3128-gmac
- rockchip,rk3228-gmac
- rockchip,rk3288-gmac
- rockchip,rk3308-gmac
- rockchip,rk3328-gmac
- rockchip,rk3366-gmac
- rockchip,rk3368-gmac
- rockchip,rk3399-gmac
- rockchip,rv1108-gmac
- items:
- enum:
- rockchip,rk3568-gmac
- const: snps,dwmac-4.20a
clocks:
minItems: 5

View File

@@ -51,11 +51,20 @@ properties:
- allwinner,sun8i-r40-emac
- allwinner,sun8i-v3s-emac
- allwinner,sun50i-a64-emac
- loongson,ls2k-dwmac
- loongson,ls7a-dwmac
- amlogic,meson6-dwmac
- amlogic,meson8b-dwmac
- amlogic,meson8m2-dwmac
- amlogic,meson-gxbb-dwmac
- amlogic,meson-axg-dwmac
- loongson,ls2k-dwmac
- loongson,ls7a-dwmac
- ingenic,jz4775-mac
- ingenic,x1000-mac
- ingenic,x1600-mac
- ingenic,x1830-mac
- ingenic,x2000-mac
- rockchip,px30-gmac
- rockchip,rk3128-gmac
- rockchip,rk3228-gmac
@@ -310,6 +319,11 @@ allOf:
- allwinner,sun8i-r40-emac
- allwinner,sun8i-v3s-emac
- allwinner,sun50i-a64-emac
- ingenic,jz4775-mac
- ingenic,x1000-mac
- ingenic,x1600-mac
- ingenic,x1830-mac
- ingenic,x2000-mac
- snps,dwxgmac
- snps,dwxgmac-2.10
- st,spear600-gmac
@@ -353,6 +367,13 @@ allOf:
- allwinner,sun8i-r40-emac
- allwinner,sun8i-v3s-emac
- allwinner,sun50i-a64-emac
- loongson,ls2k-dwmac
- loongson,ls7a-dwmac
- ingenic,jz4775-mac
- ingenic,x1000-mac
- ingenic,x1600-mac
- ingenic,x1830-mac
- ingenic,x2000-mac
- snps,dwmac-4.00
- snps,dwmac-4.10a
- snps,dwmac-4.20a

View File

@@ -0,0 +1,35 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: "http://devicetree.org/schemas/soc/microchip/microchip,polarfire-soc-sys-controller.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
title: Microchip PolarFire SoC (MPFS) MSS (microprocessor subsystem) system controller
maintainers:
- Conor Dooley <conor.dooley@microchip.com>
description: |
The PolarFire SoC system controller is communicated with via a mailbox.
This document describes the bindings for the client portion of that mailbox.
properties:
mboxes:
maxItems: 1
compatible:
const: microchip,polarfire-soc-sys-controller
required:
- compatible
- mboxes
additionalProperties: false
examples:
- |
syscontroller: syscontroller {
compatible = "microchip,polarfire-soc-sys-controller";
mboxes = <&mbox 0>;
};

View File

@@ -1,9 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0+
.. |u8| replace:: :c:type:`u8 <u8>`
.. |u16| replace:: :c:type:`u16 <u16>`
.. |ssam_cdev_request| replace:: :c:type:`struct ssam_cdev_request <ssam_cdev_request>`
.. |ssam_cdev_request_flags| replace:: :c:type:`enum ssam_cdev_request_flags <ssam_cdev_request_flags>`
.. |ssam_cdev_event| replace:: :c:type:`struct ssam_cdev_event <ssam_cdev_event>`
==============================
User-Space EC Interface (cdev)
@@ -23,6 +22,40 @@ These IOCTLs and their respective input/output parameter structs are defined in
A small python library and scripts for accessing this interface can be found
at https://github.com/linux-surface/surface-aggregator-module/tree/master/scripts/ssam.
.. contents::
Receiving Events
================
Events can be received by reading from the device-file. The are represented by
the |ssam_cdev_event| datatype.
Before events are available to be read, however, the desired notifiers must be
registered via the ``SSAM_CDEV_NOTIF_REGISTER`` IOCTL. Notifiers are, in
essence, callbacks, called when the EC sends an event. They are, in this
interface, associated with a specific target category and device-file-instance.
They forward any event of this category to the buffer of the corresponding
instance, from which it can then be read.
Notifiers themselves do not enable events on the EC. Thus, it may additionally
be necessary to enable events via the ``SSAM_CDEV_EVENT_ENABLE`` IOCTL. While
notifiers work per-client (i.e. per-device-file-instance), events are enabled
globally, for the EC and all of its clients (regardless of userspace or
non-userspace). The ``SSAM_CDEV_EVENT_ENABLE`` and ``SSAM_CDEV_EVENT_DISABLE``
IOCTLs take care of reference counting the events, such that an event is
enabled as long as there is a client that has requested it.
Note that enabled events are not automatically disabled once the client
instance is closed. Therefore any client process (or group of processes) should
balance their event enable calls with the corresponding event disable calls. It
is, however, perfectly valid to enable and disable events on different client
instances. For example, it is valid to set up notifiers and read events on
client instance ``A``, enable those events on instance ``B`` (note that these
will also be received by A since events are enabled/disabled globally), and
after no more events are desired, disable the previously enabled events via
instance ``C``.
Controller IOCTLs
=================
@@ -45,9 +78,33 @@ The following IOCTLs are provided:
- ``REQUEST``
- Perform synchronous SAM request.
* - ``0xA5``
- ``2``
- ``W``
- ``NOTIF_REGISTER``
- Register event notifier.
``REQUEST``
-----------
* - ``0xA5``
- ``3``
- ``W``
- ``NOTIF_UNREGISTER``
- Unregister event notifier.
* - ``0xA5``
- ``4``
- ``W``
- ``EVENT_ENABLE``
- Enable event source.
* - ``0xA5``
- ``5``
- ``W``
- ``EVENT_DISABLE``
- Disable event source.
``SSAM_CDEV_REQUEST``
---------------------
Defined as ``_IOWR(0xA5, 1, struct ssam_cdev_request)``.
@@ -82,6 +139,66 @@ submitted, and completed (i.e. handed back to user-space) successfully from
inside the IOCTL, but the request ``status`` member may still be negative in
case the actual execution of the request failed after it has been submitted.
A full definition of the argument struct is provided below:
A full definition of the argument struct is provided below.
``SSAM_CDEV_NOTIF_REGISTER``
----------------------------
Defined as ``_IOW(0xA5, 2, struct ssam_cdev_notifier_desc)``.
Register a notifier for the event target category specified in the given
notifier description with the specified priority. Notifiers registration is
required to receive events, but does not enable events themselves. After a
notifier for a specific target category has been registered, all events of that
category will be forwarded to the userspace client and can then be read from
the device file instance. Note that events may have to be enabled, e.g. via the
``SSAM_CDEV_EVENT_ENABLE`` IOCTL, before the EC will send them.
Only one notifier can be registered per target category and client instance. If
a notifier has already been registered, this IOCTL will fail with ``-EEXIST``.
Notifiers will automatically be removed when the device file instance is
closed.
``SSAM_CDEV_NOTIF_UNREGISTER``
------------------------------
Defined as ``_IOW(0xA5, 3, struct ssam_cdev_notifier_desc)``.
Unregisters the notifier associated with the specified target category. The
priority field will be ignored by this IOCTL. If no notifier has been
registered for this client instance and the given category, this IOCTL will
fail with ``-ENOENT``.
``SSAM_CDEV_EVENT_ENABLE``
--------------------------
Defined as ``_IOW(0xA5, 4, struct ssam_cdev_event_desc)``.
Enable the event associated with the given event descriptor.
Note that this call will not register a notifier itself, it will only enable
events on the controller. If you want to receive events by reading from the
device file, you will need to register the corresponding notifier(s) on that
instance.
Events are not automatically disabled when the device file is closed. This must
be done manually, via a call to the ``SSAM_CDEV_EVENT_DISABLE`` IOCTL.
``SSAM_CDEV_EVENT_DISABLE``
---------------------------
Defined as ``_IOW(0xA5, 5, struct ssam_cdev_event_desc)``.
Disable the event associated with the given event descriptor.
Note that this will not unregister any notifiers. Events may still be received
and forwarded to user-space after this call. The only safe way of stopping
events from being received is unregistering all previously registered
notifiers.
Structures and Enums
====================
.. kernel-doc:: include/uapi/linux/surface_aggregator/cdev.h

View File

@@ -4,14 +4,14 @@ Journal (jbd2)
--------------
Introduced in ext3, the ext4 filesystem employs a journal to protect the
filesystem against corruption in the case of a system crash. A small
continuous region of disk (default 128MiB) is reserved inside the
filesystem as a place to land “important” data writes on-disk as quickly
as possible. Once the important data transaction is fully written to the
disk and flushed from the disk write cache, a record of the data being
committed is also written to the journal. At some later point in time,
the journal code writes the transactions to their final locations on
disk (this could involve a lot of seeking or a lot of small
filesystem against metadata inconsistencies in the case of a system crash. Up
to 10,240,000 file system blocks (see man mke2fs(8) for more details on journal
size limits) can be reserved inside the filesystem as a place to land
important data writes on-disk as quickly as possible. Once the important
data transaction is fully written to the disk and flushed from the disk write
cache, a record of the data being committed is also written to the journal. At
some later point in time, the journal code writes the transactions to their
final locations on disk (this could involve a lot of seeking or a lot of small
read-write-erases) before erasing the commit record. Should the system
crash during the second slow write, the journal can be replayed all the
way to the latest commit record, guaranteeing the atomicity of whatever
@@ -731,3 +731,26 @@ point, the refcount for inode 11 is not reliable, but that gets fixed by the
replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
into a series of idempotent outcomes, fast commits ensured idempotence during
the replay.
Journal Checkpoint
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Checkpointing the journal ensures all transactions and their associated buffers
are submitted to the disk. In-progress transactions are waited upon and included
in the checkpoint. Checkpointing is used internally during critical updates to
the filesystem including journal recovery, filesystem resizing, and freeing of
the journal_t structure.
A journal checkpoint can be triggered from userspace via the ioctl
EXT4_IOC_CHECKPOINT. This ioctl takes a single, u64 argument for flags.
Currently, three flags are supported. First, EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
can be used to verify input to the ioctl. It returns error if there is any
invalid input, otherwise it returns success without performing
any checkpointing. This can be used to check whether the ioctl exists on a
system and to verify there are no issues with arguments or flags. The
other two flags are EXT4_IOC_CHECKPOINT_FLAG_DISCARD and
EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT. These flags cause the journal blocks to be
discarded or zero-filled, respectively, after the journal checkpoint is
complete. EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
cannot both be set. The ioctl may be useful when snapshotting a system or for
complying with content deletion SLOs.

View File

@@ -480,7 +480,7 @@ prototypes::
locking rules:
======================= ===================
ops bd_mutex
ops open_mutex
======================= ===================
open: yes
release: yes

View File

@@ -0,0 +1,199 @@
.. SPDX-License-Identifier: GPL-2.0
=========================
MDIO bus and PHYs in ACPI
=========================
The PHYs on an MDIO bus [1] are probed and registered using
fwnode_mdiobus_register_phy().
Later, for connecting these PHYs to their respective MACs, the PHYs registered
on the MDIO bus have to be referenced.
This document introduces two _DSD properties that are to be used
for connecting PHYs on the MDIO bus [3] to the MAC layer.
These properties are defined in accordance with the "Device
Properties UUID For _DSD" [2] document and the
daffd814-6eba-4d8c-8a91-bc9bbf4aa301 UUID must be used in the Device
Data Descriptors containing them.
phy-handle
----------
For each MAC node, a device property "phy-handle" is used to reference
the PHY that is registered on an MDIO bus. This is mandatory for
network interfaces that have PHYs connected to MAC via MDIO bus.
During the MDIO bus driver initialization, PHYs on this bus are probed
using the _ADR object as shown below and are registered on the MDIO bus.
.. code-block:: none
Scope(\_SB.MDI0)
{
Device(PHY1) {
Name (_ADR, 0x1)
} // end of PHY1
Device(PHY2) {
Name (_ADR, 0x2)
} // end of PHY2
}
Later, during the MAC driver initialization, the registered PHY devices
have to be retrieved from the MDIO bus. For this, the MAC driver needs
references to the previously registered PHYs which are provided
as device object references (e.g. \_SB.MDI0.PHY1).
phy-mode
--------
The "phy-mode" _DSD property is used to describe the connection to
the PHY. The valid values for "phy-mode" are defined in [4].
managed
-------
Optional property, which specifies the PHY management type.
The valid values for "managed" are defined in [4].
fixed-link
----------
The "fixed-link" is described by a data-only subnode of the
MAC port, which is linked in the _DSD package via
hierarchical data extension (UUID dbb8e3e6-5886-4ba6-8795-1319f52a966b
in accordance with [5] "_DSD Implementation Guide" document).
The subnode should comprise a required property ("speed") and
possibly the optional ones - complete list of parameters and
their values are specified in [4].
The following ASL example illustrates the usage of these properties.
DSDT entry for MDIO node
------------------------
The MDIO bus has an SoC component (MDIO controller) and a platform
component (PHYs on the MDIO bus).
a) Silicon Component
This node describes the MDIO controller, MDI0
---------------------------------------------
.. code-block:: none
Scope(_SB)
{
Device(MDI0) {
Name(_HID, "NXP0006")
Name(_CCA, 1)
Name(_UID, 0)
Name(_CRS, ResourceTemplate() {
Memory32Fixed(ReadWrite, MDI0_BASE, MDI_LEN)
Interrupt(ResourceConsumer, Level, ActiveHigh, Shared)
{
MDI0_IT
}
}) // end of _CRS for MDI0
} // end of MDI0
}
b) Platform Component
The PHY1 and PHY2 nodes represent the PHYs connected to MDIO bus MDI0
---------------------------------------------------------------------
.. code-block:: none
Scope(\_SB.MDI0)
{
Device(PHY1) {
Name (_ADR, 0x1)
} // end of PHY1
Device(PHY2) {
Name (_ADR, 0x2)
} // end of PHY2
}
DSDT entries representing MAC nodes
-----------------------------------
Below are the MAC nodes where PHY nodes are referenced.
phy-mode and phy-handle are used as explained earlier.
------------------------------------------------------
.. code-block:: none
Scope(\_SB.MCE0.PR17)
{
Name (_DSD, Package () {
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
Package (2) {"phy-mode", "rgmii-id"},
Package (2) {"phy-handle", \_SB.MDI0.PHY1}
}
})
}
Scope(\_SB.MCE0.PR18)
{
Name (_DSD, Package () {
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
Package (2) {"phy-mode", "rgmii-id"},
Package (2) {"phy-handle", \_SB.MDI0.PHY2}}
}
})
}
MAC node example where "managed" property is specified.
-------------------------------------------------------
.. code-block:: none
Scope(\_SB.PP21.ETH0)
{
Name (_DSD, Package () {
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
Package () {"phy-mode", "sgmii"},
Package () {"managed", "in-band-status"}
}
})
}
MAC node example with a "fixed-link" subnode.
---------------------------------------------
.. code-block:: none
Scope(\_SB.PP21.ETH1)
{
Name (_DSD, Package () {
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
Package () {"phy-mode", "sgmii"},
},
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
Package () {
Package () {"fixed-link", "LNK0"}
}
})
Name (LNK0, Package(){ // Data-only subnode of port
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
Package () {"speed", 1000},
Package () {"full-duplex", 1}
}
})
}
References
==========
[1] Documentation/networking/phy.rst
[2] https://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf
[3] Documentation/firmware-guide/acpi/DSD-properties-rules.rst
[4] Documentation/devicetree/bindings/net/ethernet-controller.yaml
[5] https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.pdf

View File

@@ -11,6 +11,7 @@ ACPI Support
dsd/graph
dsd/data-node-references
dsd/leds
dsd/phy
enumeration
osi
method-customizing

View File

@@ -27,32 +27,134 @@ these MAP frames and send them to appropriate PDN's.
2. Packet format
================
a. MAP packet (data / control)
a. MAP packet v1 (data / control)
MAP header has the same endianness of the IP packet.
MAP header fields are in big endian format.
Packet format::
Bit 0 1 2-7 8-15 16-31
Function Command / Data Reserved Pad Multiplexer ID Payload length
Bit 32-x
Function Raw Bytes
Function Raw bytes
Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
or data packet. Control packet is used for transport level flow control. Data
or data packet. Command packet is used for transport level flow control. Data
packets are standard IP packets.
Reserved bits are usually zeroed out and to be ignored by receiver.
Reserved bits must be zero when sent and ignored when received.
Padding is number of bytes to be added for 4 byte alignment if required by
hardware.
Padding is the number of bytes to be appended to the payload to
ensure 4 byte alignment.
Multiplexer ID is to indicate the PDN on which data has to be sent.
Payload length includes the padding length but does not include MAP header
length.
b. MAP packet (command specific)::
b. Map packet v4 (data / control)
MAP header fields are in big endian format.
Packet format::
Bit 0 1 2-7 8-15 16-31
Function Command / Data Reserved Pad Multiplexer ID Payload length
Bit 32-(x-33) (x-32)-x
Function Raw bytes Checksum offload header
Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
or data packet. Command packet is used for transport level flow control. Data
packets are standard IP packets.
Reserved bits must be zero when sent and ignored when received.
Padding is the number of bytes to be appended to the payload to
ensure 4 byte alignment.
Multiplexer ID is to indicate the PDN on which data has to be sent.
Payload length includes the padding length but does not include MAP header
length.
Checksum offload header, has the information about the checksum processing done
by the hardware.Checksum offload header fields are in big endian format.
Packet format::
Bit 0-14 15 16-31
Function Reserved Valid Checksum start offset
Bit 31-47 48-64
Function Checksum length Checksum value
Reserved bits must be zero when sent and ignored when received.
Valid bit indicates whether the partial checksum is calculated and is valid.
Set to 1, if its is valid. Set to 0 otherwise.
Padding is the number of bytes to be appended to the payload to
ensure 4 byte alignment.
Checksum start offset, Indicates the offset in bytes from the beginning of the
IP header, from which modem computed checksum.
Checksum length is the Length in bytes starting from CKSUM_START_OFFSET,
over which checksum is computed.
Checksum value, indicates the checksum computed.
c. MAP packet v5 (data / control)
MAP header fields are in big endian format.
Packet format::
Bit 0 1 2-7 8-15 16-31
Function Command / Data Next header Pad Multiplexer ID Payload length
Bit 32-x
Function Raw bytes
Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
or data packet. Command packet is used for transport level flow control. Data
packets are standard IP packets.
Next header is used to indicate the presence of another header, currently is
limited to checksum header.
Padding is the number of bytes to be appended to the payload to
ensure 4 byte alignment.
Multiplexer ID is to indicate the PDN on which data has to be sent.
Payload length includes the padding length but does not include MAP header
length.
d. Checksum offload header v5
Checksum offload header fields are in big endian format.
Bit 0 - 6 7 8-15 16-31
Function Header Type Next Header Checksum Valid Reserved
Header Type is to indicate the type of header, this usually is set to CHECKSUM
Header types
= ==========================================
0 Reserved
1 Reserved
2 checksum header
Checksum Valid is to indicate whether the header checksum is valid. Value of 1
implies that checksum is calculated on this packet and is valid, value of 0
indicates that the calculated packet checksum is invalid.
Reserved bits must be zero when sent and ignored when received.
e. MAP packet v1/v5 (command specific)::
Bit 0 1 2-7 8 - 15 16 - 31
Function Command Reserved Pad Multiplexer ID Payload length
@@ -74,7 +176,7 @@ Command types
3 is for error during processing of commands
= ==========================================
c. Aggregation
f. Aggregation
Aggregation is multiple MAP packets (can be data or command) delivered to
rmnet in a single linear skb. rmnet will process the individual

View File

@@ -11,12 +11,12 @@ ENA is a networking interface designed to make good use of modern CPU
features and system architectures.
The ENA device exposes a lightweight management interface with a
minimal set of memory mapped registers and extendable command set
minimal set of memory mapped registers and extendible command set
through an Admin Queue.
The driver supports a range of ENA devices, is link-speed independent
(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc.), and has
a negotiated and extendable feature set.
(i.e., the same driver is used for 10GbE, 25GbE, 40GbE, etc), and has
a negotiated and extendible feature set.
Some ENA devices support SR-IOV. This driver is used for both the
SR-IOV Physical Function (PF) and Virtual Function (VF) devices.
@@ -27,9 +27,9 @@ is advertised by the device via the Admin Queue), a dedicated MSI-X
interrupt vector per Tx/Rx queue pair, adaptive interrupt moderation,
and CPU cacheline optimized data placement.
The ENA driver supports industry standard TCP/IP offload features such
as checksum offload and TCP transmit segmentation offload (TSO).
Receive-side scaling (RSS) is supported for multi-core scaling.
The ENA driver supports industry standard TCP/IP offload features such as
checksum offload. Receive-side scaling (RSS) is supported for multi-core
scaling.
The ENA driver and its corresponding devices implement health
monitoring mechanisms such as watchdog, enabling the device and driver
@@ -38,7 +38,6 @@ debug logs.
Some of the ENA devices support a working mode called Low-latency
Queue (LLQ), which saves several more microseconds.
ENA Source Code Directory Structure
===================================
@@ -53,7 +52,6 @@ ena_eth_io_defs.h Definition of ENA data path interface.
ena_common_defs.h Common definitions for ena_com layer.
ena_regs_defs.h Definition of ENA PCI memory-mapped (MMIO) registers.
ena_netdev.[ch] Main Linux kernel driver.
ena_syfsfs.[ch] Sysfs files.
ena_ethtool.c ethtool callbacks.
ena_pci_id_tbl.h Supported device IDs.
================= ======================================================
@@ -69,7 +67,7 @@ ENA management interface is exposed by means of:
- Asynchronous Event Notification Queue (AENQ)
ENA device MMIO Registers are accessed only during driver
initialization and are not involved in further normal device
initialization and are not used during further normal device
operation.
AQ is used for submitting management commands, and the
@@ -112,16 +110,15 @@ The events are:
ACQ and AENQ share the same MSI-X vector.
Keep-Alive is a special mechanism that allows monitoring of the
device's health. The driver maintains a watchdog (WD) handler which,
if fired, logs the current state and statistics then resets and
restarts the ENA device and driver. A Keep-Alive event is delivered by
the device every second. The driver re-arms the WD upon reception of a
Keep-Alive event. A missed Keep-Alive event causes the WD handler to
fire.
Keep-Alive is a special mechanism that allows monitoring the device's health.
A Keep-Alive event is delivered by the device every second.
The driver maintains a watchdog (WD) handler which logs the current state and
statistics. If the keep-alive events aren't delivered as expected the WD resets
the device and the driver.
Data Path Interface
===================
I/O operations are based on Tx and Rx Submission Queues (Tx SQ and Rx
SQ correspondingly). Each SQ has a completion queue (CQ) associated
with it.
@@ -131,26 +128,24 @@ physical memory.
The ENA driver supports two Queue Operation modes for Tx SQs:
- Regular mode
* In this mode the Tx SQs reside in the host's memory. The ENA
- **Regular mode:**
In this mode the Tx SQs reside in the host's memory. The ENA
device fetches the ENA Tx descriptors and packet data from host
memory.
- Low Latency Queue (LLQ) mode or "push-mode".
* In this mode the driver pushes the transmit descriptors and the
- **Low Latency Queue (LLQ) mode or "push-mode":**
In this mode the driver pushes the transmit descriptors and the
first 128 bytes of the packet directly to the ENA device memory
space. The rest of the packet payload is fetched by the
device. For this operation mode, the driver uses a dedicated PCI
device memory BAR, which is mapped with write-combine capability.
The Rx SQs support only the regular mode.
Note: Not all ENA devices support LLQ, and this feature is negotiated
**Note that** not all ENA devices support LLQ, and this feature is negotiated
with the device upon initialization. If the ENA device does not
support LLQ mode, the driver falls back to the regular mode.
The Rx SQs support only the regular mode.
The driver supports multi-queue for both Tx and Rx. This has various
benefits:
@@ -165,6 +160,7 @@ benefits:
Interrupt Modes
===============
The driver assigns a single MSI-X vector per queue pair (for both Tx
and Rx directions). The driver assigns an additional dedicated MSI-X vector
for management (for ACQ and AENQ).
@@ -190,20 +186,21 @@ unmasked by the driver after NAPI processing is complete.
Interrupt Moderation
====================
ENA driver and device can operate in conventional or adaptive interrupt
moderation mode.
In conventional mode the driver instructs device to postpone interrupt
**In conventional mode** the driver instructs device to postpone interrupt
posting according to static interrupt delay value. The interrupt delay
value can be configured through ethtool(8). The following ethtool
parameters are supported by the driver: tx-usecs, rx-usecs
value can be configured through `ethtool(8)`. The following `ethtool`
parameters are supported by the driver: ``tx-usecs``, ``rx-usecs``
In adaptive interrupt moderation mode the interrupt delay value is
**In adaptive interrupt** moderation mode the interrupt delay value is
updated by the driver dynamically and adjusted every NAPI cycle
according to the traffic nature.
Adaptive coalescing can be switched on/off through ethtool(8)
adaptive_rx on|off parameter.
Adaptive coalescing can be switched on/off through `ethtool(8)`'s
:code:`adaptive_rx on|off` parameter.
More information about Adaptive Interrupt Moderation (DIM) can be found in
Documentation/networking/net_dim.rst
@@ -214,17 +211,10 @@ The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK
and can be configured by the ETHTOOL_STUNABLE command of the
SIOCETHTOOL ioctl.
SKB
===
The driver-allocated SKB for frames received from Rx handling using
NAPI context. The allocation method depends on the size of the packet.
If the frame length is larger than rx_copybreak, napi_get_frags()
is used, otherwise netdev_alloc_skb_ip_align() is used, the buffer
content is copied (by CPU) to the SKB, and the buffer is recycled.
Statistics
==========
The user can obtain ENA device and driver statistics using ethtool.
The user can obtain ENA device and driver statistics using `ethtool`.
The driver can collect regular or extended statistics (including
per-queue stats) from the device.
@@ -232,22 +222,23 @@ In addition the driver logs the stats to syslog upon device reset.
MTU
===
The driver supports an arbitrarily large MTU with a maximum that is
negotiated with the device. The driver configures MTU using the
SetFeature command (ENA_ADMIN_MTU property). The user can change MTU
via ip(8) and similar legacy tools.
via `ip(8)` and similar legacy tools.
Stateless Offloads
==================
The ENA driver supports:
- TSO over IPv4/IPv6
- TSO with ECN
- IPv4 header checksum offload
- TCP/UDP over IPv4/IPv6 checksum offloads
RSS
===
- The ENA device supports RSS that allows flexible Rx traffic
steering.
- Toeplitz and CRC32 hash functions are supported.
@@ -260,41 +251,42 @@ RSS
function delivered in the Rx CQ descriptor is set in the received
SKB.
- The user can provide a hash key, hash function, and configure the
indirection table through ethtool(8).
indirection table through `ethtool(8)`.
DATA PATH
=========
Tx
--
ena_start_xmit() is called by the stack. This function does the following:
:code:`ena_start_xmit()` is called by the stack. This function does the following:
- Maps data buffers (skb->data and frags).
- Populates ena_buf for the push buffer (if the driver and device are
in push mode.)
- Maps data buffers (``skb->data`` and frags).
- Populates ``ena_buf`` for the push buffer (if the driver and device are
in push mode).
- Prepares ENA bufs for the remaining frags.
- Allocates a new request ID from the empty req_id ring. The request
- Allocates a new request ID from the empty ``req_id`` ring. The request
ID is the index of the packet in the Tx info. This is used for
out-of-order TX completions.
out-of-order Tx completions.
- Adds the packet to the proper place in the Tx ring.
- Calls ena_com_prepare_tx(), an ENA communication layer that converts
the ena_bufs to ENA descriptors (and adds meta ENA descriptors as
needed.)
- Calls :code:`ena_com_prepare_tx()`, an ENA communication layer that converts
the ``ena_bufs`` to ENA descriptors (and adds meta ENA descriptors as
needed).
* This function also copies the ENA descriptors and the push buffer
to the Device memory space (if in push mode.)
to the Device memory space (if in push mode).
- Writes doorbell to the ENA device.
- Writes a doorbell to the ENA device.
- When the ENA device finishes sending the packet, a completion
interrupt is raised.
- The interrupt handler schedules NAPI.
- The ena_clean_tx_irq() function is called. This function handles the
- The :code:`ena_clean_tx_irq()` function is called. This function handles the
completion descriptors generated by the ENA, with a single
completion descriptor per completed packet.
* req_id is retrieved from the completion descriptor. The tx_info of
the packet is retrieved via the req_id. The data buffers are
unmapped and req_id is returned to the empty req_id ring.
* ``req_id`` is retrieved from the completion descriptor. The ``tx_info`` of
the packet is retrieved via the ``req_id``. The data buffers are
unmapped and ``req_id`` is returned to the empty ``req_id`` ring.
* The function stops when the completion descriptors are completed or
the budget is reached.
@@ -303,12 +295,11 @@ Rx
- When a packet is received from the ENA device.
- The interrupt handler schedules NAPI.
- The ena_clean_rx_irq() function is called. This function calls
ena_rx_pkt(), an ENA communication layer function, which returns the
number of descriptors used for a new unhandled packet, and zero if
- The :code:`ena_clean_rx_irq()` function is called. This function calls
:code:`ena_com_rx_pkt()`, an ENA communication layer function, which returns the
number of descriptors used for a new packet, and zero if
no new packet is found.
- Then it calls the ena_clean_rx_irq() function.
- ena_eth_rx_skb() checks packet length:
- :code:`ena_rx_skb()` checks packet length:
* If the packet is small (len < rx_copybreak), the driver allocates
a SKB for the new packet, and copies the packet payload into the
@@ -317,9 +308,10 @@ Rx
- In this way the original data buffer is not passed to the stack
and is reused for future Rx packets.
* Otherwise the function unmaps the Rx buffer, then allocates the
new SKB structure and hooks the Rx buffer to the SKB frags.
* Otherwise the function unmaps the Rx buffer, sets the first
descriptor as `skb`'s linear part and the other descriptors as the
`skb`'s frags.
- The new SKB is updated with the necessary information (protocol,
checksum hw verify result, etc.), and then passed to the network
stack, using the NAPI interface function napi_gro_receive().
checksum hw verify result, etc), and then passed to the network
stack, using the NAPI interface function :code:`napi_gro_receive()`.

View File

@@ -47,13 +47,24 @@ The driver interacts with the device in the following ways:
- Transmit and Receive Queues
- See description below
Descriptor Formats
------------------
GVE supports two descriptor formats: GQI and DQO. These two formats have
entirely different descriptors, which will be described below.
Registers
---------
All registers are MMIO and big endian.
All registers are MMIO.
The registers are used for initializing and configuring the device as well as
querying device status in response to management interrupts.
Endianness
----------
- Admin Queue messages and registers are all Big Endian.
- GQI descriptors and datapath registers are Big Endian.
- DQO descriptors and datapath registers are Little Endian.
Admin Queue (AQ)
----------------
The Admin Queue is a PAGE_SIZE memory block, treated as an array of AQ
@@ -97,10 +108,10 @@ the queues associated with that interrupt.
The handler for these irqs schedule the napi for that block to run
and poll the queues.
Traffic Queues
--------------
gVNIC's queues are composed of a descriptor ring and a buffer and are
assigned to a notification block.
GQI Traffic Queues
------------------
GQI queues are composed of a descriptor ring and a buffer and are assigned to a
notification block.
The descriptor rings are power-of-two-sized ring buffers consisting of
fixed-size descriptors. They advance their head pointer using a __be32
@@ -121,3 +132,35 @@ Receive
The buffers for receive rings are put into a data ring that is the same
length as the descriptor ring and the head and tail pointers advance over
the rings together.
DQO Traffic Queues
------------------
- Every TX and RX queue is assigned a notification block.
- TX and RX buffers queues, which send descriptors to the device, use MMIO
doorbells to notify the device of new descriptors.
- RX and TX completion queues, which receive descriptors from the device, use a
"generation bit" to know when a descriptor was populated by the device. The
driver initializes all bits with the "current generation". The device will
populate received descriptors with the "next generation" which is inverted
from the current generation. When the ring wraps, the current/next generation
are swapped.
- It's the driver's responsibility to ensure that the RX and TX completion
queues are not overrun. This can be accomplished by limiting the number of
descriptors posted to HW.
- TX packets have a 16 bit completion_tag and RX buffers have a 16 bit
buffer_id. These will be returned on the TX completion and RX queues
respectively to let the driver know which packet/buffer was completed.
Transmit
~~~~~~~~
A packet's buffers are DMA mapped for the device to access before transmission.
After the packet was successfully transmitted, the buffers are unmapped.
Receive
~~~~~~~
The driver posts fixed sized buffers to HW on the RX buffer queue. The packet
received on the associated RX queue may span multiple descriptors.

View File

@@ -12,6 +12,7 @@ Contents
- `Enabling the driver and kconfig options`_
- `Devlink info`_
- `Devlink parameters`_
- `Bridge offload`_
- `mlx5 subfunction`_
- `mlx5 function attributes`_
- `Devlink health reporters`_
@@ -217,6 +218,37 @@ users try to enable them.
$ devlink dev eswitch set pci/0000:06:00.0 mode switchdev
Bridge offload
==============
The mlx5 driver implements support for offloading bridge rules when in switchdev
mode. Linux bridge FDBs are automatically offloaded when mlx5 switchdev
representor is attached to bridge.
- Change device to switchdev mode::
$ devlink dev eswitch set pci/0000:06:00.0 mode switchdev
- Attach mlx5 switchdev representor 'enp8s0f0' to bridge netdev 'bridge1'::
$ ip link set enp8s0f0 master bridge1
VLANs
-----
Following bridge VLAN functions are supported by mlx5:
- VLAN filtering (including multiple VLANs per port)::
$ ip link set bridge1 type bridge vlan_filtering 1
$ bridge vlan add dev enp8s0f0 vid 2-3
- VLAN push on bridge ingress::
$ bridge vlan add dev enp8s0f0 vid 3 pvid
- VLAN pop on bridge egress::
$ bridge vlan add dev enp8s0f0 vid 3 untagged
mlx5 subfunction
================
mlx5 supports subfunction management using devlink port (see :ref:`Documentation/networking/devlink/devlink-port.rst <devlink_port>`) interface.
@@ -568,3 +600,59 @@ tc and eswitch offloads tracepoints:
$ cat /sys/kernel/debug/tracing/trace
...
kworker/u48:7-2221 [009] ...1 1475.387435: mlx5e_rep_neigh_update: netdev: ens1f0 MAC: 24:8a:07:9a:17:9a IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_connected=1
Bridge offloads tracepoints:
- mlx5_esw_bridge_fdb_entry_init: trace bridge FDB entry offloaded to mlx5::
$ echo mlx5:mlx5_esw_bridge_fdb_entry_init >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
kworker/u20:9-2217 [003] ...1 318.582243: mlx5_esw_bridge_fdb_entry_init: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=0 flags=0 used=0
- mlx5_esw_bridge_fdb_entry_cleanup: trace bridge FDB entry deleted from mlx5::
$ echo mlx5:mlx5_esw_bridge_fdb_entry_cleanup >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
ip-2581 [005] ...1 318.629871: mlx5_esw_bridge_fdb_entry_cleanup: net_device=enp8s0f0_1 addr=e4:fd:05:08:00:03 vid=0 flags=0 used=16
- mlx5_esw_bridge_fdb_entry_refresh: trace bridge FDB entry offload refreshed in
mlx5::
$ echo mlx5:mlx5_esw_bridge_fdb_entry_refresh >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
kworker/u20:8-3849 [003] ...1 466716: mlx5_esw_bridge_fdb_entry_refresh: net_device=enp8s0f0_0 addr=e4:fd:05:08:00:02 vid=3 flags=0 used=0
- mlx5_esw_bridge_vlan_create: trace bridge VLAN object add on mlx5
representor::
$ echo mlx5:mlx5_esw_bridge_vlan_create >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
ip-2560 [007] ...1 318.460258: mlx5_esw_bridge_vlan_create: vid=1 flags=6
- mlx5_esw_bridge_vlan_cleanup: trace bridge VLAN object delete from mlx5
representor::
$ echo mlx5:mlx5_esw_bridge_vlan_cleanup >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
bridge-2582 [007] ...1 318.653496: mlx5_esw_bridge_vlan_cleanup: vid=2 flags=8
- mlx5_esw_bridge_vport_init: trace mlx5 vport assigned with bridge upper
device::
$ echo mlx5:mlx5_esw_bridge_vport_init >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
ip-2560 [007] ...1 318.458915: mlx5_esw_bridge_vport_init: vport_num=1
- mlx5_esw_bridge_vport_cleanup: trace mlx5 vport removed from bridge upper
device::
$ echo mlx5:mlx5_esw_bridge_vport_cleanup >> set_event
$ cat /sys/kernel/debug/tracing/trace
...
ip-5387 [000] ...1 573713: mlx5_esw_bridge_vport_cleanup: vport_num=1

View File

@@ -18,6 +18,7 @@ Contents:
qlogic/index
wan/index
wifi/index
wwan/index
.. only:: subproject and html

View File

@@ -0,0 +1,18 @@
.. SPDX-License-Identifier: GPL-2.0-only
WWAN Device Drivers
===================
Contents:
.. toctree::
:maxdepth: 2
iosm
.. only:: subproject and html
Indices
=======
* :ref:`genindex`

View File

@@ -0,0 +1,96 @@
.. SPDX-License-Identifier: GPL-2.0-only
.. Copyright (C) 2020-21 Intel Corporation
.. _iosm_driver_doc:
===========================================
IOSM Driver for Intel M.2 PCIe based Modems
===========================================
The IOSM (IPC over Shared Memory) driver is a WWAN PCIe host driver developed
for linux or chrome platform for data exchange over PCIe interface between
Host platform & Intel M.2 Modem. The driver exposes interface conforming to the
MBIM protocol [1]. Any front end application ( eg: Modem Manager) could easily
manage the MBIM interface to enable data communication towards WWAN.
Basic usage
===========
MBIM functions are inactive when unmanaged. The IOSM driver only provides a
userspace interface MBIM "WWAN PORT" representing MBIM control channel and does
not play any role in managing the functionality. It is the job of a userspace
application to detect port enumeration and enable MBIM functionality.
Examples of few such userspace application are:
- mbimcli (included with the libmbim [2] library), and
- Modem Manager [3]
Management Applications to carry out below required actions for establishing
MBIM IP session:
- open the MBIM control channel
- configure network connection settings
- connect to network
- configure IP network interface
Management application development
==================================
The driver and userspace interfaces are described below. The MBIM protocol is
described in [1] Mobile Broadband Interface Model v1.0 Errata-1.
MBIM control channel userspace ABI
----------------------------------
/dev/wwan0mbim0 character device
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The driver exposes an MBIM interface to the MBIM function by implementing
MBIM WWAN Port. The userspace end of the control channel pipe is a
/dev/wwan0mbim0 character device. Application shall use this interface for
MBIM protocol communication.
Fragmentation
~~~~~~~~~~~~~
The userspace application is responsible for all control message fragmentation
and defragmentation as per MBIM specification.
/dev/wwan0mbim0 write()
~~~~~~~~~~~~~~~~~~~~~~~
The MBIM control messages from the management application must not exceed the
negotiated control message size.
/dev/wwan0mbim0 read()
~~~~~~~~~~~~~~~~~~~~~~
The management application must accept control messages of up the negotiated
control message size.
MBIM data channel userspace ABI
-------------------------------
wwan0-X network device
~~~~~~~~~~~~~~~~~~~~~~
The IOSM driver exposes IP link interface "wwan0-X" of type "wwan" for IP
traffic. Iproute network utility is used for creating "wwan0-X" network
interface and for associating it with MBIM IP session. The Driver supports
upto 8 IP sessions for simultaneous IP communication.
The userspace management application is responsible for creating new IP link
prior to establishing MBIM IP session where the SessionId is greater than 0.
For example, creating new IP link for a MBIM IP session with SessionId 1:
ip link add dev wwan0-1 parentdev-name wwan0 type wwan linkid 1
The driver will automatically map the "wwan0-1" network device to MBIM IP
session 1.
References
==========
[1] "MBIM (Mobile Broadband Interface Model) Errata-1"
- https://www.usb.org/document-library/
[2] libmbim - "a glib-based library for talking to WWAN modems and
devices which speak the Mobile Interface Broadband Model (MBIM)
protocol"
- http://www.freedesktop.org/wiki/Software/libmbim/
[3] Modem Manager - "a DBus-activated daemon which controls mobile
broadband (2G/3G/4G) devices and connections"
- http://www.freedesktop.org/wiki/Software/ModemManager/

View File

@@ -164,6 +164,41 @@ device to instantiate the subfunction device on particular PCI function.
A subfunction device is created on the :ref:`Documentation/driver-api/auxiliary_bus.rst <auxiliary_bus>`.
At this point a matching subfunction driver binds to the subfunction's auxiliary device.
Rate object management
======================
Devlink provides API to manage tx rates of single devlink port or a group.
This is done through rate objects, which can be one of the two types:
``leaf``
Represents a single devlink port; created/destroyed by the driver. Since leaf
have 1to1 mapping to its devlink port, in user space it is referred as
``pci/<bus_addr>/<port_index>``;
``node``
Represents a group of rate objects (leafs and/or nodes); created/deleted by
request from the userspace; initially empty (no rate objects added). In
userspace it is referred as ``pci/<bus_addr>/<node_name>``, where
``node_name`` can be any identifier, except decimal number, to avoid
collisions with leafs.
API allows to configure following rate object's parameters:
``tx_share``
Minimum TX rate value shared among all other rate objects, or rate objects
that parts of the parent group, if it is a part of the same group.
``tx_max``
Maximum TX rate value.
``parent``
Parent node name. Parent node rate limits are considered as additional limits
to all node children limits. ``tx_max`` is an upper limit for children.
``tx_share`` is a total bandwidth distributed among children.
Driver implementations are allowed to support both or either rate object types
and setting methods of their parameters.
Terms and Definitions
=====================

View File

@@ -497,6 +497,7 @@ drivers:
* Documentation/networking/devlink/netdevsim.rst
* Documentation/networking/devlink/mlxsw.rst
* Documentation/networking/devlink/prestera.rst
.. _Generic-Packet-Trap-Groups:

View File

@@ -46,3 +46,4 @@ parameters, info versions, and other features it supports.
qed
ti-cpsw-switch
am65-nuss-cpsw-switch
prestera

View File

@@ -57,6 +57,32 @@ entries, FIB rule entries and nexthops that the driver will allow.
$ devlink resource set netdevsim/netdevsim0 path /nexthops size 16
$ devlink dev reload netdevsim/netdevsim0
Rate objects
============
The ``netdevsim`` driver supports rate objects management, which includes:
- registerging/unregistering leaf rate objects per VF devlink port;
- creation/deletion node rate objects;
- setting tx_share and tx_max rate values for any rate object type;
- setting parent node for any rate object type.
Rate nodes and it's parameters are exposed in ``netdevsim`` debugfs in RO mode.
For example created rate node with name ``some_group``:
.. code:: shell
$ ls /sys/kernel/debug/netdevsim/netdevsim0/rate_groups/some_group
rate_parent tx_max tx_share
Same parameters are exposed for leaf objects in corresponding ports directories.
For ex.:
.. code:: shell
$ ls /sys/kernel/debug/netdevsim/netdevsim0/ports/1
dev ethtool rate_parent tx_max tx_share
Driver-specific Traps
=====================

View File

@@ -0,0 +1,141 @@
.. SPDX-License-Identifier: GPL-2.0
========================
prestera devlink support
========================
This document describes the devlink features implemented by the ``prestera``
device driver.
Driver-specific Traps
=====================
.. list-table:: List of Driver-specific Traps Registered by ``prestera``
:widths: 5 5 90
* - Name
- Type
- Description
.. list-table:: List of Driver-specific Traps Registered by ``prestera``
:widths: 5 5 90
* - Name
- Type
- Description
* - ``arp_bc``
- ``trap``
- Traps ARP broadcast packets (both requests/responses)
* - ``is_is``
- ``trap``
- Traps IS-IS packets
* - ``ospf``
- ``trap``
- Traps OSPF packets
* - ``ip_bc_mac``
- ``trap``
- Traps IPv4 packets with broadcast DA Mac address
* - ``stp``
- ``trap``
- Traps STP BPDU
* - ``lacp``
- ``trap``
- Traps LACP packets
* - ``lldp``
- ``trap``
- Traps LLDP packets
* - ``router_mc``
- ``trap``
- Traps multicast packets
* - ``vrrp``
- ``trap``
- Traps VRRP packets
* - ``dhcp``
- ``trap``
- Traps DHCP packets
* - ``mtu_error``
- ``trap``
- Traps (exception) packets that exceeded port's MTU
* - ``mac_to_me``
- ``trap``
- Traps packets with switch-port's DA Mac address
* - ``ttl_error``
- ``trap``
- Traps (exception) IPv4 packets whose TTL exceeded
* - ``ipv4_options``
- ``trap``
- Traps (exception) packets due to the malformed IPV4 header options
* - ``ip_default_route``
- ``trap``
- Traps packets that have no specific IP interface (IP to me) and no forwarding prefix
* - ``local_route``
- ``trap``
- Traps packets that have been send to one of switch IP interfaces addresses
* - ``ipv4_icmp_redirect``
- ``trap``
- Traps (exception) IPV4 ICMP redirect packets
* - ``arp_response``
- ``trap``
- Traps ARP replies packets that have switch-port's DA Mac address
* - ``acl_code_0``
- ``trap``
- Traps packets that have ACL priority set to 0 (tc pref 0)
* - ``acl_code_1``
- ``trap``
- Traps packets that have ACL priority set to 1 (tc pref 1)
* - ``acl_code_2``
- ``trap``
- Traps packets that have ACL priority set to 2 (tc pref 2)
* - ``acl_code_3``
- ``trap``
- Traps packets that have ACL priority set to 3 (tc pref 3)
* - ``acl_code_4``
- ``trap``
- Traps packets that have ACL priority set to 4 (tc pref 4)
* - ``acl_code_5``
- ``trap``
- Traps packets that have ACL priority set to 5 (tc pref 5)
* - ``acl_code_6``
- ``trap``
- Traps packets that have ACL priority set to 6 (tc pref 6)
* - ``acl_code_7``
- ``trap``
- Traps packets that have ACL priority set to 7 (tc pref 7)
* - ``ipv4_bgp``
- ``trap``
- Traps IPv4 BGP packets
* - ``ssh``
- ``trap``
- Traps SSH packets
* - ``telnet``
- ``trap``
- Traps Telnet packets
* - ``icmp``
- ``trap``
- Traps ICMP packets
* - ``rxdma_drop``
- ``drop``
- Drops packets (RxDMA) due to the lack of ingress buffers etc.
* - ``port_no_vlan``
- ``drop``
- Drops packets due to faulty-configured network or due to internal bug (config issue).
* - ``local_port``
- ``drop``
- Drops packets whose decision (FDB entry) is to bridge packet back to the incoming port/trunk.
* - ``invalid_sa``
- ``drop``
- Drops packets with multicast source MAC address.
* - ``illegal_ip_addr``
- ``drop``
- Drops packets with illegal SIP/DIP multicast/unicast addresses.
* - ``illegal_ipv4_hdr``
- ``drop``
- Drops packets with illegal IPV4 header.
* - ``ip_uc_dip_da_mismatch``
- ``drop``
- Drops packets with destination MAC being unicast, but destination IP address being multicast.
* - ``ip_sip_is_zero``
- ``drop``
- Drops packets with zero (0) IPV4 source address.
* - ``met_red``
- ``drop``
- Drops non-conforming packets (dropped by Ingress policer, metering drop), e.g. packet rate exceeded configured bandwith.

View File

@@ -292,3 +292,71 @@ configuration.
# bring up the bridge devices
ip link set br0 up
Forwarding database (FDB) management
------------------------------------
The existing DSA switches do not have the necessary hardware support to keep
the software FDB of the bridge in sync with the hardware tables, so the two
tables are managed separately (``bridge fdb show`` queries both, and depending
on whether the ``self`` or ``master`` flags are being used, a ``bridge fdb
add`` or ``bridge fdb del`` command acts upon entries from one or both tables).
Up until kernel v4.14, DSA only supported user space management of bridge FDB
entries using the bridge bypass operations (which do not update the software
FDB, just the hardware one) using the ``self`` flag (which is optional and can
be omitted).
.. code-block:: sh
bridge fdb add dev swp0 00:01:02:03:04:05 self static
# or shorthand
bridge fdb add dev swp0 00:01:02:03:04:05 static
Due to a bug, the bridge bypass FDB implementation provided by DSA did not
distinguish between ``static`` and ``local`` FDB entries (``static`` are meant
to be forwarded, while ``local`` are meant to be locally terminated, i.e. sent
to the host port). Instead, all FDB entries with the ``self`` flag (implicit or
explicit) are treated by DSA as ``static`` even if they are ``local``.
.. code-block:: sh
# This command:
bridge fdb add dev swp0 00:01:02:03:04:05 static
# behaves the same for DSA as this command:
bridge fdb add dev swp0 00:01:02:03:04:05 local
# or shorthand, because the 'local' flag is implicit if 'static' is not
# specified, it also behaves the same as:
bridge fdb add dev swp0 00:01:02:03:04:05
The last command is an incorrect way of adding a static bridge FDB entry to a
DSA switch using the bridge bypass operations, and works by mistake. Other
drivers will treat an FDB entry added by the same command as ``local`` and as
such, will not forward it, as opposed to DSA.
Between kernel v4.14 and v5.14, DSA has supported in parallel two modes of
adding a bridge FDB entry to the switch: the bridge bypass discussed above, as
well as a new mode using the ``master`` flag which installs FDB entries in the
software bridge too.
.. code-block:: sh
bridge fdb add dev swp0 00:01:02:03:04:05 master static
Since kernel v5.14, DSA has gained stronger integration with the bridge's
software FDB, and the support for its bridge bypass FDB implementation (using
the ``self`` flag) has been removed. This results in the following changes:
.. code-block:: sh
# This is the only valid way of adding an FDB entry that is supported,
# compatible with v4.14 kernels and later:
bridge fdb add dev swp0 00:01:02:03:04:05 master static
# This command is no longer buggy and the entry is properly treated as
# 'local' instead of being forwarded:
bridge fdb add dev swp0 00:01:02:03:04:05
# This command no longer installs a static FDB entry to hardware:
bridge fdb add dev swp0 00:01:02:03:04:05 static
Script writers are therefore encouraged to use the ``master static`` set of
flags when working with bridge FDB entries on DSA switch interfaces.

View File

@@ -93,14 +93,15 @@ A tagging protocol may tag all packets with switch tags of the same length, or
the tag length might vary (for example packets with PTP timestamps might
require an extended switch tag, or there might be one tag length on TX and a
different one on RX). Either way, the tagging protocol driver must populate the
``struct dsa_device_ops::overhead`` with the length in octets of the longest
switch frame header. The DSA framework will automatically adjust the MTU of the
master interface to accomodate for this extra size in order for DSA user ports
to support the standard MTU (L2 payload length) of 1500 octets. The ``overhead``
is also used to request from the network stack, on a best-effort basis, the
allocation of packets with a ``needed_headroom`` or ``needed_tailroom``
sufficient such that the act of pushing the switch tag on transmission of a
packet does not cause it to reallocate due to lack of memory.
``struct dsa_device_ops::needed_headroom`` and/or ``struct dsa_device_ops::needed_tailroom``
with the length in octets of the longest switch frame header/trailer. The DSA
framework will automatically adjust the MTU of the master interface to
accommodate for this extra size in order for DSA user ports to support the
standard MTU (L2 payload length) of 1500 octets. The ``needed_headroom`` and
``needed_tailroom`` properties are also used to request from the network stack,
on a best-effort basis, the allocation of packets with enough extra space such
that the act of pushing the switch tag on transmission of a packet does not
cause it to reallocate due to lack of memory.
Even though applications are not expected to parse DSA-specific frame headers,
the format on the wire of the tagging protocol represents an Application Binary
@@ -169,8 +170,8 @@ The job of this method is to prepare the skb in a way that the switch will
understand what egress port the packet is for (and not deliver it towards other
ports). Typically this is fulfilled by pushing a frame header. Checking for
insufficient size in the skb headroom or tailroom is unnecessary provided that
the ``overhead`` and ``tail_tag`` properties were filled out properly, because
DSA ensures there is enough space before calling this method.
the ``needed_headroom`` and ``needed_tailroom`` properties were filled out
properly, because DSA ensures there is enough space before calling this method.
The reception of a packet goes through the tagger's ``rcv`` function. The
passed ``struct sk_buff *skb`` has ``skb->data`` pointing at

View File

@@ -5,7 +5,7 @@ NXP SJA1105 switch driver
Overview
========
The NXP SJA1105 is a family of 6 devices:
The NXP SJA1105 is a family of 10 SPI-managed automotive switches:
- SJA1105E: First generation, no TTEthernet
- SJA1105T: First generation, TTEthernet
@@ -13,9 +13,11 @@ The NXP SJA1105 is a family of 6 devices:
- SJA1105Q: Second generation, TTEthernet, no SGMII
- SJA1105R: Second generation, no TTEthernet, SGMII
- SJA1105S: Second generation, TTEthernet, SGMII
These are SPI-managed automotive switches, with all ports being gigabit
capable, and supporting MII/RMII/RGMII and optionally SGMII on one port.
- SJA1110A: Third generation, TTEthernet, SGMII, integrated 100base-T1 and
100base-TX PHYs
- SJA1110B: Third generation, TTEthernet, SGMII, 100base-T1, 100base-TX
- SJA1110C: Third generation, TTEthernet, SGMII, 100base-T1, 100base-TX
- SJA1110D: Third generation, TTEthernet, SGMII, 100base-T1
Being automotive parts, their configuration interface is geared towards
set-and-forget use, with minimal dynamic interaction at runtime. They
@@ -579,3 +581,54 @@ A board would need to hook up the PHYs connected to the switch to any other
MDIO bus available to Linux within the system (e.g. to the DSA master's MDIO
bus). Link state management then works by the driver manually keeping in sync
(over SPI commands) the MAC link speed with the settings negotiated by the PHY.
By comparison, the SJA1110 supports an MDIO slave access point over which its
internal 100base-T1 PHYs can be accessed from the host. This is, however, not
used by the driver, instead the internal 100base-T1 and 100base-TX PHYs are
accessed through SPI commands, modeled in Linux as virtual MDIO buses.
The microcontroller attached to the SJA1110 port 0 also has an MDIO controller
operating in master mode, however the driver does not support this either,
since the microcontroller gets disabled when the Linux driver operates.
Discrete PHYs connected to the switch ports should have their MDIO interface
attached to an MDIO controller from the host system and not to the switch,
similar to SJA1105.
Port compatibility matrix
-------------------------
The SJA1105 port compatibility matrix is:
===== ============== ============== ==============
Port SJA1105E/T SJA1105P/Q SJA1105R/S
===== ============== ============== ==============
0 xMII xMII xMII
1 xMII xMII xMII
2 xMII xMII xMII
3 xMII xMII xMII
4 xMII xMII SGMII
===== ============== ============== ==============
The SJA1110 port compatibility matrix is:
===== ============== ============== ============== ==============
Port SJA1110A SJA1110B SJA1110C SJA1110D
===== ============== ============== ============== ==============
0 RevMII (uC) RevMII (uC) RevMII (uC) RevMII (uC)
1 100base-TX 100base-TX 100base-TX
or SGMII SGMII
2 xMII xMII xMII xMII
or SGMII or SGMII
3 xMII xMII xMII
or SGMII or SGMII SGMII
or 2500base-X or 2500base-X or 2500base-X
4 SGMII SGMII SGMII SGMII
or 2500base-X or 2500base-X or 2500base-X or 2500base-X
5 100base-T1 100base-T1 100base-T1 100base-T1
6 100base-T1 100base-T1 100base-T1 100base-T1
7 100base-T1 100base-T1 100base-T1 100base-T1
8 100base-T1 100base-T1 n/a n/a
9 100base-T1 100base-T1 n/a n/a
10 100base-T1 n/a n/a n/a
===== ============== ============== ============== ==============

View File

@@ -1363,8 +1363,8 @@ in an implementation specific way.
``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP
module parameters. This does not mean autonegotiation.
MODULE_EEPROM
=============
MODULE_EEPROM_GET
=================
Fetch module EEPROM data dump.
This interface is designed to allow dumps of at most 1/2 page at once. This
@@ -1383,12 +1383,14 @@ Request contents:
``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS`` u8 page I2C address
======================================= ====== ==========================
If ``ETHTOOL_A_MODULE_EEPROM_BANK`` is not specified, bank 0 is assumed.
Kernel response contents:
+---------------------------------------------+--------+---------------------+
| ``ETHTOOL_A_MODULE_EEPROM_HEADER`` | nested | reply header |
+---------------------------------------------+--------+---------------------+
| ``ETHTOOL_A_MODULE_EEPROM_DATA`` | nested | array of bytes from |
| ``ETHTOOL_A_MODULE_EEPROM_DATA`` | binary | array of bytes from |
| | | module EEPROM |
+---------------------------------------------+--------+---------------------+

View File

@@ -99,6 +99,35 @@ fib_multipath_hash_policy - INTEGER
- 0 - Layer 3
- 1 - Layer 4
- 2 - Layer 3 or inner Layer 3 if present
- 3 - Custom multipath hash. Fields used for multipath hash calculation
are determined by fib_multipath_hash_fields sysctl
fib_multipath_hash_fields - UNSIGNED INTEGER
When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
fields used for multipath hash calculation are determined by this
sysctl.
This value is a bitmask which enables various fields for multipath hash
calculation.
Possible fields are:
====== ============================
0x0001 Source IP address
0x0002 Destination IP address
0x0004 IP protocol
0x0008 Unused (Flow Label)
0x0010 Source port
0x0020 Destination port
0x0040 Inner source IP address
0x0080 Inner destination IP address
0x0100 Inner IP protocol
0x0200 Inner Flow Label
0x0400 Inner source port
0x0800 Inner destination port
====== ============================
Default: 0x0007 (source IP, destination IP and IP protocol)
fib_sync_mem - UNSIGNED INTEGER
Amount of dirty memory from fib entries that can be backlogged before
@@ -732,6 +761,31 @@ tcp_syncookies - INTEGER
network connections you can set this knob to 2 to enable
unconditionally generation of syncookies.
tcp_migrate_req - BOOLEAN
The incoming connection is tied to a specific listening socket when
the initial SYN packet is received during the three-way handshake.
When a listener is closed, in-flight request sockets during the
handshake and established sockets in the accept queue are aborted.
If the listener has SO_REUSEPORT enabled, other listeners on the
same port should have been able to accept such connections. This
option makes it possible to migrate such child sockets to another
listener after close() or shutdown().
The BPF_SK_REUSEPORT_SELECT_OR_MIGRATE type of eBPF program should
usually be used to define the policy to pick an alive listener.
Otherwise, the kernel will randomly pick an alive listener only if
this option is enabled.
Note that migration between listeners with different settings may
crash applications. Let's say migration happens from listener A to
B, and only B has TCP_SAVE_SYN enabled. B cannot read SYN data from
the requests migrated from A. To avoid such a situation, cancel
migration by returning SK_DROP in the type of eBPF program, or
disable this option.
Default: 0
tcp_fastopen - INTEGER
Enable TCP Fast Open (RFC7413) to send and accept data in the opening
SYN packet.
@@ -1743,6 +1797,35 @@ fib_multipath_hash_policy - INTEGER
- 0 - Layer 3 (source and destination addresses plus flow label)
- 1 - Layer 4 (standard 5-tuple)
- 2 - Layer 3 or inner Layer 3 if present
- 3 - Custom multipath hash. Fields used for multipath hash calculation
are determined by fib_multipath_hash_fields sysctl
fib_multipath_hash_fields - UNSIGNED INTEGER
When fib_multipath_hash_policy is set to 3 (custom multipath hash), the
fields used for multipath hash calculation are determined by this
sysctl.
This value is a bitmask which enables various fields for multipath hash
calculation.
Possible fields are:
====== ============================
0x0001 Source IP address
0x0002 Destination IP address
0x0004 IP protocol
0x0008 Flow Label
0x0010 Source port
0x0020 Destination port
0x0040 Inner source IP address
0x0080 Inner destination IP address
0x0100 Inner IP protocol
0x0200 Inner Flow Label
0x0400 Inner source port
0x0800 Inner destination port
====== ============================
Default: 0x0007 (source IP, destination IP and IP protocol)
anycast_src_echo_reply - BOOLEAN
Controls the use of anycast addresses as source addresses for ICMPv6
@@ -2751,6 +2834,18 @@ encap_port - INTEGER
Default: 0
plpmtud_probe_interval - INTEGER
The time interval (in milliseconds) for the PLPMTUD probe timer,
which is configured to expire after this period to receive an
acknowledgment to a probe packet. This is also the time interval
between the probes for the current pmtu when the probe search
is done.
PLPMTUD will be disabled when 0 is set, and other values for it
must be >= 5000.
Default: 0
``/proc/sys/net/core/*``
========================

View File

@@ -7,13 +7,13 @@ MPTCP Sysfs variables
/proc/sys/net/mptcp/* Variables
===============================
enabled - INTEGER
enabled - BOOLEAN
Control whether MPTCP sockets can be created.
MPTCP sockets can be created if the value is nonzero. This is
a per-namespace sysctl.
MPTCP sockets can be created if the value is 1. This is a
per-namespace sysctl.
Default: 1
Default: 1 (enabled)
add_addr_timeout - INTEGER (seconds)
Set the timeout after which an ADD_ADDR control message will be
@@ -24,3 +24,24 @@ add_addr_timeout - INTEGER (seconds)
sysctl.
Default: 120
checksum_enabled - BOOLEAN
Control whether DSS checksum can be enabled.
DSS checksum can be enabled if the value is nonzero. This is a
per-namespace sysctl.
Default: 0
allow_join_initial_addr_port - BOOLEAN
Allow peers to send join requests to the IP address and port number used
by the initial subflow if the value is 1. This controls a flag that is
sent to the peer at connection time, and whether such join requests are
accepted or denied.
Joins to addresses advertised with ADD_ADDR are not affected by this
value.
This is a per-namespace sysctl.
Default: 1

View File

@@ -177,3 +177,27 @@ nf_conntrack_gre_timeout_stream - INTEGER (seconds)
This extended timeout will be used in case there is an GRE stream
detected.
nf_flowtable_tcp_timeout - INTEGER (seconds)
default 30
Control offload timeout for tcp connections.
TCP connections may be offloaded from nf conntrack to nf flow table.
Once aged, the connection is returned to nf conntrack with tcp pickup timeout.
nf_flowtable_tcp_pickup - INTEGER (seconds)
default 120
TCP connection timeout after being aged from nf flow table offload.
nf_flowtable_udp_timeout - INTEGER (seconds)
default 30
Control offload timeout for udp connections.
UDP connections may be offloaded from nf conntrack to nf flow table.
Once aged, the connection is returned to nf conntrack with udp pickup timeout.
nf_flowtable_udp_pickup - INTEGER (seconds)
default 30
UDP connection timeout after being aged from nf flow table offload.

View File

@@ -292,6 +292,12 @@ Some of the interface modes are described below:
Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
use of this definition.
``PHY_INTERFACE_MODE_25GBASER``
This is the IEEE 802.3 PCS Clause 107 defined 25GBASE-R protocol.
The PCS is identical to 10GBASE-R, i.e. 64B/66B encoded
running 2.5 as fast, giving a fixed bit rate of 25.78125 Gbaud.
Please refer to the IEEE standard for further information.
``PHY_INTERFACE_MODE_100BASEX``
This defines IEEE 802.3 Clause 24. The link operates at a fixed data
rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying

View File

@@ -378,7 +378,11 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
`int pm_runtime_get_sync(struct device *dev);`
- increment the device's usage counter, run pm_runtime_resume(dev) and
return its result
return its result;
note that it does not drop the device's usage counter on errors, so
consider using pm_runtime_resume_and_get() instead of it, especially
if its return value is checked by the caller, as this is likely to
result in cleaner code.
`int pm_runtime_get_if_in_use(struct device *dev);`
- return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
@@ -827,6 +831,15 @@ or driver about runtime power changes. Instead, the driver for the device's
parent must take responsibility for telling the device's driver when the
parent's power state changes.
Note that, in some cases it may not be desirable for subsystems/drivers to call
pm_runtime_no_callbacks() for their devices. This could be because a subset of
the runtime PM callbacks needs to be implemented, a platform dependent PM
domain could get attached to the device or that the device is power managed
through a supplier device link. For these reasons and to avoid boilerplate code
in subsystems/drivers, the PM core allows runtime PM callbacks to be
unassigned. More precisely, if a callback pointer is NULL, the PM core will act
as though there was a callback and it returned 0.
9. Autosuspend, or automatically-delayed suspends
=================================================

View File

@@ -325,7 +325,7 @@ Code Seq# Include File Comments
0xA3 90-9F linux/dtlk.h
0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem
0xA4 00-1F uapi/asm/sgx.h <mailto:linux-sgx@vger.kernel.org>
0xA5 01 linux/surface_aggregator/cdev.h Microsoft Surface Platform System Aggregator
0xA5 01-05 linux/surface_aggregator/cdev.h Microsoft Surface Platform System Aggregator
<mailto:luzmaximilian@gmail.com>
0xA5 20-2F linux/surface_aggregator/dtx.h Microsoft Surface DTX driver
<mailto:luzmaximilian@gmail.com>

View File

@@ -14,15 +14,11 @@ for the CPU. Then there could be several contiguous ranges at
completely distinct addresses. And, don't forget about NUMA, where
different memory banks are attached to different CPUs.
Linux abstracts this diversity using one of the three memory models:
FLATMEM, DISCONTIGMEM and SPARSEMEM. Each architecture defines what
Linux abstracts this diversity using one of the two memory models:
FLATMEM and SPARSEMEM. Each architecture defines what
memory models it supports, what the default memory model is and
whether it is possible to manually override that default.
.. note::
At time of this writing, DISCONTIGMEM is considered deprecated,
although it is still in use by several architectures.
All the memory models track the status of physical page frames using
struct page arranged in one or more arrays.
@@ -63,43 +59,6 @@ straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
The `ARCH_PFN_OFFSET` defines the first page frame number for
systems with physical memory starting at address different from 0.
DISCONTIGMEM
============
The DISCONTIGMEM model treats the physical memory as a collection of
`nodes` similarly to how Linux NUMA support does. For each node Linux
constructs an independent memory management subsystem represented by
`struct pglist_data` (or `pg_data_t` for short). Among other
things, `pg_data_t` holds the `node_mem_map` array that maps
physical pages belonging to that node. The `node_start_pfn` field of
`pg_data_t` is the number of the first page frame belonging to that
node.
The architecture setup code should call :c:func:`free_area_init_node` for
each node in the system to initialize the `pg_data_t` object and its
`node_mem_map`.
Every `node_mem_map` behaves exactly as FLATMEM's `mem_map` -
every physical page frame in a node has a `struct page` entry in the
`node_mem_map` array. When DISCONTIGMEM is enabled, a portion of the
`flags` field of the `struct page` encodes the node number of the
node hosting that page.
The conversion between a PFN and the `struct page` in the
DISCONTIGMEM model became slightly more complex as it has to determine
which node hosts the physical page and which `pg_data_t` object
holds the `struct page`.
Architectures that support DISCONTIGMEM provide :c:func:`pfn_to_nid`
to convert PFN to the node number. The opposite conversion helper
:c:func:`page_to_nid` is generic as it uses the node number encoded in
page->flags.
Once the node number is known, the PFN can be used to index
appropriate `node_mem_map` array to access the `struct page` and
the offset of the `struct page` from the `node_mem_map` plus
`node_start_pfn` is the PFN of that page.
SPARSEMEM
=========

View File

@@ -973,7 +973,7 @@ F: drivers/net/ethernet/amd/xgbe/
AMD SENSOR FUSION HUB DRIVER
M: Nehal Shah <nehal-bakulchandra.shah@amd.com>
M: Sandeep Singh <sandeep.singh@amd.com>
M: Basavaraj Natikar <basavaraj.natikar@amd.com>
L: linux-input@vger.kernel.org
S: Maintained
F: Documentation/hid/amd-sfh*
@@ -4447,6 +4447,18 @@ F: include/linux/compiler-clang.h
F: scripts/clang-tools/
K: \b(?i:clang|llvm)\b
CLANG CONTROL FLOW INTEGRITY SUPPORT
M: Sami Tolvanen <samitolvanen@google.com>
M: Kees Cook <keescook@chromium.org>
R: Nathan Chancellor <nathan@kernel.org>
R: Nick Desaulniers <ndesaulniers@google.com>
L: clang-built-linux@googlegroups.com
S: Supported
B: https://github.com/ClangBuiltLinux/linux/issues
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/clang/features
F: include/linux/cfi.h
F: kernel/cfi.c
CLEANCACHE API
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
L: linux-kernel@vger.kernel.org
@@ -5187,7 +5199,14 @@ DELL WMI NOTIFICATIONS DRIVER
M: Matthew Garrett <mjg59@srcf.ucam.org>
M: Pali Rohár <pali@kernel.org>
S: Maintained
F: drivers/platform/x86/dell/dell-wmi.c
F: drivers/platform/x86/dell/dell-wmi-base.c
DELL WMI HARDWARE PRIVACY SUPPORT
M: Perry Yuan <Perry.Yuan@dell.com>
L: Dell.Client.Kernel@dell.com
L: platform-driver-x86@vger.kernel.org
S: Maintained
F: drivers/platform/x86/dell/dell-wmi-privacy.c
DELTA ST MEDIA DRIVER
M: Hugues Fruchet <hugues.fruchet@foss.st.com>
@@ -6460,10 +6479,11 @@ F: Documentation/filesystems/ecryptfs.rst
F: fs/ecryptfs/
EDAC-AMD64
M: Borislav Petkov <bp@alien8.de>
M: Yazen Ghannam <yazen.ghannam@amd.com>
L: linux-edac@vger.kernel.org
S: Maintained
S: Supported
F: drivers/edac/amd64_edac*
F: drivers/edac/mce_amd*
EDAC-ARMADA
M: Jan Luebbe <jlu@pengutronix.de>
@@ -6827,6 +6847,8 @@ F: Documentation/devicetree/bindings/net/mdio*
F: Documentation/devicetree/bindings/net/qca,ar803x.yaml
F: Documentation/networking/phy.rst
F: drivers/net/mdio/
F: drivers/net/mdio/acpi_mdio.c
F: drivers/net/mdio/fwnode_mdio.c
F: drivers/net/mdio/of_mdio.c
F: drivers/net/pcs/
F: drivers/net/phy/
@@ -9150,6 +9172,7 @@ F: Documentation/networking/device_drivers/ethernet/intel/
F: drivers/net/ethernet/intel/
F: drivers/net/ethernet/intel/*/
F: include/linux/avf/virtchnl.h
F: include/linux/net/intel/iidc.h
INTEL FRAMEBUFFER DRIVER (excluding 810 and 815)
M: Maik Broemme <mbroemme@libmpq.org>
@@ -9404,6 +9427,11 @@ S: Maintained
F: arch/x86/include/asm/intel_scu_ipc.h
F: drivers/platform/x86/intel_scu_*
INTEL SKYLAKE INT3472 ACPI DEVICE DRIVER
M: Daniel Scally <djrscally@gmail.com>
S: Maintained
F: drivers/platform/x86/intel/int3472/
INTEL SPEED SELECT TECHNOLOGY
M: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
L: platform-driver-x86@vger.kernel.org
@@ -9424,7 +9452,7 @@ F: include/linux/firmware/intel/stratix10-smc.h
F: include/linux/firmware/intel/stratix10-svc-client.h
INTEL TELEMETRY DRIVER
M: Rajneesh Bhardwaj <rajneesh.bhardwaj@linux.intel.com>
M: Rajneesh Bhardwaj <irenic.rajneesh@gmail.com>
M: "David E. Box" <david.e.box@linux.intel.com>
L: platform-driver-x86@vger.kernel.org
S: Maintained
@@ -9469,6 +9497,13 @@ L: Dell.Client.Kernel@dell.com
S: Maintained
F: drivers/platform/x86/intel-wmi-thunderbolt.c
INTEL WWAN IOSM DRIVER
M: M Chetan Kumar <m.chetan.kumar@intel.com>
M: Intel Corporation <linuxwwan@intel.com>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/wwan/iosm/
INTEL(R) TRACE HUB
M: Alexander Shishkin <alexander.shishkin@linux.intel.com>
S: Supported
@@ -10879,6 +10914,7 @@ S: Maintained
F: drivers/mailbox/
F: include/linux/mailbox_client.h
F: include/linux/mailbox_controller.h
F: include/dt-bindings/mailbox/
F: Documentation/devicetree/bindings/mailbox/
MAILBOX ARM MHUv2
@@ -12215,7 +12251,7 @@ M: Maximilian Luz <luzmaximilian@gmail.com>
L: platform-driver-x86@vger.kernel.org
S: Maintained
W: https://github.com/linux-surface/surface-aggregator-module
C: irc://chat.freenode.net/##linux-surface
C: irc://irc.libera.chat/linux-surface
F: Documentation/driver-api/surface_aggregator/
F: drivers/platform/surface/aggregator/
F: drivers/platform/surface/surface_acpi_notify.c
@@ -12411,6 +12447,12 @@ F: Documentation/userspace-api/media/drivers/meye*
F: drivers/media/pci/meye/
F: include/uapi/linux/meye.h
MOTORCOMM PHY DRIVER
M: Peter Geis <pgwipeout@gmail.com>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/phy/motorcomm.c
MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
S: Orphan
F: Documentation/driver-api/serial/moxa-smartio.rst
@@ -12682,6 +12724,7 @@ W: http://www.netfilter.org/
W: http://www.iptables.org/
W: http://www.nftables.org/
Q: http://patchwork.ozlabs.org/project/netfilter-devel/list/
C: irc://irc.libera.chat/netfilter
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git
F: include/linux/netfilter*
@@ -13225,6 +13268,7 @@ M: Vladimir Oltean <olteanv@gmail.com>
L: linux-kernel@vger.kernel.org
S: Maintained
F: drivers/net/dsa/sja1105
F: drivers/net/pcs/pcs-xpcs-nxp.c
NXP TDA998X DRM DRIVER
M: Russell King <linux@armlinux.org.uk>
@@ -15612,6 +15656,13 @@ F: include/linux/rpmsg/
F: include/uapi/linux/rpmsg.h
F: samples/rpmsg/
REMOTE PROCESSOR MESSAGING (RPMSG) WWAN CONTROL DRIVER
M: Stephan Gerhold <stephan@gerhold.net>
L: netdev@vger.kernel.org
L: linux-remoteproc@vger.kernel.org
S: Maintained
F: drivers/net/wwan/rpmsg_wwan_ctrl.c
RENESAS CLOCK DRIVERS
M: Geert Uytterhoeven <geert+renesas@glider.be>
L: linux-renesas-soc@vger.kernel.org
@@ -15741,6 +15792,14 @@ F: arch/riscv/
N: riscv
K: riscv
RISC-V/MICROCHIP POLARFIRE SOC SUPPORT
M: Lewis Hanly <lewis.hanly@microchip.com>
L: linux-riscv@lists.infradead.org
S: Supported
F: drivers/mailbox/mailbox-mpfs.c
F: drivers/soc/microchip/
F: include/soc/microchip/mpfs.h
RNBD BLOCK DRIVERS
M: Md. Haris Iqbal <haris.iqbal@ionos.com>
M: Jack Wang <jinpu.wang@ionos.com>
@@ -17717,6 +17776,7 @@ M: Jose Abreu <Jose.Abreu@synopsys.com>
L: netdev@vger.kernel.org
S: Supported
F: drivers/net/pcs/pcs-xpcs.c
F: drivers/net/pcs/pcs-xpcs.h
F: include/linux/pcs/pcs-xpcs.h
SYNOPSYS DESIGNWARE I2C DRIVER
@@ -18219,6 +18279,13 @@ W: http://thinkwiki.org/wiki/Ibm-acpi
T: git git://repo.or.cz/linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git
F: drivers/platform/x86/thinkpad_acpi.c
THINKPAD LMI DRIVER
M: Mark Pearson <markpearson@lenovo.com>
L: platform-driver-x86@vger.kernel.org
S: Maintained
F: Documentation/ABI/testing/sysfs-class-firmware-attributes
F: drivers/platform/x86/think-lmi.?
THUNDERBOLT DMA TRAFFIC TEST DRIVER
M: Isaac Hazan <isaac.hazan@intel.com>
L: linux-usb@vger.kernel.org
@@ -19642,6 +19709,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git
F: Documentation/core-api/printk-formats.rst
F: lib/test_printf.c
F: lib/test_scanf.c
F: lib/vsprintf.c
VT1211 HARDWARE MONITOR DRIVER
@@ -19825,6 +19893,16 @@ F: Documentation/core-api/workqueue.rst
F: include/linux/workqueue.h
F: kernel/workqueue.c
WWAN DRIVERS
M: Loic Poulain <loic.poulain@linaro.org>
M: Sergey Ryazanov <ryazanov.s.a@gmail.com>
R: Johannes Berg <johannes@sipsolutions.net>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/wwan/
F: include/linux/wwan.h
F: include/uapi/linux/wwan.h
X-POWERS AXP288 PMIC DRIVERS
M: Hans de Goede <hdegoede@redhat.com>
S: Maintained

View File

@@ -285,6 +285,13 @@ config ARCH_THREAD_STACK_ALLOCATOR
config ARCH_WANTS_DYNAMIC_TASK_STRUCT
bool
config ARCH_WANTS_NO_INSTR
bool
help
An architecture should select this if the noinstr macro is being used on
functions to denote that the toolchain should avoid instrumenting such
functions and is required for correctness.
config ARCH_32BIT_OFF_T
bool
depends on !64BIT

View File

@@ -549,29 +549,12 @@ config NR_CPUS
MARVEL support can handle a maximum of 32 CPUs, all the others
with working support have a maximum of 4 CPUs.
config ARCH_DISCONTIGMEM_ENABLE
bool "Discontiguous Memory Support"
depends on BROKEN
help
Say Y to support efficient handling of discontiguous physical memory,
for architectures which are either NUMA (Non-Uniform Memory Access)
or have huge holes in the physical address space for other reasons.
See <file:Documentation/vm/numa.rst> for more.
config ARCH_SPARSEMEM_ENABLE
bool "Sparse Memory Support"
help
Say Y to support efficient handling of discontiguous physical memory,
for systems that have huge holes in the physical address space.
config NUMA
bool "NUMA Support (EXPERIMENTAL)"
depends on DISCONTIGMEM && BROKEN
help
Say Y to compile the kernel to support NUMA (Non-Uniform Memory
Access). This option is for configuring high-end multiprocessor
server machines. If in doubt, say N.
config ALPHA_WTINT
bool "Use WTINT" if ALPHA_SRM || ALPHA_GENERIC
default y if ALPHA_QEMU
@@ -596,11 +579,6 @@ config ALPHA_WTINT
If unsure, say N.
config NODES_SHIFT
int
default "7"
depends on NEED_MULTIPLE_NODES
# LARGE_VMALLOC is racy, if you *really* need it then fix it first
config ALPHA_LARGE_VMALLOC
bool

View File

@@ -99,12 +99,6 @@ struct alpha_machine_vector
const char *vector_name;
/* NUMA information */
int (*pa_to_nid)(unsigned long);
int (*cpuid_to_nid)(int);
unsigned long (*node_mem_start)(int);
unsigned long (*node_mem_size)(int);
/* System specific parameters. */
union {
struct {

View File

@@ -1,100 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Written by Kanoj Sarcar (kanoj@sgi.com) Aug 99
* Adapted for the alpha wildfire architecture Jan 2001.
*/
#ifndef _ASM_MMZONE_H_
#define _ASM_MMZONE_H_
#ifdef CONFIG_DISCONTIGMEM
#include <asm/smp.h>
/*
* Following are macros that are specific to this numa platform.
*/
extern pg_data_t node_data[];
#define alpha_pa_to_nid(pa) \
(alpha_mv.pa_to_nid \
? alpha_mv.pa_to_nid(pa) \
: (0))
#define node_mem_start(nid) \
(alpha_mv.node_mem_start \
? alpha_mv.node_mem_start(nid) \
: (0UL))
#define node_mem_size(nid) \
(alpha_mv.node_mem_size \
? alpha_mv.node_mem_size(nid) \
: ((nid) ? (0UL) : (~0UL)))
#define pa_to_nid(pa) alpha_pa_to_nid(pa)
#define NODE_DATA(nid) (&node_data[(nid)])
#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn)
#if 1
#define PLAT_NODE_DATA_LOCALNR(p, n) \
(((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn)
#else
static inline unsigned long
PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
{
unsigned long temp;
temp = p >> PAGE_SHIFT;
return temp - PLAT_NODE_DATA(n)->gendata.node_start_pfn;
}
#endif
/*
* Following are macros that each numa implementation must define.
*/
/*
* Given a kernel address, find the home node of the underlying memory.
*/
#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr))
/*
* Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory
* and returns the kaddr corresponding to first physical page in the
* node's mem_map.
*/
#define LOCAL_BASE_ADDR(kaddr) \
((unsigned long)__va(NODE_DATA(kvaddr_to_nid(kaddr))->node_start_pfn \
<< PAGE_SHIFT))
/* XXX: FIXME -- nyc */
#define kern_addr_valid(kaddr) (0)
#define mk_pte(page, pgprot) \
({ \
pte_t pte; \
unsigned long pfn; \
\
pfn = page_to_pfn(page) << 32; \
pte_val(pte) = pfn | pgprot_val(pgprot); \
\
pte; \
})
#define pte_page(x) \
({ \
unsigned long kvirt; \
struct page * __xx; \
\
kvirt = (unsigned long)__va(pte_val(x) >> (32-PAGE_SHIFT)); \
__xx = virt_to_page(kvirt); \
\
__xx; \
})
#define pfn_to_nid(pfn) pa_to_nid(((u64)(pfn) << PAGE_SHIFT))
#define pfn_valid(pfn) \
(((pfn) - node_start_pfn(pfn_to_nid(pfn))) < \
node_spanned_pages(pfn_to_nid(pfn))) \
#endif /* CONFIG_DISCONTIGMEM */
#endif /* _ASM_MMZONE_H_ */

View File

@@ -206,7 +206,6 @@ extern unsigned long __zero_page(void);
#define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT)
#define pte_pfn(pte) (pte_val(pte) >> 32)
#ifndef CONFIG_DISCONTIGMEM
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
#define mk_pte(page, pgprot) \
({ \
@@ -215,7 +214,6 @@ extern unsigned long __zero_page(void);
pte_val(pte) = (page_to_pfn(page) << 32) | pgprot_val(pgprot); \
pte; \
})
#endif
extern inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot)
{ pte_t pte; pte_val(pte) = (PHYS_TWIDDLE(physpfn) << 32) | pgprot_val(pgprot); return pte; }
@@ -330,9 +328,7 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })
#ifndef CONFIG_DISCONTIGMEM
#define kern_addr_valid(addr) (1)
#endif
#define pte_ERROR(e) \
printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))

View File

@@ -7,45 +7,6 @@
#include <linux/numa.h>
#include <asm/machvec.h>
#ifdef CONFIG_NUMA
static inline int cpu_to_node(int cpu)
{
int node;
if (!alpha_mv.cpuid_to_nid)
return 0;
node = alpha_mv.cpuid_to_nid(cpu);
#ifdef DEBUG_NUMA
BUG_ON(node < 0);
#endif
return node;
}
extern struct cpumask node_to_cpumask_map[];
/* FIXME: This is dumb, recalculating every time. But simple. */
static const struct cpumask *cpumask_of_node(int node)
{
int cpu;
if (node == NUMA_NO_NODE)
return cpu_all_mask;
cpumask_clear(&node_to_cpumask_map[node]);
for_each_online_cpu(cpu) {
if (cpu_to_node(cpu) == node)
cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
}
return &node_to_cpumask_map[node];
}
#define cpumask_of_pcibus(bus) (cpu_online_mask)
#endif /* !CONFIG_NUMA */
# include <asm-generic/topology.h>
#endif /* _ASM_ALPHA_TOPOLOGY_H */

View File

@@ -127,6 +127,8 @@
#define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#define SO_NETNS_COOKIE 71
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64

View File

@@ -287,8 +287,7 @@ io7_init_hose(struct io7 *io7, int port)
/*
* Set up window 0 for scatter-gather 8MB at 8MB.
*/
hose->sg_isa = iommu_arena_new_node(marvel_cpuid_to_nid(io7->pe),
hose, 0x00800000, 0x00800000, 0);
hose->sg_isa = iommu_arena_new_node(0, hose, 0x00800000, 0x00800000, 0);
hose->sg_isa->align_entry = 8; /* cache line boundary */
csrs->POx_WBASE[0].csr =
hose->sg_isa->dma_base | wbase_m_ena | wbase_m_sg;
@@ -305,8 +304,7 @@ io7_init_hose(struct io7 *io7, int port)
/*
* Set up window 2 for scatter-gather (up-to) 1GB at 3GB.
*/
hose->sg_pci = iommu_arena_new_node(marvel_cpuid_to_nid(io7->pe),
hose, 0xc0000000, 0x40000000, 0);
hose->sg_pci = iommu_arena_new_node(0, hose, 0xc0000000, 0x40000000, 0);
hose->sg_pci->align_entry = 8; /* cache line boundary */
csrs->POx_WBASE[2].csr =
hose->sg_pci->dma_base | wbase_m_ena | wbase_m_sg;
@@ -843,52 +841,7 @@ EXPORT_SYMBOL(marvel_ioportmap);
EXPORT_SYMBOL(marvel_ioread8);
EXPORT_SYMBOL(marvel_iowrite8);
#endif
/*
* NUMA Support
*/
/**********
* FIXME - for now each cpu is a node by itself
* -- no real support for striped mode
**********
*/
int
marvel_pa_to_nid(unsigned long pa)
{
int cpuid;
if ((pa >> 43) & 1) /* I/O */
cpuid = (~(pa >> 35) & 0xff);
else /* mem */
cpuid = ((pa >> 34) & 0x3) | ((pa >> (37 - 2)) & (0x1f << 2));
return marvel_cpuid_to_nid(cpuid);
}
int
marvel_cpuid_to_nid(int cpuid)
{
return cpuid;
}
unsigned long
marvel_node_mem_start(int nid)
{
unsigned long pa;
pa = (nid & 0x3) | ((nid & (0x1f << 2)) << 1);
pa <<= 34;
return pa;
}
unsigned long
marvel_node_mem_size(int nid)
{
return 16UL * 1024 * 1024 * 1024; /* 16GB */
}
/*
* AGP GART Support.
*/

View File

@@ -440,33 +440,6 @@ struct pci_ops wildfire_pci_ops =
.write = wildfire_write_config,
};
/*
* NUMA Support
*/
int wildfire_pa_to_nid(unsigned long pa)
{
return pa >> 36;
}
int wildfire_cpuid_to_nid(int cpuid)
{
/* assume 4 CPUs per node */
return cpuid >> 2;
}
unsigned long wildfire_node_mem_start(int nid)
{
/* 64GB per node */
return (unsigned long)nid * (64UL * 1024 * 1024 * 1024);
}
unsigned long wildfire_node_mem_size(int nid)
{
/* 64GB per node */
return 64UL * 1024 * 1024 * 1024;
}
#if DEBUG_DUMP_REGS
static void __init

View File

@@ -71,33 +71,6 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base,
if (align < mem_size)
align = mem_size;
#ifdef CONFIG_DISCONTIGMEM
arena = memblock_alloc_node(sizeof(*arena), align, nid);
if (!NODE_DATA(nid) || !arena) {
printk("%s: couldn't allocate arena from node %d\n"
" falling back to system-wide allocation\n",
__func__, nid);
arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES);
if (!arena)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*arena));
}
arena->ptes = memblock_alloc_node(sizeof(*arena), align, nid);
if (!NODE_DATA(nid) || !arena->ptes) {
printk("%s: couldn't allocate arena ptes from node %d\n"
" falling back to system-wide allocation\n",
__func__, nid);
arena->ptes = memblock_alloc(mem_size, align);
if (!arena->ptes)
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, mem_size, align);
}
#else /* CONFIG_DISCONTIGMEM */
arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES);
if (!arena)
panic("%s: Failed to allocate %zu bytes\n", __func__,
@@ -107,8 +80,6 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base,
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
__func__, mem_size, align);
#endif /* CONFIG_DISCONTIGMEM */
spin_lock_init(&arena->lock);
arena->hose = hose;
arena->dma_base = base;

View File

@@ -49,10 +49,6 @@ extern void marvel_init_arch(void);
extern void marvel_kill_arch(int);
extern void marvel_machine_check(unsigned long, unsigned long);
extern void marvel_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
extern int marvel_pa_to_nid(unsigned long);
extern int marvel_cpuid_to_nid(int);
extern unsigned long marvel_node_mem_start(int);
extern unsigned long marvel_node_mem_size(int);
extern struct _alpha_agp_info *marvel_agp_info(void);
struct io7 *marvel_find_io7(int pe);
struct io7 *marvel_next_io7(struct io7 *prev);
@@ -101,10 +97,6 @@ extern void wildfire_init_arch(void);
extern void wildfire_kill_arch(int);
extern void wildfire_machine_check(unsigned long vector, unsigned long la_ptr);
extern void wildfire_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t);
extern int wildfire_pa_to_nid(unsigned long);
extern int wildfire_cpuid_to_nid(int);
extern unsigned long wildfire_node_mem_start(int);
extern unsigned long wildfire_node_mem_size(int);
/* console.c */
#ifdef CONFIG_VGA_HOSE

View File

@@ -79,11 +79,6 @@ int alpha_l3_cacheshape;
unsigned long alpha_verbose_mcheck = CONFIG_VERBOSE_MCHECK_ON;
#endif
#ifdef CONFIG_NUMA
struct cpumask node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_to_cpumask_map);
#endif
/* Which processor we booted from. */
int boot_cpuid;
@@ -305,7 +300,6 @@ move_initrd(unsigned long mem_limit)
}
#endif
#ifndef CONFIG_DISCONTIGMEM
static void __init
setup_memory(void *kernel_end)
{
@@ -389,9 +383,6 @@ setup_memory(void *kernel_end)
}
#endif /* CONFIG_BLK_DEV_INITRD */
}
#else
extern void setup_memory(void *);
#endif /* !CONFIG_DISCONTIGMEM */
int __init
page_is_ram(unsigned long pfn)
@@ -618,13 +609,6 @@ setup_arch(char **cmdline_p)
"VERBOSE_MCHECK "
#endif
#ifdef CONFIG_DISCONTIGMEM
"DISCONTIGMEM "
#ifdef CONFIG_NUMA
"NUMA "
#endif
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
"DEBUG_SPINLOCK "
#endif

View File

@@ -461,10 +461,5 @@ struct alpha_machine_vector marvel_ev7_mv __initmv = {
.kill_arch = marvel_kill_arch,
.pci_map_irq = marvel_map_irq,
.pci_swizzle = common_swizzle,
.pa_to_nid = marvel_pa_to_nid,
.cpuid_to_nid = marvel_cpuid_to_nid,
.node_mem_start = marvel_node_mem_start,
.node_mem_size = marvel_node_mem_size,
};
ALIAS_MV(marvel_ev7)

View File

@@ -337,10 +337,5 @@ struct alpha_machine_vector wildfire_mv __initmv = {
.kill_arch = wildfire_kill_arch,
.pci_map_irq = wildfire_map_irq,
.pci_swizzle = common_swizzle,
.pa_to_nid = wildfire_pa_to_nid,
.cpuid_to_nid = wildfire_cpuid_to_nid,
.node_mem_start = wildfire_node_mem_start,
.node_mem_size = wildfire_node_mem_size,
};
ALIAS_MV(wildfire)

View File

@@ -482,7 +482,7 @@
550 common process_madvise sys_process_madvise
551 common epoll_pwait2 sys_epoll_pwait2
552 common mount_setattr sys_mount_setattr
# 553 reserved for quotactl_path
553 common quotactl_fd sys_quotactl_fd
554 common landlock_create_ruleset sys_landlock_create_ruleset
555 common landlock_add_rule sys_landlock_add_rule
556 common landlock_restrict_self sys_landlock_restrict_self

View File

@@ -6,5 +6,3 @@
ccflags-y := -Werror
obj-y := init.o fault.o
obj-$(CONFIG_DISCONTIGMEM) += numa.o

View File

@@ -235,8 +235,6 @@ callback_init(void * kernel_end)
return kernel_end;
}
#ifndef CONFIG_DISCONTIGMEM
/*
* paging_init() sets up the memory map.
*/
@@ -257,7 +255,6 @@ void __init paging_init(void)
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
}
#endif /* CONFIG_DISCONTIGMEM */
#if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SRM)
void

Some files were not shown because too many files have changed in this diff Show More