學習網址
https://gist.github.com/zshi-redhat/6c19f1acf1e306a47ef8a0707d7de39d
https://github.com/intel/SDN-NFV-Hands-on-Samples/tree/master/DPDK_in_Containers_Hands-on_Lab/dpdk-container-lab
apt-get install jq
lspci -vvv | grep "Single Root I/O"
BIOS 要開啟 SR-IOV VT-D ,找不到選項的話,主機有可能不支援SR-IOV
grep IOMMU /boot/config-`uname -r` | grep INTEL
>CONFIG_INTEL_IOMMU=y
在grub底下新增 intel_iommu=on iommu=pt
vim /etc/default/grub
...
GRUB_CMDLINE_LINUX="intel_iommu=on iommu=pt"
...
# 更新grup和重開機
update-grub
init 6
# 開機後確認
dmesg | grep -e DMAR -e IOMMU
cat /proc/cmdline | grep iommu=pt
cat /proc/cmdline | grep intel_iommu=on
lspci | grep Ethernet
...
> 11:10.0 Ethernet controller: Intel Corporation I350 Ethernet Controller Virtual Function (rev 01)
...
I350 對應的是 igb(目前最新版本 igb-5.3.5.39.tar.gz),且有提到最多支援8個vf
按照驅動的README make install 完後載入kernel
https://downloadcenter.intel.com/download/13663/Intel-Network-Adapter-Driver-for-82575-6-82580-I350-and-I210-211-Based-Gigabit-Network-Connections-for-Linux-
X710 對應的是 i40e (目前最新版本 i40e-2.12.6.tar.gz),且有提到 X710 最多支援 32 個 vf 按照驅動的README make install 完後載入kernel https://downloadcenter.intel.com/download/24411/Intel-Network-Adapter-Driver-for-PCIe-40-Gigabit-Ethernet-Network-Connections-under-Linux-?wapkw=Intel%C2%AE%20Network%20Adapter%20Driver
apt install -y make gcc
tar -zxvf igb<vsersion>.tar.gz
cd igb<vsersion>/src
make install
apt install -y make gcc
tar -zxvf i40e<vsersion>.tar.gz
cd i40e<vsersion>/src
make install
這邊要抓 PHY NIC 對應的 Linux NIC 介面名稱
root@sdn-k8s-server-b3-1:~/i40e-2.12.6/src# lspci | grep "I350"
05:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
05:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
05:00.2 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
05:00.3 Ethernet controller: Intel Corporation I350 Gigabit Network Connection (rev 01)
root@sdn-k8s-server-b3-1:~/i40e-2.12.6/src# lspci | grep "X710"
08:00.0 Ethernet controller: Intel Corporation Ethernet Controller X710 for 10GbE SFP+ (rev 02)
08:00.1 Ethernet controller: Intel Corporation Ethernet Controller X710 for 10GbE SFP+ (rev 02)
08:00.2 Ethernet controller: Intel Corporation Ethernet Controller X710 for 10GbE SFP+ (rev 02)
08:00.3 Ethernet controller: Intel Corporation Ethernet Controller X710 for 10GbE SFP+ (rev 02)
root@sdn-k8s-server-b3-1:~/i40e-2.12.6/src# ls -l /sys/class/net/
total 0
lrwxrwxrwx 1 root root 0 Sep 18 05:39 eno1 -> ../../devices/pci0000:00/0000:00:1c.4/0000:02:00.0/net/eno1
lrwxrwxrwx 1 root root 0 Sep 18 05:39 eno2 -> ../../devices/pci0000:00/0000:00:1c.4/0000:02:00.1/net/eno2
lrwxrwxrwx 1 root root 0 Sep 18 05:39 eno3 -> ../../devices/pci0000:00/0000:00:1c.4/0000:02:00.2/net/eno3
lrwxrwxrwx 1 root root 0 Sep 18 05:39 eno4 -> ../../devices/pci0000:00/0000:00:1c.4/0000:02:00.3/net/eno4
lrwxrwxrwx 1 root root 0 Sep 23 03:33 ens1f0 -> ../../devices/pci0000:00/0000:00:03.0/0000:08:00.0/net/ens1f0
lrwxrwxrwx 1 root root 0 Sep 23 03:33 ens1f1 -> ../../devices/pci0000:00/0000:00:03.0/0000:08:00.1/net/ens1f1
lrwxrwxrwx 1 root root 0 Sep 23 03:33 ens1f2 -> ../../devices/pci0000:00/0000:00:03.0/0000:08:00.2/net/ens1f2
lrwxrwxrwx 1 root root 0 Sep 23 03:33 ens1f3 -> ../../devices/pci0000:00/0000:00:03.0/0000:08:00.3/net/ens1f3
lrwxrwxrwx 1 root root 0 Sep 18 05:39 ens2f0 -> ../../devices/pci0000:00/0000:00:02.0/0000:05:00.0/net/ens2f0
lrwxrwxrwx 1 root root 0 Sep 18 05:39 ens2f1 -> ../../devices/pci0000:00/0000:00:02.0/0000:05:00.1/net/ens2f1
lrwxrwxrwx 1 root root 0 Sep 18 05:39 ens2f2 -> ../../devices/pci0000:00/0000:00:02.0/0000:05:00.2/net/ens2f2
lrwxrwxrwx 1 root root 0 Sep 18 05:39 ens2f3 -> ../../devices/pci0000:00/0000:00:02.0/0000:05:00.3/net/ens2f3
lrwxrwxrwx 1 root root 0 Sep 18 05:39 lo -> ../../devices/virtual/net/lo
查看全部模組,如有先移除可能是舊版本,再加載入且指定vf數量
lsmod | grep igb
rmmod igb
modprobe igb max_vfs=2,2,2,2
max_vfs=2,2,2,2 因為pf有4個 vf最多只能開8個 這樣寫就平均都開2個 也可以都開在同一個 如:max_vfs=8,0,0,0
or
echo 8 > /sys/class/net/ens1f3/device/sriov_numvfs
查看全部模組,如有先移除可能是舊版本,再加載入且指定vf數量,X710 最多有 32 個 vf 可以用,機房 X710 NIC 是 4 Port,這邊 max_vfs 就分開各開 8 個了,因為這版本的驅動設計在 kernel 3.8 之後不支援 max_vfs 對於單一網路介面設定,需要手動配置給 NIC I/O,我們這邊需要先基於 NIC 位置進行 vf 設定。
modinfo i40e
lsmod | grep i40e
rmmod i40e
modprobe i40e max_vfs=32
echo 8 > /sys/class/net/ens1f0/device/sriov_numvfs
echo 8 > /sys/class/net/ens1f1/device/sriov_numvfs
echo 8 > /sys/class/net/ens1f2/device/sriov_numvfs
echo 8 > /sys/class/net/ens1f3/device/sriov_numvfs
這邊指令可能要等一下,因為 vf 生成需要一下子。
ip link
https://www.linux-kvm.org/page/10G_NIC_performance:_VFIO_vs_virtio
http://doc.dpdk.org/guides/linux_gsg/linux_drivers.html
rmmod vfio-pci
modprobe vfio-pci
lspci -nn -D |grep Ethernet (可以知道介面的序列號)
pf
echo "8086 1521" > /sys/bus/pci/drivers/vfio-pci/new_id
echo 0000:07:00.0 > /sys/bus/pci/devices/0000\:07\:00.0/driver/unbind
echo 0000:07:00.0 > /sys/bus/pci/drivers/vfio-pci/bind
vf
echo "8086 1520" > /sys/bus/pci/drivers/vfio-pci/new_id
echo 0000:61:10.1 > /sys/bus/pci/devices/0000\:61\:10.1/driver/unbind
echo 0000:61:10.1 > /sys/bus/pci/drivers/vfio-pci/bind
dmesg
https://pci-ids.ucw.cz/read/PC/8086
git clone https://github.com/intel/multus-cni.git && cd multus-cni
nano ./images/multus-daemonset-pre1.16.yml
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: network-attachment-definitions.k8s.cni.cncf.io
spec:
group: k8s.cni.cncf.io
version: v1
scope: Namespaced
names:
plural: network-attachment-definitions
singular: network-attachment-definition
kind: NetworkAttachmentDefinition
shortNames:
- net-attach-def
validation:
openAPIV3Schema:
properties:
spec:
properties:
config:
type: string
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: multus
rules:
- apiGroups: ["k8s.cni.cncf.io"]
resources:
- '*'
verbs:
- '*'
- apiGroups:
- ""
resources:
- pods
- pods/status
verbs:
- get
- update
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1beta1
metadata:
name: multus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: multus
subjects:
- kind: ServiceAccount
name: multus
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: multus
namespace: kube-system
---
kind: ConfigMap
apiVersion: v1
metadata:
name: multus-cni-config
namespace: kube-system
labels:
tier: node
app: multus
data:
# NOTE: If you'd prefer to manually apply a configuration file, you may create one here.
# In the case you'd like to customize the Multus installation, you should change the arguments to the Multus pod
# change the "args" line below from
# - "--multus-conf-file=auto"
# to:
# "--multus-conf-file=/tmp/multus-conf/70-multus.conf"
# Additionally -- you should ensure that the name "70-multus.conf" is the alphabetically first name in the
# /etc/cni/net.d/ directory on each node, otherwise, it will not be used by the Kubelet.
cni-conf.json: |
{
"name": "multus-cni-network",
"type": "multus",
"capabilities": {
"portMappings": true
},
"delegates": [
{
"cniVersion": "0.3.1",
"name": "default-cni-network",
"plugins": [
{
"type": "flannel",
"name": "flannel.1",
"delegate": {
"isDefaultGateway": true,
"hairpinMode": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
],
"kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig"
}
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: kube-multus-ds-amd64
namespace: kube-system
labels:
tier: node
app: multus
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
tier: node
app: multus
spec:
hostNetwork: true
nodeSelector:
beta.kubernetes.io/arch: amd64
tolerations:
- operator: Exists
effect: NoSchedule
serviceAccountName: multus
containers:
- name: kube-multus
image: nfvpe/multus:v3.4
command: ["/entrypoint.sh"]
args:
- "--multus-conf-file=auto"
resources:
requests:
cpu: "100m"
memory: "50Mi"
limits:
cpu: "100m"
memory: "50Mi"
securityContext:
privileged: true
volumeMounts:
- name: cni
mountPath: /host/etc/cni/net.d
- name: cnibin
mountPath: /host/opt/cni/bin
- name: multus-cfg
mountPath: /tmp/multus-conf
volumes:
- name: cni
hostPath:
path: /etc/cni/net.d
- name: cnibin
hostPath:
path: /opt/cni/bin
- name: multus-cfg
configMap:
name: multus-cni-config
items:
- key: cni-conf.json
path: 70-multus.conf
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: kube-multus-ds-ppc64le
namespace: kube-system
labels:
tier: node
app: multus
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
tier: node
app: multus
spec:
hostNetwork: true
nodeSelector:
beta.kubernetes.io/arch: ppc64le
tolerations:
- operator: Exists
effect: NoSchedule
serviceAccountName: multus
containers:
- name: kube-multus
# ppc64le support requires multus:latest for now. support 3.3 or later.
image: nfvpe/multus:latest-ppc64le
command: ["/entrypoint.sh"]
args:
- "--multus-conf-file=auto"
resources:
requests:
cpu: "100m"
memory: "90Mi"
limits:
cpu: "100m"
memory: "90Mi"
securityContext:
privileged: true
volumeMounts:
- name: cni
mountPath: /host/etc/cni/net.d
- name: cnibin
mountPath: /host/opt/cni/bin
- name: multus-cfg
mountPath: /tmp/multus-conf
volumes:
- name: cni
hostPath:
path: /etc/cni/net.d
- name: cnibin
hostPath:
path: /opt/cni/bin
- name: multus-cfg
configMap:
name: multus-cni-config
items:
- key: cni-conf.json
path: 70-multus.conf
# late 1.16
cat ./images/multus-daemonset.yml | kubectl apply -f -
# pre 1.16
cat ./images/multus-daemonset-pre1.16.yml | kubectl apply -f -
wget https://dl.google.com/go/go1.13.4.linux-amd64.tar.gz
tar -C /usr/local -xzf go1.13.4.linux-amd64.tar.gz
vim ~/.bashrc
...
export GOROOT=/usr/local/go
export GOPATH=$HOME/go
export PATH=$GOPATH/bin:$GOROOT/bin:$PATH
source ~/.bashrc
git clone https://github.com/intel/sriov-cni.git
cd sriov-cni
make
cp build/sriov /opt/cni/bin
git clone https://github.com/intel/sriov-network-device-plugin.git
cd sriov-network-device-plugin/deployments/
這個配置檔要讓device plugin能取得node上的實體網路介面
lspci -nn -D |grep Ethernet
cat /sys/bus/pci/devices/$(device)/vendor
cat /sys/bus/pci/devices/$(device)/device
X710: i40evf / I350: igbvf
nano configMap.yaml
---
apiVersion: v1
kind: ConfigMap
metadata:
name: sriovdp-config
namespace: kube-system
data:
config.json: |
{
"resourceList": [{
"resourceName": "intel_sriov_netdevice",
"trust": "on",
"selectors": {
"vendors": ["8086"],
"devices": ["1520"],
"drivers": ["i40evf", "iavf", "ixgbevf"]
}
}
]
}
之前遇到的錯
vendor -> 要寫vf的裝置,1521是pf的
kubevirt一定要vfio-pci -> "drivers":["vfio-pci"]
kubectl apply -f configMap.yaml
kubectl apply -f sriov-crd.yaml
cd k8s-v1.10-v1.15
kubectl apply -f ./
kubectl get node <node-name> -o json | jq '.status.allocatable'
root@zhengsheng-server-1:~/sriov-network-device-plugin/deployments/k8s-v1.10-v1.15# kubectl get node zhengsheng-server-1 -o json | jq '.status.allocatable'
{
"cpu": "24",
"ephemeral-storage": "112906362284",
"hugepages-1Gi": "0",
"hugepages-2Mi": "0",
"intel.com/intel_sriov_netdevice": "8",
"memory": "32833292Ki",
"pods": "110"
}
cd ~/sriov-network-device-plugin/deployments
kubectl apply pod-tc1.yaml
kubectl apply pod-tc2.yaml
root@sdn-k8s-server-b3-1:~/sriov-network-device-plugin/deployments# kubectl get pod -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
testpod1 1/1 Running 0 12m 10.244.1.4 10.0.0.218 <none> <none>
testpod2 1/1 Running 0 12m 10.244.1.5 10.0.0.218 <none> <none>
kubectl exec -ti testpod1 bash
yum install iperf3 -y
kubectl exec -ti testpod1 bash
yum install iperf3 -y
這邊的 IP 會依照 SR-IOV 給定的子網路區段去配置,有要設定他分配的 IP 可以去找
~/sriov-network-device-plugin/deployments/sriov-crd.yaml
,預設會是10.56.217.xx
。
[root@testpod1 /]# ifconfig
eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1450
inet 10.244.1.4 netmask 255.255.255.0 broadcast 10.244.1.255
ether 96:df:cb:c0:a4:f4 txqueuelen 0 (Ethernet)
RX packets 8530 bytes 12116510 (11.5 MiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 4510 bytes 304861 (297.7 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
lo: flags=73<UP,LOOPBACK,RUNNING> mtu 65536
inet 127.0.0.1 netmask 255.0.0.0
loop txqueuelen 1000 (Local Loopback)
RX packets 0 bytes 0 (0.0 B)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 0 bytes 0 (0.0 B)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
net1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 10.56.217.2 netmask 255.255.255.0 broadcast 10.56.217.255
ether 82:83:e2:cb:d0:fe txqueuelen 1000 (Ethernet)
RX packets 257283 bytes 17449188 (16.6 MiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 8598428 bytes 13017987010 (12.1 GiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
[root@testpod2 /]# ifconfig
eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1450
inet 10.244.1.5 netmask 255.255.255.0 broadcast 10.244.1.255
ether 72:98:01:30:b1:cd txqueuelen 0 (Ethernet)
RX packets 8830 bytes 12586642 (12.0 MiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 5526 bytes 390406 (381.2 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
lo: flags=73<UP,LOOPBACK,RUNNING> mtu 65536
inet 127.0.0.1 netmask 255.0.0.0
loop txqueuelen 1000 (Local Loopback)
RX packets 0 bytes 0 (0.0 B)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 0 bytes 0 (0.0 B)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
net1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 10.56.217.3 netmask 255.255.255.0 broadcast 10.56.217.255
ether 7a:6e:fb:98:f8:51 txqueuelen 1000 (Ethernet)
RX packets 8591677 bytes 13003258888 (12.1 GiB)
RX errors 0 dropped 61 overruns 0 frame 0
TX packets 256138 bytes 17372836 (16.5 MiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
這邊會用
iperf3
做區網通訊測試,另外就是要指定出去的介面,所以會使用剛剛的ifconfig
的 SR-IOV 分配出來的區網 IP,網卡是用X710
所以會是 10G 的速度。
[root@testpod1 /]# iperf3 -c 10.56.217.3 -B 10.56.217.2
Connecting to host 10.56.217.3, port 5201
[ 4] local 10.56.217.2 port 54533 connected to 10.56.217.3 port 5201
[ ID] Interval Transfer Bandwidth Retr Cwnd
[ 4] 0.00-1.00 sec 1.16 GBytes 9.95 Gbits/sec 1059 325 KBytes
[ 4] 1.00-2.00 sec 1.16 GBytes 9.98 Gbits/sec 942 515 KBytes
[ 4] 2.00-3.00 sec 1.16 GBytes 9.98 Gbits/sec 932 327 KBytes
[ 4] 3.00-4.00 sec 1.16 GBytes 9.98 Gbits/sec 1077 498 KBytes
[ 4] 4.00-5.00 sec 1.15 GBytes 9.91 Gbits/sec 799 344 KBytes
[ 4] 5.00-6.00 sec 1.16 GBytes 9.94 Gbits/sec 1098 393 KBytes
[ 4] 6.00-7.00 sec 1.16 GBytes 9.99 Gbits/sec 1116 499 KBytes
[ 4] 7.00-8.00 sec 1.15 GBytes 9.91 Gbits/sec 838 356 KBytes
[ 4] 8.00-9.00 sec 1.15 GBytes 9.92 Gbits/sec 996 362 KBytes
[ 4] 9.00-10.00 sec 1.16 GBytes 9.94 Gbits/sec 1069 564 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth Retr
[ 4] 0.00-10.00 sec 11.6 GBytes 9.95 Gbits/sec 9926 sender
[ 4] 0.00-10.00 sec 11.6 GBytes 9.95 Gbits/sec receiver
iperf Done.
[root@testpod2 /]# iperf3 -s -B 10.56.217.3
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 10.56.217.2, port 37883
[ 5] local 10.56.217.3 port 5201 connected to 10.56.217.2 port 54533
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-1.00 sec 1.11 GBytes 9.53 Gbits/sec
[ 5] 1.00-2.00 sec 1.16 GBytes 9.98 Gbits/sec
[ 5] 2.00-3.00 sec 1.16 GBytes 9.98 Gbits/sec
[ 5] 3.00-4.00 sec 1.16 GBytes 9.97 Gbits/sec
[ 5] 4.00-5.00 sec 1.15 GBytes 9.92 Gbits/sec
[ 5] 5.00-6.00 sec 1.16 GBytes 9.93 Gbits/sec
[ 5] 6.00-7.00 sec 1.16 GBytes 9.99 Gbits/sec
[ 5] 7.00-8.00 sec 1.15 GBytes 9.91 Gbits/sec
[ 5] 8.00-9.00 sec 1.16 GBytes 9.93 Gbits/sec
[ 5] 9.00-10.00 sec 1.16 GBytes 9.94 Gbits/sec
[ 5] 10.00-10.04 sec 47.3 MBytes 9.66 Gbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bandwidth
[ 5] 0.00-10.04 sec 0.00 Bytes 0.00 bits/sec sender
[ 5] 0.00-10.04 sec 11.6 GBytes 9.91 Gbits/sec receiver