Skip to main content

alioth/virtio/dev/net/
tap.rs

1// Copyright 2024 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::cmp::max;
16use std::fmt::Debug;
17use std::fs::{File, OpenOptions};
18use std::io::{ErrorKind, IoSlice};
19use std::mem::MaybeUninit;
20use std::os::fd::{AsFd, AsRawFd};
21use std::os::unix::prelude::OpenOptionsExt;
22use std::path::Path;
23use std::sync::Arc;
24use std::sync::mpsc::Receiver;
25use std::thread::JoinHandle;
26
27use io_uring::cqueue::Entry as Cqe;
28use io_uring::opcode;
29use io_uring::types::Fd;
30use libc::{IFF_MULTI_QUEUE, IFF_NO_PI, IFF_TAP, IFF_VNET_HDR, O_NONBLOCK};
31use mio::event::Event;
32use mio::unix::SourceFd;
33use mio::{Interest, Registry, Token};
34use serde::Deserialize;
35use serde_aco::Help;
36use zerocopy::{FromBytes, IntoBytes};
37
38use crate::device::net::MacAddr;
39use crate::hv::IoeventFd;
40use crate::mem::mapped::RamBus;
41use crate::sync::notifier::Notifier;
42use crate::sys::if_tun::{TunFeature, tun_set_iff, tun_set_offload, tun_set_vnet_hdr_sz};
43use crate::virtio::dev::net::{
44    CtrlAck, CtrlClass, CtrlHdr, CtrlMq, CtrlMqParisSet, NetConfig, NetFeature, VirtioNetHdr,
45};
46use crate::virtio::dev::{DevParam, DeviceId, Result, Virtio, WakeEvent};
47use crate::virtio::queue::{
48    DescChain, QueueReg, Status, VirtQueue, copy_from_reader, copy_to_writer,
49};
50use crate::virtio::worker::WorkerApi;
51use crate::virtio::worker::io_uring::{ActiveIoUring, BufferAction, IoUring, VirtioIoUring};
52use crate::virtio::worker::mio::{ActiveMio, Mio, VirtioMio};
53use crate::virtio::{FEATURE_BUILT_IN, IrqSender, error};
54
55#[derive(Debug)]
56pub struct Net {
57    name: Arc<str>,
58    config: Arc<NetConfig>,
59    tap_sockets: Vec<File>,
60    feature: NetFeature,
61    driver_feature: NetFeature,
62    dev_tap: Option<Box<Path>>,
63    if_name: Option<String>,
64    api: WorkerApi,
65}
66
67#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Help)]
68pub struct NetTapParam {
69    /// MAC address of the virtual NIC, e.g. 06:3a:76:53:da:3d.
70    pub mac: MacAddr,
71    /// Maximum transmission unit.
72    pub mtu: u16,
73    /// Number of pairs of transmit/receive queues. [default: 1]
74    #[serde(alias = "qp", default)]
75    pub queue_pairs: u16,
76    /// Path to the character device file of a tap interface.
77    ///
78    /// Required for MacVTap and IPVTap, e.g. /dev/tapX.
79    /// Optional for TUN/TAP. [default: /dev/net/tun]
80    pub tap: Option<Box<Path>>,
81    /// Name of a tap interface, e.g. tapX.
82    ///
83    /// Required for TUN/TAP. Optional for MacVTap and IPVTap.
84    #[serde(alias = "if")]
85    pub if_name: Option<String>,
86    /// System API for asynchronous IO.
87    #[serde(default)]
88    pub api: WorkerApi,
89}
90
91impl DevParam for NetTapParam {
92    type Device = Net;
93
94    fn build(self, name: impl Into<Arc<str>>) -> Result<Net> {
95        Net::new(self, name)
96    }
97}
98
99fn new_socket(dev_tap: Option<&Path>, blocking: bool) -> Result<File> {
100    let tap_dev = dev_tap.unwrap_or(Path::new("/dev/net/tun"));
101    let mut opt = OpenOptions::new();
102    opt.read(true).write(true);
103    if !blocking {
104        opt.custom_flags(O_NONBLOCK);
105    }
106    let socket = opt.open(tap_dev)?;
107    Ok(socket)
108}
109
110impl Net {
111    pub fn new(param: NetTapParam, name: impl Into<Arc<str>>) -> Result<Self> {
112        let mut socket = new_socket(
113            param.tap.as_deref(),
114            matches!(param.api, WorkerApi::IoUring),
115        )?;
116        let max_queue_pairs = max(param.queue_pairs, 1);
117        setup_socket(&mut socket, param.if_name.as_deref(), max_queue_pairs > 1)?;
118        let mut dev_feat = NetFeature::MAC
119            | NetFeature::MTU
120            | NetFeature::CSUM
121            | NetFeature::HOST_TSO4
122            | NetFeature::HOST_TSO6
123            | NetFeature::HOST_ECN
124            | NetFeature::HOST_UFO
125            | NetFeature::HOST_USO
126            | NetFeature::CTRL_VQ
127            | detect_tap_offload(&socket);
128        if max_queue_pairs > 1 {
129            dev_feat |= NetFeature::MQ;
130        }
131        let net = Net {
132            name: name.into(),
133            config: Arc::new(NetConfig {
134                mac: param.mac,
135                max_queue_pairs,
136                mtu: param.mtu,
137                ..Default::default()
138            }),
139            tap_sockets: vec![socket],
140            feature: dev_feat,
141            driver_feature: NetFeature::empty(),
142            dev_tap: param.tap,
143            if_name: param.if_name,
144            api: param.api,
145        };
146        Ok(net)
147    }
148
149    fn handle_ctrl_queue(
150        &mut self,
151        desc: &mut DescChain,
152        registry: Option<&Registry>,
153    ) -> Result<u32> {
154        let Some(header) = desc
155            .readable
156            .first()
157            .and_then(|b| CtrlHdr::read_from_bytes(b).ok())
158        else {
159            return error::InvalidBuffer.fail();
160        };
161        let Some(ack_byte) = desc.writable.first_mut().and_then(|v| v.first_mut()) else {
162            return error::InvalidBuffer.fail();
163        };
164        let ack = match header.class {
165            CtrlClass::MQ => match CtrlMq(header.command) {
166                CtrlMq::VQ_PARIS_SET => {
167                    let to_set = |b: &IoSlice| CtrlMqParisSet::read_from_bytes(b).ok();
168                    let Some(data) = desc.readable.get(1).and_then(to_set) else {
169                        return error::InvalidBuffer.fail();
170                    };
171                    let pairs = data.virtq_pairs as usize;
172                    self.tap_sockets.truncate(pairs);
173                    for index in self.tap_sockets.len()..pairs {
174                        let mut socket = new_socket(
175                            self.dev_tap.as_deref(),
176                            matches!(self.api, WorkerApi::IoUring),
177                        )?;
178                        setup_socket(&mut socket, self.if_name.as_deref(), true)?;
179                        enable_tap_offload(&mut socket, self.driver_feature)?;
180                        if let Some(r) = registry {
181                            r.register(
182                                &mut SourceFd(&socket.as_raw_fd()),
183                                Token(index),
184                                Interest::READABLE | Interest::WRITABLE,
185                            )?;
186                        }
187                        self.tap_sockets.push(socket);
188                    }
189                    log::info!("{}: using {pairs} pairs of queues", self.name);
190                    CtrlAck::OK
191                }
192                _ => CtrlAck::ERR,
193            },
194            _ => CtrlAck::ERR,
195        };
196        *ack_byte = ack.raw();
197        Ok(1)
198    }
199}
200
201impl Virtio for Net {
202    type Config = NetConfig;
203    type Feature = NetFeature;
204
205    fn id(&self) -> DeviceId {
206        DeviceId::NET
207    }
208
209    fn name(&self) -> &str {
210        &self.name
211    }
212
213    fn num_queues(&self) -> u16 {
214        let data_queues = self.config.max_queue_pairs << 1;
215        if self.feature.contains(NetFeature::CTRL_VQ) {
216            data_queues + 1
217        } else {
218            data_queues
219        }
220    }
221
222    fn config(&self) -> Arc<NetConfig> {
223        self.config.clone()
224    }
225
226    fn feature(&self) -> u128 {
227        self.feature.bits() | FEATURE_BUILT_IN
228    }
229
230    fn spawn_worker<S, E>(
231        self,
232        event_rx: Receiver<WakeEvent<S, E>>,
233        memory: Arc<RamBus>,
234        queue_regs: Arc<[QueueReg]>,
235    ) -> Result<(JoinHandle<()>, Arc<Notifier>)>
236    where
237        S: IrqSender,
238        E: IoeventFd,
239    {
240        match self.api {
241            WorkerApi::Mio => Mio::spawn_worker(self, event_rx, memory, queue_regs),
242            WorkerApi::IoUring => IoUring::spawn_worker(self, event_rx, memory, queue_regs),
243        }
244    }
245}
246
247impl VirtioMio for Net {
248    fn reset(&mut self, registry: &Registry) {
249        self.tap_sockets.truncate(1);
250        let _ = registry.deregister(&mut SourceFd(&self.tap_sockets[0].as_raw_fd()));
251    }
252
253    fn activate<'m, Q, S, E>(
254        &mut self,
255        feature: u128,
256        active_mio: &mut ActiveMio<'_, '_, 'm, Q, S, E>,
257    ) -> Result<()>
258    where
259        Q: VirtQueue<'m>,
260        S: IrqSender,
261        E: IoeventFd,
262    {
263        self.driver_feature = NetFeature::from_bits_retain(feature);
264        let socket = &mut self.tap_sockets[0];
265        enable_tap_offload(socket, self.driver_feature)?;
266        active_mio.poll.registry().register(
267            &mut SourceFd(&socket.as_raw_fd()),
268            Token(0),
269            Interest::READABLE | Interest::WRITABLE,
270        )?;
271        Ok(())
272    }
273
274    fn handle_event<'a, 'm, Q, S, E>(
275        &mut self,
276        event: &Event,
277        active_mio: &mut ActiveMio<'_, '_, 'm, Q, S, E>,
278    ) -> Result<()>
279    where
280        Q: VirtQueue<'m>,
281        S: IrqSender,
282        E: IoeventFd,
283    {
284        let token = event.token().0;
285        let irq_sender = active_mio.irq_sender;
286        if event.is_readable() {
287            let rx_queue_index = token << 1;
288            let Some(Some(queue)) = active_mio.queues.get_mut(rx_queue_index) else {
289                log::error!("{}: cannot find rx queue {rx_queue_index}", self.name);
290                return Ok(());
291            };
292            let Some(socket) = self.tap_sockets.get(token) else {
293                log::error!("{}: cannot find tap queue {token}", self.name);
294                return Ok(());
295            };
296            queue.handle_desc(rx_queue_index as u16, irq_sender, copy_from_reader(socket))?;
297        }
298        if event.is_writable() {
299            let tx_queue_index = (token << 1) + 1;
300            let Some(Some(queue)) = active_mio.queues.get_mut(tx_queue_index) else {
301                log::error!("{}: cannot find tx queue {tx_queue_index}", self.name);
302                return Ok(());
303            };
304            let Some(socket) = self.tap_sockets.get(token) else {
305                log::error!("{}: cannot find tap queue {token}", self.name);
306                return Ok(());
307            };
308            queue.handle_desc(tx_queue_index as u16, irq_sender, copy_to_writer(socket))?;
309        }
310        Ok(())
311    }
312
313    fn handle_queue<'m, Q, S, E>(
314        &mut self,
315        index: u16,
316        active_mio: &mut ActiveMio<'_, '_, 'm, Q, S, E>,
317    ) -> Result<()>
318    where
319        Q: VirtQueue<'m>,
320        S: IrqSender,
321        E: IoeventFd,
322    {
323        let Some(Some(queue)) = active_mio.queues.get_mut(index as usize) else {
324            log::error!("{}: invalid queue index {index}", self.name);
325            return Ok(());
326        };
327        let irq_sender = active_mio.irq_sender;
328        let registry = active_mio.poll.registry();
329        if index == self.config.max_queue_pairs * 2 {
330            return queue.handle_desc(index, irq_sender, |chain| {
331                let len = self.handle_ctrl_queue(chain, Some(registry))?;
332                Ok(Status::Done { len })
333            });
334        }
335        let Some(socket) = self.tap_sockets.get(index as usize >> 1) else {
336            log::error!("{}: invalid tap queue {}", self.name, index >> 1);
337            return Ok(());
338        };
339        if index & 1 == 0 {
340            queue.handle_desc(index, irq_sender, copy_from_reader(socket))
341        } else {
342            queue.handle_desc(index, irq_sender, copy_to_writer(socket))
343        }
344    }
345}
346
347impl VirtioIoUring for Net {
348    fn activate<'m, Q, S, E>(
349        &mut self,
350        feature: u128,
351        _ring: &mut ActiveIoUring<'_, '_, 'm, Q, S, E>,
352    ) -> Result<()>
353    where
354        S: IrqSender,
355        Q: VirtQueue<'m>,
356        E: IoeventFd,
357    {
358        self.driver_feature = NetFeature::from_bits_retain(feature);
359        let socket = &mut self.tap_sockets[0];
360        enable_tap_offload(socket, self.driver_feature)?;
361        Ok(())
362    }
363
364    fn handle_desc(&mut self, q_index: u16, chain: &mut DescChain) -> Result<BufferAction> {
365        if q_index == self.config.max_queue_pairs * 2 {
366            let len = self.handle_ctrl_queue(chain, None)?;
367            return Ok(BufferAction::Written(len));
368        }
369        let Some(socket) = self.tap_sockets.get(q_index as usize >> 1) else {
370            log::error!("{}: invalid tap queue {}", self.name, q_index >> 1);
371            return Ok(BufferAction::Written(0));
372        };
373        let entry = if q_index & 1 == 0 {
374            let writable = &chain.writable;
375            opcode::Readv::new(
376                Fd(socket.as_raw_fd()),
377                writable.as_ptr() as *const _,
378                writable.len() as _,
379            )
380            .build()
381        } else {
382            let readable = &chain.readable;
383            opcode::Writev::new(
384                Fd(socket.as_raw_fd()),
385                readable.as_ptr() as *const _,
386                readable.len() as _,
387            )
388            .build()
389        };
390        Ok(BufferAction::Sqe(entry))
391    }
392
393    fn complete_desc(&mut self, q_index: u16, _chain: &mut DescChain, cqe: &Cqe) -> Result<u32> {
394        let ret = cqe.result();
395        if ret < 0 {
396            let err = std::io::Error::from_raw_os_error(-ret);
397            log::error!("{}: failed to send/receive packet: {err}", self.name,);
398            return Ok(0);
399        }
400        if q_index & 1 == 0 {
401            Ok(ret as u32)
402        } else {
403            Ok(0)
404        }
405    }
406}
407
408fn setup_socket(file: &mut File, if_name: Option<&str>, mq: bool) -> Result<()> {
409    let mut tap_ifconfig = unsafe { MaybeUninit::<libc::ifreq>::zeroed().assume_init() };
410
411    if let Some(name) = if_name {
412        let name_len = std::cmp::min(tap_ifconfig.ifr_name.len() - 1, name.len());
413        tap_ifconfig.ifr_name.as_mut_bytes()[0..name_len]
414            .copy_from_slice(&name.as_bytes()[0..name_len]);
415    }
416
417    let mut flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
418    if mq {
419        flags |= IFF_MULTI_QUEUE;
420    }
421    tap_ifconfig.ifr_ifru.ifru_flags = flags as i16;
422
423    unsafe { tun_set_iff(file, &tap_ifconfig) }.or_else(|e| {
424        if e.kind() == ErrorKind::InvalidInput && !mq {
425            flags |= IFF_MULTI_QUEUE;
426            tap_ifconfig.ifr_ifru.ifru_flags = flags as i16;
427            unsafe { tun_set_iff(file, &tap_ifconfig) }
428        } else {
429            Err(e)
430        }
431    })?;
432
433    unsafe { tun_set_vnet_hdr_sz(file, &(size_of::<VirtioNetHdr>() as _)) }?;
434    Ok(())
435}
436
437fn detect_tap_offload(tap: &impl AsFd) -> NetFeature {
438    let mut tap_feature = TunFeature::all();
439    let mut dev_feat = NetFeature::GUEST_CSUM
440        | NetFeature::GUEST_TSO4
441        | NetFeature::GUEST_TSO6
442        | NetFeature::GUEST_ECN
443        | NetFeature::GUEST_UFO
444        | NetFeature::GUEST_USO4
445        | NetFeature::GUEST_USO6;
446    if unsafe { tun_set_offload(tap, tap_feature) }.is_ok() {
447        return dev_feat;
448    }
449    tap_feature &= !(TunFeature::USO4 | TunFeature::USO6);
450    dev_feat &= !(NetFeature::GUEST_USO4 | NetFeature::GUEST_USO6);
451    if unsafe { tun_set_offload(tap, tap_feature) }.is_ok() {
452        return dev_feat;
453    }
454    tap_feature &= !(TunFeature::UFO);
455    dev_feat &= !NetFeature::GUEST_UFO;
456    if unsafe { tun_set_offload(tap, tap_feature) }.is_ok() {
457        return dev_feat;
458    }
459    NetFeature::empty()
460}
461
462fn enable_tap_offload(tap: &mut File, feature: NetFeature) -> Result<()> {
463    let mut tap_feature = TunFeature::empty();
464    if feature.contains(NetFeature::GUEST_CSUM) {
465        tap_feature |= TunFeature::CSUM;
466    }
467    if feature.contains(NetFeature::GUEST_TSO4) {
468        tap_feature |= TunFeature::TSO4;
469    }
470    if feature.contains(NetFeature::GUEST_TSO6) {
471        tap_feature |= TunFeature::TSO6;
472    }
473    if feature.contains(NetFeature::GUEST_ECN) {
474        tap_feature |= TunFeature::TSO_ECN;
475    }
476    if feature.contains(NetFeature::GUEST_UFO) {
477        tap_feature |= TunFeature::UFO;
478    }
479    if feature.contains(NetFeature::GUEST_USO4) {
480        tap_feature |= TunFeature::USO4;
481    }
482    if feature.contains(NetFeature::GUEST_USO6) {
483        tap_feature |= TunFeature::USO6;
484    }
485    unsafe { tun_set_offload(tap, tap_feature) }?;
486    Ok(())
487}