diff --git a/.cargo/config b/.cargo/config index 11941090b1e..0678c0723c6 100644 --- a/.cargo/config +++ b/.cargo/config @@ -1,6 +1,6 @@ [build] -target-dir = "build/cargo_target" -target = "x86_64-unknown-linux-musl" +# target = "x86_64-unknown-linux-musl" +# target-dir = "build/cargo_target" [net] git-fetch-with-cli = true diff --git a/Cargo.lock b/Cargo.lock index ad8eb46456a..e72f72fbbf5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -668,9 +668,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.6.5" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" dependencies = [ "autocfg", ] @@ -715,15 +715,16 @@ version = "0.1.0" [[package]] name = "nix" -version = "0.23.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags", - "cc", "cfg-if", "libc", "memoffset", + "pin-utils", + "static_assertions", ] [[package]] @@ -769,6 +770,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "polyval" version = "0.6.0" @@ -999,6 +1006,12 @@ dependencies = [ "versionize_derive", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "subtle" version = "2.4.1" @@ -1095,8 +1108,7 @@ dependencies = [ [[package]] name = "userfaultfd" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fee2cdd3f8bdd0b98d7aa9ace35e7214a71888229d60c1cd1cd71b7c09c089d0" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=b11a187b5743847dda76ed8df5419c3607d21375#b11a187b5743847dda76ed8df5419c3607d21375" dependencies = [ "bitflags", "cfg-if", @@ -1109,8 +1121,7 @@ dependencies = [ [[package]] name = "userfaultfd-sys" version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cbcf2717fa856a7226499babbbccb07353ea2fc2b27defd38bd13b1227cc78" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=b11a187b5743847dda76ed8df5419c3607d21375#b11a187b5743847dda76ed8df5419c3607d21375" dependencies = [ "bindgen", "cc", diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 3bf1f5b270a..4d32b8a36b2 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -626,6 +626,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 55e5ddc23ab..178efe599b6 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -261,6 +261,31 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, + { + "syscall": "memfd_create", + "comment": "Used to create a memory backed file descriptor that can be used to save memory to" + }, + { + "syscall": "nanosleep", + "comment": "Debugging sleep" + }, + { + "syscall": "copy_file_range", + "comment": "debugging" + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index c54280c3897..12fa1301fa0 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -17,6 +17,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use crate::request::memory_backend::parse_put_memory_backend; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -112,6 +113,7 @@ impl ParsedRequest { (Method::Put, "network-interfaces", Some(body)) => { parse_put_net(body, path_tokens.get(1)) } + (Method::Put, "memory-backend", Some(body)) => parse_put_memory_backend(body), (Method::Put, "shutdown-internal", None) => { Ok(ParsedRequest::new(RequestAction::ShutdownInternal)) } diff --git a/src/api_server/src/request/memory_backend.rs b/src/api_server/src/request/memory_backend.rs new file mode 100644 index 00000000000..b81c7f5fc78 --- /dev/null +++ b/src/api_server/src/request/memory_backend.rs @@ -0,0 +1,46 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use logger::{IncMetric, METRICS}; +use vmm::vmm_config::snapshot::MemBackendConfig; + +pub(crate) fn parse_put_memory_backend(body: &Body) -> Result { + METRICS.put_api_requests.memory_backend_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackend( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backend_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use vmm::vmm_config::snapshot::MemBackendType; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backend(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "backend_type": "File", + "backend_path": "./memory.snap" + }"#; + let same_body = MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backend(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackend(same_body))); + } +} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index 75f9a0daef3..f58bce5b533 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -8,6 +8,7 @@ pub mod drive; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_backend; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index a86d5e91446..f5a6383e44c 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -350,6 +350,29 @@ paths: description: Internal server error schema: $ref: "#/definitions/Error" + + /memory-backend: + put: + summary: Configures a memory backend to sync the memory changes from during the runtime of the vm + operationId: putMemoryBackend + parameters: + - name: body + in: body + description: The memory backend to use + required: true + schema: + $ref: "#/definitions/MemoryBackend" + responses: + 204: + description: Memory backend configured + 400: + description: Memory backend failed + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" /metrics: put: diff --git a/src/cpuid/src/transformer/amd.rs b/src/cpuid/src/transformer/amd.rs index 5c90f5ebfb7..836ace1e979 100644 --- a/src/cpuid/src/transformer/amd.rs +++ b/src/cpuid/src/transformer/amd.rs @@ -147,6 +147,8 @@ impl CpuidTransformer for AmdCpuidTransformer { leaf_0x8000001d::LEAF_NUM => Some(amd::update_extended_cache_topology_entry), leaf_0x8000001e::LEAF_NUM => Some(amd::update_extended_apic_id_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/cpuid/src/transformer/common.rs b/src/cpuid/src/transformer/common.rs index a3ed9577ac9..6580b1583fb 100644 --- a/src/cpuid/src/transformer/common.rs +++ b/src/cpuid/src/transformer/common.rs @@ -69,6 +69,19 @@ pub fn update_brand_string_entry( Ok(()) } +// KVM feature bits +#[cfg(target_arch = "x86_64")] +const KVM_FEATURE_ASYNC_PF_INT_BIT: u32 = 14; + +pub fn disable_kvm_feature_async_pf( + entry: &mut kvm_cpuid_entry2, + _vm_spec: &VmSpec, +) -> Result<(), Error> { + entry.eax.write_bit(KVM_FEATURE_ASYNC_PF_INT_BIT, false); + + Ok(()) +} + pub fn update_cache_parameters_entry( entry: &mut kvm_cpuid_entry2, vm_spec: &VmSpec, diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index a5035c971b9..096c470034a 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -150,6 +150,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/devices/src/virtio/balloon/utils.rs b/src/devices/src/virtio/balloon/utils.rs index 55d9d6792ad..1f8cd299458 100644 --- a/src/devices/src/virtio/balloon/utils.rs +++ b/src/devices/src/virtio/balloon/utils.rs @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + _restored: bool, ) -> std::result::Result<(), RemoveRegionError> { let (guest_address, range_len) = range; @@ -80,25 +80,26 @@ pub(crate) fn remove_range( .get_host_address(guest_address) .map_err(|_| RemoveRegionError::AddressTranslation)?; - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - libc::mmap( - phys_address.cast(), - range_len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; + // CodeSandbox: since we use UFFD handler, this is not needed for us. In fact, it breaks the UFFD handler + // if this happens right now, as it unregisters the UFFD handler for the given range. + // // Mmap a new anonymous region over the present one in order to create a hole. + // // This workaround is (only) needed after resuming from a snapshot because the guest memory + // // is mmaped from file as private and there is no `madvise` flag that works for this case. + // if restored { + // let ret = unsafe { + // libc::mmap( + // phys_address as *mut _, + // range_len as usize, + // libc::PROT_READ | libc::PROT_WRITE, + // libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + // -1, + // 0, + // ) + // }; + // if ret == libc::MAP_FAILED { + // return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); + // } + // }; // Madvise the region in order to mark it as not used. // SAFETY: The address and length are known to be valid. diff --git a/src/jailer/src/env.rs b/src/jailer/src/env.rs index 52b2d3bc2dc..d2058338c35 100644 --- a/src/jailer/src/env.rs +++ b/src/jailer/src/env.rs @@ -389,9 +389,8 @@ impl Env { // a new PathBuf, with something like chroot_dir.join(exec_file_name) ?! self.chroot_dir.push(exec_file_name); - // TODO: hard link instead of copy? This would save up disk space, but hard linking is - // not always possible :( - fs::copy(&self.exec_file_path, &self.chroot_dir).map_err(|err| { + // We hard link instead of copy for space savings and to retain the capabilities + fs::hard_link(&self.exec_file_path, &self.chroot_dir).map_err(|err| { Error::Copy(self.exec_file_path.clone(), self.chroot_dir.clone(), err) })?; diff --git a/src/logger/src/metrics.rs b/src/logger/src/metrics.rs index cdaa69664c4..84c7d739f62 100644 --- a/src/logger/src/metrics.rs +++ b/src/logger/src/metrics.rs @@ -403,6 +403,10 @@ pub struct PutRequestsMetrics { pub machine_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, + /// Number of PUTs for setting memory backing file. + pub memory_backend_cfg_count: SharedIncMetric, + /// Number of failures in configuring the machine. + pub memory_backend_cfg_fails: SharedIncMetric, /// Number of PUTs for initializing the metrics system. pub metrics_count: SharedIncMetric, /// Number of failures in initializing the metrics system. diff --git a/src/vm-memory/src/lib.rs b/src/vm-memory/src/lib.rs index 85ad72421fc..7c6f1cbd900 100644 --- a/src/vm-memory/src/lib.rs +++ b/src/vm-memory/src/lib.rs @@ -121,7 +121,7 @@ pub fn create_guest_memory( for region in regions { let flags = match region.0 { None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE, + Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED, }; let mmap_region = diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 2792d49cea7..6ae464ea4db 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -15,11 +15,15 @@ lazy_static = "1.4.0" libc = "0.2.117" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.78" -userfaultfd = "0.5.0" +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "b11a187b5743847dda76ed8df5419c3607d21375", features = [ + "linux5_7", +] } versionize = "0.1.6" versionize_derive = "0.1.4" vm-allocator = "0.1.0" -derive_more = { version = "0.99.17", default-features = false, features = ["from"] } +derive_more = { version = "0.99.17", default-features = false, features = [ + "from", +] } thiserror = "1.0.32" arch = { path = "../arch" } @@ -28,7 +32,7 @@ logger = { path = "../logger" } mmds = { path = "../mmds" } rate_limiter = { path = "../rate_limiter" } seccompiler = { path = "../seccompiler" } -snapshot = { path = "../snapshot"} +snapshot = { path = "../snapshot" } utils = { path = "../utils" } virtio_gen = { path = "../virtio_gen" } vm-memory = { path = "../vm-memory" } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index ea0e9862dcf..2cda02fa14c 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,9 +5,15 @@ use std::convert::TryFrom; use std::fmt::{Display, Formatter}; +use std::fs::{File, OpenOptions}; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::UnixStream; +use std::path::Path; use std::sync::{Arc, Mutex}; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; +use vm_memory::{FileOffset, GuestMemory}; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -29,7 +35,6 @@ use linux_loader::loader::KernelLoader; use logger::{error, warn, METRICS}; use seccompiler::BpfThreadMap; use snapshot::Persist; -use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::terminal::Terminal; use utils::time::TimestampUs; @@ -44,7 +49,7 @@ use crate::construct_kvm_mpidrs; use crate::device_manager::legacy::PortIODeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::persist::MMIODevManagerConstructorArgs; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{GuestRegionUffdMapping, MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -59,6 +64,8 @@ use crate::{device_manager, Error, EventManager, Vmm, VmmEventsObserver}; pub enum StartMicrovmError { /// Unable to attach block device to Vmm. AttachBlockDevice(io::Error), + /// Unable to create/open the memory backing file. + BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. ConfigureSystem(arch::Error), /// Internal errors are due to resource exhaustion. @@ -95,6 +102,20 @@ pub enum StartMicrovmError { RestoreMicrovmState(MicrovmStateError), /// Unable to set VmResources. SetVmResources(VmConfigError), + /// Failed to create an UFFD Builder. + CreateUffdBuilder(userfaultfd::Error), + /// Unable to connect to UDS in order to send information regarding + /// handling guest memory page-fault events. + UdsConnection(io::Error), + /// Failed to register guest memory regions to UFFD. + UffdMemoryRegionsRegister(userfaultfd::Error), + /// Failed to send guest memory layout and path to user fault FD used to handle + /// guest memory page faults. This information is sent to a UDS where a custom + /// page-fault handler process is listening. + UffdSend(kvm_ioctls::Error), + + /// Failed to get the memfd from the uffd socket + NoMemFdReceived, } impl std::error::Error for StartMicrovmError {} /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -113,6 +134,9 @@ impl Display for StartMicrovmError { write!(f, "Unable to attach block device to Vmm: {}", err) } ConfigureSystem(err) => write!(f, "System configuration error: {:?}", err), + BackingMemoryFile(err) => { + write!(f, "Unable to create the memory backing file: {}", err) + } CreateRateLimiter(err) => write!(f, "Cannot create RateLimiter: {}", err), CreateNetDevice(err) => { let mut err_msg = format!("{:?}", err); @@ -178,6 +202,13 @@ impl Display for StartMicrovmError { } RestoreMicrovmState(err) => write!(f, "Cannot restore microvm state. Error: {}", err), SetVmResources(err) => write!(f, "Cannot set vm resources. Error: {}", err), + CreateUffdBuilder(err) => write!(f, "Cannot create uffd socket. Error: {}", err), + UdsConnection(err) => write!(f, "Cannot connect to uffd socket. Error: {}", err), + UffdMemoryRegionsRegister(err) => { + write!(f, "Cannot uffd memory region register. Error: {}", err) + } + UffdSend(err) => write!(f, "Cannot send to uffd. Error: {}", err), + NoMemFdReceived => write!(f, "No memfd received from uffd."), } } } @@ -232,7 +263,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, vcpu_count: u8, ) -> std::result::Result<(Vmm, Vec), StartMicrovmError> { @@ -298,7 +329,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, - uffd, + memory_descriptor, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -332,8 +363,57 @@ pub fn build_microvm_for_boot( .ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = - create_guest_memory(vm_resources.vm_config().mem_size_mib, track_dirty_pages)?; + + let (guest_memory, memory_descriptor, _file) = + if let Some(ref backend_config) = vm_resources.memory_backend { + match backend_config.backend_type { + crate::vmm_config::snapshot::MemBackendType::File => { + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&backend_config.backend_path) + .map_err(BackingMemoryFile)?; + file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + let file = Arc::new(file); + + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + Some(file.clone()), + track_dirty_pages, + )?, + Some(MemoryDescriptor::File(file)), + None, + ) + } + crate::vmm_config::snapshot::MemBackendType::Uffd => { + let (mem, uffd, file) = create_uffd_guest_memory( + vm_resources.vm_config().mem_size_mib, + backend_config.backend_path.as_path(), + track_dirty_pages, + )?; + + (mem, Some(MemoryDescriptor::Uffd(uffd)), Some(file)) + } + } + } else { + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + None, + track_dirty_pages, + )?, + None, + None, + ) + }; + let vcpu_config = vm_resources.vcpu_config(); let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; @@ -345,7 +425,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - None, + memory_descriptor, track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -490,7 +570,7 @@ pub fn build_microvm_from_snapshot( event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -504,7 +584,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), - uffd, + memory_descriptor, track_dirty_pages, vcpu_count, )?; @@ -595,21 +675,143 @@ pub fn build_microvm_from_snapshot( /// Creates GuestMemory of `mem_size_mib` MiB in size. pub fn create_guest_memory( mem_size_mib: usize, + backing_memory_file: Option>, track_dirty_pages: bool, ) -> std::result::Result { let mem_size = mem_size_mib << 20; let arch_mem_regions = arch::arch_memory_regions(mem_size); + let mut offset = 0_u64; vm_memory::create_guest_memory( &arch_mem_regions .iter() - .map(|(addr, size)| (None, *addr, *size)) + .map(|(addr, size)| { + let file_offset = backing_memory_file + .clone() + .map(|file| FileOffset::from_arc(file, offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) .collect::>()[..], track_dirty_pages, ) .map_err(StartMicrovmError::GuestMemoryMmap) } +/// Creates GuestMemory of `mem_size_mib` MiB in size. +pub fn create_uffd_guest_memory( + mem_size_mib: usize, + uds_socket_path: &Path, + track_dirty_pages: bool, +) -> std::result::Result<(GuestMemoryMmap, Uffd, Arc), StartMicrovmError> { + use StartMicrovmError::{CreateUffdBuilder, NoMemFdReceived, UdsConnection, UffdSend}; + + let mut socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; + + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; + + if memfd.is_none() { + return Err(NoMemFdReceived); + } + + let mem_size = mem_size_mib << 20; + let arch_mem_regions = arch::arch_memory_regions(mem_size); + let backing_memory_file = Arc::new(memfd.unwrap()); + + let mut offset = 0_u64; + let guest_memory = vm_memory::create_guest_memory( + &arch_mem_regions + .iter() + .map(|(addr, size)| { + let file_offset = Some(FileOffset::from_arc(backing_memory_file.clone(), offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) + .collect::>()[..], + track_dirty_pages, + ) + .map_err(StartMicrovmError::GuestMemoryMmap)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) + .create() + .map_err(CreateUffdBuilder)?; + + let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); + let mut offset = 0; + for mem_region in guest_memory.iter() { + let host_base_addr = mem_region.as_ptr(); + let size = mem_region.size(); + + backend_mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: host_base_addr as u64, + size, + offset, + }); + offset += size as u64; + } + + // This is safe to unwrap() because we control the contents of the vector + // (i.e GuestRegionUffdMapping entries). + let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(UffdSend)?; + + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + + Ok((guest_memory, uffd, backing_memory_file)) +} + fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, @@ -1114,7 +1316,7 @@ pub mod tests { } pub(crate) fn default_vmm() -> Vmm { - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK) .map_err(Error::EventFd) @@ -1142,12 +1344,12 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, - uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, #[cfg(target_arch = "x86_64")] pio_device_manager, + memory_descriptor: None, } } @@ -1329,13 +1531,13 @@ pub mod tests { // Case 1: create guest memory without dirty page tracking { - let guest_memory = create_guest_memory(mem_size, false).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, false).unwrap(); assert!(!is_dirty_tracking_enabled(&guest_memory)); } // Case 2: create guest memory with dirty page tracking { - let guest_memory = create_guest_memory(mem_size, true).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, true).unwrap(); assert!(is_dirty_tracking_enabled(&guest_memory)); } } @@ -1343,7 +1545,7 @@ pub mod tests { #[test] fn test_create_vcpus() { let vcpu_count = 2; - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); #[allow(unused_mut)] let mut vm = setup_kvm_vm(&guest_memory, false).unwrap(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 665e2015dd3..256785062a8 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -50,10 +50,10 @@ use devices::virtio::{ use devices::BusDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; +use persist::MemoryDescriptor; use rate_limiter::BucketUpdate; use seccompiler::BpfProgram; use snapshot::Persist; -use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -287,10 +287,6 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, - // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] - uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -299,6 +295,11 @@ pub struct Vmm { mmio_device_manager: MMIODeviceManager, #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, + + // The mem file that should be mmaped. We need to keep a reference of the UFFD in the + // process so we allow dead_code + #[allow(dead_code)] + memory_descriptor: Option, } impl Vmm { diff --git a/src/vmm/src/memory_snapshot.rs b/src/vmm/src/memory_snapshot.rs index beebecb41c3..a0b560fb563 100644 --- a/src/vmm/src/memory_snapshot.rs +++ b/src/vmm/src/memory_snapshot.rs @@ -5,7 +5,9 @@ use std::fs::File; use std::io::SeekFrom; +use std::time::Instant; +use libc::{MAP_SHARED, PROT_WRITE}; use utils::{errno, get_page_size}; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; @@ -117,7 +119,11 @@ impl SnapshotMemory for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size()?; - self.iter() + let start = Instant::now(); + let mut total_written = 0; + + let res = self + .iter() .enumerate() .try_for_each(|(slot, region)| { let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); @@ -141,23 +147,37 @@ impl SnapshotMemory for GuestMemoryMmap { } write_size += page_size; } else if write_size > 0 { + let start = Instant::now(); // We are at the end of a batch of dirty pages. region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; write_size = 0; } } } if write_size > 0 { + let start = Instant::now(); region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); } writer_offset += region.len(); if let Some(bitmap) = firecracker_bitmap { @@ -166,7 +186,15 @@ impl SnapshotMemory for GuestMemoryMmap { Ok(()) }) - .map_err(Error::WriteMemory) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + res } /// Creates a GuestMemoryMmap backed by a `file` if present, otherwise backed @@ -190,6 +218,117 @@ impl SnapshotMemory for GuestMemoryMmap { } } +/// Dumps all pages of GuestMemoryMmap present in `dirty_bitmap` to a writer. +pub fn mem_dump_dirty( + mem_map: &GuestMemoryMmap, + fd: i32, + len: usize, + dirty_bitmap: &DirtyBitmap, +) -> std::result::Result<(), Error> { + let mut writer_offset = 0_u64; + let page_size = get_page_size()?; + + let start = Instant::now(); + let mut total_written = 0; + + let source_map = + unsafe { libc::mmap(std::ptr::null_mut(), len, PROT_WRITE, MAP_SHARED, fd, 0) }; + + let res = mem_map + .iter() + .enumerate() + .try_for_each(|(slot, region)| { + let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let firecracker_bitmap = region.bitmap(); + let mut write_size = 0; + let mut dirty_batch_start: u64 = 0; + + let mmap_base = region.get_host_address(MemoryRegionAddress(0)).unwrap(); + for (i, v) in kvm_bitmap.iter().enumerate() { + for j in 0..64 { + let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; + let page_offset = ((i * 64) + j) * page_size; + let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + if is_kvm_page_dirty || is_firecracker_page_dirty { + // We are at the start of a new batch of dirty pages. + if write_size == 0 { + // Seek forward over the unmodified pages. + dirty_batch_start = page_offset as u64; + } + write_size += page_size; + } else if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting write of {}B (source {}, dest {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) + as *mut u8, + write_size, + ); + } + + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; + write_size = 0; + } + } + } + + if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting final write of {}B (source {}, dest {}) (total_size: {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start, + len + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) as *mut u8, + write_size, + ); + } + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + } + writer_offset += region.len(); + if let Some(bitmap) = firecracker_bitmap { + bitmap.reset(); + } + + Ok(()) + }) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + eprintln!("memfd {}, len {}", fd, len); + + res +} + #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 5f47e0514e1..fb37af9b655 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -3,10 +3,12 @@ //! Defines state structures for saving/restoring a Firecracker microVM. +use std::ffi::CString; use std::fs::{File, OpenOptions}; -use std::io::{self, Write}; +use std::io::{self, Read, Write}; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; +use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; @@ -15,7 +17,9 @@ use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; #[cfg(target_arch = "x86_64")] use cpuid::common::{get_vendor_id_from_cpuid, get_vendor_id_from_host}; use devices::virtio::TYPE_NET; -use logger::{error, info, warn}; +use libc::memfd_create; +use logger::warn; +use logger::{error, info}; use seccompiler::BpfThreadMap; use serde::Serialize; use snapshot::Snapshot; @@ -28,7 +32,7 @@ use vm_memory::{GuestMemory, GuestMemoryMmap}; use crate::builder::{self, BuildMicrovmFromSnapshotError}; use crate::device_manager::persist::{DeviceStates, Error as DevicePersistError}; -use crate::memory_snapshot::{GuestMemoryState, SnapshotMemory}; +use crate::memory_snapshot::{mem_dump_dirty, GuestMemoryState, SnapshotMemory}; use crate::resources::VmResources; #[cfg(target_arch = "x86_64")] use crate::version_map::FC_V0_23_SNAP_VERSION; @@ -235,7 +239,9 @@ pub fn create_snapshot( version_map, )?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + if params.snapshot_type == SnapshotType::Full { + snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + } Ok(()) } @@ -272,12 +278,28 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; - let mut file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(mem_file_path) - .map_err(|err| MemoryBackingFile("open", err))?; + + let mut file = if mem_file_path.to_string_lossy() == "memfd" { + let fd = unsafe { + let memfd_name = CString::new("diff").unwrap(); + memfd_create(memfd_name.as_ptr(), 0) + }; + if fd == -1 { + return Err(MemoryBackingFile( + "memfd_create", + std::io::Error::last_os_error(), + )); + } + + unsafe { File::from_raw_fd(fd) } + } else { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(mem_file_path) + .map_err(|err| MemoryBackingFile("open", err))? + }; // Set the length of the file to the full size of the memory area. let mem_size_mib = mem_size_mib(vmm.guest_memory()); @@ -287,16 +309,19 @@ fn snapshot_memory_to_file( match snapshot_type { SnapshotType::Diff => { let dirty_bitmap = vmm.get_dirty_bitmap().map_err(DirtyBitmap)?; - vmm.guest_memory() - .dump_dirty(&mut file, &dirty_bitmap) - .map_err(Memory) + + mem_dump_dirty( + vmm.guest_memory(), + file.as_raw_fd(), + (mem_size_mib * 1024 * 1024) as usize, + &dirty_bitmap, + ) + .map_err(Memory) } SnapshotType::Full => vmm.guest_memory().dump(&mut file).map_err(Memory), }?; - file.flush() - .map_err(|err| MemoryBackingFile("flush", err))?; - file.sync_all() - .map_err(|err| MemoryBackingFile("sync_all", err)) + + Ok(()) } /// Validate the microVM version and translate it to its corresponding snapshot data format. @@ -473,6 +498,16 @@ pub fn snapshot_state_sanity_check( Ok(()) } +/// Describes a descriptor that connects to the memory used by the VM. This could either be the a file descriptor +/// or a UFFD descriptor. +#[derive(Debug)] +pub enum MemoryDescriptor { + /// A file descriptor that connects to the user fault process. + Uffd(Uffd), + /// A file descriptor of the backing memory file. + File(Arc), +} + /// Error type for [`restore_from_snapshot`]. #[derive(Debug, thiserror::Error)] pub enum RestoreFromSnapshotError { @@ -518,29 +553,27 @@ pub fn restore_from_snapshot( let mem_backend_path = ¶ms.mem_backend.backend_path; let mem_state = µvm_state.memory_state; let track_dirty_pages = params.enable_diff_snapshots; + let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { + MemBackendType::File => { + let (guest_memory, file) = + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::File)?; + (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) + } + MemBackendType::Uffd => { + let (guest_memory, uffd) = + guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?; - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, - None, - ), - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - ) - .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, + (guest_memory, uffd.map(MemoryDescriptor::Uffd)) + } }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, - uffd, + memory_descriptor, track_dirty_pages, seccomp_filters, vm_resources, @@ -589,10 +622,16 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result { - let mem_file = File::open(mem_file_path)?; - let guest_mem = GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages)?; - Ok(guest_mem) +) -> std::result::Result<(GuestMemoryMmap, File), GuestMemoryFromFileError> { + let mem_file = OpenOptions::new() + .write(true) + .read(true) + .open(mem_file_path)?; + + Ok(( + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages)?, + mem_file, + )) } /// Error type for [`guest_memory_from_uffd`] @@ -613,27 +652,46 @@ pub enum GuestMemoryFromUffdError { /// Failed to send file descriptor. #[error("Failed to sends file descriptor: {0}")] Send(#[from] utils::errno::Error), + + /// No memfd received + #[error("No memfd received")] + NoMemFdReceived, + /// Receiving memfd went wrong + #[error("Failed to receive memfd: {0}")] + Receive(utils::errno::Error), } -fn guest_memory_from_uffd( +pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, - enable_balloon: bool, ) -> std::result::Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { - let guest_memory = GuestMemoryMmap::restore(None, mem_state, track_dirty_pages)?; + let mut socket = UnixStream::connect(mem_uds_path)?; - let mut uffd_builder = UffdBuilder::new(); + let mut buf = [0u8; 8]; + let (_, memfd) = socket + .recv_with_fd(&mut buf) + .map_err(GuestMemoryFromUffdError::Receive)?; - if enable_balloon { - // We enable this so that the page fault handler can add logic - // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + if memfd.is_none() { + return Err(GuestMemoryFromUffdError::NoMemFdReceived); } - let uffd = uffd_builder - .close_on_exec(true) - .non_blocking(true) + let memfd = memfd.unwrap(); + + let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) .create() .map_err(GuestMemoryFromUffdError::Create)?; @@ -642,8 +700,6 @@ fn guest_memory_from_uffd( let host_base_addr = mem_region.as_ptr(); let size = mem_region.size(); - uffd.register(host_base_addr as _, size as _) - .map_err(GuestMemoryFromUffdError::Register)?; backend_mappings.push(GuestRegionUffdMapping { base_host_virt_addr: host_base_addr as u64, size, @@ -655,7 +711,6 @@ fn guest_memory_from_uffd( // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - let socket = UnixStream::connect(mem_uds_path)?; socket.send_with_fd( backend_mappings.as_bytes(), // In the happy case we can close the fd since the other process has it open and is @@ -691,6 +746,11 @@ fn guest_memory_from_uffd( uffd.as_raw_fd(), )?; + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf)?; + Ok((guest_memory, Some(uffd))) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 974a73ab109..189aa4eb8b5 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -22,6 +22,7 @@ use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig} use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; +use crate::vmm_config::snapshot::MemBackendConfig; use crate::vmm_config::vsock::*; use crate::vstate::vcpu::VcpuConfig; @@ -119,6 +120,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// When backed by a memory on boot, this should be set + pub memory_backend: Option, } impl VmResources { @@ -238,6 +241,16 @@ impl VmResources { self.vm_config.track_dirty_pages = dirty_page_tracking; } + /// Returns the config for the backing memory file + pub fn memory_backend(&self) -> Option { + self.memory_backend.clone() + } + + /// Sets the backing memory file + pub fn set_memory_backend(&mut self, backing_mem_file: MemBackendConfig) { + self.memory_backend.get_or_insert(backing_mem_file); + } + /// Returns the VmConfig. pub fn vm_config(&self) -> &VmConfig { &self.vm_config @@ -589,6 +602,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, + memory_backend: None, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 5eb460ba676..8b4fe35c4c7 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -39,7 +39,9 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ NetworkInterfaceConfig, NetworkInterfaceError, NetworkInterfaceUpdateConfig, }; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, SnapshotType, +}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; use crate::vmm_config::{self, RateLimiterUpdate}; use crate::{EventManager, FcExitCode}; @@ -99,6 +101,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), + /// Set the memory backend for the VM. The VM will use this backend to handle its + /// memory. This action can only be called before the microVM has booted. + SetMemoryBackend(MemBackendConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -431,6 +436,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), + SetMemoryBackend(config) => self.set_memory_backend(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), // Operations not allowed pre-boot. @@ -456,6 +462,13 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } + fn set_memory_backend(&mut self, cfg: MemBackendConfig) -> ActionResult { + self.boot_path = true; + self.vm_resources.memory_backend = Some(cfg); + + Ok(VmmData::Empty) + } + fn insert_block_device(&mut self, cfg: BlockDeviceConfig) -> ActionResult { self.boot_path = true; self.vm_resources @@ -667,6 +680,7 @@ impl RuntimeApiController { | InsertNetworkDevice(_) | LoadSnapshot(_) | SetBalloonDevice(_) + | SetMemoryBackend(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | StartMicroVm @@ -733,14 +747,14 @@ impl RuntimeApiController { fn create_snapshot(&mut self, create_params: &CreateSnapshotParams) -> ActionResult { log_dev_preview_warning("Virtual machine snapshots", None); - if create_params.snapshot_type == SnapshotType::Diff - && !self.vm_resources.track_dirty_pages() - { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } + // if create_params.snapshot_type == SnapshotType::Diff + // && !self.vm_resources.track_dirty_pages() + // { + // return Err(VmmActionError::NotSupported( + // "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." + // .to_string(), + // )); + // } let mut locked_vmm = self.vmm.lock().unwrap(); let vm_cfg = self.vm_resources.vm_config(); @@ -887,6 +901,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, + pub memory_backend: Option, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index d7d73345f40..a6b66398130 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -23,7 +23,7 @@ pub enum SnapshotType { /// 1) A file that contains the guest memory to be loaded, /// 2) An UDS where a custom page-fault handler process is listening for /// the UFFD set up by Firecracker to handle its guest memory page faults. -#[derive(Debug, PartialEq, Eq, Deserialize)] +#[derive(Debug, Clone, Deserialize, Eq, PartialEq)] pub enum MemBackendType { /// Guest memory contents will be loaded from a file. File, @@ -86,7 +86,7 @@ pub struct LoadSnapshotConfig { } /// Stores the configuration used for managing snapshot memory. -#[derive(Debug, PartialEq, Eq, Deserialize)] +#[derive(Debug, Clone, Deserialize, Eq, PartialEq)] #[serde(deny_unknown_fields)] pub struct MemBackendConfig { /// Path to the backend used to handle the guest memory. diff --git a/tests/host_tools/uffd/Cargo.toml b/tests/host_tools/uffd/Cargo.toml index a1243eeb9ca..200cb20b0d6 100644 --- a/tests/host_tools/uffd/Cargo.toml +++ b/tests/host_tools/uffd/Cargo.toml @@ -9,9 +9,9 @@ utils = { path = "../../../src/utils" } libc = "0.2.121" nix = "0.23.0" -serde = { version = "1.0.136", features = ["derive"] } -serde_json = "1.0.79" -userfaultfd = "0.4.2" +serde = { version = ">=1.0.27", features = ["derive"] } +serde_json = ">=1.0.9" +userfaultfd = ">=0.5.0" [workspace] diff --git a/tools/devtool b/tools/devtool index a5a660b974f..cf513552ce5 100755 --- a/tools/devtool +++ b/tools/devtool @@ -510,6 +510,7 @@ run_devctr() { --rm \ --volume /dev:/dev \ --volume "$FC_ROOT_DIR:$CTR_FC_ROOT_DIR:z" \ + --mount type=bind,source=/usr/include/linux/userfaultfd.h,target=/usr/include/linux/userfaultfd.h \ --env OPT_LOCAL_IMAGES_PATH="$(dirname "$CTR_MICROVM_IMAGES_DIR")" \ --env PYTHONDONTWRITEBYTECODE=1 \ "$DEVCTR_IMAGE" "${ctr_args[@]}"