From efebfca680470b8d9c995fc81a9386bb68fdbf5d Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Thu, 26 May 2022 21:26:37 +0200 Subject: [PATCH 01/11] MMDS: Add support for lowercased headers Signed-off-by: Ives van Hoorne update test Co-authored-by: Luminita Voicu move test to valid headers --- docs/mmds/mmds-user-guide.md | 2 +- src/mmds/src/lib.rs | 6 ++--- src/mmds/src/token_headers.rs | 22 +++++++++++++++++-- .../integration_tests/functional/test_mmds.py | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/mmds/mmds-user-guide.md b/docs/mmds/mmds-user-guide.md index c6bac05100e..5f9b3f012d0 100644 --- a/docs/mmds/mmds-user-guide.md +++ b/docs/mmds/mmds-user-guide.md @@ -249,7 +249,7 @@ The session must start with an HTTP `PUT` request that generates the session tok In order to be successful, the request must respect the following constraints: - must be directed towards `/latest/api/token` path -- must contain a `X-ametadata-token-ttl-seconds` header specifying the token lifetime +- must contain a `X-metadata-token-ttl-seconds` header specifying the token lifetime in seconds. The value cannot be lower than 1 or greater than 21600 (6 hours). - must not contain a `X-Forwarded-For` header. diff --git a/src/mmds/src/lib.rs b/src/mmds/src/lib.rs index 5360c5a0f0a..ae206ab7d6e 100644 --- a/src/mmds/src/lib.rs +++ b/src/mmds/src/lib.rs @@ -42,7 +42,7 @@ impl fmt::Display for Error { ), Error::NoTtlProvided => write!( f, - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ specify the token's lifetime." ), Error::ResourceNotFound(ref uri) => { @@ -705,8 +705,8 @@ mod tests { assert_eq!( Error::NoTtlProvided.to_string(), - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ - specify the token's lifetime." + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ + specify the token's lifetime." ); assert_eq!( diff --git a/src/mmds/src/token_headers.rs b/src/mmds/src/token_headers.rs index 5d13d9a4245..1f03d0b4f4e 100644 --- a/src/mmds/src/token_headers.rs +++ b/src/mmds/src/token_headers.rs @@ -39,12 +39,19 @@ impl TokenHeaders { /// Return `TokenHeaders` from headers map. pub fn try_from(map: &HashMap) -> Result { let mut headers = Self::default(); + let lowercased_headers: HashMap = map + .iter() + .map(|(k, v)| (k.to_lowercase(), v.clone())) + .collect(); - if let Some(token) = map.get(TokenHeaders::X_METADATA_TOKEN) { + if let Some(token) = lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN.to_lowercase()) + { headers.x_metadata_token = Some(token.to_string()); } - if let Some(value) = map.get(TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS) { + if let Some(value) = + lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS.to_lowercase()) + { match value.parse::() { Ok(seconds) => { headers.x_metadata_token_ttl_seconds = Some(seconds); @@ -127,6 +134,17 @@ mod tests { let headers = TokenHeaders::try_from(&map).unwrap(); assert_eq!(*headers.x_metadata_token().unwrap(), "".to_string()); + // Lowercased headers + let mut map: HashMap = HashMap::default(); + map.insert( + TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS + .to_string() + .to_lowercase(), + "60".to_string(), + ); + let headers = TokenHeaders::try_from(&map).unwrap(); + assert_eq!(headers.x_metadata_token_ttl_seconds().unwrap(), 60); + // Invalid value. let mut map: HashMap = HashMap::default(); map.insert( diff --git a/tests/integration_tests/functional/test_mmds.py b/tests/integration_tests/functional/test_mmds.py index 870bce0a70a..92f735436df 100644 --- a/tests/integration_tests/functional/test_mmds.py +++ b/tests/integration_tests/functional/test_mmds.py @@ -832,7 +832,7 @@ def test_mmds_v2_negative(test_microvm_with_api, network_config): cmd = f"curl -m 2 -s -X PUT http://{DEFAULT_IPV4}/latest/api/token" expected = ( "Token time to live value not found. Use " - "`X-metadata-token-ttl_seconds` header to specify " + "`X-metadata-token-ttl-seconds` header to specify " "the token's lifetime." ) _run_guest_cmd(ssh_connection, cmd, expected) From a43235462de3642f4fa2130c9722b60da3cd3298 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Tue, 21 Jun 2022 12:25:56 +0200 Subject: [PATCH 02/11] Support shared mmap for running VMs This allows us to run VMs while streaming memory changes to disk support mmap shared update log add configuration for memory backing file Progress tweaks --- .../seccomp/aarch64-unknown-linux-musl.json | 13 +++ .../seccomp/x86_64-unknown-linux-musl.json | 13 +++ src/api_server/src/parsed_request.rs | 2 + .../src/request/memory_backing_file.rs | 42 ++++++++++ src/api_server/src/request/mod.rs | 1 + src/api_server/swagger/firecracker.yaml | 31 +++++++ src/logger/src/metrics.rs | 4 + src/vm-memory/src/lib.rs | 2 +- src/vmm/src/builder.rs | 60 ++++++++++---- src/vmm/src/lib.rs | 11 +-- src/vmm/src/persist.rs | 80 ++++++++++++++----- src/vmm/src/resources.rs | 14 ++++ src/vmm/src/rpc_interface.rs | 45 ++++++++--- src/vmm/src/vmm_config/memory_backing_file.rs | 31 +++++++ src/vmm/src/vmm_config/mod.rs | 2 + 15 files changed, 303 insertions(+), 48 deletions(-) create mode 100644 src/api_server/src/request/memory_backing_file.rs create mode 100644 src/vmm/src/vmm_config/memory_backing_file.rs diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index ca0e51381a5..5a2204f41d6 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -596,6 +596,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index b419581ea3f..02bc9629a6a 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -248,6 +248,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index 9b4cd6e64b9..7565853e1ab 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -17,6 +17,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use crate::request::memory_backing_file::parse_put_memory_backing_file; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -112,6 +113,7 @@ impl ParsedRequest { (Method::Put, "network-interfaces", Some(body)) => { parse_put_net(body, path_tokens.get(1)) } + (Method::Put, "memory-backing-file", Some(body)) => parse_put_memory_backing_file(body), (Method::Put, "shutdown-internal", None) => { Ok(ParsedRequest::new(RequestAction::ShutdownInternal)) } diff --git a/src/api_server/src/request/memory_backing_file.rs b/src/api_server/src/request/memory_backing_file.rs new file mode 100644 index 00000000000..e29798039bc --- /dev/null +++ b/src/api_server/src/request/memory_backing_file.rs @@ -0,0 +1,42 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use logger::{IncMetric, METRICS}; +use vmm::vmm_config::memory_backing_file::MemoryBackingFileConfig; + +pub(crate) fn parse_put_memory_backing_file(body: &Body) -> Result { + METRICS.put_api_requests.memory_backing_file_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backing_file_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backing_file(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "path": "./memory.snap" + }"#; + let same_body = MemoryBackingFileConfig { + path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backing_file(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile(same_body))); + } +} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index 75f9a0daef3..c2f52110bf0 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -8,6 +8,7 @@ pub mod drive; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_backing_file; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 07d242532e6..14cae2b7a69 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -350,6 +350,29 @@ paths: description: Internal server error schema: $ref: "#/definitions/Error" + + /memory-backing-file: + put: + summary: Configures a memory backing file to sync the memory changes to during the runtime of the vm + operationId: putMemoryBackingFile + parameters: + - name: body + in: body + description: Path to memory backing file + required: true + schema: + $ref: "#/definitions/MemoryBackingFile" + responses: + 204: + description: Memory backing file configured + 400: + description: Memory backing file failed + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" /metrics: put: @@ -1047,6 +1070,14 @@ definitions: tx_rate_limiter: $ref: "#/definitions/RateLimiter" + MemoryBackingFile: + type: object + required: + - path + properties: + path: + type: string + PartialDrive: type: object required: diff --git a/src/logger/src/metrics.rs b/src/logger/src/metrics.rs index 779689f04b8..aba286cc4d8 100644 --- a/src/logger/src/metrics.rs +++ b/src/logger/src/metrics.rs @@ -403,6 +403,10 @@ pub struct PutRequestsMetrics { pub machine_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, + /// Number of PUTs for setting memory backing file. + pub memory_backing_file_cfg_count: SharedIncMetric, + /// Number of failures in configuring the machine. + pub memory_backing_file_cfg_fails: SharedIncMetric, /// Number of PUTs for initializing the metrics system. pub metrics_count: SharedIncMetric, /// Number of failures in initializing the metrics system. diff --git a/src/vm-memory/src/lib.rs b/src/vm-memory/src/lib.rs index 814f79098b4..1d18139dd99 100644 --- a/src/vm-memory/src/lib.rs +++ b/src/vm-memory/src/lib.rs @@ -117,7 +117,7 @@ pub fn create_guest_memory( for region in regions { let flags = match region.0 { None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE, + Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED, }; let mmap_region = diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index bffb43e47cf..c096d8cb132 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,9 +5,11 @@ use std::convert::TryFrom; use std::fmt::{Display, Formatter}; +use std::fs::File; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::{Arc, Mutex}; +use vm_memory::FileOffset; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -28,7 +30,6 @@ use linux_loader::loader::KernelLoader; use logger::{error, warn, METRICS}; use seccompiler::BpfThreadMap; use snapshot::Persist; -use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::terminal::Terminal; use utils::time::TimestampUs; @@ -43,7 +44,7 @@ use crate::construct_kvm_mpidrs; use crate::device_manager::legacy::PortIODeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::persist::MMIODevManagerConstructorArgs; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -58,6 +59,8 @@ use crate::{device_manager, mem_size_mib, Error, EventManager, Vmm, VmmEventsObs pub enum StartMicrovmError { /// Unable to attach block device to Vmm. AttachBlockDevice(io::Error), + /// Unable to create the memory backing file. + BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. ConfigureSystem(arch::Error), /// Internal errors are due to resource exhaustion. @@ -112,6 +115,9 @@ impl Display for StartMicrovmError { write!(f, "Unable to attach block device to Vmm: {}", err) } ConfigureSystem(err) => write!(f, "System configuration error: {:?}", err), + BackingMemoryFile(err) => { + write!(f, "Unable to create the memory backing file: {}", err) + } CreateRateLimiter(err) => write!(f, "Cannot create RateLimiter: {}", err), CreateNetDevice(err) => { let mut err_msg = format!("{:?}", err); @@ -231,7 +237,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, vcpu_count: u8, ) -> std::result::Result<(Vmm, Vec), StartMicrovmError> { @@ -297,7 +303,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, - uffd, + memory_descriptor, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -329,8 +335,23 @@ pub fn build_microvm_for_boot( let boot_config = vm_resources.boot_source().ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = - create_guest_memory(vm_resources.vm_config().mem_size_mib, track_dirty_pages)?; + + let backing_memory_file = if let Some(ref file) = vm_resources.backing_memory_file { + file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + Some(file.clone()) + } else { + None + }; + let guest_memory = create_guest_memory( + vm_resources.vm_config().mem_size_mib, + backing_memory_file.clone(), + track_dirty_pages, + )?; let vcpu_config = vm_resources.vcpu_config(); let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; @@ -362,7 +383,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - None, + backing_memory_file.map(MemoryDescriptor::File), track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -451,7 +472,7 @@ pub fn build_microvm_from_snapshot( event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -466,7 +487,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), - uffd, + memory_descriptor, track_dirty_pages, vcpu_count, )?; @@ -581,15 +602,24 @@ pub fn build_microvm_from_snapshot( /// Creates GuestMemory of `mem_size_mib` MiB in size. pub fn create_guest_memory( mem_size_mib: usize, + backing_memory_file: Option>, track_dirty_pages: bool, ) -> std::result::Result { let mem_size = mem_size_mib << 20; let arch_mem_regions = arch::arch_memory_regions(mem_size); + let mut offset = 0_u64; vm_memory::create_guest_memory( &arch_mem_regions .iter() - .map(|(addr, size)| (None, *addr, *size)) + .map(|(addr, size)| { + let file_offset = backing_memory_file + .clone() + .map(|file| FileOffset::from_arc(file, offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) .collect::>()[..], track_dirty_pages, ) @@ -1068,7 +1098,7 @@ pub mod tests { } pub(crate) fn default_vmm() -> Vmm { - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK) .map_err(Error::EventFd) @@ -1096,12 +1126,12 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, - uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, #[cfg(target_arch = "x86_64")] pio_device_manager, + memory_descriptor: None, } } @@ -1283,13 +1313,13 @@ pub mod tests { // Case 1: create guest memory without dirty page tracking { - let guest_memory = create_guest_memory(mem_size, false).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, false).unwrap(); assert!(!is_dirty_tracking_enabled(&guest_memory)); } // Case 2: create guest memory with dirty page tracking { - let guest_memory = create_guest_memory(mem_size, true).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, true).unwrap(); assert!(is_dirty_tracking_enabled(&guest_memory)); } } @@ -1297,7 +1327,7 @@ pub mod tests { #[test] fn test_create_vcpus() { let vcpu_count = 2; - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); #[allow(unused_mut)] let mut vm = setup_kvm_vm(&guest_memory, false).unwrap(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d71932525b5..90975ac6acc 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -50,10 +50,10 @@ use devices::virtio::{ use devices::BusDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; +use persist::MemoryDescriptor; use rate_limiter::BucketUpdate; use seccompiler::BpfProgram; use snapshot::Persist; -use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -260,10 +260,6 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, - // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] - uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -272,6 +268,11 @@ pub struct Vmm { mmio_device_manager: MMIODeviceManager, #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, + + // The mem file that should be mmaped. We need to keep a reference of the UFFD in the + // process so we allow dead_code + #[allow(dead_code)] + memory_descriptor: Option, } impl Vmm { diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 309cbce46b3..0ff5ca1f9da 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -325,6 +325,27 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; + if OpenOptions::new().read(true).open(mem_file_path).is_ok() + && snapshot_type == &SnapshotType::Diff + { + // // The memory file already exists. + // // We're going to use the msync behaviour + // for region in vmm.guest_memory().iter() { + // info!("msyncing memory region"); + // unsafe { + // if libc::msync(region.as_ptr() as _, region.len() as _, libc::MS_SYNC) == -1 { + // return Err(CreateSnapshotError::Memory( + // memory_snapshot::Error::CreateRegion(vm_memory::MmapRegionError::Mmap( + // std::io::Error::last_os_error(), + // )), + // )); + // } + // }; + // } + + return Ok(()); + } + let mut file = OpenOptions::new() .write(true) .create(true) @@ -483,6 +504,16 @@ pub fn snapshot_state_sanity_check( Ok(()) } +/// Describes a descriptor that connects to the memory used by the VM. This could either be the a file descriptor +/// or a UFFD descriptor. +#[derive(Debug)] +pub enum MemoryDescriptor { + /// A file descriptor that connects to the user fault process. + Uffd(Uffd), + /// A file descriptor of the backing memory file. + File(Arc), +} + /// Loads a Microvm snapshot producing a 'paused' Microvm. pub fn restore_from_snapshot( instance_info: &InstanceInfo, @@ -501,26 +532,31 @@ pub fn restore_from_snapshot( let mem_backend_path = ¶ms.mem_backend.backend_path; let mem_state = µvm_state.memory_state; let track_dirty_pages = params.enable_diff_snapshots; - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?, - None, - ), - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - )?, + let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { + MemBackendType::File => { + let (guest_memory, file) = + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?; + (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) + } + MemBackendType::Uffd => { + let (guest_memory, uffd) = guest_memory_from_uffd( + mem_backend_path, + mem_state, + track_dirty_pages, + // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device + // is present in the microVM state. + microvm_state.device_states.balloon_device.is_some(), + )?; + + (guest_memory, uffd.map(MemoryDescriptor::Uffd)) + } }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, - uffd, + memory_descriptor, track_dirty_pages, seccomp_filters, vm_resources, @@ -545,11 +581,19 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result { +) -> std::result::Result<(GuestMemoryMmap, File), LoadSnapshotError> { use self::LoadSnapshotError::{DeserializeMemory, MemoryBackingFile}; - let mem_file = File::open(mem_file_path).map_err(MemoryBackingFile)?; - GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) - .map_err(DeserializeMemory) + let mem_file = OpenOptions::new() + .write(true) + .read(true) + .open(mem_file_path) + .map_err(MemoryBackingFile)?; + + Ok(( + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?, + mem_file, + )) } fn guest_memory_from_uffd( diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 9a5257ac307..fb9c3b11e61 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; +use std::fs::File; use std::sync::{Arc, Mutex, MutexGuard}; use logger::info; @@ -117,6 +118,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// When backed by a memory file, this should be set + pub backing_memory_file: Option>, } impl VmResources { @@ -236,6 +239,16 @@ impl VmResources { self.vm_config.track_dirty_pages = dirty_page_tracking; } + /// Returns the config for the backing memory file + pub fn backing_memory_file(&self) -> Option> { + self.backing_memory_file.clone() + } + + /// Sets the backing memory file + pub fn set_backing_memory_file(&mut self, backing_mem_file: Arc) { + self.backing_memory_file.get_or_insert(backing_mem_file); + } + /// Returns the VmConfig. pub fn vm_config(&self) -> &VmConfig { &self.vm_config @@ -575,6 +588,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, + backing_memory_file: None, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 2a79237d3f1..87f9f2aae2a 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::{Display, Formatter}; -use std::result; use std::sync::{Arc, Mutex, MutexGuard}; +use std::{fs, result}; use logger::*; use mmds::data_store::{self, Mmds}; @@ -34,6 +34,7 @@ use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, Drive use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::logger::{LoggerConfig, LoggerConfigError}; use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig}; +use crate::vmm_config::memory_backing_file::{MemoryBackingFileConfig, MemoryBackingFileError}; use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ @@ -99,6 +100,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), + /// Set the memory backing file for the VM. The VM will use this backing file to store its + /// memory. This action can only be called before the microVM has booted. + SetMemoryBackingFile(MemoryBackingFileConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -130,6 +134,8 @@ pub enum VmmAction { pub enum VmmActionError { /// The action `SetBalloonDevice` failed because of bad user input. BalloonConfig(BalloonConfigError), + /// The action `SetMemoryBackingFile` failed because of bad user input. + MemoryBackingFile(MemoryBackingFileError), /// The action `ConfigureBootSource` failed because of bad user input. BootSource(BootSourceConfigError), /// The action `CreateSnapshot` failed. @@ -182,6 +188,7 @@ impl Display for VmmActionError { match self { BalloonConfig(err) => err.to_string(), BootSource(err) => err.to_string(), + MemoryBackingFile(err) => err.to_string(), CreateSnapshot(err) => err.to_string(), DriveConfig(err) => err.to_string(), InternalVmm(err) => format!("Internal Vmm error: {}", err), @@ -422,6 +429,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), + SetMemoryBackingFile(config) => self.set_backing_memory_file(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), // Operations not allowed pre-boot. @@ -447,6 +455,23 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } + fn set_backing_memory_file(&mut self, cfg: MemoryBackingFileConfig) -> ActionResult { + self.boot_path = true; + let file = fs::OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(cfg.path) + .map(Arc::new) + .map_err(|e| { + VmmActionError::MemoryBackingFile(MemoryBackingFileError::CreateFile(e)) + })?; + + self.vm_resources.backing_memory_file = Some(file); + + Ok(VmmData::Empty) + } + fn insert_block_device(&mut self, cfg: BlockDeviceConfig) -> ActionResult { self.boot_path = true; self.vm_resources @@ -654,6 +679,7 @@ impl RuntimeApiController { | InsertNetworkDevice(_) | LoadSnapshot(_) | SetBalloonDevice(_) + | SetMemoryBackingFile(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | StartMicroVm @@ -720,14 +746,14 @@ impl RuntimeApiController { fn create_snapshot(&mut self, create_params: &CreateSnapshotParams) -> ActionResult { log_dev_preview_warning("Virtual machine snapshots", None); - if create_params.snapshot_type == SnapshotType::Diff - && !self.vm_resources.track_dirty_pages() - { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } + // if create_params.snapshot_type == SnapshotType::Diff + // && !self.vm_resources.track_dirty_pages() + // { + // return Err(VmmActionError::NotSupported( + // "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." + // .to_string(), + // )); + // } let mut locked_vmm = self.vmm.lock().unwrap(); let create_start_us = utils::time::get_time_us(utils::time::ClockType::Monotonic); @@ -862,6 +888,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, + pub backing_memory_file: Option>, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/memory_backing_file.rs b/src/vmm/src/vmm_config/memory_backing_file.rs new file mode 100644 index 00000000000..f5f5bd9ce6f --- /dev/null +++ b/src/vmm/src/vmm_config/memory_backing_file.rs @@ -0,0 +1,31 @@ +use std::{ + fmt::{Display, Formatter}, + io, + path::PathBuf, +}; + +use serde::{Deserialize, Serialize}; + +/// Keeps the Memory Backing file configuration. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +#[serde(deny_unknown_fields)] +pub struct MemoryBackingFileConfig { + /// Location of the memory backing file. + pub path: PathBuf, +} + +/// Errors associated with the operations allowed on a memory backing file. +#[derive(Debug)] +pub enum MemoryBackingFileError { + /// Failed to create the block device + CreateFile(io::Error), +} + +impl Display for MemoryBackingFileError { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + use self::MemoryBackingFileError::*; + match self { + CreateFile(e) => write!(f, "Unable to create the memory backing file: {}", e), + } + } +} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index c509fce888f..61b235a0fbb 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -23,6 +23,8 @@ pub mod instance_info; pub mod logger; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for configuring the backing memory file. +pub mod memory_backing_file; /// Wrapper for configuring the metrics. pub mod metrics; /// Wrapper for configuring the MMDS. From 41643714f571f281ac9777ff72254591856ea0da Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Wed, 31 Aug 2022 12:05:46 +0200 Subject: [PATCH 03/11] fix: VMs getting stuck when starting from snapshot Fixes https://github.com/firecracker-microvm/firecracker/issues/3043 Fixes https://github.com/firecracker-microvm/firecracker/issues/3020 --- src/cpuid/src/transformer/amd.rs | 3 +++ src/cpuid/src/transformer/common.rs | 13 +++++++++++++ src/cpuid/src/transformer/intel.rs | 2 ++ 3 files changed, 18 insertions(+) diff --git a/src/cpuid/src/transformer/amd.rs b/src/cpuid/src/transformer/amd.rs index c1db38260a5..261e88ef4e4 100644 --- a/src/cpuid/src/transformer/amd.rs +++ b/src/cpuid/src/transformer/amd.rs @@ -147,6 +147,9 @@ impl CpuidTransformer for AmdCpuidTransformer { leaf_0x8000001d::LEAF_NUM => Some(amd::update_extended_cache_topology_entry), leaf_0x8000001e::LEAF_NUM => Some(amd::update_extended_apic_id_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + + // hypervisor stuff + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/cpuid/src/transformer/common.rs b/src/cpuid/src/transformer/common.rs index ba89fc35f74..59de658fba5 100644 --- a/src/cpuid/src/transformer/common.rs +++ b/src/cpuid/src/transformer/common.rs @@ -69,6 +69,19 @@ pub fn update_brand_string_entry( Ok(()) } +// KVM feature bits +#[cfg(target_arch = "x86_64")] +const KVM_FEATURE_ASYNC_PF_INT_BIT: u32 = 14; + +pub fn disable_kvm_feature_async_pf( + entry: &mut kvm_cpuid_entry2, + vm_spec: &VmSpec, +) -> Result<(), Error> { + entry.eax.write_bit(KVM_FEATURE_ASYNC_PF_INT_BIT, false); + + Ok(()) +} + pub fn update_cache_parameters_entry( entry: &mut kvm_cpuid_entry2, vm_spec: &VmSpec, diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index 8505b668932..439058c8780 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -126,6 +126,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // hypervisor stuff + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } From ab6fe7395f99f8993b1940ab3ea1151c46e7545e Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Sun, 9 Oct 2022 23:40:12 +0200 Subject: [PATCH 04/11] Progress --- .../seccomp/x86_64-unknown-linux-musl.json | 12 ++ src/api_server/src/parsed_request.rs | 4 +- src/api_server/src/request/memory_backend.rs | 46 ++++ .../src/request/memory_backing_file.rs | 42 ---- src/api_server/src/request/mod.rs | 2 +- src/api_server/swagger/firecracker.yaml | 22 +- src/cpuid/src/transformer/common.rs | 2 +- src/logger/src/metrics.rs | 4 +- src/vmm/src/builder.rs | 199 ++++++++++++++++-- src/vmm/src/memory_snapshot.rs | 143 ++++++++++++- src/vmm/src/persist.rs | 128 +++++++---- src/vmm/src/resources.rs | 16 +- src/vmm/src/rpc_interface.rs | 34 +-- src/vmm/src/vmm_config/memory_backing_file.rs | 31 --- src/vmm/src/vmm_config/mod.rs | 2 - src/vmm/src/vmm_config/snapshot.rs | 4 +- 16 files changed, 502 insertions(+), 189 deletions(-) create mode 100644 src/api_server/src/request/memory_backend.rs delete mode 100644 src/api_server/src/request/memory_backing_file.rs delete mode 100644 src/vmm/src/vmm_config/memory_backing_file.rs diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 02bc9629a6a..51c90224393 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -261,6 +261,18 @@ } ] }, + { + "syscall": "memfd_create", + "comment": "Used to create a memory backed file descriptor that can be used to save memory to" + }, + { + "syscall": "nanosleep", + "comment": "Debugging sleep" + }, + { + "syscall": "copy_file_range", + "comment": "debugging" + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index 7565853e1ab..b81e3391b0e 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -17,7 +17,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; -use crate::request::memory_backing_file::parse_put_memory_backing_file; +use crate::request::memory_backend::parse_put_memory_backend; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -113,7 +113,7 @@ impl ParsedRequest { (Method::Put, "network-interfaces", Some(body)) => { parse_put_net(body, path_tokens.get(1)) } - (Method::Put, "memory-backing-file", Some(body)) => parse_put_memory_backing_file(body), + (Method::Put, "memory-backend", Some(body)) => parse_put_memory_backend(body), (Method::Put, "shutdown-internal", None) => { Ok(ParsedRequest::new(RequestAction::ShutdownInternal)) } diff --git a/src/api_server/src/request/memory_backend.rs b/src/api_server/src/request/memory_backend.rs new file mode 100644 index 00000000000..b81c7f5fc78 --- /dev/null +++ b/src/api_server/src/request/memory_backend.rs @@ -0,0 +1,46 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use logger::{IncMetric, METRICS}; +use vmm::vmm_config::snapshot::MemBackendConfig; + +pub(crate) fn parse_put_memory_backend(body: &Body) -> Result { + METRICS.put_api_requests.memory_backend_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackend( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backend_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use vmm::vmm_config::snapshot::MemBackendType; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backend(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "backend_type": "File", + "backend_path": "./memory.snap" + }"#; + let same_body = MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backend(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackend(same_body))); + } +} diff --git a/src/api_server/src/request/memory_backing_file.rs b/src/api_server/src/request/memory_backing_file.rs deleted file mode 100644 index e29798039bc..00000000000 --- a/src/api_server/src/request/memory_backing_file.rs +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -use super::super::VmmAction; -use crate::parsed_request::{Error, ParsedRequest}; -use crate::request::Body; -use logger::{IncMetric, METRICS}; -use vmm::vmm_config::memory_backing_file::MemoryBackingFileConfig; - -pub(crate) fn parse_put_memory_backing_file(body: &Body) -> Result { - METRICS.put_api_requests.memory_backing_file_cfg_count.inc(); - Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile( - serde_json::from_slice::(body.raw()).map_err(|e| { - METRICS.put_api_requests.memory_backing_file_cfg_fails.inc(); - Error::SerdeJson(e) - })?, - ))) -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use super::*; - - #[test] - fn test_parse_memory_backing_file() { - assert!(parse_put_memory_backing_file(&Body::new("invalid_payload")).is_err()); - - let body = r#"{ - "path": "./memory.snap" - }"#; - let same_body = MemoryBackingFileConfig { - path: PathBuf::from("./memory.snap"), - }; - let result = parse_put_memory_backing_file(&Body::new(body)); - assert!(result.is_ok()); - let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); - - assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile(same_body))); - } -} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index c2f52110bf0..f58bce5b533 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -8,7 +8,7 @@ pub mod drive; pub mod instance_info; pub mod logger; pub mod machine_configuration; -pub mod memory_backing_file; +pub mod memory_backend; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 14cae2b7a69..88f0d2697a5 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -351,22 +351,22 @@ paths: schema: $ref: "#/definitions/Error" - /memory-backing-file: + /memory-backend: put: - summary: Configures a memory backing file to sync the memory changes to during the runtime of the vm - operationId: putMemoryBackingFile + summary: Configures a memory backend to sync the memory changes from during the runtime of the vm + operationId: putMemoryBackend parameters: - name: body in: body - description: Path to memory backing file + description: The memory backend to use required: true schema: - $ref: "#/definitions/MemoryBackingFile" + $ref: "#/definitions/MemoryBackend" responses: 204: - description: Memory backing file configured + description: Memory backend configured 400: - description: Memory backing file failed + description: Memory backend failed schema: $ref: "#/definitions/Error" default: @@ -1070,14 +1070,6 @@ definitions: tx_rate_limiter: $ref: "#/definitions/RateLimiter" - MemoryBackingFile: - type: object - required: - - path - properties: - path: - type: string - PartialDrive: type: object required: diff --git a/src/cpuid/src/transformer/common.rs b/src/cpuid/src/transformer/common.rs index 59de658fba5..ea2592a07da 100644 --- a/src/cpuid/src/transformer/common.rs +++ b/src/cpuid/src/transformer/common.rs @@ -75,7 +75,7 @@ const KVM_FEATURE_ASYNC_PF_INT_BIT: u32 = 14; pub fn disable_kvm_feature_async_pf( entry: &mut kvm_cpuid_entry2, - vm_spec: &VmSpec, + _vm_spec: &VmSpec, ) -> Result<(), Error> { entry.eax.write_bit(KVM_FEATURE_ASYNC_PF_INT_BIT, false); diff --git a/src/logger/src/metrics.rs b/src/logger/src/metrics.rs index aba286cc4d8..f009004362f 100644 --- a/src/logger/src/metrics.rs +++ b/src/logger/src/metrics.rs @@ -404,9 +404,9 @@ pub struct PutRequestsMetrics { /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, /// Number of PUTs for setting memory backing file. - pub memory_backing_file_cfg_count: SharedIncMetric, + pub memory_backend_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. - pub memory_backing_file_cfg_fails: SharedIncMetric, + pub memory_backend_cfg_fails: SharedIncMetric, /// Number of PUTs for initializing the metrics system. pub metrics_count: SharedIncMetric, /// Number of failures in initializing the metrics system. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index c096d8cb132..681faf0f440 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -4,12 +4,18 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. use std::convert::TryFrom; +use std::ffi::CString; use std::fmt::{Display, Formatter}; -use std::fs::File; +use std::fs::{File, OpenOptions}; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::UnixStream; +use std::os::unix::prelude::FromRawFd; +use std::path::Path; use std::sync::{Arc, Mutex}; -use vm_memory::FileOffset; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; +use vm_memory::{FileOffset, GuestMemory, GuestMemoryRegion}; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -44,7 +50,7 @@ use crate::construct_kvm_mpidrs; use crate::device_manager::legacy::PortIODeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::persist::MMIODevManagerConstructorArgs; -use crate::persist::{MemoryDescriptor, MicrovmState, MicrovmStateError}; +use crate::persist::{GuestRegionUffdMapping, MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -59,7 +65,7 @@ use crate::{device_manager, mem_size_mib, Error, EventManager, Vmm, VmmEventsObs pub enum StartMicrovmError { /// Unable to attach block device to Vmm. AttachBlockDevice(io::Error), - /// Unable to create the memory backing file. + /// Unable to create/open the memory backing file. BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. ConfigureSystem(arch::Error), @@ -97,6 +103,17 @@ pub enum StartMicrovmError { RestoreMicrovmState(MicrovmStateError), /// Unable to set VmResources. SetVmResources(VmConfigError), + /// Failed to create an UFFD Builder. + CreateUffdBuilder(userfaultfd::Error), + /// Unable to connect to UDS in order to send information regarding + /// handling guest memory page-fault events. + UdsConnection(io::Error), + /// Failed to register guest memory regions to UFFD. + UffdMemoryRegionsRegister(userfaultfd::Error), + /// Failed to send guest memory layout and path to user fault FD used to handle + /// guest memory page faults. This information is sent to a UDS where a custom + /// page-fault handler process is listening. + UffdSend(kvm_ioctls::Error), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -183,6 +200,12 @@ impl Display for StartMicrovmError { } RestoreMicrovmState(err) => write!(f, "Cannot restore microvm state. Error: {}", err), SetVmResources(err) => write!(f, "Cannot set vm resources. Error: {}", err), + CreateUffdBuilder(err) => write!(f, "Cannot create uffd socket. Error: {}", err), + UdsConnection(err) => write!(f, "Cannot connect to uffd socket. Error: {}", err), + UffdMemoryRegionsRegister(err) => { + write!(f, "Cannot uffd memory region register. Error: {}", err) + } + UffdSend(err) => write!(f, "Cannot send to uffd. Error: {}", err), } } } @@ -336,22 +359,53 @@ pub fn build_microvm_for_boot( let track_dirty_pages = vm_resources.track_dirty_pages(); - let backing_memory_file = if let Some(ref file) = vm_resources.backing_memory_file { - file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) - .map_err(|e| { - error!("Failed to set backing memory file size: {}", e); - StartMicrovmError::BackingMemoryFile(e) - })?; + let (guest_memory, memory_descriptor) = + if let Some(ref backend_config) = vm_resources.memory_backend { + match backend_config.backend_type { + crate::vmm_config::snapshot::MemBackendType::File => { + let file = OpenOptions::new() + .read(true) + .write(true) + .open(&backend_config.backend_path) + .map_err(BackingMemoryFile)?; + file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + let file = Arc::new(file); + + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + Some(file.clone()), + track_dirty_pages, + )?, + Some(MemoryDescriptor::File(file)), + ) + } + crate::vmm_config::snapshot::MemBackendType::Uffd => { + let (mem, uffd) = create_uffd_guest_memory( + vm_resources.vm_config().mem_size_mib, + backend_config.backend_path.as_path(), + track_dirty_pages, + )?; + + (mem, Some(MemoryDescriptor::Uffd(uffd))) + } + } + } else { + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + None, + track_dirty_pages, + )?, + None, + ) + }; - Some(file.clone()) - } else { - None - }; - let guest_memory = create_guest_memory( - vm_resources.vm_config().mem_size_mib, - backing_memory_file.clone(), - track_dirty_pages, - )?; let vcpu_config = vm_resources.vcpu_config(); let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; @@ -383,7 +437,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - backing_memory_file.map(MemoryDescriptor::File), + memory_descriptor, track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -626,6 +680,111 @@ pub fn create_guest_memory( .map_err(StartMicrovmError::GuestMemoryMmap) } +/// Creates GuestMemory of `mem_size_mib` MiB in size. +pub fn create_uffd_guest_memory( + mem_size_mib: usize, + uds_socket_path: &Path, + track_dirty_pages: bool, +) -> std::result::Result<(GuestMemoryMmap, Uffd), StartMicrovmError> { + use StartMicrovmError::{ + BackingMemoryFile, CreateUffdBuilder, UdsConnection, UffdMemoryRegionsRegister, UffdSend, + }; + let mem_size = mem_size_mib << 20; + let arch_mem_regions = arch::arch_memory_regions(mem_size); + + let backing_memory_file = unsafe { + let mem_name = CString::new("vm-memory").expect("CString::new failed"); + Arc::new(File::from_raw_fd(libc::memfd_create(mem_name.as_ptr(), 0))) + }; + backing_memory_file + .set_len(mem_size as u64) + .map_err(BackingMemoryFile)?; + + let mut offset = 0_u64; + let guest_memory = vm_memory::create_guest_memory( + &arch_mem_regions + .iter() + .map(|(addr, size)| { + let file_offset = Some(FileOffset::from_arc(backing_memory_file.clone(), offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) + .collect::>()[..], + track_dirty_pages, + ) + .map_err(StartMicrovmError::GuestMemoryMmap)?; + + let mut uffd_builder = UffdBuilder::new(); + uffd_builder.require_features(FeatureFlags::EVENT_REMOVE | FeatureFlags::PAGEFAULT_FLAG_WP); + + let uffd = uffd_builder + .close_on_exec(true) + .non_blocking(true) + .create() + .map_err(CreateUffdBuilder)?; + + let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); + let mut offset = 0; + for mem_region in guest_memory.iter() { + let host_base_addr = mem_region.as_ptr(); + let size = mem_region.size(); + + uffd.register(host_base_addr as _, size as _) + .map_err(UffdMemoryRegionsRegister)?; + backend_mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: host_base_addr as u64, + size, + offset, + }); + offset += mem_region.len(); + } + + // This is safe to unwrap() because we control the contents of the vector + // (i.e GuestRegionUffdMapping entries). + let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + + let socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(UffdSend)?; + + Ok((guest_memory, uffd)) +} + fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, diff --git a/src/vmm/src/memory_snapshot.rs b/src/vmm/src/memory_snapshot.rs index 9eebe371da0..8e5293bcfcd 100644 --- a/src/vmm/src/memory_snapshot.rs +++ b/src/vmm/src/memory_snapshot.rs @@ -6,7 +6,9 @@ use std::fmt::{Display, Formatter}; use std::fs::File; use std::io::SeekFrom; +use std::time::Instant; +use libc::{MAP_SHARED, PROT_WRITE}; use utils::{errno, get_page_size}; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; @@ -126,7 +128,11 @@ impl SnapshotMemory for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size()?; - self.iter() + let start = Instant::now(); + let mut total_written = 0; + + let res = self + .iter() .enumerate() .try_for_each(|(slot, region)| { let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); @@ -150,23 +156,37 @@ impl SnapshotMemory for GuestMemoryMmap { } write_size += page_size; } else if write_size > 0 { + let start = Instant::now(); // We are at the end of a batch of dirty pages. region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; write_size = 0; } } } if write_size > 0 { + let start = Instant::now(); region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); } writer_offset += region.len(); if let Some(bitmap) = firecracker_bitmap { @@ -175,7 +195,15 @@ impl SnapshotMemory for GuestMemoryMmap { Ok(()) }) - .map_err(Error::WriteMemory) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + res } /// Creates a GuestMemoryMmap backed by a `file` if present, otherwise backed @@ -199,6 +227,117 @@ impl SnapshotMemory for GuestMemoryMmap { } } +/// Dumps all pages of GuestMemoryMmap present in `dirty_bitmap` to a writer. +pub fn mem_dump_dirty( + mem_map: &GuestMemoryMmap, + fd: i32, + len: usize, + dirty_bitmap: &DirtyBitmap, +) -> std::result::Result<(), Error> { + let mut writer_offset = 0_u64; + let page_size = get_page_size()?; + + let start = Instant::now(); + let mut total_written = 0; + + let source_map = + unsafe { libc::mmap(std::ptr::null_mut(), len, PROT_WRITE, MAP_SHARED, fd, 0) }; + + let res = mem_map + .iter() + .enumerate() + .try_for_each(|(slot, region)| { + let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let firecracker_bitmap = region.bitmap(); + let mut write_size = 0; + let mut dirty_batch_start: u64 = 0; + + let mmap_base = region.get_host_address(MemoryRegionAddress(0)).unwrap(); + for (i, v) in kvm_bitmap.iter().enumerate() { + for j in 0..64 { + let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; + let page_offset = ((i * 64) + j) * page_size; + let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + if is_kvm_page_dirty || is_firecracker_page_dirty { + // We are at the start of a new batch of dirty pages. + if write_size == 0 { + // Seek forward over the unmodified pages. + dirty_batch_start = page_offset as u64; + } + write_size += page_size; + } else if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting write of {}B (source {}, dest {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) + as *mut u8, + write_size, + ); + } + + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; + write_size = 0; + } + } + } + + if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting final write of {}B (source {}, dest {}) (total_size: {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start, + len + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) as *mut u8, + write_size, + ); + } + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + } + writer_offset += region.len(); + if let Some(bitmap) = firecracker_bitmap { + bitmap.reset(); + } + + Ok(()) + }) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + eprintln!("memfd {}, len {}", fd, len); + + res +} + #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 0ff5ca1f9da..8ee7cb107f9 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -3,19 +3,23 @@ //! Defines state structures for saving/restoring a Firecracker microVM. +use std::ffi::CString; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; use std::io::{self, Write}; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; +use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; +use std::time::Instant; #[cfg(target_arch = "aarch64")] use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; #[cfg(target_arch = "x86_64")] use cpuid::common::{get_vendor_id_from_cpuid, get_vendor_id_from_host}; use devices::virtio::TYPE_NET; +use libc::memfd_create; use logger::{error, info}; use seccompiler::BpfThreadMap; use serde::Serialize; @@ -29,7 +33,7 @@ use vm_memory::{GuestMemory, GuestMemoryMmap}; use crate::builder::{self, StartMicrovmError}; use crate::device_manager::persist::{DeviceStates, Error as DevicePersistError}; -use crate::memory_snapshot::{GuestMemoryState, SnapshotMemory}; +use crate::memory_snapshot::{mem_dump_dirty, GuestMemoryState, SnapshotMemory}; use crate::resources::VmResources; #[cfg(target_arch = "x86_64")] use crate::version_map::FC_V0_23_SNAP_VERSION; @@ -325,52 +329,84 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; - if OpenOptions::new().read(true).open(mem_file_path).is_ok() - && snapshot_type == &SnapshotType::Diff - { - // // The memory file already exists. - // // We're going to use the msync behaviour - // for region in vmm.guest_memory().iter() { - // info!("msyncing memory region"); - // unsafe { - // if libc::msync(region.as_ptr() as _, region.len() as _, libc::MS_SYNC) == -1 { - // return Err(CreateSnapshotError::Memory( - // memory_snapshot::Error::CreateRegion(vm_memory::MmapRegionError::Mmap( - // std::io::Error::last_os_error(), - // )), - // )); - // } - // }; - // } - - return Ok(()); - } + // if OpenOptions::new().read(true).open(mem_file_path).is_ok() + // && snapshot_type == &SnapshotType::Diff + // { + // // The memory file already exists. + // // We're going to use the msync behaviour + // for region in vmm.guest_memory().iter() { + // info!("msyncing memory region"); + // unsafe { + // if libc::msync(region.as_ptr() as _, region.len() as _, libc::MS_SYNC) == -1 { + // return Err(CreateSnapshotError::Memory( + // memory_snapshot::Error::CreateRegion(vm_memory::MmapRegionError::Mmap( + // std::io::Error::last_os_error(), + // )), + // )); + // } + // }; + // } + + // return Ok(()); + // } + + eprintln!("snapshot_memory_to_file"); + eprintln!("{}", mem_file_path.to_string_lossy()); + let start = Instant::now(); + let mut file = if mem_file_path.to_string_lossy() == "memfd" { + let fd = unsafe { + let memfd_name = CString::new("diff").unwrap(); + memfd_create(memfd_name.as_ptr(), 0) + }; + if fd == -1 { + return Err(MemoryBackingFile( + "memfd_create", + std::io::Error::last_os_error(), + )); + } - let mut file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(mem_file_path) - .map_err(|err| MemoryBackingFile("open", err))?; + unsafe { File::from_raw_fd(fd) } + } else { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(mem_file_path) + .map_err(|err| MemoryBackingFile("open", err))? + }; // Set the length of the file to the full size of the memory area. let mem_size_mib = mem_size_mib(vmm.guest_memory()); + // // Set the length of the file to the full size of the memory area. + // memory_file + // .set_len((mem_size_mib * 1024 * 1024) as u64) + // .map_err(|err| MemoryBackingFile("set_length", err))?; + file.set_len((mem_size_mib * 1024 * 1024) as u64) .map_err(|err| MemoryBackingFile("set_length", err))?; match snapshot_type { SnapshotType::Diff => { let dirty_bitmap = vmm.get_dirty_bitmap().map_err(DirtyBitmap)?; - vmm.guest_memory() - .dump_dirty(&mut file, &dirty_bitmap) - .map_err(Memory) + + mem_dump_dirty( + vmm.guest_memory(), + file.as_raw_fd(), + (mem_size_mib * 1024 * 1024) as usize, + &dirty_bitmap, + ) + .map_err(Memory) + + // vmm.guest_memory() + // .dump_dirty(&mut memory_file, &dirty_bitmap) + // .map_err(Memory) } SnapshotType::Full => vmm.guest_memory().dump(&mut file).map_err(Memory), }?; - file.flush() - .map_err(|err| MemoryBackingFile("flush", err))?; - file.sync_all() - .map_err(|err| MemoryBackingFile("sync_all", err)) + + eprintln!("snapshot_memory_to_file took {:?}", start.elapsed()); + + Ok(()) } /// Validate the microVM version and translate it to its corresponding snapshot data format. @@ -596,25 +632,41 @@ fn guest_memory_from_file( )) } -fn guest_memory_from_uffd( +pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, enable_balloon: bool, ) -> std::result::Result<(GuestMemoryMmap, Option), LoadSnapshotError> { use self::LoadSnapshotError::{ - CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdMemoryRegionsRegister, UffdSend, + CreateUffdBuilder, DeserializeMemory, MemoryBackingFile, UdsConnection, + UffdMemoryRegionsRegister, UffdSend, }; + let backing_memory_file = unsafe { + let mem_name = CString::new("vm-memory").expect("CString::new failed"); + File::from_raw_fd(libc::memfd_create(mem_name.as_ptr(), 0)) + }; + backing_memory_file + .set_len( + mem_state + .regions + .iter() + .map(|region| region.size) + .sum::() as u64, + ) + .map_err(MemoryBackingFile)?; + let guest_memory = - GuestMemoryMmap::restore(None, mem_state, track_dirty_pages).map_err(DeserializeMemory)?; + GuestMemoryMmap::restore(Some(&backing_memory_file), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?; let mut uffd_builder = UffdBuilder::new(); if enable_balloon { // We enable this so that the page fault handler can add logic // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + uffd_builder.require_features(FeatureFlags::EVENT_REMOVE | FeatureFlags::PAGEFAULT_FLAG_WP); } let uffd = uffd_builder diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index fb9c3b11e61..94c4c7d05fa 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; -use std::fs::File; use std::sync::{Arc, Mutex, MutexGuard}; use logger::info; @@ -21,6 +20,7 @@ use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig} use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; +use crate::vmm_config::snapshot::MemBackendConfig; use crate::vmm_config::vsock::*; use crate::vstate::vcpu::VcpuConfig; @@ -118,8 +118,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, - /// When backed by a memory file, this should be set - pub backing_memory_file: Option>, + /// When backed by a memory on boot, this should be set + pub memory_backend: Option, } impl VmResources { @@ -240,13 +240,13 @@ impl VmResources { } /// Returns the config for the backing memory file - pub fn backing_memory_file(&self) -> Option> { - self.backing_memory_file.clone() + pub fn memory_backend(&self) -> Option { + self.memory_backend.clone() } /// Sets the backing memory file - pub fn set_backing_memory_file(&mut self, backing_mem_file: Arc) { - self.backing_memory_file.get_or_insert(backing_mem_file); + pub fn set_memory_backend(&mut self, backing_mem_file: MemBackendConfig) { + self.memory_backend.get_or_insert(backing_mem_file); } /// Returns the VmConfig. @@ -588,7 +588,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, - backing_memory_file: None, + memory_backend: None, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 87f9f2aae2a..4c263c048b7 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::{Display, Formatter}; +use std::result; use std::sync::{Arc, Mutex, MutexGuard}; -use std::{fs, result}; use logger::*; use mmds::data_store::{self, Mmds}; @@ -34,13 +34,14 @@ use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, Drive use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::logger::{LoggerConfig, LoggerConfigError}; use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig}; -use crate::vmm_config::memory_backing_file::{MemoryBackingFileConfig, MemoryBackingFileError}; use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ NetworkInterfaceConfig, NetworkInterfaceError, NetworkInterfaceUpdateConfig, }; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, SnapshotType, +}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; use crate::vmm_config::{self, RateLimiterUpdate}; use crate::{EventManager, FcExitCode}; @@ -100,9 +101,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), - /// Set the memory backing file for the VM. The VM will use this backing file to store its + /// Set the memory backend for the VM. The VM will use this backend to handle its /// memory. This action can only be called before the microVM has booted. - SetMemoryBackingFile(MemoryBackingFileConfig), + SetMemoryBackend(MemBackendConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -134,8 +135,6 @@ pub enum VmmAction { pub enum VmmActionError { /// The action `SetBalloonDevice` failed because of bad user input. BalloonConfig(BalloonConfigError), - /// The action `SetMemoryBackingFile` failed because of bad user input. - MemoryBackingFile(MemoryBackingFileError), /// The action `ConfigureBootSource` failed because of bad user input. BootSource(BootSourceConfigError), /// The action `CreateSnapshot` failed. @@ -188,7 +187,6 @@ impl Display for VmmActionError { match self { BalloonConfig(err) => err.to_string(), BootSource(err) => err.to_string(), - MemoryBackingFile(err) => err.to_string(), CreateSnapshot(err) => err.to_string(), DriveConfig(err) => err.to_string(), InternalVmm(err) => format!("Internal Vmm error: {}", err), @@ -429,7 +427,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), - SetMemoryBackingFile(config) => self.set_backing_memory_file(config), + SetMemoryBackend(config) => self.set_memory_backend(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), // Operations not allowed pre-boot. @@ -455,19 +453,9 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } - fn set_backing_memory_file(&mut self, cfg: MemoryBackingFileConfig) -> ActionResult { + fn set_memory_backend(&mut self, cfg: MemBackendConfig) -> ActionResult { self.boot_path = true; - let file = fs::OpenOptions::new() - .create(true) - .read(true) - .write(true) - .open(cfg.path) - .map(Arc::new) - .map_err(|e| { - VmmActionError::MemoryBackingFile(MemoryBackingFileError::CreateFile(e)) - })?; - - self.vm_resources.backing_memory_file = Some(file); + self.vm_resources.memory_backend = Some(cfg); Ok(VmmData::Empty) } @@ -679,7 +667,7 @@ impl RuntimeApiController { | InsertNetworkDevice(_) | LoadSnapshot(_) | SetBalloonDevice(_) - | SetMemoryBackingFile(_) + | SetMemoryBackend(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | StartMicroVm @@ -888,7 +876,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, - pub backing_memory_file: Option>, + pub memory_backend: Option, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/memory_backing_file.rs b/src/vmm/src/vmm_config/memory_backing_file.rs deleted file mode 100644 index f5f5bd9ce6f..00000000000 --- a/src/vmm/src/vmm_config/memory_backing_file.rs +++ /dev/null @@ -1,31 +0,0 @@ -use std::{ - fmt::{Display, Formatter}, - io, - path::PathBuf, -}; - -use serde::{Deserialize, Serialize}; - -/// Keeps the Memory Backing file configuration. -#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] -#[serde(deny_unknown_fields)] -pub struct MemoryBackingFileConfig { - /// Location of the memory backing file. - pub path: PathBuf, -} - -/// Errors associated with the operations allowed on a memory backing file. -#[derive(Debug)] -pub enum MemoryBackingFileError { - /// Failed to create the block device - CreateFile(io::Error), -} - -impl Display for MemoryBackingFileError { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - use self::MemoryBackingFileError::*; - match self { - CreateFile(e) => write!(f, "Unable to create the memory backing file: {}", e), - } - } -} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 61b235a0fbb..c509fce888f 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -23,8 +23,6 @@ pub mod instance_info; pub mod logger; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; -/// Wrapper for configuring the backing memory file. -pub mod memory_backing_file; /// Wrapper for configuring the metrics. pub mod metrics; /// Wrapper for configuring the MMDS. diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index c8634b66980..d06a92f7d3a 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -28,7 +28,7 @@ impl Default for SnapshotType { /// 1) A file that contains the guest memory to be loaded, /// 2) An UDS where a custom page-fault handler process is listening for /// the UFFD set up by Firecracker to handle its guest memory page faults. -#[derive(Debug, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, PartialEq)] pub enum MemBackendType { /// Guest memory contents will be loaded from a file. File, @@ -91,7 +91,7 @@ pub struct LoadSnapshotConfig { } /// Stores the configuration used for managing snapshot memory. -#[derive(Debug, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, PartialEq)] #[serde(deny_unknown_fields)] pub struct MemBackendConfig { /// Path to the backend used to handle the guest memory. From 182554dff4a83e9c9b61f586e532d888bea7e64a Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Fri, 4 Nov 2022 14:04:24 +0100 Subject: [PATCH 05/11] changes --- .cargo/config | 4 +- Cargo.lock | 46 +++++------ src/cpuid/src/transformer/intel.rs | 4 +- src/jailer/src/env.rs | 5 +- src/vmm/Cargo.toml | 10 ++- src/vmm/src/builder.rs | 73 +++++++++++------- src/vmm/src/persist.rs | 118 ++++++++++------------------- tests/host_tools/uffd/Cargo.toml | 2 +- tools/devtool | 1 + 9 files changed, 117 insertions(+), 146 deletions(-) diff --git a/.cargo/config b/.cargo/config index ef7728c6c98..0678c0723c6 100644 --- a/.cargo/config +++ b/.cargo/config @@ -1,6 +1,6 @@ [build] -target = "x86_64-unknown-linux-musl" -target-dir = "build/cargo_target" +# target = "x86_64-unknown-linux-musl" +# target-dir = "build/cargo_target" [net] git-fetch-with-cli = true diff --git a/Cargo.lock b/Cargo.lock index 338b2b1a74e..4ccc0cc02a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,7 +17,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cipher", "cpufeatures", "opaque-debug", @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", @@ -182,12 +182,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -292,7 +286,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -302,7 +296,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -313,7 +307,7 @@ version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "lazy_static", "memoffset", @@ -326,7 +320,7 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "lazy_static", ] @@ -464,7 +458,7 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "wasi", ] @@ -595,7 +589,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -614,7 +608,7 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -691,7 +685,7 @@ checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", "memoffset", ] @@ -778,7 +772,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8419d2b623c7c0896ff2d5d96e2cb4ede590fed28fcc34934f4c33c036e620a1" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "opaque-debug", "universal-hash", @@ -1142,12 +1136,11 @@ dependencies = [ [[package]] name = "userfaultfd" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b738009e099b4ded1ecf19dfb7631f69c24f16e0af6d29fd9b3f54a092aca46" +version = "0.5.0" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" dependencies = [ "bitflags", - "cfg-if 1.0.0", + "cfg-if", "libc", "nix", "thiserror", @@ -1156,13 +1149,12 @@ dependencies = [ [[package]] name = "userfaultfd-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a4be003c705d2c8dc1234d473856945e291bb998ac2e2d83e70328d964d7458" +version = "0.4.2" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" dependencies = [ "bindgen", "cc", - "cfg-if 0.1.10", + "cfg-if", ] [[package]] @@ -1321,7 +1313,7 @@ version = "0.2.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index 439058c8780..5272ac4437b 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -126,8 +126,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), - // hypervisor stuff - 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), + // // hypervisor stuff + // 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/jailer/src/env.rs b/src/jailer/src/env.rs index 405b74e4337..bd8c39219b2 100644 --- a/src/jailer/src/env.rs +++ b/src/jailer/src/env.rs @@ -371,9 +371,8 @@ impl Env { // a new PathBuf, with something like chroot_dir.join(exec_file_name) ?! self.chroot_dir.push(exec_file_name); - // TODO: hard link instead of copy? This would save up disk space, but hard linking is - // not always possible :( - fs::copy(&self.exec_file_path, &self.chroot_dir).map_err(|err| { + // We hard link instead of copy for space savings and to retain the capabilities + fs::hard_link(&self.exec_file_path, &self.chroot_dir).map_err(|err| { Error::Copy(self.exec_file_path.clone(), self.chroot_dir.clone(), err) })?; diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 68a64a4d23c..50a5bd78232 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,12 +14,16 @@ libc = ">=0.2.39" linux-loader = ">=0.4.0" serde = { version = ">=1.0.27", features = ["derive"] } serde_json = ">=1.0.9" -userfaultfd = ">=0.4.0" +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5", features = [ + "linux5_7", +] } versionize = ">=0.1.6" versionize_derive = ">=0.1.3" vm-superio = ">=0.4.0" vm-allocator = "0.1.0" -derive_more = { version = "0.99.17", default-features = false, features = ["from"] } +derive_more = { version = "0.99.17", default-features = false, features = [ + "from", +] } arch = { path = "../arch" } devices = { path = "../devices" } @@ -27,7 +31,7 @@ logger = { path = "../logger" } mmds = { path = "../mmds" } rate_limiter = { path = "../rate_limiter" } seccompiler = { path = "../seccompiler" } -snapshot = { path = "../snapshot"} +snapshot = { path = "../snapshot" } utils = { path = "../utils" } virtio_gen = { path = "../virtio_gen" } vm-memory = { path = "../vm-memory" } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 681faf0f440..a83c6baeadc 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -13,9 +13,9 @@ use std::os::unix::net::UnixStream; use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; use utils::sock_ctrl_msg::ScmSocket; -use vm_memory::{FileOffset, GuestMemory, GuestMemoryRegion}; +use vm_memory::{FileOffset, GuestMemory}; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -114,6 +114,9 @@ pub enum StartMicrovmError { /// guest memory page faults. This information is sent to a UDS where a custom /// page-fault handler process is listening. UffdSend(kvm_ioctls::Error), + + /// Failed to get the memfd from the uffd socket + NoMemFdReceived, } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -206,6 +209,7 @@ impl Display for StartMicrovmError { write!(f, "Cannot uffd memory region register. Error: {}", err) } UffdSend(err) => write!(f, "Cannot send to uffd. Error: {}", err), + NoMemFdReceived => write!(f, "No memfd received from uffd."), } } } @@ -359,13 +363,14 @@ pub fn build_microvm_for_boot( let track_dirty_pages = vm_resources.track_dirty_pages(); - let (guest_memory, memory_descriptor) = + let (guest_memory, memory_descriptor, _file) = if let Some(ref backend_config) = vm_resources.memory_backend { match backend_config.backend_type { crate::vmm_config::snapshot::MemBackendType::File => { let file = OpenOptions::new() .read(true) .write(true) + .create(true) .open(&backend_config.backend_path) .map_err(BackingMemoryFile)?; file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) @@ -383,16 +388,17 @@ pub fn build_microvm_for_boot( track_dirty_pages, )?, Some(MemoryDescriptor::File(file)), + None, ) } crate::vmm_config::snapshot::MemBackendType::Uffd => { - let (mem, uffd) = create_uffd_guest_memory( + let (mem, uffd, file) = create_uffd_guest_memory( vm_resources.vm_config().mem_size_mib, backend_config.backend_path.as_path(), track_dirty_pages, )?; - (mem, Some(MemoryDescriptor::Uffd(uffd))) + (mem, Some(MemoryDescriptor::Uffd(uffd)), Some(file)) } } } else { @@ -403,6 +409,7 @@ pub fn build_microvm_for_boot( track_dirty_pages, )?, None, + None, ) }; @@ -685,20 +692,21 @@ pub fn create_uffd_guest_memory( mem_size_mib: usize, uds_socket_path: &Path, track_dirty_pages: bool, -) -> std::result::Result<(GuestMemoryMmap, Uffd), StartMicrovmError> { - use StartMicrovmError::{ - BackingMemoryFile, CreateUffdBuilder, UdsConnection, UffdMemoryRegionsRegister, UffdSend, - }; +) -> std::result::Result<(GuestMemoryMmap, Uffd, Arc), StartMicrovmError> { + use StartMicrovmError::{CreateUffdBuilder, NoMemFdReceived, UdsConnection, UffdSend}; + + let mut socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; + + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; + + if memfd.is_none() { + return Err(NoMemFdReceived); + } + let mem_size = mem_size_mib << 20; let arch_mem_regions = arch::arch_memory_regions(mem_size); - - let backing_memory_file = unsafe { - let mem_name = CString::new("vm-memory").expect("CString::new failed"); - Arc::new(File::from_raw_fd(libc::memfd_create(mem_name.as_ptr(), 0))) - }; - backing_memory_file - .set_len(mem_size as u64) - .map_err(BackingMemoryFile)?; + let backing_memory_file = Arc::new(memfd.unwrap()); let mut offset = 0_u64; let guest_memory = vm_memory::create_guest_memory( @@ -715,12 +723,17 @@ pub fn create_uffd_guest_memory( ) .map_err(StartMicrovmError::GuestMemoryMmap)?; - let mut uffd_builder = UffdBuilder::new(); - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE | FeatureFlags::PAGEFAULT_FLAG_WP); - - let uffd = uffd_builder - .close_on_exec(true) - .non_blocking(true) + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) .create() .map_err(CreateUffdBuilder)?; @@ -730,21 +743,18 @@ pub fn create_uffd_guest_memory( let host_base_addr = mem_region.as_ptr(); let size = mem_region.size(); - uffd.register(host_base_addr as _, size as _) - .map_err(UffdMemoryRegionsRegister)?; backend_mappings.push(GuestRegionUffdMapping { base_host_virt_addr: host_base_addr as u64, size, offset, }); - offset += mem_region.len(); + offset += size as u64; } // This is safe to unwrap() because we control the contents of the vector // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - let socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; socket .send_with_fd( backend_mappings.as_bytes(), @@ -782,7 +792,12 @@ pub fn create_uffd_guest_memory( ) .map_err(UffdSend)?; - Ok((guest_memory, uffd)) + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + + Ok((guest_memory, uffd, backing_memory_file)) } fn load_kernel( @@ -883,7 +898,7 @@ pub(crate) fn setup_kvm_vm( .map_err(Error::KvmContext) .map_err(Internal)?; let mut vm = Vm::new(kvm.fd()).map_err(Error::Vm).map_err(Internal)?; - vm.memory_init(&guest_memory, kvm.max_memslots(), track_dirty_pages) + vm.memory_init(guest_memory, kvm.max_memslots(), track_dirty_pages) .map_err(Error::Vm) .map_err(Internal)?; Ok(vm) diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 8ee7cb107f9..36667c0d578 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -6,7 +6,7 @@ use std::ffi::CString; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{self, Write}; +use std::io::{self, Read, Write}; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::os::unix::prelude::FromRawFd; @@ -228,6 +228,8 @@ pub enum LoadSnapshotError { /// Unable to connect to UDS in order to send information regarding /// handling guest memory page-fault events. UdsConnection(io::Error), + /// We didn't get the memfd when handshaking with the uffd manager + NoMemFdReceived, /// Failed to register guest memory regions to UFFD. UffdMemoryRegionsRegister(userfaultfd::Error), /// Failed to send guest memory layout and path to user fault FD used to handle @@ -249,6 +251,7 @@ impl Display for LoadSnapshotError { } InvalidSnapshot(err) => write!(f, "Snapshot sanity check failed: {}", err), MemoryBackingFile(err) => write!(f, "Cannot open the memory file: {}", err), + NoMemFdReceived => write!(f, "No memory file descriptor received"), ResumeMicroVm(err) => write!( f, "Failed to resume microVM after loading snapshot: {}", @@ -280,7 +283,7 @@ pub fn create_snapshot( version_map: VersionMap, ) -> std::result::Result<(), CreateSnapshotError> { // Fail early from invalid target version. - let snapshot_data_version = get_snapshot_data_version(¶ms.version, &version_map, &vmm)?; + let snapshot_data_version = get_snapshot_data_version(¶ms.version, &version_map, vmm)?; let microvm_state = vmm .save_state() @@ -293,7 +296,9 @@ pub fn create_snapshot( version_map, )?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + if params.snapshot_type == SnapshotType::Full { + snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + } Ok(()) } @@ -329,30 +334,7 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; - // if OpenOptions::new().read(true).open(mem_file_path).is_ok() - // && snapshot_type == &SnapshotType::Diff - // { - // // The memory file already exists. - // // We're going to use the msync behaviour - // for region in vmm.guest_memory().iter() { - // info!("msyncing memory region"); - // unsafe { - // if libc::msync(region.as_ptr() as _, region.len() as _, libc::MS_SYNC) == -1 { - // return Err(CreateSnapshotError::Memory( - // memory_snapshot::Error::CreateRegion(vm_memory::MmapRegionError::Mmap( - // std::io::Error::last_os_error(), - // )), - // )); - // } - // }; - // } - - // return Ok(()); - // } - - eprintln!("snapshot_memory_to_file"); - eprintln!("{}", mem_file_path.to_string_lossy()); - let start = Instant::now(); + let mut file = if mem_file_path.to_string_lossy() == "memfd" { let fd = unsafe { let memfd_name = CString::new("diff").unwrap(); @@ -377,11 +359,7 @@ fn snapshot_memory_to_file( // Set the length of the file to the full size of the memory area. let mem_size_mib = mem_size_mib(vmm.guest_memory()); - // // Set the length of the file to the full size of the memory area. - // memory_file - // .set_len((mem_size_mib * 1024 * 1024) as u64) - // .map_err(|err| MemoryBackingFile("set_length", err))?; - + // Set the length of the file to the full size of the memory area. file.set_len((mem_size_mib * 1024 * 1024) as u64) .map_err(|err| MemoryBackingFile("set_length", err))?; @@ -396,16 +374,10 @@ fn snapshot_memory_to_file( &dirty_bitmap, ) .map_err(Memory) - - // vmm.guest_memory() - // .dump_dirty(&mut memory_file, &dirty_bitmap) - // .map_err(Memory) } SnapshotType::Full => vmm.guest_memory().dump(&mut file).map_err(Memory), }?; - eprintln!("snapshot_memory_to_file took {:?}", start.elapsed()); - Ok(()) } @@ -575,14 +547,8 @@ pub fn restore_from_snapshot( (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) } MemBackendType::Uffd => { - let (guest_memory, uffd) = guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - )?; + let (guest_memory, uffd) = + guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages)?; (guest_memory, uffd.map(MemoryDescriptor::Uffd)) } @@ -636,42 +602,34 @@ pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, - enable_balloon: bool, ) -> std::result::Result<(GuestMemoryMmap, Option), LoadSnapshotError> { - use self::LoadSnapshotError::{ - CreateUffdBuilder, DeserializeMemory, MemoryBackingFile, UdsConnection, - UffdMemoryRegionsRegister, UffdSend, - }; + use self::LoadSnapshotError::{CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdSend}; - let backing_memory_file = unsafe { - let mem_name = CString::new("vm-memory").expect("CString::new failed"); - File::from_raw_fd(libc::memfd_create(mem_name.as_ptr(), 0)) - }; - backing_memory_file - .set_len( - mem_state - .regions - .iter() - .map(|region| region.size) - .sum::() as u64, - ) - .map_err(MemoryBackingFile)?; + let mut socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; - let guest_memory = - GuestMemoryMmap::restore(Some(&backing_memory_file), mem_state, track_dirty_pages) - .map_err(DeserializeMemory)?; + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; - let mut uffd_builder = UffdBuilder::new(); - - if enable_balloon { - // We enable this so that the page fault handler can add logic - // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE | FeatureFlags::PAGEFAULT_FLAG_WP); + if memfd.is_none() { + return Err(LoadSnapshotError::NoMemFdReceived); } - let uffd = uffd_builder - .close_on_exec(true) - .non_blocking(true) + let memfd = memfd.unwrap(); + + let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) .create() .map_err(CreateUffdBuilder)?; @@ -680,8 +638,6 @@ pub(crate) fn guest_memory_from_uffd( let host_base_addr = mem_region.as_ptr(); let size = mem_region.size(); - uffd.register(host_base_addr as _, size as _) - .map_err(UffdMemoryRegionsRegister)?; backend_mappings.push(GuestRegionUffdMapping { base_host_virt_addr: host_base_addr as u64, size, @@ -693,7 +649,6 @@ pub(crate) fn guest_memory_from_uffd( // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - let socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; socket .send_with_fd( backend_mappings.as_bytes(), @@ -731,6 +686,11 @@ pub(crate) fn guest_memory_from_uffd( ) .map_err(UffdSend)?; + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + Ok((guest_memory, Some(uffd))) } diff --git a/tests/host_tools/uffd/Cargo.toml b/tests/host_tools/uffd/Cargo.toml index f83ed0f9c3f..725da59e84e 100644 --- a/tests/host_tools/uffd/Cargo.toml +++ b/tests/host_tools/uffd/Cargo.toml @@ -11,7 +11,7 @@ libc = ">=0.2.39" nix = "0.23.0" serde = { version = ">=1.0.27", features = ["derive"] } serde_json = ">=1.0.9" -userfaultfd = ">=0.4.0" +userfaultfd = ">=0.5.0" [workspace] diff --git a/tools/devtool b/tools/devtool index 6d5bc487cc2..fcca289378f 100755 --- a/tools/devtool +++ b/tools/devtool @@ -615,6 +615,7 @@ run_devctr() { --rm \ --volume /dev:/dev \ --volume "$FC_ROOT_DIR:$CTR_FC_ROOT_DIR:z" \ + --mount type=bind,source=/usr/include/linux/userfaultfd.h,target=/usr/include/linux/userfaultfd.h \ --env OPT_LOCAL_IMAGES_PATH="$(dirname "$CTR_MICROVM_IMAGES_DIR")" \ --env PYTHONDONTWRITEBYTECODE=1 \ "$DEVCTR_IMAGE" "${ctr_args[@]}" From 12b00cd87a6305941f309a5fa9eca59ed6861d5d Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Fri, 4 Nov 2022 14:26:20 +0100 Subject: [PATCH 06/11] remove warning --- src/vmm/src/builder.rs | 4 +--- src/vmm/src/persist.rs | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a83c6baeadc..170e4ed6a0f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -4,16 +4,14 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. use std::convert::TryFrom; -use std::ffi::CString; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::net::UnixStream; -use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; -use userfaultfd::{FeatureFlags, RegisterMode, Uffd, UffdBuilder}; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; use utils::sock_ctrl_msg::ScmSocket; use vm_memory::{FileOffset, GuestMemory}; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 36667c0d578..60b40d7f5b0 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -12,7 +12,6 @@ use std::os::unix::net::UnixStream; use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; -use std::time::Instant; #[cfg(target_arch = "aarch64")] use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; From 8ac760b67d97bb2c1d62ba5fea6f553bcccf96e8 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Wed, 31 Aug 2022 12:05:46 +0200 Subject: [PATCH 07/11] disable async pf for intel --- src/cpuid/src/transformer/amd.rs | 3 +-- src/cpuid/src/transformer/intel.rs | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cpuid/src/transformer/amd.rs b/src/cpuid/src/transformer/amd.rs index 261e88ef4e4..8fc6222f2d9 100644 --- a/src/cpuid/src/transformer/amd.rs +++ b/src/cpuid/src/transformer/amd.rs @@ -147,8 +147,7 @@ impl CpuidTransformer for AmdCpuidTransformer { leaf_0x8000001d::LEAF_NUM => Some(amd::update_extended_cache_topology_entry), leaf_0x8000001e::LEAF_NUM => Some(amd::update_extended_apic_id_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), - - // hypervisor stuff + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index 5272ac4437b..db897b5ab31 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -126,8 +126,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), - // // hypervisor stuff - // 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } From 1a7a3831dad1bc82fe3cdc2ce7b402143dd1e487 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Fri, 16 Jun 2023 15:23:16 +0200 Subject: [PATCH 08/11] balloon: don't map over existing regions Firecracker used to map over the mmap when it was a restore. For CodeSandbox this is not needed, as we use UFFD and we don't have the limitation that the map was used for. In fact, this is problematic for the UFFD handler, because it implicitly unregisters the UFFD handler from the reclaimed range. --- src/devices/src/virtio/balloon/utils.rs | 38 +++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/devices/src/virtio/balloon/utils.rs b/src/devices/src/virtio/balloon/utils.rs index 53ec5d3efd0..e2185b23189 100644 --- a/src/devices/src/virtio/balloon/utils.rs +++ b/src/devices/src/virtio/balloon/utils.rs @@ -80,24 +80,26 @@ pub(crate) fn remove_range( .get_host_address(guest_address) .map_err(|_| RemoveRegionError::AddressTranslation)?; - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { - let ret = unsafe { - libc::mmap( - phys_address as *mut _, - range_len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; + // CodeSandbox: since we use UFFD handler, this is not needed for us. In fact, it breaks the UFFD handler + // if this happens right now, as it unregisters the UFFD handler for the given range. + // // Mmap a new anonymous region over the present one in order to create a hole. + // // This workaround is (only) needed after resuming from a snapshot because the guest memory + // // is mmaped from file as private and there is no `madvise` flag that works for this case. + // if restored { + // let ret = unsafe { + // libc::mmap( + // phys_address as *mut _, + // range_len as usize, + // libc::PROT_READ | libc::PROT_WRITE, + // libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + // -1, + // 0, + // ) + // }; + // if ret == libc::MAP_FAILED { + // return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); + // } + // }; // Madvise the region in order to mark it as not used. let ret = unsafe { From 32de822631ae0cc45894c3768a6a964863838ca3 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Sat, 17 Jun 2023 15:17:48 +0200 Subject: [PATCH 09/11] update clippy --- src/devices/src/virtio/balloon/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/devices/src/virtio/balloon/utils.rs b/src/devices/src/virtio/balloon/utils.rs index e2185b23189..2a3ed7af56c 100644 --- a/src/devices/src/virtio/balloon/utils.rs +++ b/src/devices/src/virtio/balloon/utils.rs @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + _restored: bool, ) -> std::result::Result<(), RemoveRegionError> { let (guest_address, range_len) = range; From be2d754c3d8ccb90c7497c13f6303d95ba8196d2 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Tue, 20 Jun 2023 16:47:08 +0200 Subject: [PATCH 10/11] Possible race condition between VMM thread and API thread (FC 1.2) https://github.com/firecracker-microvm/firecracker/pull/3591/commits/174e0425fc17ac98d4210943d8739f84cf31c535 --- src/firecracker/src/api_server_adapter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 7934530e1c4..cd5725eedfb 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -71,6 +71,7 @@ impl MutEventSubscriber for ApiServerAdapter { let event_set = event.event_set(); if source == self.api_event_fd.as_raw_fd() && event_set == EventSet::IN { + let _ = self.api_event_fd.read(); match self.from_api.try_recv() { Ok(api_request) => { let request_is_pause = *api_request == VmmAction::Pause; @@ -101,7 +102,6 @@ impl MutEventSubscriber for ApiServerAdapter { panic!("The channel's sending half was disconnected. Cannot receive data."); } }; - let _ = self.api_event_fd.read(); } else { error!("Spurious EventManager event for handler: ApiServerAdapter"); } @@ -129,7 +129,7 @@ pub(crate) fn run_with_api( // FD to notify of API events. This is a blocking eventfd by design. // It is used in the config/pre-boot loop which is a simple blocking loop // which only consumes API events. - let api_event_fd = EventFd::new(0).expect("Cannot create API Eventfd."); + let api_event_fd = EventFd::new(libc::EFD_SEMAPHORE).expect("Cannot create API Eventfd."); // Channels for both directions between Vmm and Api threads. let (to_vmm, from_api) = channel(); From d8a9bed4c73be9a6a0e803b47c4b06e2b6ba1493 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Tue, 20 Jun 2023 18:19:34 +0200 Subject: [PATCH 11/11] wip --- Cargo.lock | 36 ++++---- src/vmm/Cargo.toml | 2 +- src/vmm/src/persist.rs | 136 +++++++++++++++++------------ src/vmm/src/vmm_config/snapshot.rs | 4 +- 4 files changed, 105 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a7037fc8f37..bb68600f756 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -364,7 +364,7 @@ dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.7.1", + "memoffset", "scopeguard", ] @@ -726,15 +726,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memoffset" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" -dependencies = [ - "autocfg", -] - [[package]] name = "memoffset" version = "0.7.1" @@ -784,15 +775,16 @@ version = "0.1.0" [[package]] name = "nix" -version = "0.23.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags", - "cc", "cfg-if", "libc", - "memoffset 0.6.5", + "memoffset", + "pin-utils", + "static_assertions", ] [[package]] @@ -854,6 +846,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "plotters" version = "0.3.4" @@ -1141,6 +1139,12 @@ dependencies = [ "versionize_derive", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "subtle" version = "2.4.1" @@ -1237,7 +1241,7 @@ dependencies = [ [[package]] name = "userfaultfd" version = "0.5.0" -source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=b11a187b5743847dda76ed8df5419c3607d21375#b11a187b5743847dda76ed8df5419c3607d21375" dependencies = [ "bitflags", "cfg-if", @@ -1250,7 +1254,7 @@ dependencies = [ [[package]] name = "userfaultfd-sys" version = "0.4.2" -source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=b11a187b5743847dda76ed8df5419c3607d21375#b11a187b5743847dda76ed8df5419c3607d21375" dependencies = [ "bindgen", "cc", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index d3e92610032..6ae17d9ad8e 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -15,7 +15,7 @@ lazy_static = "1.4.0" libc = "0.2.117" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.78" -userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5", features = [ +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "b11a187b5743847dda76ed8df5419c3607d21375", features = [ "linux5_7", ] } versionize = "0.1.6" diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 71cb5bc2c84..db38852b7bf 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -3,6 +3,7 @@ //! Defines state structures for saving/restoring a Firecracker microVM. +use std::ffi::CString; use std::fs::{File, OpenOptions}; use std::io::{self, Read, Write}; use std::os::unix::io::AsRawFd; @@ -17,9 +18,13 @@ use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; use cpuid::common::{get_vendor_id_from_cpuid, get_vendor_id_from_host}; use devices::virtio::TYPE_NET; use libc::memfd_create; +use logger::warn; use logger::{error, info}; -use logger::{error, info, warn}; +use seccompiler::BpfThreadMap; +use serde::Serialize; +use snapshot::Snapshot; use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; use virtio_gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; @@ -552,12 +557,14 @@ pub fn restore_from_snapshot( let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { MemBackendType::File => { let (guest_memory, file) = - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?; + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::File)?; (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) } MemBackendType::Uffd => { let (guest_memory, uffd) = - guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages)?; + guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?; (guest_memory, uffd.map(MemoryDescriptor::Uffd)) } @@ -616,41 +623,64 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result<(GuestMemoryMmap, File), LoadSnapshotError> { - use self::LoadSnapshotError::{DeserializeMemory, MemoryBackingFile}; +) -> std::result::Result<(GuestMemoryMmap, File), GuestMemoryFromFileError> { let mem_file = OpenOptions::new() .write(true) .read(true) - .open(mem_file_path) - .map_err(MemoryBackingFile)?; + .open(mem_file_path)?; Ok(( - GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) - .map_err(DeserializeMemory)?, + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages)?, mem_file, )) } +/// Error type for [`guest_memory_from_uffd`] +#[derive(Debug, thiserror::Error)] +pub enum GuestMemoryFromUffdError { + /// Failed to restore guest memory. + #[error("Failed to restore guest memory: {0}")] + Restore(#[from] crate::memory_snapshot::Error), + /// Failed to UFFD object. + #[error("Failed to UFFD object: {0}")] + Create(userfaultfd::Error), + /// Failed to register memory address range with the userfaultfd object. + #[error("Failed to register memory address range with the userfaultfd object: {0}")] + Register(userfaultfd::Error), + /// Failed to connect to UDS Unix stream. + #[error("Failed to connect to UDS Unix stream: {0}")] + Connect(#[from] std::io::Error), + /// Failed to send file descriptor. + #[error("Failed to sends file descriptor: {0}")] + Send(#[from] utils::errno::Error), + + /// No memfd received + #[error("No memfd received")] + NoMemFdReceived, + /// Receiving memfd went wrong + #[error("Failed to receive memfd: {0}")] + Receive(utils::errno::Error), +} + pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result<(GuestMemoryMmap, Option), LoadSnapshotError> { - use self::LoadSnapshotError::{CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdSend}; - - let mut socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; +) -> std::result::Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { + let mut socket = UnixStream::connect(mem_uds_path)?; let mut buf = [0u8; 8]; - let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; + let (_, memfd) = socket + .recv_with_fd(&mut buf) + .map_err(GuestMemoryFromUffdError::Receive)?; if memfd.is_none() { - return Err(LoadSnapshotError::NoMemFdReceived); + return Err(GuestMemoryFromUffdError::NoMemFdReceived); } let memfd = memfd.unwrap(); - let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages) - .map_err(DeserializeMemory)?; + let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages)?; let uffd = UffdBuilder::new() .require_features( @@ -682,47 +712,45 @@ pub(crate) fn guest_memory_from_uffd( // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - socket - .send_with_fd( - backend_mappings.as_bytes(), - // In the happy case we can close the fd since the other process has it open and is - // using it to serve us pages. - // - // The problem is that if other process crashes/exits, firecracker guest memory - // will simply revert to anon-mem behavior which would lead to silent errors and - // undefined behavior. - // - // To tackle this scenario, the page fault handler can notify Firecracker of any - // crashes/exits. There is no need for Firecracker to explicitly send its process ID. - // The external process can obtain Firecracker's PID by calling `getsockopt` with - // `libc::SO_PEERCRED` option like so: - // - // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; - // let mut ucred_size: u32 = mem::size_of::() as u32; - // libc::getsockopt( - // socket.as_raw_fd(), - // libc::SOL_SOCKET, - // libc::SO_PEERCRED, - // &mut val as *mut _ as *mut _, - // &mut ucred_size as *mut libc::socklen_t, - // ); - // - // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, - // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process - // connected to this socket. The returned credentials are those that were in effect - // at the time of the `connect` call. - // - // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the - // page fault handler process does not tear down Firecracker when necessary, the - // uffd will still be alive but with no one to serve faults, leading to guest freeze. - uffd.as_raw_fd(), - ) - .map_err(UffdSend)?; + socket.send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + )?; // Wait for UFFD to be ready. // TODO: maybe add a timeout? let mut buf = [0; 2]; - socket.read_exact(&mut buf).map_err(UdsConnection)?; + socket.read_exact(&mut buf)?; Ok((guest_memory, Some(uffd))) } diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 009085e6e49..1d690f08e32 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -28,7 +28,7 @@ impl Default for SnapshotType { /// 1) A file that contains the guest memory to be loaded, /// 2) An UDS where a custom page-fault handler process is listening for /// the UFFD set up by Firecracker to handle its guest memory page faults. -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Eq, PartialEq)] pub enum MemBackendType { /// Guest memory contents will be loaded from a file. File, @@ -91,7 +91,7 @@ pub struct LoadSnapshotConfig { } /// Stores the configuration used for managing snapshot memory. -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, Eq, PartialEq)] #[serde(deny_unknown_fields)] pub struct MemBackendConfig { /// Path to the backend used to handle the guest memory.