From c30f31c22cc89af1b50897bb2d7eea0cfa3e6e76 Mon Sep 17 00:00:00 2001 From: Rasmus Halland Date: Mon, 24 Feb 2020 02:13:30 +0100 Subject: [PATCH 1/3] Avoid opening all files for reading on windows It can be very expensive to do that, especially when it causes windows defender to read the files and scan them. --- src/utils/mod.rs | 29 ++++++++++---- src/utils/platform.rs | 93 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 103 insertions(+), 19 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 7f0d853..2261bd5 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -85,15 +85,20 @@ pub fn get_dir_tree>( None }; + let mut fake_inode_counter = u64::max_value(); + let mut examine_dir_args = ExamineDirMutArsg { + data: &mut data, + fake_inode_counter: &mut fake_inode_counter, + file_count_no_permission: &mut permissions, + }; for b in top_level_names.iter() { examine_dir( b, apparent_size, &restricted_filesystems, ignore_directories, - &mut data, - &mut permissions, threads, + &mut examine_dir_args, ); } (permissions == 0, data) @@ -119,14 +124,19 @@ pub fn normalize_path>(path: P) -> PathBuf { path.as_ref().components().collect::() } +struct ExamineDirMutArsg<'a> { + data: &'a mut HashMap, + file_count_no_permission: &'a mut u64, + fake_inode_counter: &'a mut u64, +} + fn examine_dir>( top_dir: P, apparent_size: bool, filesystems: &Option>, ignore_directories: &Option>, - data: &mut HashMap, - file_count_no_permission: &mut u64, threads: Option, + mut_args: &mut ExamineDirMutArsg, ) { let top_dir = top_dir.as_ref(); let mut inodes: HashSet<(u64, u64)> = HashSet::new(); @@ -136,9 +146,12 @@ fn examine_dir>( if let Some(threads_to_start) = threads { iter = iter.num_threads(threads_to_start); } + 'entry: for entry in iter { if let Ok(e) = entry { - let maybe_size_and_inode = get_metadata(&e, apparent_size); + let maybe_size_and_inode = + get_metadata(&e, apparent_size, &mut mut_args.fake_inode_counter); + if let Some(dirs) = ignore_directories { let path = e.path(); let parts = path.components().collect::>(); @@ -156,13 +169,13 @@ fn examine_dir>( match maybe_size_and_inode { Some((size, inode, device)) => { if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode, device) { - process_file_with_size_and_inode(top_dir, data, e, size) + process_file_with_size_and_inode(top_dir, mut_args.data, e, size) } } - None => *file_count_no_permission += 1, + None => *mut_args.file_count_no_permission += 1, } } else { - *file_count_no_permission += 1 + *mut_args.file_count_no_permission += 1 } } } diff --git a/src/utils/platform.rs b/src/utils/platform.rs index 144f9ce..8ef25a5 100644 --- a/src/utils/platform.rs +++ b/src/utils/platform.rs @@ -12,7 +12,11 @@ fn get_block_size() -> u64 { } #[cfg(target_family = "unix")] -pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, u64, u64)> { +pub fn get_metadata( + d: &DirEntry, + use_apparent_size: bool, + _fake_inode_counter: &mut u64, +) -> Option<(u64, u64, u64)> { use std::os::unix::fs::MetadataExt; d.metadata.as_ref().unwrap().as_ref().ok().map(|md| { if use_apparent_size { @@ -24,18 +28,85 @@ pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, u64, } #[cfg(target_family = "windows")] -pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, u64, u64)> { - use winapi_util::file::information; - use winapi_util::Handle; +pub fn get_metadata( + d: &DirEntry, + _use_apparent_size: bool, + fake_inode_counter: &mut u64, +) -> Option<(u64, u64, u64)> { + // On windows opening the file to get size, file ID and volume can be very + // expensive because 1) it causes a few system calls, and more importantly 2) it can cause + // windows defender to scan the file. + // Therefore we try to avoid doing that for common cases, mainly those of + // plain files: - let h = Handle::from_path_any(d.path()).ok()?; - let info = information(&h).ok()?; + // The idea is to make do with the file size that we get from the OS for + // free as part of iterating a folder. Therefore we want to make sure that + // it makes sense to use that free size information: - Some(( - info.file_size(), - info.file_index(), - info.volume_serial_number(), - )) + // Volume boundaries: + // The user can ask us not to cross volume boundaries. If the DirEntry is a + // plain file and not a reparse point or other non-trivial stuff, we assume + // that the file is located on the same volume as the directory that + // contains it. + + // File ID: + // This optimization does deprive us of access to a file ID. As a + // workaround, we just make one up that hopefully does not collide with real + // file IDs. + // Hard links: Unresolved. We don't get inode/file index, so hard links + // count once for each link. Hopefully they are not too commonly in use on + // windows. + + // Size: + // We assume (naively?) that for the common cases the free size info is the + // same as one would get by doing the expensive thing. Sparse, encrypted and + // compressed files are not included in the common cases, as one can image + // there being more than view on their size. + + // Savings in orders of magnitude in terms of time, io and cpu have been + // observed on hdd, windows 10, some 100Ks files taking up some hundreds of + // GBs: + // Consistently opening the file: 30 minutes. + // With this optimization: 8 sec. + + fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, u64, u64)> { + use winapi_util::file::information; + use winapi_util::Handle; + + let h = Handle::from_path_any(d.path()).ok()?; + let info = information(&h).ok()?; + + Some(( + info.file_size(), + info.file_index(), + info.volume_serial_number(), + )) + } + + match d.metadata { + Some(Ok(ref md)) => { + use std::os::windows::fs::MetadataExt; + const FILE_ATTRIBUTE_ARCHIVE: u32 = 0x20u32; + const FILE_ATTRIBUTE_READONLY: u32 = 0x1u32; + const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2u32; + const FILE_ATTRIBUTE_SYSTEM: u32 = 0x4u32; + const FILE_ATTRIBUTE_NORMAL: u32 = 0x80u32; + const FILE_ATTRIBUTE_DIRECTORY: u32 = 0x10u32; + + let attr_filtered = md.file_attributes() + & !(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_READONLY | FILE_ATTRIBUTE_SYSTEM); + if attr_filtered == FILE_ATTRIBUTE_ARCHIVE + || attr_filtered == FILE_ATTRIBUTE_DIRECTORY + || md.file_attributes() == FILE_ATTRIBUTE_NORMAL + { + *fake_inode_counter -= 1; + Some((md.len(), *fake_inode_counter, 0xcafe_cafe_u64)) + } else { + get_metadata_expensive(&d) + } + } + _ => get_metadata_expensive(&d), + } } #[cfg(target_family = "unix")] From 2c58041885232949d4a946de340dd875c0310ec8 Mon Sep 17 00:00:00 2001 From: "andy.boot" Date: Thu, 27 Feb 2020 08:24:59 +0000 Subject: [PATCH 2/3] Clean up windows performance Instead of generating random values for the drive and inode counter on windows we return None instead --- src/utils/mod.rs | 63 ++++++++++++++++++++++++------------------- src/utils/platform.rs | 24 +++++------------ 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 2261bd5..f4342bb 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -85,10 +85,8 @@ pub fn get_dir_tree>( None }; - let mut fake_inode_counter = u64::max_value(); let mut examine_dir_args = ExamineDirMutArsg { data: &mut data, - fake_inode_counter: &mut fake_inode_counter, file_count_no_permission: &mut permissions, }; for b in top_level_names.iter() { @@ -127,7 +125,6 @@ pub fn normalize_path>(path: P) -> PathBuf { struct ExamineDirMutArsg<'a> { data: &'a mut HashMap, file_count_no_permission: &'a mut u64, - fake_inode_counter: &'a mut u64, } fn examine_dir>( @@ -149,8 +146,7 @@ fn examine_dir>( 'entry: for entry in iter { if let Ok(e) = entry { - let maybe_size_and_inode = - get_metadata(&e, apparent_size, &mut mut_args.fake_inode_counter); + let maybe_size_and_inode = get_metadata(&e, apparent_size); if let Some(dirs) = ignore_directories { let path = e.path(); @@ -167,8 +163,9 @@ fn examine_dir>( } match maybe_size_and_inode { - Some((size, inode, device)) => { - if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode, device) { + Some(data) => { + let (size, inode_device) = data; + if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode_device) { process_file_with_size_and_inode(top_dir, mut_args.data, e, size) } } @@ -184,24 +181,29 @@ fn should_ignore_file( apparent_size: bool, restricted_filesystems: &Option>, inodes: &mut HashSet<(u64, u64)>, - inode: u64, - device: u64, + maybe_inode_device: Option<(u64, u64)>, ) -> bool { - // Ignore files on different devices (if flag applied) - if let Some(rs) = restricted_filesystems { - if !rs.contains(&device) { - return true; - } - } + match maybe_inode_device { + None => false, + Some(data) => { + let (inode, device) = data; + // Ignore files on different devices (if flag applied) + if let Some(rs) = restricted_filesystems { + if !rs.contains(&device) { + return true; + } + } - if !apparent_size { - // Ignore files already visited or symlinked - if inodes.contains(&(inode, device)) { - return true; + if !apparent_size { + // Ignore files already visited or symlinked + if inodes.contains(&(inode, device)) { + return true; + } + inodes.insert((inode, device)); + } + false } - inodes.insert((inode, device)); } - false } fn process_file_with_size_and_inode>( @@ -365,14 +367,19 @@ mod tests { let mut files = HashSet::new(); files.insert((10, 20)); - assert!(!should_ignore_file(true, &None, &mut files, 0, 0)); + assert!(!should_ignore_file(true, &None, &mut files, Some((0, 0)))); // New file is not known it will be inserted to the hashmp and should not be ignored - assert!(!should_ignore_file(false, &None, &mut files, 11, 12)); + assert!(!should_ignore_file( + false, + &None, + &mut files, + Some((11, 12)) + )); assert!(files.contains(&(11, 12))); // The same file will be ignored the second time - assert!(should_ignore_file(false, &None, &mut files, 11, 12)); + assert!(should_ignore_file(false, &None, &mut files, Some((11, 12)))); } #[test] @@ -386,11 +393,11 @@ mod tests { // If we are looking at a different device (disk) and the device flag is set // then apparent_size is irrelevant - we ignore files on other devices - assert!(should_ignore_file(false, &od, &mut files, 11, 12)); - assert!(should_ignore_file(true, &od, &mut files, 11, 12)); + assert!(should_ignore_file(false, &od, &mut files, Some((11, 12)))); + assert!(should_ignore_file(true, &od, &mut files, Some((11, 12)))); // We do not ignore files on the same device - assert!(!should_ignore_file(false, &od, &mut files, 2, 99)); - assert!(!should_ignore_file(true, &od, &mut files, 2, 99)); + assert!(!should_ignore_file(false, &od, &mut files, Some((2, 99)))); + assert!(!should_ignore_file(true, &od, &mut files, Some((2, 99)))); } } diff --git a/src/utils/platform.rs b/src/utils/platform.rs index 8ef25a5..121b643 100644 --- a/src/utils/platform.rs +++ b/src/utils/platform.rs @@ -12,27 +12,19 @@ fn get_block_size() -> u64 { } #[cfg(target_family = "unix")] -pub fn get_metadata( - d: &DirEntry, - use_apparent_size: bool, - _fake_inode_counter: &mut u64, -) -> Option<(u64, u64, u64)> { +pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> { use std::os::unix::fs::MetadataExt; d.metadata.as_ref().unwrap().as_ref().ok().map(|md| { if use_apparent_size { - (md.len(), md.ino(), md.dev()) + (md.len(), Some((md.ino(), md.dev()))) } else { - (md.blocks() * get_block_size(), md.ino(), md.dev()) + (md.blocks() * get_block_size(), Some((md.ino(), md.dev()))) } }) } #[cfg(target_family = "windows")] -pub fn get_metadata( - d: &DirEntry, - _use_apparent_size: bool, - fake_inode_counter: &mut u64, -) -> Option<(u64, u64, u64)> { +pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> { // On windows opening the file to get size, file ID and volume can be very // expensive because 1) it causes a few system calls, and more importantly 2) it can cause // windows defender to scan the file. @@ -69,7 +61,7 @@ pub fn get_metadata( // Consistently opening the file: 30 minutes. // With this optimization: 8 sec. - fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, u64, u64)> { + fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, Option<(u64, u64)>)> { use winapi_util::file::information; use winapi_util::Handle; @@ -78,8 +70,7 @@ pub fn get_metadata( Some(( info.file_size(), - info.file_index(), - info.volume_serial_number(), + Some((info.file_index(), info.volume_serial_number())), )) } @@ -99,8 +90,7 @@ pub fn get_metadata( || attr_filtered == FILE_ATTRIBUTE_DIRECTORY || md.file_attributes() == FILE_ATTRIBUTE_NORMAL { - *fake_inode_counter -= 1; - Some((md.len(), *fake_inode_counter, 0xcafe_cafe_u64)) + Some((md.len(), None)) } else { get_metadata_expensive(&d) } From efb455c73930d44d94c369cdfac2858e01d5c016 Mon Sep 17 00:00:00 2001 From: Rasmus Halland Date: Fri, 28 Feb 2020 02:09:32 +0100 Subject: [PATCH 3/3] Opening files on windows got a lot cheaper. We avoid passing FILE_READ_DATA to CreateFile. --- src/utils/platform.rs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/utils/platform.rs b/src/utils/platform.rs index 121b643..633c3bc 100644 --- a/src/utils/platform.rs +++ b/src/utils/platform.rs @@ -61,11 +61,36 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti // Consistently opening the file: 30 minutes. // With this optimization: 8 sec. + use winapi_util::Handle; + fn handle_from_path_limited>(path: P) -> io::Result { + use std::fs::OpenOptions; + use std::os::windows::fs::OpenOptionsExt; + const FILE_READ_ATTRIBUTES: u32 = 0x0080; + + // So, it seems that it does does have to be that expensive to open + // files to get their info: Avoiding opening the file with the full + // GENERIC_READ is key: + + // https://docs.microsoft.com/en-us/windows/win32/secauthz/generic-access-rights: + // "For example, a Windows file object maps the GENERIC_READ bit to the + // READ_CONTROL and SYNCHRONIZE standard access rights and to the + // FILE_READ_DATA, FILE_READ_EA, and FILE_READ_ATTRIBUTES + // object-specific access rights" + + // The flag FILE_READ_DATA seems to be the expensive one, so we'll avoid + // that, and a most of the other ones. Simply because it seems that we + // don't need them. + + let file = OpenOptions::new() + .access_mode(FILE_READ_ATTRIBUTES) + .open(path)?; + Ok(Handle::from_file(file)) + } + fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, Option<(u64, u64)>)> { use winapi_util::file::information; - use winapi_util::Handle; - let h = Handle::from_path_any(d.path()).ok()?; + let h = handle_from_path_limited(d.path()).ok()?; let info = information(&h).ok()?; Some((