Avoid opening all files for reading on windows

It can be very expensive to do that, especially when it causes windows defender to read the files and scan them.
This commit is contained in:
Rasmus Halland
2020-02-24 02:13:30 +01:00
parent 59f2cdfb84
commit c30f31c22c
2 changed files with 103 additions and 19 deletions
+21 -8
View File
@@ -85,15 +85,20 @@ pub fn get_dir_tree<P: AsRef<Path>>(
None
};
let mut fake_inode_counter = u64::max_value();
let mut examine_dir_args = ExamineDirMutArsg {
data: &mut data,
fake_inode_counter: &mut fake_inode_counter,
file_count_no_permission: &mut permissions,
};
for b in top_level_names.iter() {
examine_dir(
b,
apparent_size,
&restricted_filesystems,
ignore_directories,
&mut data,
&mut permissions,
threads,
&mut examine_dir_args,
);
}
(permissions == 0, data)
@@ -119,14 +124,19 @@ pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
path.as_ref().components().collect::<PathBuf>()
}
struct ExamineDirMutArsg<'a> {
data: &'a mut HashMap<PathBuf, u64>,
file_count_no_permission: &'a mut u64,
fake_inode_counter: &'a mut u64,
}
fn examine_dir<P: AsRef<Path>>(
top_dir: P,
apparent_size: bool,
filesystems: &Option<HashSet<u64>>,
ignore_directories: &Option<Vec<PathBuf>>,
data: &mut HashMap<PathBuf, u64>,
file_count_no_permission: &mut u64,
threads: Option<usize>,
mut_args: &mut ExamineDirMutArsg,
) {
let top_dir = top_dir.as_ref();
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
@@ -136,9 +146,12 @@ fn examine_dir<P: AsRef<Path>>(
if let Some(threads_to_start) = threads {
iter = iter.num_threads(threads_to_start);
}
'entry: for entry in iter {
if let Ok(e) = entry {
let maybe_size_and_inode = get_metadata(&e, apparent_size);
let maybe_size_and_inode =
get_metadata(&e, apparent_size, &mut mut_args.fake_inode_counter);
if let Some(dirs) = ignore_directories {
let path = e.path();
let parts = path.components().collect::<Vec<std::path::Component>>();
@@ -156,13 +169,13 @@ fn examine_dir<P: AsRef<Path>>(
match maybe_size_and_inode {
Some((size, inode, device)) => {
if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode, device) {
process_file_with_size_and_inode(top_dir, data, e, size)
process_file_with_size_and_inode(top_dir, mut_args.data, e, size)
}
}
None => *file_count_no_permission += 1,
None => *mut_args.file_count_no_permission += 1,
}
} else {
*file_count_no_permission += 1
*mut_args.file_count_no_permission += 1
}
}
}
+82 -11
View File
@@ -12,7 +12,11 @@ fn get_block_size() -> u64 {
}
#[cfg(target_family = "unix")]
pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, u64, u64)> {
pub fn get_metadata(
d: &DirEntry,
use_apparent_size: bool,
_fake_inode_counter: &mut u64,
) -> Option<(u64, u64, u64)> {
use std::os::unix::fs::MetadataExt;
d.metadata.as_ref().unwrap().as_ref().ok().map(|md| {
if use_apparent_size {
@@ -24,18 +28,85 @@ pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, u64,
}
#[cfg(target_family = "windows")]
pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, u64, u64)> {
use winapi_util::file::information;
use winapi_util::Handle;
pub fn get_metadata(
d: &DirEntry,
_use_apparent_size: bool,
fake_inode_counter: &mut u64,
) -> Option<(u64, u64, u64)> {
// On windows opening the file to get size, file ID and volume can be very
// expensive because 1) it causes a few system calls, and more importantly 2) it can cause
// windows defender to scan the file.
// Therefore we try to avoid doing that for common cases, mainly those of
// plain files:
let h = Handle::from_path_any(d.path()).ok()?;
let info = information(&h).ok()?;
// The idea is to make do with the file size that we get from the OS for
// free as part of iterating a folder. Therefore we want to make sure that
// it makes sense to use that free size information:
Some((
info.file_size(),
info.file_index(),
info.volume_serial_number(),
))
// Volume boundaries:
// The user can ask us not to cross volume boundaries. If the DirEntry is a
// plain file and not a reparse point or other non-trivial stuff, we assume
// that the file is located on the same volume as the directory that
// contains it.
// File ID:
// This optimization does deprive us of access to a file ID. As a
// workaround, we just make one up that hopefully does not collide with real
// file IDs.
// Hard links: Unresolved. We don't get inode/file index, so hard links
// count once for each link. Hopefully they are not too commonly in use on
// windows.
// Size:
// We assume (naively?) that for the common cases the free size info is the
// same as one would get by doing the expensive thing. Sparse, encrypted and
// compressed files are not included in the common cases, as one can image
// there being more than view on their size.
// Savings in orders of magnitude in terms of time, io and cpu have been
// observed on hdd, windows 10, some 100Ks files taking up some hundreds of
// GBs:
// Consistently opening the file: 30 minutes.
// With this optimization: 8 sec.
fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, u64, u64)> {
use winapi_util::file::information;
use winapi_util::Handle;
let h = Handle::from_path_any(d.path()).ok()?;
let info = information(&h).ok()?;
Some((
info.file_size(),
info.file_index(),
info.volume_serial_number(),
))
}
match d.metadata {
Some(Ok(ref md)) => {
use std::os::windows::fs::MetadataExt;
const FILE_ATTRIBUTE_ARCHIVE: u32 = 0x20u32;
const FILE_ATTRIBUTE_READONLY: u32 = 0x1u32;
const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2u32;
const FILE_ATTRIBUTE_SYSTEM: u32 = 0x4u32;
const FILE_ATTRIBUTE_NORMAL: u32 = 0x80u32;
const FILE_ATTRIBUTE_DIRECTORY: u32 = 0x10u32;
let attr_filtered = md.file_attributes()
& !(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_READONLY | FILE_ATTRIBUTE_SYSTEM);
if attr_filtered == FILE_ATTRIBUTE_ARCHIVE
|| attr_filtered == FILE_ATTRIBUTE_DIRECTORY
|| md.file_attributes() == FILE_ATTRIBUTE_NORMAL
{
*fake_inode_counter -= 1;
Some((md.len(), *fake_inode_counter, 0xcafe_cafe_u64))
} else {
get_metadata_expensive(&d)
}
}
_ => get_metadata_expensive(&d),
}
}
#[cfg(target_family = "unix")]