Merge pull request #74 from rasmushalland/win-perf

Avoid opening all files for reading on windows
This commit is contained in:
andy.boot
2020-03-01 14:49:47 +00:00
committed by GitHub
2 changed files with 148 additions and 42 deletions
+50 -30
View File
@@ -85,15 +85,18 @@ pub fn get_dir_tree<P: AsRef<Path>>(
None
};
let mut examine_dir_args = ExamineDirMutArsg {
data: &mut data,
file_count_no_permission: &mut permissions,
};
for b in top_level_names.iter() {
examine_dir(
b,
apparent_size,
&restricted_filesystems,
ignore_directories,
&mut data,
&mut permissions,
threads,
&mut examine_dir_args,
);
}
(permissions == 0, data)
@@ -119,14 +122,18 @@ pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
path.as_ref().components().collect::<PathBuf>()
}
struct ExamineDirMutArsg<'a> {
data: &'a mut HashMap<PathBuf, u64>,
file_count_no_permission: &'a mut u64,
}
fn examine_dir<P: AsRef<Path>>(
top_dir: P,
apparent_size: bool,
filesystems: &Option<HashSet<u64>>,
ignore_directories: &Option<Vec<PathBuf>>,
data: &mut HashMap<PathBuf, u64>,
file_count_no_permission: &mut u64,
threads: Option<usize>,
mut_args: &mut ExamineDirMutArsg,
) {
let top_dir = top_dir.as_ref();
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
@@ -136,9 +143,11 @@ fn examine_dir<P: AsRef<Path>>(
if let Some(threads_to_start) = threads {
iter = iter.num_threads(threads_to_start);
}
'entry: for entry in iter {
if let Ok(e) = entry {
let maybe_size_and_inode = get_metadata(&e, apparent_size);
if let Some(dirs) = ignore_directories {
let path = e.path();
let parts = path.components().collect::<Vec<std::path::Component>>();
@@ -154,15 +163,16 @@ fn examine_dir<P: AsRef<Path>>(
}
match maybe_size_and_inode {
Some((size, inode, device)) => {
if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode, device) {
process_file_with_size_and_inode(top_dir, data, e, size)
Some(data) => {
let (size, inode_device) = data;
if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode_device) {
process_file_with_size_and_inode(top_dir, mut_args.data, e, size)
}
}
None => *file_count_no_permission += 1,
None => *mut_args.file_count_no_permission += 1,
}
} else {
*file_count_no_permission += 1
*mut_args.file_count_no_permission += 1
}
}
}
@@ -171,24 +181,29 @@ fn should_ignore_file(
apparent_size: bool,
restricted_filesystems: &Option<HashSet<u64>>,
inodes: &mut HashSet<(u64, u64)>,
inode: u64,
device: u64,
maybe_inode_device: Option<(u64, u64)>,
) -> bool {
// Ignore files on different devices (if flag applied)
if let Some(rs) = restricted_filesystems {
if !rs.contains(&device) {
return true;
}
}
match maybe_inode_device {
None => false,
Some(data) => {
let (inode, device) = data;
// Ignore files on different devices (if flag applied)
if let Some(rs) = restricted_filesystems {
if !rs.contains(&device) {
return true;
}
}
if !apparent_size {
// Ignore files already visited or symlinked
if inodes.contains(&(inode, device)) {
return true;
if !apparent_size {
// Ignore files already visited or symlinked
if inodes.contains(&(inode, device)) {
return true;
}
inodes.insert((inode, device));
}
false
}
inodes.insert((inode, device));
}
false
}
fn process_file_with_size_and_inode<P: AsRef<Path>>(
@@ -352,14 +367,19 @@ mod tests {
let mut files = HashSet::new();
files.insert((10, 20));
assert!(!should_ignore_file(true, &None, &mut files, 0, 0));
assert!(!should_ignore_file(true, &None, &mut files, Some((0, 0))));
// New file is not known it will be inserted to the hashmp and should not be ignored
assert!(!should_ignore_file(false, &None, &mut files, 11, 12));
assert!(!should_ignore_file(
false,
&None,
&mut files,
Some((11, 12))
));
assert!(files.contains(&(11, 12)));
// The same file will be ignored the second time
assert!(should_ignore_file(false, &None, &mut files, 11, 12));
assert!(should_ignore_file(false, &None, &mut files, Some((11, 12))));
}
#[test]
@@ -373,11 +393,11 @@ mod tests {
// If we are looking at a different device (disk) and the device flag is set
// then apparent_size is irrelevant - we ignore files on other devices
assert!(should_ignore_file(false, &od, &mut files, 11, 12));
assert!(should_ignore_file(true, &od, &mut files, 11, 12));
assert!(should_ignore_file(false, &od, &mut files, Some((11, 12))));
assert!(should_ignore_file(true, &od, &mut files, Some((11, 12))));
// We do not ignore files on the same device
assert!(!should_ignore_file(false, &od, &mut files, 2, 99));
assert!(!should_ignore_file(true, &od, &mut files, 2, 99));
assert!(!should_ignore_file(false, &od, &mut files, Some((2, 99))));
assert!(!should_ignore_file(true, &od, &mut files, Some((2, 99))));
}
}
+98 -12
View File
@@ -12,30 +12,116 @@ fn get_block_size() -> u64 {
}
#[cfg(target_family = "unix")]
pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, u64, u64)> {
pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
use std::os::unix::fs::MetadataExt;
d.metadata.as_ref().unwrap().as_ref().ok().map(|md| {
if use_apparent_size {
(md.len(), md.ino(), md.dev())
(md.len(), Some((md.ino(), md.dev())))
} else {
(md.blocks() * get_block_size(), md.ino(), md.dev())
(md.blocks() * get_block_size(), Some((md.ino(), md.dev())))
}
})
}
#[cfg(target_family = "windows")]
pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, u64, u64)> {
use winapi_util::file::information;
pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
// On windows opening the file to get size, file ID and volume can be very
// expensive because 1) it causes a few system calls, and more importantly 2) it can cause
// windows defender to scan the file.
// Therefore we try to avoid doing that for common cases, mainly those of
// plain files:
// The idea is to make do with the file size that we get from the OS for
// free as part of iterating a folder. Therefore we want to make sure that
// it makes sense to use that free size information:
// Volume boundaries:
// The user can ask us not to cross volume boundaries. If the DirEntry is a
// plain file and not a reparse point or other non-trivial stuff, we assume
// that the file is located on the same volume as the directory that
// contains it.
// File ID:
// This optimization does deprive us of access to a file ID. As a
// workaround, we just make one up that hopefully does not collide with real
// file IDs.
// Hard links: Unresolved. We don't get inode/file index, so hard links
// count once for each link. Hopefully they are not too commonly in use on
// windows.
// Size:
// We assume (naively?) that for the common cases the free size info is the
// same as one would get by doing the expensive thing. Sparse, encrypted and
// compressed files are not included in the common cases, as one can image
// there being more than view on their size.
// Savings in orders of magnitude in terms of time, io and cpu have been
// observed on hdd, windows 10, some 100Ks files taking up some hundreds of
// GBs:
// Consistently opening the file: 30 minutes.
// With this optimization: 8 sec.
use winapi_util::Handle;
fn handle_from_path_limited<P: AsRef<Path>>(path: P) -> io::Result<Handle> {
use std::fs::OpenOptions;
use std::os::windows::fs::OpenOptionsExt;
const FILE_READ_ATTRIBUTES: u32 = 0x0080;
let h = Handle::from_path_any(d.path()).ok()?;
let info = information(&h).ok()?;
// So, it seems that it does does have to be that expensive to open
// files to get their info: Avoiding opening the file with the full
// GENERIC_READ is key:
Some((
info.file_size(),
info.file_index(),
info.volume_serial_number(),
))
// https://docs.microsoft.com/en-us/windows/win32/secauthz/generic-access-rights:
// "For example, a Windows file object maps the GENERIC_READ bit to the
// READ_CONTROL and SYNCHRONIZE standard access rights and to the
// FILE_READ_DATA, FILE_READ_EA, and FILE_READ_ATTRIBUTES
// object-specific access rights"
// The flag FILE_READ_DATA seems to be the expensive one, so we'll avoid
// that, and a most of the other ones. Simply because it seems that we
// don't need them.
let file = OpenOptions::new()
.access_mode(FILE_READ_ATTRIBUTES)
.open(path)?;
Ok(Handle::from_file(file))
}
fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, Option<(u64, u64)>)> {
use winapi_util::file::information;
let h = handle_from_path_limited(d.path()).ok()?;
let info = information(&h).ok()?;
Some((
info.file_size(),
Some((info.file_index(), info.volume_serial_number())),
))
}
match d.metadata {
Some(Ok(ref md)) => {
use std::os::windows::fs::MetadataExt;
const FILE_ATTRIBUTE_ARCHIVE: u32 = 0x20u32;
const FILE_ATTRIBUTE_READONLY: u32 = 0x1u32;
const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2u32;
const FILE_ATTRIBUTE_SYSTEM: u32 = 0x4u32;
const FILE_ATTRIBUTE_NORMAL: u32 = 0x80u32;
const FILE_ATTRIBUTE_DIRECTORY: u32 = 0x10u32;
let attr_filtered = md.file_attributes()
& !(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_READONLY | FILE_ATTRIBUTE_SYSTEM);
if attr_filtered == FILE_ATTRIBUTE_ARCHIVE
|| attr_filtered == FILE_ATTRIBUTE_DIRECTORY
|| md.file_attributes() == FILE_ATTRIBUTE_NORMAL
{
Some((md.len(), None))
} else {
get_metadata_expensive(&d)
}
}
_ => get_metadata_expensive(&d),
}
}
#[cfg(target_family = "unix")]