Move everything to ignore instead of jwalk

This commit is contained in:
andy.boot
2020-03-22 17:49:43 +00:00
parent f096e82754
commit 1953e107c2
6 changed files with 305 additions and 381 deletions
+9 -24
View File
@@ -1,6 +1,9 @@
#[macro_use]
extern crate clap;
extern crate crossbeam_channel as channel;
extern crate ignore;
extern crate unicode_width;
extern crate walkdir;
use self::display::draw_it;
use crate::utils::is_a_parent_of;
@@ -8,7 +11,7 @@ use clap::{App, AppSettings, Arg};
use std::cmp::max;
use std::path::PathBuf;
use terminal_size::{terminal_size, Height, Width};
use utils::{find_big_ones, get_dir_tree, simplify_dir_names, sort, trim_deep_ones, Node};
use utils::{find_big_ones, get_dir_tree, simplify_dir_names, sort, Node};
mod display;
mod utils;
@@ -137,27 +140,9 @@ fn main() {
}
};
let temp_threads = options.value_of("threads").and_then(|threads| {
threads
.parse::<usize>()
.map_err(|_| eprintln!("Ignoring bad value for threads: {:?}", threads))
.ok()
});
// Bug in JWalk
// https://github.com/jessegrosjean/jwalk/issues/15
// We force it to use 2 threads if there is only 1 cpu
// as JWalk breaks if it tries to run on a single cpu
let threads = {
if temp_threads.is_none() && num_cpus::get() == 1 {
Some(2)
} else {
temp_threads
}
};
let depth = options.value_of("depth").and_then(|depth| {
depth
.parse::<u64>()
.parse::<usize>()
.map(|v| v + 1)
.map_err(|_| eprintln!("Ignoring bad value for depth"))
.ok()
@@ -181,13 +166,13 @@ fn main() {
&ignore_directories,
use_apparent_size,
limit_filesystem,
threads,
depth,
);
let sorted_data = sort(nodes);
let biggest_ones = {
match depth {
None => find_big_ones(sorted_data, number_of_lines + simplified_dirs.len()),
Some(d) => trim_deep_ones(sorted_data, d, &simplified_dirs),
Some(_) => sorted_data,
}
};
let tree = build_tree(biggest_ones, depth);
@@ -202,7 +187,7 @@ fn main() {
);
}
fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<u64>) -> Node {
fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<usize>) -> Node {
let mut top_parent = Node::default();
// assume sorted order
@@ -217,7 +202,7 @@ fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<u64>) -> Node {
top_parent
}
fn recursively_build_tree(parent_node: &mut Node, new_node: Node, depth: Option<u64>) {
fn recursively_build_tree(parent_node: &mut Node, new_node: Node, depth: Option<usize>) {
let new_depth = match depth {
None => None,
Some(0) => return,
+120 -166
View File
@@ -1,14 +1,22 @@
use jwalk::DirEntry;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::collections::HashSet;
use std::iter::FromIterator;
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool;
use jwalk::WalkDir;
use channel::Receiver;
use std::thread::JoinHandle;
use ignore::{WalkBuilder, WalkState};
use std::sync::atomic;
use std::thread;
mod platform;
use self::platform::*;
type PathData = (PathBuf, u64, Option<(u64, u64)>);
#[derive(Debug, Default, Eq)]
pub struct Node {
pub name: PathBuf,
@@ -70,46 +78,121 @@ pub fn simplify_dir_names<P: AsRef<Path>>(filenames: Vec<P>) -> HashSet<PathBuf>
top_level_names
}
fn prepare_walk_dir_builder<P: AsRef<Path>>(
top_level_names: &HashSet<P>,
limit_filesystem: bool,
max_depth: Option<usize>,
) -> WalkBuilder {
let mut it = top_level_names.iter();
let mut builder = WalkBuilder::new(it.next().unwrap());
builder.follow_links(false);
builder.ignore(false);
builder.git_global(false);
builder.git_ignore(false);
builder.git_exclude(false);
builder.hidden(false);
if limit_filesystem {
builder.same_file_system(true);
}
builder.max_depth(max_depth);
for b in it {
builder.add(b);
}
builder
}
pub fn get_dir_tree<P: AsRef<Path>>(
top_level_names: &HashSet<P>,
ignore_directories: &Option<Vec<PathBuf>>,
apparent_size: bool,
limit_filesystem: bool,
threads: Option<usize>,
max_depth: Option<usize>,
) -> (bool, HashMap<PathBuf, u64>) {
let mut permissions = 0;
let mut data: HashMap<PathBuf, u64> = HashMap::new();
let restricted_filesystems = if limit_filesystem {
get_allowed_filesystems(top_level_names)
} else {
None
};
let (tx, rx) = channel::bounded::<PathData>(1000);
let mut examine_dir_args = ExamineDirMutArsg {
data: &mut data,
file_count_no_permission: &mut permissions,
};
for b in top_level_names.iter() {
examine_dir(
b,
apparent_size,
&restricted_filesystems,
ignore_directories,
threads,
&mut examine_dir_args,
);
}
(permissions == 0, data)
let permissions_flag = AtomicBool::new(true);
let t2 = HashSet::from_iter(top_level_names.iter().map(|p| p.as_ref().to_path_buf()));
let t = create_reader_thread(rx, t2, apparent_size);
let walk_dir_builder = prepare_walk_dir_builder(top_level_names, limit_filesystem, max_depth);
walk_dir_builder.build_parallel().run(|| {
let txc = tx.clone();
let pf = &permissions_flag;
Box::new(move |path| {
match path {
Ok(p) => {
if let Some(dirs) = ignore_directories {
let path = p.path();
let parts = path.components().collect::<Vec<std::path::Component>>();
for d in dirs {
let seq = d.components().collect::<Vec<std::path::Component>>();
if parts
.windows(seq.len())
.any(|window| window.iter().collect::<PathBuf>() == *d)
{
return WalkState::Continue;
}
}
}
let maybe_size_and_inode = get_metadata(&p, apparent_size);
match maybe_size_and_inode {
Some(data) => {
let (size, inode_device) = data;
txc.send((p.into_path(), size, inode_device)).unwrap();
}
None => {
pf.store(false, atomic::Ordering::Relaxed);
}
}
}
Err(_) => {
pf.store(false, atomic::Ordering::Relaxed);
}
};
WalkState::Continue
})
});
drop(tx);
let data = t.join().unwrap();
(permissions_flag.load(atomic::Ordering::SeqCst), data)
}
fn get_allowed_filesystems<P: AsRef<Path>>(top_level_names: &HashSet<P>) -> Option<HashSet<u64>> {
let mut limit_filesystems: HashSet<u64> = HashSet::new();
for file_name in top_level_names.iter() {
if let Ok(a) = get_filesystem(file_name) {
limit_filesystems.insert(a);
fn create_reader_thread(
rx: Receiver<PathData>,
top_level_names: HashSet<PathBuf>,
apparent_size: bool,
) -> JoinHandle<HashMap<PathBuf, u64>> {
// Receiver thread
thread::spawn(move || {
let mut hash: HashMap<PathBuf, u64> = HashMap::new();
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
for dent in rx {
let (path, size, maybe_inode_device) = dent;
if should_ignore_file(apparent_size, &mut inodes, maybe_inode_device) {
continue;
} else {
for p in path.ancestors() {
let s = hash.entry(p.to_path_buf()).or_insert(0);
*s += size;
if top_level_names.contains(p) {
break;
}
}
}
}
}
Some(limit_filesystems)
hash
})
}
pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
@@ -122,64 +205,8 @@ pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
path.as_ref().components().collect::<PathBuf>()
}
struct ExamineDirMutArsg<'a> {
data: &'a mut HashMap<PathBuf, u64>,
file_count_no_permission: &'a mut u64,
}
fn examine_dir<P: AsRef<Path>>(
top_dir: P,
apparent_size: bool,
filesystems: &Option<HashSet<u64>>,
ignore_directories: &Option<Vec<PathBuf>>,
threads: Option<usize>,
mut_args: &mut ExamineDirMutArsg,
) {
let top_dir = top_dir.as_ref();
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
let mut iter = WalkDir::new(top_dir)
.preload_metadata(true)
.skip_hidden(false);
if let Some(threads_to_start) = threads {
iter = iter.num_threads(threads_to_start);
}
'entry: for entry in iter {
if let Ok(e) = entry {
let maybe_size_and_inode = get_metadata(&e, apparent_size);
if let Some(dirs) = ignore_directories {
let path = e.path();
let parts = path.components().collect::<Vec<std::path::Component>>();
for d in dirs {
let seq = d.components().collect::<Vec<std::path::Component>>();
if parts
.windows(seq.len())
.any(|window| window.iter().collect::<PathBuf>() == *d)
{
continue 'entry;
}
}
}
match maybe_size_and_inode {
Some(data) => {
let (size, inode_device) = data;
if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode_device) {
process_file_with_size_and_inode(top_dir, mut_args.data, e, size)
}
}
None => *mut_args.file_count_no_permission += 1,
}
} else {
*mut_args.file_count_no_permission += 1
}
}
}
fn should_ignore_file(
apparent_size: bool,
restricted_filesystems: &Option<HashSet<u64>>,
inodes: &mut HashSet<(u64, u64)>,
maybe_inode_device: Option<(u64, u64)>,
) -> bool {
@@ -187,13 +214,6 @@ fn should_ignore_file(
None => false,
Some(data) => {
let (inode, device) = data;
// Ignore files on different devices (if flag applied)
if let Some(rs) = restricted_filesystems {
if !rs.contains(&device) {
return true;
}
}
if !apparent_size {
// Ignore files already visited or symlinked
if inodes.contains(&(inode, device)) {
@@ -206,28 +226,6 @@ fn should_ignore_file(
}
}
fn process_file_with_size_and_inode<P: AsRef<Path>>(
top_dir: P,
data: &mut HashMap<PathBuf, u64>,
e: DirEntry,
size: u64,
) {
let top_dir = top_dir.as_ref();
// This path and all its parent paths have their counter incremented
for path in e.path().ancestors() {
// This is required due to bug in Jwalk that adds '/' to all sub dir lists
// see: https://github.com/jessegrosjean/jwalk/issues/13
if path.to_string_lossy() == "/" && top_dir.to_string_lossy() != "/" {
continue;
}
let s = data.entry(normalize_path(path)).or_insert(0);
*s += size;
if path.starts_with(top_dir) && top_dir.starts_with(path) {
break;
}
}
}
pub fn sort_by_size_first_name_second(a: &(PathBuf, u64), b: &(PathBuf, u64)) -> Ordering {
let result = b.1.cmp(&a.1);
if result == Ordering::Equal {
@@ -251,36 +249,6 @@ pub fn find_big_ones(new_l: Vec<(PathBuf, u64)>, max_to_show: usize) -> Vec<(Pat
}
}
fn depth_of_path(name: &PathBuf) -> usize {
// Filter required as paths can have some odd preliminary
// ("Prefix") bits (for example, from windows, "\\?\" or "\\UNC\")
name.components()
.filter(|&c| match c {
std::path::Component::Prefix(_) => false,
_ => true,
})
.count()
}
pub fn trim_deep_ones(
input: Vec<(PathBuf, u64)>,
max_depth: u64,
top_level_names: &HashSet<PathBuf>,
) -> Vec<(PathBuf, u64)> {
let mut result: Vec<(PathBuf, u64)> = Vec::with_capacity(input.len() * top_level_names.len());
for name in top_level_names {
let my_max_depth = depth_of_path(name) + max_depth as usize;
for &(ref k, ref v) in input.iter() {
if k.starts_with(name) && depth_of_path(k) <= my_max_depth {
result.push((k.clone(), *v));
}
}
}
result
}
mod tests {
#[allow(unused_imports)]
use super::*;
@@ -367,19 +335,14 @@ mod tests {
let mut files = HashSet::new();
files.insert((10, 20));
assert!(!should_ignore_file(true, &None, &mut files, Some((0, 0))));
assert!(!should_ignore_file(true, &mut files, Some((0, 0))));
// New file is not known it will be inserted to the hashmp and should not be ignored
assert!(!should_ignore_file(
false,
&None,
&mut files,
Some((11, 12))
));
assert!(!should_ignore_file(false, &mut files, Some((11, 12))));
assert!(files.contains(&(11, 12)));
// The same file will be ignored the second time
assert!(should_ignore_file(false, &None, &mut files, Some((11, 12))));
assert!(should_ignore_file(false, &mut files, Some((11, 12))));
}
#[test]
@@ -387,17 +350,8 @@ mod tests {
let mut files = HashSet::new();
files.insert((10, 20));
let mut devices = HashSet::new();
devices.insert(99);
let od = Some(devices);
// If we are looking at a different device (disk) and the device flag is set
// then apparent_size is irrelevant - we ignore files on other devices
assert!(should_ignore_file(false, &od, &mut files, Some((11, 12))));
assert!(should_ignore_file(true, &od, &mut files, Some((11, 12))));
// We do not ignore files on the same device
assert!(!should_ignore_file(false, &od, &mut files, Some((2, 99))));
assert!(!should_ignore_file(true, &od, &mut files, Some((2, 99))));
assert!(!should_ignore_file(false, &mut files, Some((2, 99))));
assert!(!should_ignore_file(true, &mut files, Some((2, 99))));
}
}
+12 -30
View File
@@ -1,8 +1,6 @@
use jwalk::DirEntry;
use ignore::DirEntry;
#[allow(unused_imports)]
use std::fs;
use std::io;
use std::path::Path;
#[cfg(target_family = "unix")]
fn get_block_size() -> u64 {
@@ -14,13 +12,12 @@ fn get_block_size() -> u64 {
#[cfg(target_family = "unix")]
pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
use std::os::unix::fs::MetadataExt;
d.metadata.as_ref().unwrap().as_ref().ok().map(|md| {
if use_apparent_size {
(md.len(), Some((md.ino(), md.dev())))
} else {
(md.blocks() * get_block_size(), Some((md.ino(), md.dev())))
}
})
let md = d.metadata().unwrap();
if use_apparent_size {
Some((md.len(), Some((md.ino(), md.dev()))))
} else {
Some((md.blocks() * get_block_size(), Some((md.ino(), md.dev()))))
}
}
#[cfg(target_family = "windows")]
@@ -61,6 +58,8 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
// Consistently opening the file: 30 minutes.
// With this optimization: 8 sec.
use std::io;
use std::path::Path;
use winapi_util::Handle;
fn handle_from_path_limited<P: AsRef<Path>>(path: P) -> io::Result<Handle> {
use std::fs::OpenOptions;
@@ -99,9 +98,9 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
))
}
match d.metadata {
Some(Ok(ref md)) => {
use std::os::windows::fs::MetadataExt;
use std::os::windows::fs::MetadataExt;
match d.metadata() {
Ok(ref md) => {
const FILE_ATTRIBUTE_ARCHIVE: u32 = 0x20u32;
const FILE_ATTRIBUTE_READONLY: u32 = 0x1u32;
const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2u32;
@@ -123,20 +122,3 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
_ => get_metadata_expensive(&d),
}
}
#[cfg(target_family = "unix")]
pub fn get_filesystem<P: AsRef<Path>>(file_path: P) -> Result<u64, io::Error> {
use std::os::unix::fs::MetadataExt;
let metadata = fs::metadata(file_path)?;
Ok(metadata.dev())
}
#[cfg(target_family = "windows")]
pub fn get_filesystem<P: AsRef<Path>>(file_path: P) -> Result<u64, io::Error> {
use winapi_util::file::information;
use winapi_util::Handle;
let h = Handle::from_path_any(file_path)?;
let info = information(&h)?;
Ok(info.volume_serial_number())
}