Move everything to ignore instead of jwalk

2026-06-08 11:29:05 +03:00 · 2020-03-22 17:49:43 +00:00
parent f096e82754
commit 1953e107c2
6 changed files with 305 additions and 381 deletions
@@ -1,6 +1,9 @@
 #[macro_use]
 extern crate clap;
+extern crate crossbeam_channel as channel;
+extern crate ignore;
 extern crate unicode_width;
+extern crate walkdir;

 use self::display::draw_it;
 use crate::utils::is_a_parent_of;
@@ -8,7 +11,7 @@ use clap::{App, AppSettings, Arg};
 use std::cmp::max;
 use std::path::PathBuf;
 use terminal_size::{terminal_size, Height, Width};
-use utils::{find_big_ones, get_dir_tree, simplify_dir_names, sort, trim_deep_ones, Node};
+use utils::{find_big_ones, get_dir_tree, simplify_dir_names, sort, Node};

 mod display;
 mod utils;
@@ -137,27 +140,9 @@ fn main() {
        }
    };

-    let temp_threads = options.value_of("threads").and_then(|threads| {
-        threads
-            .parse::<usize>()
-            .map_err(|_| eprintln!("Ignoring bad value for threads: {:?}", threads))
-            .ok()
-    });
-    // Bug in JWalk
-    // https://github.com/jessegrosjean/jwalk/issues/15
-    // We force it to use 2 threads if there is only 1 cpu
-    // as JWalk breaks if it tries to run on a single cpu
-    let threads = {
-        if temp_threads.is_none() && num_cpus::get() == 1 {
-            Some(2)
-        } else {
-            temp_threads
-        }
-    };
-
    let depth = options.value_of("depth").and_then(|depth| {
        depth
-            .parse::<u64>()
+            .parse::<usize>()
            .map(|v| v + 1)
            .map_err(|_| eprintln!("Ignoring bad value for depth"))
            .ok()
@@ -181,13 +166,13 @@ fn main() {
        &ignore_directories,
        use_apparent_size,
        limit_filesystem,
-        threads,
+        depth,
    );
    let sorted_data = sort(nodes);
    let biggest_ones = {
        match depth {
            None => find_big_ones(sorted_data, number_of_lines + simplified_dirs.len()),
-            Some(d) => trim_deep_ones(sorted_data, d, &simplified_dirs),
+            Some(_) => sorted_data,
        }
    };
    let tree = build_tree(biggest_ones, depth);
@@ -202,7 +187,7 @@ fn main() {
    );
 }

-fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<u64>) -> Node {
+fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<usize>) -> Node {
    let mut top_parent = Node::default();

    // assume sorted order
@@ -217,7 +202,7 @@ fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<u64>) -> Node {
    top_parent
 }

-fn recursively_build_tree(parent_node: &mut Node, new_node: Node, depth: Option<u64>) {
+fn recursively_build_tree(parent_node: &mut Node, new_node: Node, depth: Option<usize>) {
    let new_depth = match depth {
        None => None,
        Some(0) => return,
@@ -1,14 +1,22 @@
-use jwalk::DirEntry;
 use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::collections::HashSet;
+use std::iter::FromIterator;
 use std::path::{Path, PathBuf};
+use std::sync::atomic::AtomicBool;

-use jwalk::WalkDir;
+use channel::Receiver;
+use std::thread::JoinHandle;
+
+use ignore::{WalkBuilder, WalkState};
+use std::sync::atomic;
+use std::thread;

 mod platform;
 use self::platform::*;

+type PathData = (PathBuf, u64, Option<(u64, u64)>);
+
 #[derive(Debug, Default, Eq)]
 pub struct Node {
    pub name: PathBuf,
@@ -70,46 +78,121 @@ pub fn simplify_dir_names<P: AsRef<Path>>(filenames: Vec<P>) -> HashSet<PathBuf>
    top_level_names
 }

+fn prepare_walk_dir_builder<P: AsRef<Path>>(
+    top_level_names: &HashSet<P>,
+    limit_filesystem: bool,
+    max_depth: Option<usize>,
+) -> WalkBuilder {
+    let mut it = top_level_names.iter();
+    let mut builder = WalkBuilder::new(it.next().unwrap());
+    builder.follow_links(false);
+    builder.ignore(false);
+    builder.git_global(false);
+    builder.git_ignore(false);
+    builder.git_exclude(false);
+    builder.hidden(false);
+
+    if limit_filesystem {
+        builder.same_file_system(true);
+    }
+
+    builder.max_depth(max_depth);
+
+    for b in it {
+        builder.add(b);
+    }
+    builder
+}
+
 pub fn get_dir_tree<P: AsRef<Path>>(
    top_level_names: &HashSet<P>,
    ignore_directories: &Option<Vec<PathBuf>>,
    apparent_size: bool,
    limit_filesystem: bool,
-    threads: Option<usize>,
+    max_depth: Option<usize>,
 ) -> (bool, HashMap<PathBuf, u64>) {
-    let mut permissions = 0;
-    let mut data: HashMap<PathBuf, u64> = HashMap::new();
-    let restricted_filesystems = if limit_filesystem {
-        get_allowed_filesystems(top_level_names)
-    } else {
-        None
-    };
+    let (tx, rx) = channel::bounded::<PathData>(1000);

-    let mut examine_dir_args = ExamineDirMutArsg {
-        data: &mut data,
-        file_count_no_permission: &mut permissions,
-    };
-    for b in top_level_names.iter() {
-        examine_dir(
-            b,
-            apparent_size,
-            &restricted_filesystems,
-            ignore_directories,
-            threads,
-            &mut examine_dir_args,
-        );
-    }
-    (permissions == 0, data)
+    let permissions_flag = AtomicBool::new(true);
+
+    let t2 = HashSet::from_iter(top_level_names.iter().map(|p| p.as_ref().to_path_buf()));
+
+    let t = create_reader_thread(rx, t2, apparent_size);
+    let walk_dir_builder = prepare_walk_dir_builder(top_level_names, limit_filesystem, max_depth);
+
+    walk_dir_builder.build_parallel().run(|| {
+        let txc = tx.clone();
+        let pf = &permissions_flag;
+        Box::new(move |path| {
+            match path {
+                Ok(p) => {
+                    if let Some(dirs) = ignore_directories {
+                        let path = p.path();
+                        let parts = path.components().collect::<Vec<std::path::Component>>();
+                        for d in dirs {
+                            let seq = d.components().collect::<Vec<std::path::Component>>();
+                            if parts
+                                .windows(seq.len())
+                                .any(|window| window.iter().collect::<PathBuf>() == *d)
+                            {
+                                return WalkState::Continue;
+                            }
+                        }
+                    }
+
+                    let maybe_size_and_inode = get_metadata(&p, apparent_size);
+
+                    match maybe_size_and_inode {
+                        Some(data) => {
+                            let (size, inode_device) = data;
+                            txc.send((p.into_path(), size, inode_device)).unwrap();
+                        }
+                        None => {
+                            pf.store(false, atomic::Ordering::Relaxed);
+                        }
+                    }
+                }
+                Err(_) => {
+                    pf.store(false, atomic::Ordering::Relaxed);
+                }
+            };
+            WalkState::Continue
+        })
+    });
+
+    drop(tx);
+    let data = t.join().unwrap();
+    (permissions_flag.load(atomic::Ordering::SeqCst), data)
 }

-fn get_allowed_filesystems<P: AsRef<Path>>(top_level_names: &HashSet<P>) -> Option<HashSet<u64>> {
-    let mut limit_filesystems: HashSet<u64> = HashSet::new();
-    for file_name in top_level_names.iter() {
-        if let Ok(a) = get_filesystem(file_name) {
-            limit_filesystems.insert(a);
+fn create_reader_thread(
+    rx: Receiver<PathData>,
+    top_level_names: HashSet<PathBuf>,
+    apparent_size: bool,
+) -> JoinHandle<HashMap<PathBuf, u64>> {
+    // Receiver thread
+    thread::spawn(move || {
+        let mut hash: HashMap<PathBuf, u64> = HashMap::new();
+        let mut inodes: HashSet<(u64, u64)> = HashSet::new();
+
+        for dent in rx {
+            let (path, size, maybe_inode_device) = dent;
+
+            if should_ignore_file(apparent_size, &mut inodes, maybe_inode_device) {
+                continue;
+            } else {
+                for p in path.ancestors() {
+                    let s = hash.entry(p.to_path_buf()).or_insert(0);
+                    *s += size;
+
+                    if top_level_names.contains(p) {
+                        break;
+                    }
+                }
+            }
        }
-    }
-    Some(limit_filesystems)
+        hash
+    })
 }

 pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
@@ -122,64 +205,8 @@ pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
    path.as_ref().components().collect::<PathBuf>()
 }

-struct ExamineDirMutArsg<'a> {
-    data: &'a mut HashMap<PathBuf, u64>,
-    file_count_no_permission: &'a mut u64,
-}
-
-fn examine_dir<P: AsRef<Path>>(
-    top_dir: P,
-    apparent_size: bool,
-    filesystems: &Option<HashSet<u64>>,
-    ignore_directories: &Option<Vec<PathBuf>>,
-    threads: Option<usize>,
-    mut_args: &mut ExamineDirMutArsg,
-) {
-    let top_dir = top_dir.as_ref();
-    let mut inodes: HashSet<(u64, u64)> = HashSet::new();
-    let mut iter = WalkDir::new(top_dir)
-        .preload_metadata(true)
-        .skip_hidden(false);
-    if let Some(threads_to_start) = threads {
-        iter = iter.num_threads(threads_to_start);
-    }
-
-    'entry: for entry in iter {
-        if let Ok(e) = entry {
-            let maybe_size_and_inode = get_metadata(&e, apparent_size);
-
-            if let Some(dirs) = ignore_directories {
-                let path = e.path();
-                let parts = path.components().collect::<Vec<std::path::Component>>();
-                for d in dirs {
-                    let seq = d.components().collect::<Vec<std::path::Component>>();
-                    if parts
-                        .windows(seq.len())
-                        .any(|window| window.iter().collect::<PathBuf>() == *d)
-                    {
-                        continue 'entry;
-                    }
-                }
-            }
-
-            match maybe_size_and_inode {
-                Some(data) => {
-                    let (size, inode_device) = data;
-                    if !should_ignore_file(apparent_size, filesystems, &mut inodes, inode_device) {
-                        process_file_with_size_and_inode(top_dir, mut_args.data, e, size)
-                    }
-                }
-                None => *mut_args.file_count_no_permission += 1,
-            }
-        } else {
-            *mut_args.file_count_no_permission += 1
-        }
-    }
-}
-
 fn should_ignore_file(
    apparent_size: bool,
-    restricted_filesystems: &Option<HashSet<u64>>,
    inodes: &mut HashSet<(u64, u64)>,
    maybe_inode_device: Option<(u64, u64)>,
 ) -> bool {
@@ -187,13 +214,6 @@ fn should_ignore_file(
        None => false,
        Some(data) => {
            let (inode, device) = data;
-            // Ignore files on different devices (if flag applied)
-            if let Some(rs) = restricted_filesystems {
-                if !rs.contains(&device) {
-                    return true;
-                }
-            }
-
            if !apparent_size {
                // Ignore files already visited or symlinked
                if inodes.contains(&(inode, device)) {
@@ -206,28 +226,6 @@ fn should_ignore_file(
    }
 }

-fn process_file_with_size_and_inode<P: AsRef<Path>>(
-    top_dir: P,
-    data: &mut HashMap<PathBuf, u64>,
-    e: DirEntry,
-    size: u64,
-) {
-    let top_dir = top_dir.as_ref();
-    // This path and all its parent paths have their counter incremented
-    for path in e.path().ancestors() {
-        // This is required due to bug in Jwalk that adds '/' to all sub dir lists
-        // see: https://github.com/jessegrosjean/jwalk/issues/13
-        if path.to_string_lossy() == "/" && top_dir.to_string_lossy() != "/" {
-            continue;
-        }
-        let s = data.entry(normalize_path(path)).or_insert(0);
-        *s += size;
-        if path.starts_with(top_dir) && top_dir.starts_with(path) {
-            break;
-        }
-    }
-}
-
 pub fn sort_by_size_first_name_second(a: &(PathBuf, u64), b: &(PathBuf, u64)) -> Ordering {
    let result = b.1.cmp(&a.1);
    if result == Ordering::Equal {
@@ -251,36 +249,6 @@ pub fn find_big_ones(new_l: Vec<(PathBuf, u64)>, max_to_show: usize) -> Vec<(Pat
    }
 }

-fn depth_of_path(name: &PathBuf) -> usize {
-    // Filter required as paths can have some odd preliminary
-    // ("Prefix") bits (for example, from windows, "\\?\" or "\\UNC\")
-    name.components()
-        .filter(|&c| match c {
-            std::path::Component::Prefix(_) => false,
-            _ => true,
-        })
-        .count()
-}
-
-pub fn trim_deep_ones(
-    input: Vec<(PathBuf, u64)>,
-    max_depth: u64,
-    top_level_names: &HashSet<PathBuf>,
-) -> Vec<(PathBuf, u64)> {
-    let mut result: Vec<(PathBuf, u64)> = Vec::with_capacity(input.len() * top_level_names.len());
-
-    for name in top_level_names {
-        let my_max_depth = depth_of_path(name) + max_depth as usize;
-
-        for &(ref k, ref v) in input.iter() {
-            if k.starts_with(name) && depth_of_path(k) <= my_max_depth {
-                result.push((k.clone(), *v));
-            }
-        }
-    }
-    result
-}
-
 mod tests {
    #[allow(unused_imports)]
    use super::*;
@@ -367,19 +335,14 @@ mod tests {
        let mut files = HashSet::new();
        files.insert((10, 20));

-        assert!(!should_ignore_file(true, &None, &mut files, Some((0, 0))));
+        assert!(!should_ignore_file(true, &mut files, Some((0, 0))));

        // New file is not known it will be inserted to the hashmp and should not be ignored
-        assert!(!should_ignore_file(
-            false,
-            &None,
-            &mut files,
-            Some((11, 12))
-        ));
+        assert!(!should_ignore_file(false, &mut files, Some((11, 12))));
        assert!(files.contains(&(11, 12)));

        // The same file will be ignored the second time
-        assert!(should_ignore_file(false, &None, &mut files, Some((11, 12))));
+        assert!(should_ignore_file(false, &mut files, Some((11, 12))));
    }

    #[test]
@@ -387,17 +350,8 @@ mod tests {
        let mut files = HashSet::new();
        files.insert((10, 20));

-        let mut devices = HashSet::new();
-        devices.insert(99);
-        let od = Some(devices);
-
-        // If we are looking at a different device (disk) and the device flag is set
-        // then apparent_size is irrelevant - we ignore files on other devices
-        assert!(should_ignore_file(false, &od, &mut files, Some((11, 12))));
-        assert!(should_ignore_file(true, &od, &mut files, Some((11, 12))));
-
        // We do not ignore files on the same device
-        assert!(!should_ignore_file(false, &od, &mut files, Some((2, 99))));
-        assert!(!should_ignore_file(true, &od, &mut files, Some((2, 99))));
+        assert!(!should_ignore_file(false, &mut files, Some((2, 99))));
+        assert!(!should_ignore_file(true, &mut files, Some((2, 99))));
    }
 }
@@ -1,8 +1,6 @@
-use jwalk::DirEntry;
+use ignore::DirEntry;
 #[allow(unused_imports)]
 use std::fs;
-use std::io;
-use std::path::Path;

 #[cfg(target_family = "unix")]
 fn get_block_size() -> u64 {
@@ -14,13 +12,12 @@ fn get_block_size() -> u64 {
 #[cfg(target_family = "unix")]
 pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
    use std::os::unix::fs::MetadataExt;
-    d.metadata.as_ref().unwrap().as_ref().ok().map(|md| {
-        if use_apparent_size {
-            (md.len(), Some((md.ino(), md.dev())))
-        } else {
-            (md.blocks() * get_block_size(), Some((md.ino(), md.dev())))
-        }
-    })
+    let md = d.metadata().unwrap();
+    if use_apparent_size {
+        Some((md.len(), Some((md.ino(), md.dev()))))
+    } else {
+        Some((md.blocks() * get_block_size(), Some((md.ino(), md.dev()))))
+    }
 }

 #[cfg(target_family = "windows")]
@@ -61,6 +58,8 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
    // Consistently opening the file: 30 minutes.
    // With this optimization:         8 sec.

+    use std::io;
+    use std::path::Path;
    use winapi_util::Handle;
    fn handle_from_path_limited<P: AsRef<Path>>(path: P) -> io::Result<Handle> {
        use std::fs::OpenOptions;
@@ -99,9 +98,9 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
        ))
    }

-    match d.metadata {
-        Some(Ok(ref md)) => {
-            use std::os::windows::fs::MetadataExt;
+    use std::os::windows::fs::MetadataExt;
+    match d.metadata() {
+        Ok(ref md) => {
            const FILE_ATTRIBUTE_ARCHIVE: u32 = 0x20u32;
            const FILE_ATTRIBUTE_READONLY: u32 = 0x1u32;
            const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2u32;
@@ -123,20 +122,3 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
        _ => get_metadata_expensive(&d),
    }
 }
-
-#[cfg(target_family = "unix")]
-pub fn get_filesystem<P: AsRef<Path>>(file_path: P) -> Result<u64, io::Error> {
-    use std::os::unix::fs::MetadataExt;
-    let metadata = fs::metadata(file_path)?;
-    Ok(metadata.dev())
-}
-
-#[cfg(target_family = "windows")]
-pub fn get_filesystem<P: AsRef<Path>>(file_path: P) -> Result<u64, io::Error> {
-    use winapi_util::file::information;
-    use winapi_util::Handle;
-
-    let h = Handle::from_path_any(file_path)?;
-    let info = information(&h)?;
-    Ok(info.volume_serial_number())
-}