Large refactor. Use rayon, 10X performance boost

Code changes:
Removed ignore & channel crates. Using a single reciever thread to build
a hashmap to prevend duplicate inodes being reported gave a severe
performance penalty

Using rayon crate with some hand crafted file traversal has improved
performance aprox 10X

Behaviour changes:

Removed parameter 'limit by filesystem' - don't think this is used, and
I only added it as it was easy to add with the ignore crate.

Sym links will now not appear in the output tree unless using '-s'
'apparent-size' flag

Change behaviour of multiple args so that it unifies them and
compares them under one tree instead of treating them
individually: https://github.com/bootandy/dust/issues/136
This commit is contained in:
andy.boot
2021-06-15 11:23:50 +01:00
parent c4a73d5921
commit 00c7ce8f15
14 changed files with 700 additions and 630 deletions
+176
View File
@@ -0,0 +1,176 @@
use std::fs;
use crate::node::Node;
use rayon::iter::ParallelBridge;
use rayon::prelude::ParallelIterator;
use std::path::PathBuf;
use std::sync::atomic;
use std::sync::atomic::AtomicBool;
use std::collections::HashSet;
use crate::node::build_node;
use std::fs::DirEntry;
pub fn walk_it(
dirs: HashSet<PathBuf>,
ignore_directories: HashSet<PathBuf>,
use_apparent_size: bool,
by_filecount: bool,
ignore_hidden: bool,
) -> (Vec<Node>, bool) {
let permissions_flag = AtomicBool::new(false);
let top_level_nodes: Vec<_> = dirs
.into_iter()
.filter_map(|d| {
let n = walk(
d,
&permissions_flag,
&ignore_directories,
use_apparent_size,
by_filecount,
ignore_hidden,
);
match n {
Some(n) => {
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
clean_inodes(n, &mut inodes, use_apparent_size)
}
None => None,
}
})
.collect();
(top_level_nodes, permissions_flag.into_inner())
}
// Remove files which have the same inode, we don't want to double count them.
fn clean_inodes(
x: Node,
inodes: &mut HashSet<(u64, u64)>,
use_apparent_size: bool,
) -> Option<Node> {
if !use_apparent_size {
if let Some(id) = x.inode_device {
if inodes.contains(&id) {
return None;
}
inodes.insert(id);
}
}
let new_children: Vec<_> = x
.children
.into_iter()
.filter_map(|c| clean_inodes(c, inodes, use_apparent_size))
.collect();
return Some(Node {
name: x.name,
size: x.size + new_children.iter().map(|c| c.size).sum::<u64>(),
children: new_children,
inode_device: x.inode_device,
});
}
fn ignore_file(
entry: &DirEntry,
ignore_hidden: bool,
ignore_directories: &HashSet<PathBuf>,
) -> bool {
let is_dot_file = entry.file_name().to_str().unwrap_or("").starts_with('.');
let is_ignored_path = ignore_directories.contains(&entry.path());
(is_dot_file && ignore_hidden) || is_ignored_path
}
fn walk(
dir: PathBuf,
permissions_flag: &AtomicBool,
ignore_directories: &HashSet<PathBuf>,
use_apparent_size: bool,
by_filecount: bool,
ignore_hidden: bool,
) -> Option<Node> {
let mut children = vec![];
if let Ok(entries) = fs::read_dir(dir.clone()) {
children = entries
.into_iter()
.par_bridge()
.filter_map(|entry| {
if let Ok(ref entry) = entry {
// uncommenting the below line gives simpler code but
// rayon doesn't parallelise as well giving a 3X performance drop
// hence we unravel the recursion a bit
// return walk(entry.path(), permissions_flag, ignore_directories, use_apparent_size, by_filecount, ignore_hidden);
if !ignore_file(&entry, ignore_hidden, &ignore_directories) {
if let Ok(data) = entry.file_type() {
if data.is_dir() && !data.is_symlink() {
return walk(
entry.path(),
permissions_flag,
ignore_directories,
use_apparent_size,
by_filecount,
ignore_hidden,
);
}
return build_node(
entry.path(),
vec![],
use_apparent_size,
by_filecount,
);
}
}
} else {
permissions_flag.store(true, atomic::Ordering::Relaxed);
}
None
})
.collect();
} else {
permissions_flag.store(true, atomic::Ordering::Relaxed);
}
build_node(dir, children, use_apparent_size, by_filecount)
}
mod tests {
#[allow(unused_imports)]
use super::*;
#[cfg(test)]
fn create_node() -> Node {
Node {
name: PathBuf::new(),
size: 10,
children: vec![],
inode_device: Some((5, 6)),
}
}
#[test]
fn test_should_ignore_file() {
let mut inodes = HashSet::new();
let n = create_node();
// First time we insert the node
assert!(clean_inodes(n.clone(), &mut inodes, false) == Some(n.clone()));
// Second time is a duplicate - we ignore it
assert!(clean_inodes(n.clone(), &mut inodes, false) == None);
}
#[test]
fn test_should_not_ignore_files_if_using_apparent_size() {
let mut inodes = HashSet::new();
let n = create_node();
// If using apparent size we include Nodes, even if duplicate inodes
assert!(clean_inodes(n.clone(), &mut inodes, true) == Some(n.clone()));
assert!(clean_inodes(n.clone(), &mut inodes, true) == Some(n.clone()));
}
}
+48 -40
View File
@@ -1,6 +1,6 @@
extern crate ansi_term;
use crate::utils::{Errors, Node};
use crate::display_node::DisplayNode;
use self::ansi_term::Colour::Red;
use lscolors::{LsColors, Style};
@@ -60,7 +60,7 @@ impl DisplayData {
}
}
fn percent_size(&self, node: &Node) -> f32 {
fn percent_size(&self, node: &DisplayNode) -> f32 {
let result = node.size as f32 / self.base_size as f32;
if result.is_normal() {
result
@@ -83,7 +83,7 @@ impl DrawData<'_> {
}
// TODO: can we test this?
fn generate_bar(&self, node: &Node, level: usize) -> String {
fn generate_bar(&self, node: &DisplayNode, level: usize) -> String {
let chars_in_bar = self.percent_bar.chars().count();
let num_bars = chars_in_bar as f32 * self.display_data.percent_size(node);
let mut num_not_my_bar = (chars_in_bar as i32) - num_bars as i32;
@@ -107,21 +107,23 @@ impl DrawData<'_> {
#[allow(clippy::too_many_arguments)]
pub fn draw_it(
errors: Errors,
permission_error: bool,
use_full_path: bool,
is_reversed: bool,
no_colors: bool,
no_percents: bool,
terminal_width: usize,
by_filecount: bool,
root_node: Node,
option_root_node: Option<DisplayNode>,
) {
if errors.permissions {
if permission_error {
eprintln!("Did not have permissions for all directories");
}
if errors.not_found {
eprintln!("Not all directories were found");
if option_root_node.is_none() {
return;
}
let root_node = option_root_node.unwrap();
let num_chars_needed_on_left_most = if by_filecount {
let max_size = root_node.children.iter().map(|n| n.size).fold(0, max);
max_size.separate_with_commas().chars().count()
@@ -131,11 +133,8 @@ pub fn draw_it(
let terminal_width = terminal_width - 9 - num_chars_needed_on_left_most;
let num_indent_chars = 3;
let longest_string_length = root_node
.children
.iter()
.map(|c| find_longest_dir_name(&c, num_indent_chars, terminal_width, !use_full_path))
.fold(0, max);
let longest_string_length =
find_longest_dir_name(&root_node, num_indent_chars, terminal_width, !use_full_path);
let max_bar_length = if no_percents || longest_string_length >= terminal_width as usize {
0
@@ -145,27 +144,30 @@ pub fn draw_it(
let first_size_bar = repeat(BLOCKS[0]).take(max_bar_length).collect::<String>();
for c in root_node.get_children_from_node(is_reversed) {
let display_data = DisplayData {
short_paths: !use_full_path,
is_reversed,
colors_on: !no_colors,
by_filecount,
num_chars_needed_on_left_most,
base_size: c.size,
longest_string_length,
ls_colors: LsColors::from_env().unwrap_or_default(),
};
let draw_data = DrawData {
indent: "".to_string(),
percent_bar: first_size_bar.clone(),
display_data: &display_data,
};
display_node(c, &draw_data, true, true);
}
let display_data = DisplayData {
short_paths: !use_full_path,
is_reversed,
colors_on: !no_colors,
by_filecount,
num_chars_needed_on_left_most,
base_size: root_node.size,
longest_string_length,
ls_colors: LsColors::from_env().unwrap_or_default(),
};
let draw_data = DrawData {
indent: "".to_string(),
percent_bar: first_size_bar,
display_data: &display_data,
};
display_node(root_node, &draw_data, true, true);
}
fn find_longest_dir_name(node: &Node, indent: usize, terminal: usize, long_paths: bool) -> usize {
fn find_longest_dir_name(
node: &DisplayNode,
indent: usize,
terminal: usize,
long_paths: bool,
) -> usize {
let printable_name = get_printable_name(&node.name, long_paths);
let longest = min(
UnicodeWidthStr::width(&*printable_name) + 1 + indent,
@@ -179,7 +181,7 @@ fn find_longest_dir_name(node: &Node, indent: usize, terminal: usize, long_paths
.fold(longest, max)
}
fn display_node(node: Node, draw_data: &DrawData, is_biggest: bool, is_last: bool) {
fn display_node(node: DisplayNode, draw_data: &DrawData, is_biggest: bool, is_last: bool) {
// hacky way of working out how deep we are in the tree
let indent = draw_data.get_new_indent(!node.children.is_empty(), is_last);
let level = ((indent.chars().count() - 1) / 2) - 1;
@@ -254,11 +256,13 @@ fn get_printable_name<P: AsRef<Path>>(dir_name: &P, long_paths: bool) -> String
encode_u8(printable_name.display().to_string().as_bytes())
}
fn pad_or_trim_filename(node: &Node, indent: &str, display_data: &DisplayData) -> String {
fn pad_or_trim_filename(node: &DisplayNode, indent: &str, display_data: &DisplayData) -> String {
let name = get_printable_name(&node.name, display_data.short_paths);
let indent_and_name = format!("{} {}", indent, name);
let width = UnicodeWidthStr::width(&*indent_and_name);
assert!(display_data.longest_string_length >= width);
// Add spaces after the filename so we can draw the % used bar chart.
let name_and_padding = name
+ &(repeat(" ")
@@ -281,7 +285,7 @@ fn maybe_trim_filename(name_in: String, display_data: &DisplayData) -> String {
}
pub fn format_string(
node: &Node,
node: &DisplayNode,
indent: &str,
percent_bar: &str,
is_biggest: bool,
@@ -294,7 +298,7 @@ pub fn format_string(
}
fn get_name_percent(
node: &Node,
node: &DisplayNode,
indent: &str,
bar_chart: &str,
display_data: &DisplayData,
@@ -311,7 +315,7 @@ fn get_name_percent(
}
}
fn get_pretty_size(node: &Node, is_biggest: bool, display_data: &DisplayData) -> String {
fn get_pretty_size(node: &DisplayNode, is_biggest: bool, display_data: &DisplayData) -> String {
let output = if display_data.by_filecount {
let size_as_str = node.size.separate_with_commas();
let spaces_to_add =
@@ -328,7 +332,11 @@ fn get_pretty_size(node: &Node, is_biggest: bool, display_data: &DisplayData) ->
}
}
fn get_pretty_name(node: &Node, name_and_padding: String, display_data: &DisplayData) -> String {
fn get_pretty_name(
node: &DisplayNode,
name_and_padding: String,
display_data: &DisplayData,
) -> String {
if display_data.colors_on {
let meta_result = fs::metadata(node.name.clone());
let directory_color = display_data
@@ -379,7 +387,7 @@ mod tests {
#[test]
fn test_format_str() {
let n = Node {
let n = DisplayNode {
name: PathBuf::from("/short"),
size: 2_u64.pow(12), // This is 4.0K
children: vec![],
@@ -401,7 +409,7 @@ mod tests {
#[test]
fn test_format_str_long_name() {
let name = "very_long_name_longer_than_the_eighty_character_limit_very_long_name_this_bit_will_truncate";
let n = Node {
let n = DisplayNode {
name: PathBuf::from(name),
size: 2_u64.pow(12), // This is 4.0K
children: vec![],
+46
View File
@@ -0,0 +1,46 @@
use std::cmp::Ordering;
use std::path::PathBuf;
#[derive(Debug, Eq, Clone)]
pub struct DisplayNode {
pub name: PathBuf, //todo: consider moving to a string?
pub size: u64,
pub children: Vec<DisplayNode>,
}
impl Ord for DisplayNode {
fn cmp(&self, other: &Self) -> Ordering {
if self.size == other.size {
self.name.cmp(&other.name)
} else {
self.size.cmp(&other.size)
}
}
}
impl PartialOrd for DisplayNode {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for DisplayNode {
fn eq(&self, other: &Self) -> bool {
self.name == other.name && self.size == other.size && self.children == other.children
}
}
impl DisplayNode {
pub fn num_siblings(&self) -> u64 {
self.children.len() as u64
}
pub fn get_children_from_node(&self, is_reversed: bool) -> impl Iterator<Item = DisplayNode> {
if is_reversed {
let children: Vec<DisplayNode> = self.children.clone().into_iter().rev().collect();
children.into_iter()
} else {
self.children.clone().into_iter()
}
}
}
+104
View File
@@ -0,0 +1,104 @@
use crate::display_node::DisplayNode;
use crate::node::Node;
use std::collections::BinaryHeap;
use std::collections::HashSet;
use std::path::PathBuf;
pub fn get_by_depth(top_level_nodes: Vec<Node>, n: usize) -> Option<DisplayNode> {
if top_level_nodes.is_empty() {
// perhaps change this, bring back Error object?
return None;
}
let root = get_new_root(top_level_nodes);
Some(build_by_depth(&root, n - 1))
}
pub fn get_biggest(top_level_nodes: Vec<Node>, n: usize) -> Option<DisplayNode> {
if top_level_nodes.is_empty() {
// perhaps change this, bring back Error object?
return None;
}
let mut heap = BinaryHeap::new();
let number_top_level_nodes = top_level_nodes.len();
let root = get_new_root(top_level_nodes);
root.children.iter().for_each(|c| heap.push(c));
let mut allowed_nodes = HashSet::new();
allowed_nodes.insert(&root.name);
for _ in number_top_level_nodes..n {
let line = heap.pop();
match line {
Some(line) => {
line.children.iter().for_each(|c| heap.push(c));
allowed_nodes.insert(&line.name);
}
None => break,
}
}
recursive_rebuilder(&allowed_nodes, &root)
}
fn build_by_depth(node: &Node, depth: usize) -> DisplayNode {
let new_children = {
if depth == 0 {
vec![]
} else {
let mut new_children: Vec<_> = node
.children
.iter()
.map(|c| build_by_depth(c, depth - 1))
.collect();
new_children.sort();
new_children.reverse();
new_children
}
};
DisplayNode {
name: node.name.clone(),
size: node.size,
children: new_children,
}
}
fn get_new_root(top_level_nodes: Vec<Node>) -> Node {
if top_level_nodes.len() > 1 {
let total_size = top_level_nodes.iter().map(|node| node.size).sum();
Node {
name: PathBuf::from("(total)"),
size: total_size,
children: top_level_nodes,
inode_device: None,
}
} else {
top_level_nodes.into_iter().next().unwrap()
}
}
fn recursive_rebuilder<'a>(
allowed_nodes: &'a HashSet<&PathBuf>,
current: &Node,
) -> Option<DisplayNode> {
let mut new_children: Vec<_> = current
.children
.iter()
.filter_map(|c| {
if allowed_nodes.contains(&c.name) {
recursive_rebuilder(allowed_nodes, c)
} else {
None
}
})
.collect();
new_children.sort();
new_children.reverse();
let newnode = DisplayNode {
name: current.name.clone(),
size: current.size,
children: new_children,
};
Some(newnode)
}
+29 -57
View File
@@ -1,19 +1,25 @@
#[macro_use]
extern crate clap;
extern crate crossbeam_channel as channel;
extern crate ignore;
extern crate rayon;
extern crate unicode_width;
extern crate walkdir;
use std::collections::HashSet;
use self::display::draw_it;
use crate::utils::is_a_parent_of;
use clap::{App, AppSettings, Arg};
use dirwalker::walk_it;
use filter::{get_biggest, get_by_depth};
use std::cmp::max;
use std::path::PathBuf;
use terminal_size::{terminal_size, Height, Width};
use utils::{find_big_ones, get_dir_tree, simplify_dir_names, sort, Node};
use utils::simplify_dir_names;
mod dirwalker;
mod display;
mod display_node;
mod filter;
mod node;
mod platform;
mod utils;
static DEFAULT_NUMBER_OF_LINES: usize = 30;
@@ -101,12 +107,6 @@ fn main() {
.multiple(true)
.help("Exclude any file or directory with this name"),
)
.arg(
Arg::with_name("limit_filesystem")
.short("x")
.long("limit-filesystem")
.help("Only count the files and directories on the same filesystem as the supplied directory"),
)
.arg(
Arg::with_name("display_apparent_size")
.short("s")
@@ -184,31 +184,35 @@ fn main() {
let no_colors = init_color(options.is_present("no_colors"));
let use_apparent_size = options.is_present("display_apparent_size");
let limit_filesystem = options.is_present("limit_filesystem");
let ignore_directories = options
let ignore_directories: Vec<PathBuf> = options
.values_of("ignore_directory")
.map(|i| i.map(PathBuf::from).collect());
.map(|i| i.map(PathBuf::from).collect())
.unwrap_or_default();
let by_filecount = options.is_present("by_filecount");
let show_hidden = !options.is_present("ignore_hidden");
let ignore_hidden = options.is_present("ignore_hidden");
let simplified_dirs = simplify_dir_names(target_dirs);
let (errors, nodes) = get_dir_tree(
&simplified_dirs,
&ignore_directories,
let ignored_full_path: HashSet<PathBuf> = ignore_directories
.into_iter()
.flat_map(|x| simplified_dirs.iter().map(move |d| d.join(x.clone())))
.collect();
let (nodes, errors) = walk_it(
simplified_dirs,
ignored_full_path,
use_apparent_size,
limit_filesystem,
by_filecount,
show_hidden,
ignore_hidden,
);
let sorted_data = sort(nodes);
let biggest_ones = {
let tree = {
match depth {
None => find_big_ones(sorted_data, number_of_lines),
Some(_) => sorted_data,
None => get_biggest(nodes, number_of_lines),
Some(depth) => get_by_depth(nodes, depth),
}
};
let tree = build_tree(biggest_ones, depth);
draw_it(
errors,
@@ -221,35 +225,3 @@ fn main() {
tree,
);
}
fn build_tree(biggest_ones: Vec<(PathBuf, u64)>, depth: Option<usize>) -> Node {
let mut top_parent = Node::default();
// assume sorted order
for b in biggest_ones {
let n = Node {
name: b.0,
size: b.1,
children: Vec::default(),
};
recursively_build_tree(&mut top_parent, n, depth);
}
top_parent
}
fn recursively_build_tree(parent_node: &mut Node, new_node: Node, depth: Option<usize>) {
let new_depth = match depth {
None => None,
Some(0) => return,
Some(d) => Some(d - 1),
};
if let Some(c) = parent_node
.children
.iter_mut()
.find(|c| is_a_parent_of(&c.name, &new_node.name))
{
recursively_build_tree(c, new_node, new_depth);
} else {
parent_node.children.push(new_node);
}
}
+54
View File
@@ -0,0 +1,54 @@
use crate::platform::get_metadata;
use std::cmp::Ordering;
use std::path::PathBuf;
#[derive(Debug, Eq, Clone)]
pub struct Node {
pub name: PathBuf,
pub size: u64,
pub children: Vec<Node>,
pub inode_device: Option<(u64, u64)>,
}
pub fn build_node(
dir: PathBuf,
children: Vec<Node>,
use_apparent_size: bool,
by_filecount: bool,
) -> Option<Node> {
match get_metadata(&dir, use_apparent_size) {
Some(data) => {
let (size, inode_device) = if by_filecount { (1, data.1) } else { data };
Some(Node {
name: dir,
size,
children,
inode_device,
})
}
None => None,
}
}
impl PartialEq for Node {
fn eq(&self, other: &Self) -> bool {
self.name == other.name && self.size == other.size && self.children == other.children
}
}
impl Ord for Node {
fn cmp(&self, other: &Self) -> Ordering {
if self.size == other.size {
self.name.cmp(&other.name)
} else {
self.size.cmp(&other.size)
}
}
}
impl PartialOrd for Node {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
+6 -6
View File
@@ -1,7 +1,8 @@
use ignore::DirEntry;
#[allow(unused_imports)]
use std::fs;
use std::path::Path;
#[cfg(target_family = "unix")]
fn get_block_size() -> u64 {
// All os specific implementations of MetatdataExt seem to define a block as 512 bytes
@@ -10,7 +11,7 @@ fn get_block_size() -> u64 {
}
#[cfg(target_family = "unix")]
pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
pub fn get_metadata(d: &Path, use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
use std::os::unix::fs::MetadataExt;
match d.metadata() {
Ok(md) => {
@@ -25,7 +26,7 @@ pub fn get_metadata(d: &DirEntry, use_apparent_size: bool) -> Option<(u64, Optio
}
#[cfg(target_family = "windows")]
pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
pub fn get_metadata(d: &Path, _use_apparent_size: bool) -> Option<(u64, Option<(u64, u64)>)> {
// On windows opening the file to get size, file ID and volume can be very
// expensive because 1) it causes a few system calls, and more importantly 2) it can cause
// windows defender to scan the file.
@@ -63,7 +64,6 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
// With this optimization: 8 sec.
use std::io;
use std::path::Path;
use winapi_util::Handle;
fn handle_from_path_limited<P: AsRef<Path>>(path: P) -> io::Result<Handle> {
use std::fs::OpenOptions;
@@ -90,10 +90,10 @@ pub fn get_metadata(d: &DirEntry, _use_apparent_size: bool) -> Option<(u64, Opti
Ok(Handle::from_file(file))
}
fn get_metadata_expensive(d: &DirEntry) -> Option<(u64, Option<(u64, u64)>)> {
fn get_metadata_expensive(d: &Path) -> Option<(u64, Option<(u64, u64)>)> {
use winapi_util::file::information;
let h = handle_from_path_limited(d.path()).ok()?;
let h = handle_from_path_limited(d).ok()?;
let info = information(&h).ok()?;
Some((
+126
View File
@@ -0,0 +1,126 @@
use std::collections::HashSet;
use std::path::{Path, PathBuf};
fn is_a_parent_of<P: AsRef<Path>>(parent: P, child: P) -> bool {
let parent = parent.as_ref();
let child = child.as_ref();
child.starts_with(parent) && !parent.starts_with(child)
}
pub fn simplify_dir_names<P: AsRef<Path>>(filenames: Vec<P>) -> HashSet<PathBuf> {
let mut top_level_names: HashSet<PathBuf> = HashSet::with_capacity(filenames.len());
let mut to_remove: Vec<PathBuf> = Vec::with_capacity(filenames.len());
for t in filenames {
let top_level_name = normalize_path(t);
let mut can_add = true;
for tt in top_level_names.iter() {
if is_a_parent_of(&top_level_name, tt) {
to_remove.push(tt.to_path_buf());
} else if is_a_parent_of(tt, &top_level_name) {
can_add = false;
}
}
to_remove.sort_unstable();
top_level_names.retain(|tr| to_remove.binary_search(tr).is_err());
to_remove.clear();
if can_add {
top_level_names.insert(top_level_name);
}
}
top_level_names
}
pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
// normalize path ...
// 1. removing repeated separators
// 2. removing interior '.' ("current directory") path segments
// 3. removing trailing extra separators and '.' ("current directory") path segments
// * `Path.components()` does all the above work; ref: <https://doc.rust-lang.org/std/path/struct.Path.html#method.components>
// 4. changing to os preferred separator (automatically done by recollecting components back into a PathBuf)
path.as_ref().components().collect::<PathBuf>()
}
mod tests {
#[allow(unused_imports)]
use super::*;
#[test]
fn test_simplify_dir() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("a"));
assert_eq!(simplify_dir_names(vec!["a"]), correct);
}
#[test]
fn test_simplify_dir_rm_subdir() {
let mut correct = HashSet::new();
correct.insert(["a", "b"].iter().collect::<PathBuf>());
assert_eq!(simplify_dir_names(vec!["a/b", "a/b/c", "a/b/d/f"]), correct);
}
#[test]
fn test_simplify_dir_duplicates() {
let mut correct = HashSet::new();
correct.insert(["a", "b"].iter().collect::<PathBuf>());
correct.insert(PathBuf::from("c"));
assert_eq!(
simplify_dir_names(vec![
"a/b",
"a/b//",
"a/././b///",
"c",
"c/",
"c/.",
"c/././",
"c/././."
]),
correct
);
}
#[test]
fn test_simplify_dir_rm_subdir_and_not_substrings() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("b"));
correct.insert(["c", "a", "b"].iter().collect::<PathBuf>());
correct.insert(["a", "b"].iter().collect::<PathBuf>());
assert_eq!(simplify_dir_names(vec!["a/b", "c/a/b/", "b"]), correct);
}
#[test]
fn test_simplify_dir_dots() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("src"));
assert_eq!(simplify_dir_names(vec!["src/."]), correct);
}
#[test]
fn test_simplify_dir_substring_names() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("src"));
correct.insert(PathBuf::from("src_v2"));
assert_eq!(simplify_dir_names(vec!["src/", "src_v2"]), correct);
}
#[test]
fn test_is_a_parent_of() {
assert!(is_a_parent_of("/usr", "/usr/andy"));
assert!(is_a_parent_of("/usr", "/usr/andy/i/am/descendant"));
assert!(!is_a_parent_of("/usr", "/usr/."));
assert!(!is_a_parent_of("/usr", "/usr/"));
assert!(!is_a_parent_of("/usr", "/usr"));
assert!(!is_a_parent_of("/usr/", "/usr"));
assert!(!is_a_parent_of("/usr/andy", "/usr"));
assert!(!is_a_parent_of("/usr/andy", "/usr/sibling"));
assert!(!is_a_parent_of("/usr/folder", "/usr/folder_not_a_child"));
}
#[test]
fn test_is_a_parent_of_root() {
assert!(is_a_parent_of("/", "/usr/andy"));
assert!(is_a_parent_of("/", "/usr"));
assert!(!is_a_parent_of("/", "/"));
}
}
-402
View File
@@ -1,402 +0,0 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool;
use channel::Receiver;
use std::thread::JoinHandle;
use ignore::{WalkBuilder, WalkState};
use std::sync::atomic;
use std::thread;
mod platform;
use self::platform::*;
type PathData = (PathBuf, u64, Option<(u64, u64)>);
#[derive(Debug, Default, Eq, Clone)]
pub struct Node {
pub name: PathBuf,
pub size: u64,
pub children: Vec<Node>,
}
impl Ord for Node {
fn cmp(&self, other: &Self) -> Ordering {
if self.size == other.size {
self.name.cmp(&other.name)
} else {
self.size.cmp(&other.size)
}
}
}
impl PartialOrd for Node {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for Node {
fn eq(&self, other: &Self) -> bool {
self.name == other.name && self.size == other.size && self.children == other.children
}
}
impl Node {
pub fn num_siblings(&self) -> u64 {
self.children.len() as u64
}
pub fn get_children_from_node(&self, is_reversed: bool) -> impl Iterator<Item = Node> {
if is_reversed {
let children: Vec<Node> = self.children.clone().into_iter().rev().collect();
children.into_iter()
} else {
self.children.clone().into_iter()
}
}
}
pub struct Errors {
pub permissions: bool,
pub not_found: bool,
}
pub fn is_a_parent_of<P: AsRef<Path>>(parent: P, child: P) -> bool {
let parent = parent.as_ref();
let child = child.as_ref();
child.starts_with(parent) && !parent.starts_with(child)
}
pub fn simplify_dir_names<P: AsRef<Path>>(filenames: Vec<P>) -> HashSet<PathBuf> {
let mut top_level_names: HashSet<PathBuf> = HashSet::with_capacity(filenames.len());
let mut to_remove: Vec<PathBuf> = Vec::with_capacity(filenames.len());
for t in filenames {
let top_level_name = normalize_path(t);
let mut can_add = true;
for tt in top_level_names.iter() {
if is_a_parent_of(&top_level_name, tt) {
to_remove.push(tt.to_path_buf());
} else if is_a_parent_of(tt, &top_level_name) {
can_add = false;
}
}
to_remove.sort_unstable();
top_level_names.retain(|tr| to_remove.binary_search(tr).is_err());
to_remove.clear();
if can_add {
top_level_names.insert(top_level_name);
}
}
top_level_names
}
fn prepare_walk_dir_builder<P: AsRef<Path>>(
top_level_names: &HashSet<P>,
limit_filesystem: bool,
show_hidden: bool,
) -> WalkBuilder {
let mut it = top_level_names.iter();
let mut builder = WalkBuilder::new(it.next().unwrap());
builder.follow_links(false);
if show_hidden {
builder.hidden(false);
builder.ignore(false);
builder.git_global(false);
builder.git_ignore(false);
builder.git_exclude(false);
}
if limit_filesystem {
builder.same_file_system(true);
}
for b in it {
builder.add(b);
}
builder
}
fn is_not_found(e: &ignore::Error) -> bool {
use ignore::Error;
if let Error::WithPath { err, .. } = e {
if let Error::Io(e) = &**err {
if e.kind() == std::io::ErrorKind::NotFound {
return true;
}
}
}
false
}
pub fn get_dir_tree<P: AsRef<Path>>(
top_level_names: &HashSet<P>,
ignore_directories: &Option<Vec<PathBuf>>,
apparent_size: bool,
limit_filesystem: bool,
by_filecount: bool,
show_hidden: bool,
) -> (Errors, HashMap<PathBuf, u64>) {
let (tx, rx) = channel::bounded::<PathData>(1000);
let permissions_flag = AtomicBool::new(false);
let not_found_flag = AtomicBool::new(false);
let t2 = top_level_names
.iter()
.map(|p| p.as_ref().to_path_buf())
.collect();
let t = create_reader_thread(rx, t2, apparent_size);
let walk_dir_builder = prepare_walk_dir_builder(top_level_names, limit_filesystem, show_hidden);
walk_dir_builder.build_parallel().run(|| {
let txc = tx.clone();
let pf = &permissions_flag;
let nf = &not_found_flag;
Box::new(move |path| {
match path {
Ok(p) => {
if let Some(dirs) = ignore_directories {
let path = p.path();
let parts = path.components().collect::<Vec<std::path::Component>>();
for d in dirs {
if parts
.windows(d.components().count())
.any(|window| window.iter().collect::<PathBuf>() == *d)
{
return WalkState::Continue;
}
}
}
let maybe_size_and_inode = get_metadata(&p, apparent_size);
match maybe_size_and_inode {
Some(data) => {
let (size, inode_device) =
if by_filecount { (1, data.1) } else { data };
txc.send((p.into_path(), size, inode_device)).unwrap();
}
None => {
pf.store(true, atomic::Ordering::Relaxed);
}
}
}
Err(e) => {
if is_not_found(&e) {
nf.store(true, atomic::Ordering::Relaxed);
} else {
pf.store(true, atomic::Ordering::Relaxed);
}
}
};
WalkState::Continue
})
});
drop(tx);
let data = t.join().unwrap();
let errors = Errors {
permissions: permissions_flag.load(atomic::Ordering::SeqCst),
not_found: not_found_flag.load(atomic::Ordering::SeqCst),
};
(errors, data)
}
fn create_reader_thread(
rx: Receiver<PathData>,
top_level_names: HashSet<PathBuf>,
apparent_size: bool,
) -> JoinHandle<HashMap<PathBuf, u64>> {
// Receiver thread
thread::spawn(move || {
let mut hash: HashMap<PathBuf, u64> = HashMap::new();
let mut inodes: HashSet<(u64, u64)> = HashSet::new();
for dent in rx {
let (path, size, maybe_inode_device) = dent;
if should_ignore_file(apparent_size, &mut inodes, maybe_inode_device) {
continue;
} else {
for p in path.ancestors() {
let s = hash.entry(p.to_path_buf()).or_insert(0);
*s += size;
if top_level_names.contains(p) {
break;
}
}
}
}
hash
})
}
pub fn normalize_path<P: AsRef<Path>>(path: P) -> PathBuf {
// normalize path ...
// 1. removing repeated separators
// 2. removing interior '.' ("current directory") path segments
// 3. removing trailing extra separators and '.' ("current directory") path segments
// * `Path.components()` does all the above work; ref: <https://doc.rust-lang.org/std/path/struct.Path.html#method.components>
// 4. changing to os preferred separator (automatically done by recollecting components back into a PathBuf)
path.as_ref().components().collect::<PathBuf>()
}
fn should_ignore_file(
apparent_size: bool,
inodes: &mut HashSet<(u64, u64)>,
maybe_inode_device: Option<(u64, u64)>,
) -> bool {
match maybe_inode_device {
None => false,
Some(data) => {
let (inode, device) = data;
if !apparent_size {
// Ignore files already visited or symlinked
if inodes.contains(&(inode, device)) {
return true;
}
inodes.insert((inode, device));
}
false
}
}
}
pub fn sort_by_size_first_name_second(a: &(PathBuf, u64), b: &(PathBuf, u64)) -> Ordering {
let result = b.1.cmp(&a.1);
if result == Ordering::Equal {
a.0.cmp(&b.0)
} else {
result
}
}
pub fn sort(data: HashMap<PathBuf, u64>) -> Vec<(PathBuf, u64)> {
let mut new_l: Vec<(PathBuf, u64)> = data.iter().map(|(a, b)| (a.clone(), *b)).collect();
new_l.sort_unstable_by(sort_by_size_first_name_second);
new_l
}
pub fn find_big_ones(new_l: Vec<(PathBuf, u64)>, max_to_show: usize) -> Vec<(PathBuf, u64)> {
if max_to_show > 0 && new_l.len() > max_to_show {
new_l[0..max_to_show].to_vec()
} else {
new_l
}
}
mod tests {
#[allow(unused_imports)]
use super::*;
#[test]
fn test_simplify_dir() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("a"));
assert_eq!(simplify_dir_names(vec!["a"]), correct);
}
#[test]
fn test_simplify_dir_rm_subdir() {
let mut correct = HashSet::new();
correct.insert(["a", "b"].iter().collect::<PathBuf>());
assert_eq!(simplify_dir_names(vec!["a/b", "a/b/c", "a/b/d/f"]), correct);
}
#[test]
fn test_simplify_dir_duplicates() {
let mut correct = HashSet::new();
correct.insert(["a", "b"].iter().collect::<PathBuf>());
correct.insert(PathBuf::from("c"));
assert_eq!(
simplify_dir_names(vec![
"a/b",
"a/b//",
"a/././b///",
"c",
"c/",
"c/.",
"c/././",
"c/././."
]),
correct
);
}
#[test]
fn test_simplify_dir_rm_subdir_and_not_substrings() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("b"));
correct.insert(["c", "a", "b"].iter().collect::<PathBuf>());
correct.insert(["a", "b"].iter().collect::<PathBuf>());
assert_eq!(simplify_dir_names(vec!["a/b", "c/a/b/", "b"]), correct);
}
#[test]
fn test_simplify_dir_dots() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("src"));
assert_eq!(simplify_dir_names(vec!["src/."]), correct);
}
#[test]
fn test_simplify_dir_substring_names() {
let mut correct = HashSet::new();
correct.insert(PathBuf::from("src"));
correct.insert(PathBuf::from("src_v2"));
assert_eq!(simplify_dir_names(vec!["src/", "src_v2"]), correct);
}
#[test]
fn test_is_a_parent_of() {
assert!(is_a_parent_of("/usr", "/usr/andy"));
assert!(is_a_parent_of("/usr", "/usr/andy/i/am/descendant"));
assert!(!is_a_parent_of("/usr", "/usr/."));
assert!(!is_a_parent_of("/usr", "/usr/"));
assert!(!is_a_parent_of("/usr", "/usr"));
assert!(!is_a_parent_of("/usr/", "/usr"));
assert!(!is_a_parent_of("/usr/andy", "/usr"));
assert!(!is_a_parent_of("/usr/andy", "/usr/sibling"));
assert!(!is_a_parent_of("/usr/folder", "/usr/folder_not_a_child"));
}
#[test]
fn test_is_a_parent_of_root() {
assert!(is_a_parent_of("/", "/usr/andy"));
assert!(is_a_parent_of("/", "/usr"));
assert!(!is_a_parent_of("/", "/"));
}
#[test]
fn test_should_ignore_file() {
let mut files = HashSet::new();
files.insert((10, 20));
assert!(!should_ignore_file(true, &mut files, Some((0, 0))));
// New file is not known it will be inserted to the hashmp and should not be ignored
assert!(!should_ignore_file(false, &mut files, Some((11, 12))));
assert!(files.contains(&(11, 12)));
// The same file will be ignored the second time
assert!(should_ignore_file(false, &mut files, Some((11, 12))));
}
#[test]
fn test_should_ignore_file_on_different_device() {
let mut files = HashSet::new();
files.insert((10, 20));
// We do not ignore files on the same device
assert!(!should_ignore_file(false, &mut files, Some((2, 99))));
assert!(!should_ignore_file(true, &mut files, Some((2, 99))));
}
}