From 137e366eca7f875bc6e3f4999ce813d06310598b Mon Sep 17 00:00:00 2001 From: "andy.boot" Date: Wed, 5 Feb 2025 20:28:17 +0000 Subject: [PATCH] feat: Handle duplicate dir names better If we run `dust /usr/*/Trash` We see several 'Trash' directories in the output but do not know which user they belong to. This fix means if we see duplicate names in a directory we will display the parent directory name as well --- src/display.rs | 2 +- src/filter.rs | 66 ++++++++++++++++++++- src/main.rs | 1 + tests/test_dir_matching/andy/dup_name/hello | 0 tests/test_dir_matching/dave/dup_name/hello | 0 tests/test_flags.rs | 16 +++++ 6 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 tests/test_dir_matching/andy/dup_name/hello create mode 100644 tests/test_dir_matching/dave/dup_name/hello diff --git a/src/display.rs b/src/display.rs index 68a13d8..f63edea 100644 --- a/src/display.rs +++ b/src/display.rs @@ -273,7 +273,7 @@ fn clean_indentation_string(s: &str) -> String { is } -fn get_printable_name>(dir_name: &P, short_paths: bool) -> String { +pub fn get_printable_name>(dir_name: &P, short_paths: bool) -> String { let dir_name = dir_name.as_ref(); let printable_name = { if short_paths { diff --git a/src/filter.rs b/src/filter.rs index c7629bb..d597482 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -1,3 +1,6 @@ +use stfu8::encode_u8; + +use crate::display::get_printable_name; use crate::display_node::DisplayNode; use crate::node::FileTime; use crate::node::Node; @@ -14,6 +17,7 @@ pub struct AggregateData { pub number_of_lines: usize, pub depth: usize, pub using_a_filter: bool, + pub short_paths: bool, } pub fn get_biggest( @@ -40,13 +44,17 @@ pub fn get_biggest( } else { top_level_nodes.iter().map(|node| node.size).sum() }; + + let nodes = handle_duplicate_top_level_names(top_level_nodes, display_data.short_paths); + root = Node { name: PathBuf::from("(total)"), size, - children: top_level_nodes, + children: nodes, inode_device: None, depth: 0, }; + // Always include the base nodes if we add a 'parent' (total) node heap = always_add_children(&display_data, &root, heap); } else { @@ -74,6 +82,8 @@ pub fn fill_remaining_lines<'a>( let line = heap.pop(); match line { Some(line) => { + // If we are not doing only_file OR if we are doing + // only_file and it has no children (ie is a file not a dir) if !display_data.only_file || line.children.is_empty() { allowed_nodes.insert(line.name.as_path(), line); } @@ -161,3 +171,57 @@ fn build_display_node(mut new_children: Vec, current: &Node) -> Dis children: new_children, } } + +fn names_have_dup(top_level_nodes: &Vec) -> bool { + let mut stored = HashSet::new(); + for node in top_level_nodes { + let name = get_printable_name(&node.name, true); + if stored.contains(&name) { + return true; + } + stored.insert(name); + } + false +} + +fn handle_duplicate_top_level_names(top_level_nodes: Vec, short_paths: bool) -> Vec { + // If we have top level names that are the same - we need to tweak them: + if short_paths && names_have_dup(&top_level_nodes) { + let mut new_top_nodes = top_level_nodes.clone(); + let mut dir_walk_up_count = 0; + + while names_have_dup(&new_top_nodes) && dir_walk_up_count < 10 { + dir_walk_up_count += 1; + let mut newer = vec![]; + + for node in new_top_nodes.iter() { + let mut folders = node.name.iter().rev(); + // Get parent folder (if second time round get grandparent and so on) + for _ in 0..dir_walk_up_count { + folders.next(); + } + match folders.next() { + // Add (parent_name) to path of Node + Some(data) => { + let parent = encode_u8(data.as_encoded_bytes()); + let current_node = node.name.display(); + let n = Node { + name: PathBuf::from(format!("{current_node}({parent})")), + size: node.size, + children: node.children.clone(), + inode_device: node.inode_device, + depth: node.depth, + }; + newer.push(n) + } + // Node does not have a parent + None => newer.push(node.clone()), + } + } + new_top_nodes = newer; + } + new_top_nodes + } else { + top_level_nodes + } +} diff --git a/src/main.rs b/src/main.rs index 66b9ac4..3bfc8d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -293,6 +293,7 @@ fn main() { number_of_lines, depth, using_a_filter: !filter_regexs.is_empty() || !invert_filter_regexs.is_empty(), + short_paths: !config.get_full_paths(&options), }; get_biggest(top_level_nodes, agg_data, &by_filetime, keep_collapsed) } diff --git a/tests/test_dir_matching/andy/dup_name/hello b/tests/test_dir_matching/andy/dup_name/hello new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_dir_matching/dave/dup_name/hello b/tests/test_dir_matching/dave/dup_name/hello new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_flags.rs b/tests/test_flags.rs index eed8c70..d6e9cb9 100644 --- a/tests/test_flags.rs +++ b/tests/test_flags.rs @@ -261,3 +261,19 @@ pub fn test_collapse() { assert!(output.contains("many")); assert!(!output.contains("hello_file")); } + +#[test] +pub fn test_handle_duplicate_names() { + // Check that even if we run on a multiple directories with the same name + // we still show the distinct parent dir in the output + let output = build_command(vec![ + "tests/test_dir_matching/dave/dup_name", + "tests/test_dir_matching/andy/dup_name", + "ci", + ]); + assert!(output.contains("andy")); + assert!(output.contains("dave")); + assert!(output.contains("ci")); + assert!(output.contains("dup_name")); + assert!(!output.contains("test_dir_matching")); +}