Skip to content

Commit 958fc6e

Browse files
committed
feat: Add --path-mode and change default download behavior to only keep paths after glob patterns
Instead of always reproducing the full path. Previous behavior can be restored with the new --path-mode=abs arg.
1 parent 988fd8a commit 958fc6e

File tree

4 files changed

+195
-12
lines changed

4 files changed

+195
-12
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## v0.3.0 - 2025-02-20
6+
7+
### 🚀 Features
8+
9+
- Add --path-mode and change default download behavior to only keep paths after glob patterns
10+
511
## v0.2.7 - 2025-02-19
612

713
### 🚀 Features

src/glob_matcher.rs

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
1-
#![allow(dead_code)]
2-
31
//! A pattern is a glob that knows how to split itself into a prefix and join with a partial prefix
4-
5-
const GLOB_CHARS: &[char] = &['*', '?', '[', '{'];
2+
#![allow(dead_code)]
63

74
use std::collections::BTreeSet;
85

@@ -18,6 +15,8 @@ pub use engine::{Engine, S3Engine};
1815

1916
mod glob;
2017

18+
pub(crate) const GLOB_CHARS: &[char] = &['*', '?', '[', '{'];
19+
2120
/// The maximum number of prefixes that can be generated by the glob matcher
2221
///
2322
/// Checking that constructed prefixes exist is significantly slower than

src/main.rs

+126-6
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use anyhow::{anyhow, bail, Context as _, Result};
88
use aws_config::meta::region::RegionProviderChain;
99
use aws_sdk_s3::types::Object;
1010
use aws_sdk_s3::{config::BehaviorVersion, config::Region, Client};
11-
use clap::{ArgAction, Parser, Subcommand};
12-
use glob_matcher::{S3Engine, S3GlobMatcher};
11+
use clap::{ArgAction, Parser, Subcommand, ValueEnum};
12+
use glob_matcher::{S3Engine, S3GlobMatcher, GLOB_CHARS};
1313
use humansize::{FormatSizeOptions, SizeFormatter, DECIMAL};
1414
use num_format::{Locale, ToFormattedString};
1515
use regex::Regex;
@@ -77,6 +77,16 @@ enum Command {
7777
/// The full key name will be reproduced in the directory, so multiple
7878
/// folders may be created.
7979
dest: String,
80+
81+
/// Control how S3 object keys are mapped to local file paths
82+
///
83+
/// - absolute | abs: the full key path will be reproduced in the
84+
/// destination
85+
/// - from-first-glob | g: the key path relative to the first path part
86+
/// containing a glob in the pattern will be reproduced in the
87+
/// destination
88+
#[clap(short, long, verbatim_doc_comment, default_value = "from-first-glob")]
89+
path_mode: PathType,
8090
},
8191

8292
/// Learn how to tune s3glob's parallelism for better performance
@@ -120,6 +130,42 @@ enum Command {
120130
},
121131
}
122132

133+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
134+
enum PathType {
135+
Abs,
136+
Absolute,
137+
G,
138+
FromFirstGlob,
139+
}
140+
141+
impl ValueEnum for PathType {
142+
fn value_variants<'a>() -> &'a [Self] {
143+
&[
144+
PathType::Absolute,
145+
PathType::Abs,
146+
PathType::FromFirstGlob,
147+
PathType::G,
148+
]
149+
}
150+
151+
fn from_str(s: &str, _ignore_case: bool) -> Result<Self, String> {
152+
match s {
153+
"absolute" | "abs" => Ok(PathType::Absolute),
154+
"from-first-glob" | "g" => Ok(PathType::FromFirstGlob),
155+
_ => Err(format!("invalid path type: {}", s)),
156+
}
157+
}
158+
159+
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
160+
match self {
161+
PathType::Abs => Some(clap::builder::PossibleValue::new("abs")),
162+
PathType::Absolute => Some(clap::builder::PossibleValue::new("absolute")),
163+
PathType::FromFirstGlob => Some(clap::builder::PossibleValue::new("from-first-glob")),
164+
PathType::G => Some(clap::builder::PossibleValue::new("g")),
165+
}
166+
}
167+
}
168+
123169
#[derive(Debug, Parser)]
124170
#[command(version, author, about, max_term_width = 80)]
125171
/// A fast aws s3 ls and downloader that supports glob patterns
@@ -228,11 +274,11 @@ async fn run(opts: Opts) -> Result<()> {
228274
let client = create_s3_client(&opts, &bucket).await?;
229275

230276
let prefix = raw_pattern
231-
.find(['*', '?', '[', '{'])
277+
.find(GLOB_CHARS)
232278
.map_or(raw_pattern.clone(), |i| raw_pattern[..i].to_owned());
233279

234280
let engine = S3Engine::new(client.clone(), bucket.clone(), opts.delimiter.to_string());
235-
let matcher = S3GlobMatcher::parse(raw_pattern, &opts.delimiter.to_string())?;
281+
let matcher = S3GlobMatcher::parse(raw_pattern.clone(), &opts.delimiter.to_string())?;
236282
let mut prefixes = match matcher.find_prefixes(engine).await {
237283
Ok(prefixes) => prefixes,
238284
Err(err) => {
@@ -331,7 +377,9 @@ async fn run(opts: Opts) -> Result<()> {
331377
Duration::from_millis(start.elapsed().as_millis() as u64)
332378
);
333379
}
334-
Command::Download { dest, .. } => {
380+
Command::Download {
381+
dest, path_mode, ..
382+
} => {
335383
let mut matching_objects = Vec::new();
336384
let mut match_count = 0;
337385
while let Some(PrefixResult {
@@ -356,9 +404,13 @@ async fn run(opts: Opts) -> Result<()> {
356404
let obj_count = objects.len();
357405
let base_path = Path::new(&dest);
358406
let mut total_bytes = 0_usize;
407+
let prefix_to_strip = extract_prefix_to_strip(&raw_pattern, path_mode);
359408
for (i, obj) in objects.iter().enumerate() {
360409
let key = obj.key.as_ref().unwrap();
361-
let path = base_path.join(key);
410+
let key_suffix = key
411+
.strip_prefix(&prefix_to_strip)
412+
.expect("all found objects will include the prefix");
413+
let path = base_path.join(key_suffix);
362414
let dir = path.parent().unwrap();
363415
std::fs::create_dir_all(dir)
364416
.with_context(|| format!("Creating directory: {}", dir.display()))?;
@@ -407,6 +459,23 @@ async fn run(opts: Opts) -> Result<()> {
407459
Ok(())
408460
}
409461

462+
fn extract_prefix_to_strip(raw_pattern: &str, path_mode: PathType) -> String {
463+
match path_mode {
464+
PathType::Abs | PathType::Absolute => String::new(),
465+
PathType::FromFirstGlob | PathType::G => {
466+
let up_to_glob: String = raw_pattern
467+
.chars()
468+
.take_while(|c| !GLOB_CHARS.contains(c))
469+
.collect();
470+
// find the last slash in the prefix and only include that
471+
match up_to_glob.rfind('/') {
472+
Some(slash_idx) => up_to_glob[..slash_idx + 1].to_string(),
473+
None => up_to_glob,
474+
}
475+
}
476+
}
477+
}
478+
410479
#[derive(Debug)]
411480
struct PrefixResult {
412481
#[allow(dead_code)]
@@ -607,6 +676,7 @@ pub(crate) fn setup_logging(directive: Option<&str>) {
607676

608677
#[cfg(test)]
609678
mod tests {
679+
#![allow(clippy::comparison_to_empty)]
610680
use aws_sdk_s3::types::Object;
611681
use rstest::rstest;
612682

@@ -630,4 +700,54 @@ mod tests {
630700
fn test_format_invalid_variable() {
631701
assert!(compile_format("{invalid_var}").is_err());
632702
}
703+
704+
macro_rules! assert_extract_prefix_to_strip {
705+
($pattern:expr, $path_mode:expr, $expected:expr) => {
706+
let actual = extract_prefix_to_strip($pattern, $path_mode);
707+
assert2::check!(
708+
actual == $expected,
709+
"input: {} path_mode: {:?}",
710+
$pattern,
711+
$path_mode,
712+
);
713+
};
714+
}
715+
716+
#[test]
717+
fn test_extract_prefix_to_strip() {
718+
// Test absolute path mode
719+
assert_extract_prefix_to_strip!("prefix/path/to/*.txt", PathType::Absolute, "");
720+
assert_extract_prefix_to_strip!("bucket/deep/path/*.txt", PathType::Abs, "");
721+
722+
// Test from-first-glob path mode
723+
assert_extract_prefix_to_strip!(
724+
"prefix/path/to/*.txt",
725+
PathType::FromFirstGlob,
726+
"prefix/path/to/"
727+
);
728+
assert_extract_prefix_to_strip!(
729+
"prefix/path/*/more/*.txt",
730+
PathType::FromFirstGlob,
731+
"prefix/path/"
732+
);
733+
assert_extract_prefix_to_strip!("prefix/*.txt", PathType::FromFirstGlob, "prefix/");
734+
assert_extract_prefix_to_strip!("*.txt", PathType::FromFirstGlob, "");
735+
assert_extract_prefix_to_strip!("prefix/a.txt", PathType::FromFirstGlob, "prefix/");
736+
// Test with different glob characters
737+
assert_extract_prefix_to_strip!(
738+
"prefix/path/to/[abc]/*.txt",
739+
PathType::FromFirstGlob,
740+
"prefix/path/to/"
741+
);
742+
assert_extract_prefix_to_strip!(
743+
"prefix/path/to/?/*.txt",
744+
PathType::FromFirstGlob,
745+
"prefix/path/to/"
746+
);
747+
assert_extract_prefix_to_strip!(
748+
"prefix/path/{a,b}/*.txt",
749+
PathType::FromFirstGlob,
750+
"prefix/path/"
751+
);
752+
}
633753
}

tests/integration.rs

+60-2
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,12 @@ async fn test_s3glob_pattern_matching(
8080
let tempdir = TempDir::new()?;
8181
let mut cmd = run_s3glob(
8282
port,
83-
&[command, uri.as_str(), tempdir.path().to_str().unwrap()],
83+
&[
84+
command,
85+
"-pabs",
86+
uri.as_str(),
87+
tempdir.path().to_str().unwrap(),
88+
],
8489
)?;
8590
let _ = cmd.assert().success();
8691
for object in &test_objects {
@@ -95,6 +100,59 @@ async fn test_s3glob_pattern_matching(
95100
Ok(())
96101
}
97102

103+
#[rstest]
104+
#[case("prefix/2024-01/file1.txt", &["file1.txt"])]
105+
#[case("prefix/2024-01/file*.txt", &["file1.txt", "file2.txt"])]
106+
#[case("prefix/2024-*/file1.txt", &["2024-02/file1.txt"])]
107+
#[case("prefix/2024-*/nested/*3*", &["2024-02/nested/file3.txt"])]
108+
#[case("prefix/2024-0{1,3}/*", &["2024-01/file1.txt", "2024-03/file5.txt"])]
109+
#[tokio::test]
110+
async fn test_download_prefix_from_first_glob(
111+
#[case] glob: &str,
112+
#[case] expected: &[&str],
113+
) -> anyhow::Result<()> {
114+
let (_node, port, client) = minio_and_client().await;
115+
116+
let bucket = "test-bucket";
117+
client.create_bucket().bucket(bucket).send().await?;
118+
119+
let test_objects = vec![
120+
"prefix/2024-01/file1.txt",
121+
"prefix/2024-01/file2.txt",
122+
"prefix/2024-02/nested/file3.txt",
123+
"prefix/2024-02/nested/file4.txt",
124+
"prefix/2024-03/file5.txt",
125+
];
126+
for key in &test_objects {
127+
create_object(&client, bucket, key).await?;
128+
}
129+
130+
let tempdir = TempDir::new()?;
131+
132+
let mut cmd = run_s3glob(
133+
port,
134+
&[
135+
"dl",
136+
"-p",
137+
"from-first-glob",
138+
format!("s3://{}/{}", bucket, glob).as_str(),
139+
tempdir.path().to_str().unwrap(),
140+
],
141+
)?;
142+
143+
let _ = cmd.assert().success();
144+
145+
for object in test_objects {
146+
if expected.contains(&object) {
147+
tempdir.child(object).assert(predicate::path::exists());
148+
} else {
149+
tempdir.child(object).assert(predicate::path::missing());
150+
}
151+
}
152+
153+
Ok(())
154+
}
155+
98156
#[rstest]
99157
#[case("{key}", "test/file.txt")]
100158
#[case("{size_bytes}", "1234")]
@@ -179,7 +237,7 @@ async fn test_patterns_in_file_not_path_component(
179237
} else {
180238
let tempdir = TempDir::new()?;
181239
let out_path = tempdir.path().to_str().unwrap();
182-
let mut cmd = run_s3glob(port, &[command, needle.as_str(), out_path])?;
240+
let mut cmd = run_s3glob(port, &[command, "-pabs", needle.as_str(), out_path])?;
183241
let _ = cmd.assert().success();
184242
for object in &test_objects {
185243
if expected.contains(object) {

0 commit comments

Comments
 (0)