diff --git a/data/src/clean.rs b/data/src/clean.rs
index d1d870d..bc285ce 100644
--- a/data/src/clean.rs
+++ b/data/src/clean.rs
@@ -39,12 +39,22 @@ use crate::remote_shard_interface::RemoteShardInterface;
 use crate::repo_salt::RepoSalt;
 use crate::PointerFile;
 
-// Chunking is the bottleneck, changing batch size doesn't have a big impact.
+// Tradeoff is the memory size of the buffer vs. the following benefits:
+// 1. global dedup query -- when a chunk hash satisfies the condition for global dedup, we query
+// the global dedup server in parallel with chunking the rest of the hash.  We process the block of
+// chunks while that query is executed, but then wait until it finishes before either reprocessing
+// the chunks or exiting.  A larger batch size means more work is done in the round trip here in the case
+// of not hitting the global dedup, and more is reprocessed if the global dedup was successful.
+//
+// 2. When there are a lot of shards, dedup from a single match proceeds as far as possible through the chunks
+// while still matching, which saves a lot of time when there are a lot of shards and hmac keys to work through.
+//
+// 256 is chosen as a decent balance between memory and the above benefits.
 lazy_static! {
     pub static ref DEDUP_CHUNK_BATCH_SIZE: usize = std::env::var("XET_DEDUP_BATCHSIZE")
         .ok()
         .and_then(|s| s.parse().ok())
-        .unwrap_or(1);
+        .unwrap_or(256);
 }
 
 lazy_static! {