From bbcf1962dc54eee8c60c4a1e256490420667881f Mon Sep 17 00:00:00 2001
From: Yucheng Low <2740522+ylow@users.noreply.github.com>
Date: Fri, 24 Jan 2025 13:47:26 -0800
Subject: [PATCH] Remove small file passthrough (#149)

Removing all the old code that passes through small text file through to
git. Since we are not using that anymore.
---
 data/src/bin/example.rs                       |  3 +-
 data/src/clean.rs                             | 59 ++-----------------
 data/src/configurations.rs                    |  1 -
 data/src/constants.rs                         |  9 ---
 data/src/data_client.rs                       |  4 --
 data/src/data_processing.rs                   |  1 -
 data/src/lib.rs                               |  2 -
 data/src/small_file_determination.rs          | 19 ------
 .../test_basic_clean_smudge.sh                |  4 +-
 9 files changed, 8 insertions(+), 94 deletions(-)
 delete mode 100644 data/src/small_file_determination.rs
diff --git a/data/src/bin/example.rs b/data/src/bin/example.rs
index 90ef47dc..a0cd77ff 100644
--- a/data/src/bin/example.rs
+++ b/data/src/bin/example.rs
@@ -9,7 +9,7 @@ use anyhow::Result;
 use cas_client::CacheConfig;
 use clap::{Args, Parser, Subcommand};
 use data::configurations::*;
-use data::{PointerFile, PointerFileTranslator, SMALL_FILE_THRESHOLD};
+use data::{PointerFile, PointerFileTranslator};
 use utils::ThreadPool;
 
 #[derive(Parser)]
@@ -101,7 +101,6 @@ fn default_clean_config() -> Result<TranslatorConfig> {
         },
         dedup_config: Some(DedupConfig {
             repo_salt: None,
-            small_file_threshold: SMALL_FILE_THRESHOLD,
             global_dedup_policy: Default::default(),
         }),
         repo_info: Some(RepoInfo {
diff --git a/data/src/clean.rs b/data/src/clean.rs
index ce39f6ac..f8b6fac4 100644
--- a/data/src/clean.rs
+++ b/data/src/clean.rs
@@ -34,7 +34,6 @@ use crate::metrics::FILTER_BYTES_CLEANED;
 use crate::parallel_xorb_uploader::XorbUpload;
 use crate::remote_shard_interface::RemoteShardInterface;
 use crate::repo_salt::RepoSalt;
-use crate::small_file_determination::{is_file_passthrough, is_possible_start_to_text_file};
 use crate::PointerFile;
 
 // Chunking is the bottleneck, changing batch size doesn't have a big impact.
@@ -108,7 +107,6 @@ impl ShaGenerator {
 
 pub struct Cleaner {
     // Configurations
-    small_file_threshold: usize,
     enable_global_dedup_queries: bool,
     cas_prefix: String,
     repo_salt: Option<RepoSalt>,
@@ -129,7 +127,6 @@ pub struct Cleaner {
 
     // Internal Data
     tracking_info: Mutex<DedupFileTrackingInfo>,
-    small_file_buffer: Mutex<Option<Vec<u8>>>,
 
     // Auxiliary info
     file_name: Option<PathBuf>,
@@ -145,7 +142,6 @@ pub struct Cleaner {
 impl Cleaner {
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn new(
-        small_file_threshold: usize,
         enable_global_dedup_queries: bool,
         cas_prefix: String,
         repo_salt: Option<RepoSalt>,
@@ -166,7 +162,6 @@ impl Cleaner {
         let chunker = chunk_target_default(data_c, chunk_p, threadpool.clone());
 
         let cleaner = Arc::new(Cleaner {
-            small_file_threshold,
             enable_global_dedup_queries,
             cas_prefix,
             repo_salt,
@@ -178,7 +173,6 @@ impl Cleaner {
             chunking_worker: Mutex::new(Some(chunker)),
             dedup_worker: Mutex::new(None),
             tracking_info: Mutex::new(Default::default()),
-            small_file_buffer: Mutex::new(Some(Vec::with_capacity(small_file_threshold))),
             file_name: file_name.map(|f| f.to_owned()),
             sha_generator: ShaGenerator::new(),
             metrics: CleanMetrics {
@@ -200,9 +194,8 @@ impl Cleaner {
         self.metrics.file_size.fetch_add(data.len() as u64, Ordering::Relaxed);
 
         self.sha_generator.update(&data)?;
-        if !self.check_passthrough_status(&data).await? {
-            self.add_data_to_chunking(BufferItem::Value(data)).await?
-        }
+
+        self.add_data_to_chunking(BufferItem::Value(data)).await?;
 
         Ok(())
     }
@@ -212,15 +205,8 @@ impl Cleaner {
 
         let file_size = self.metrics.file_size.load(Ordering::Relaxed);
 
-        // File is small, all data kept in the small file buffer.
-        let mut small_file_buffer = self.small_file_buffer.lock().await;
-        let (new_bytes, return_file) = if let Some(buffer) = small_file_buffer.take() {
-            let small_file = String::from_utf8(buffer)?;
-            (small_file.len() as u64, small_file)
-        } else {
-            let new_bytes = self.metrics.new_bytes_after_dedup.load(Ordering::Relaxed);
-            (new_bytes, self.to_pointer_file().await?)
-        };
+        let new_bytes = self.metrics.new_bytes_after_dedup.load(Ordering::Relaxed);
+        let return_file = self.to_pointer_file().await?;
 
         let current_time = SystemTime::now();
         let start: DateTime<Utc> = self.metrics.start_time.into();
@@ -308,32 +294,6 @@ impl Cleaner {
         Ok(())
     }
 
-    /// Check passthrough condition of data.
-    /// Return true if the incoming data is already processed inside,
-    /// otherwise return false and let the caller to handle the data.
-    async fn check_passthrough_status(&self, data: &[u8]) -> Result<bool> {
-        let mut small_file_buffer = self.small_file_buffer.lock().await;
-
-        if let Some(mut buffer) = small_file_buffer.take() {
-            buffer.extend_from_slice(data);
-
-            if !is_possible_start_to_text_file(&buffer) || buffer.len() >= self.small_file_threshold {
-                self.add_data_to_chunking(BufferItem::Value(buffer)).await?;
-
-                // not passthrough, but just sent all buffered data + incoming data to chunker
-                return Ok(true);
-            }
-
-            *small_file_buffer = Some(buffer);
-
-            // may be passthrough, keep accumulating
-            return Ok(true);
-        }
-
-        // not passthrough, already sent all buffered data to chunker
-        Ok(false)
-    }
-
     async fn dedup(&self, chunks: &[ChunkYieldType]) -> Result<()> {
         debug!("Dedup {} chunks", chunks.len());
         let mut tracking_info = self.tracking_info.lock().await;
@@ -574,17 +534,6 @@ impl Cleaner {
     async fn finish(&self) -> Result<()> {
         self.task_is_running().await?;
 
-        // check if there is remaining data in buffer
-        let mut small_file_buffer = self.small_file_buffer.lock().await;
-        if let Some(buffer) = small_file_buffer.take() {
-            if !is_file_passthrough(&buffer, self.small_file_threshold) {
-                self.add_data_to_chunking(BufferItem::Value(buffer)).await?;
-            } else {
-                // put back for return value
-                *small_file_buffer = Some(buffer);
-            }
-        }
-
         // signal finish
         self.add_data_to_chunking(BufferItem::Completed).await?;
 
diff --git a/data/src/configurations.rs b/data/src/configurations.rs
index a1f17624..89575a4b 100644
--- a/data/src/configurations.rs
+++ b/data/src/configurations.rs
@@ -25,7 +25,6 @@ pub struct StorageConfig {
 #[derive(Debug)]
 pub struct DedupConfig {
     pub repo_salt: Option<RepoSalt>,
-    pub small_file_threshold: usize,
     pub global_dedup_policy: GlobalDedupPolicy,
 }
 
diff --git a/data/src/constants.rs b/data/src/constants.rs
index 7b2f87a5..f7cb02d2 100644
--- a/data/src/constants.rs
+++ b/data/src/constants.rs
@@ -18,15 +18,6 @@ pub const GIT_MAX_PACKET_SIZE: usize = 65516;
 /// See gitxetcore::data::pointer_file for the explanation for this limit.
 pub const POINTER_FILE_LIMIT: usize = 150;
 
-/// If a file has size smaller than this threshold, AND if it "looks-like"
-/// text, we interpret this as a text file and passthrough the file, letting
-/// git handle it. See `small_file_determination.rs` for details.
-///
-/// We set this to be 1 less than a constant multiple of the GIT_MAX_PACKET_SIZE
-/// so we can read exactly up to that multiple of packets to determine if it
-/// is a small file.
-pub const SMALL_FILE_THRESHOLD: usize = 4 * GIT_MAX_PACKET_SIZE - 1;
-
 // Salt is 256-bit in length.
 pub const REPO_SALT_LEN: usize = 32;
 
diff --git a/data/src/data_client.rs b/data/src/data_client.rs
index e8e8b0fb..4dbe3051 100644
--- a/data/src/data_client.rs
+++ b/data/src/data_client.rs
@@ -28,9 +28,6 @@ lazy_static! {
 }
 const MAX_CONCURRENT_DOWNLOADS: usize = 8; // Download is not CPU-bound
 
-// We now process every file delegated from the Python library.
-const SMALL_FILE_THRESHOLD: usize = 1;
-
 const DEFAULT_CAS_ENDPOINT: &str = "http://localhost:8080";
 const READ_BLOCK_SIZE: usize = 1024 * 1024;
 
@@ -76,7 +73,6 @@ pub fn default_config(
         },
         dedup_config: Some(DedupConfig {
             repo_salt: None,
-            small_file_threshold: SMALL_FILE_THRESHOLD,
             global_dedup_policy: Default::default(),
         }),
         repo_info: Some(RepoInfo {
diff --git a/data/src/data_processing.rs b/data/src/data_processing.rs
index 4676f900..d1f0f547 100644
--- a/data/src/data_processing.rs
+++ b/data/src/data_processing.rs
@@ -172,7 +172,6 @@ impl PointerFileTranslator {
         };
 
         Cleaner::new(
-            dedup.small_file_threshold,
             matches!(dedup.global_dedup_policy, GlobalDedupPolicy::Always),
             self.config.cas_storage_config.prefix.clone(),
             dedup.repo_salt,
diff --git a/data/src/lib.rs b/data/src/lib.rs
index 400c6f27..ef10fd20 100644
--- a/data/src/lib.rs
+++ b/data/src/lib.rs
@@ -14,10 +14,8 @@ mod pointer_file;
 mod remote_shard_interface;
 mod repo_salt;
 mod shard_interface;
-mod small_file_determination;
 mod test_utils;
 
 pub use cas_client::CacheConfig;
-pub use constants::SMALL_FILE_THRESHOLD;
 pub use data_processing::PointerFileTranslator;
 pub use pointer_file::PointerFile;
diff --git a/data/src/small_file_determination.rs b/data/src/small_file_determination.rs
deleted file mode 100644
index 822168cf..00000000
--- a/data/src/small_file_determination.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-/// Implements the small file heuristic to determine if this file
-/// should be stored in Git or MerkleDB.
-/// The current heuristic is simple:
-///  - if it is less than the SMALL_FILE_THRESHOLD AND if it decodes as utf-8, it is a small file
-pub fn is_file_passthrough(buf: &[u8], small_file_threshold: usize) -> bool {
-    buf.len() < small_file_threshold && std::str::from_utf8(buf).is_ok()
-}
-
-pub fn is_possible_start_to_text_file(buf: &[u8]) -> bool {
-    // In UTF-8 encoding, the maximum length of a character is 4 bytes. This means
-    // that a valid UTF-8 character can span up to 4 bytes. Therefore, when you have
-    // a sequence of bytes that could be part of a valid UTF-8 string but is truncated,
-    // the maximum difference between the end of the buffer and the position indicated
-    // by e.valid_up_to() from a Utf8Error is 3 bytes.
-    match std::str::from_utf8(buf) {
-        Ok(_) => true,
-        Err(e) => e.valid_up_to() != 0 && e.valid_up_to() >= buf.len().saturating_sub(3),
-    }
-}
diff --git a/data/tests/integration_tests/test_basic_clean_smudge.sh b/data/tests/integration_tests/test_basic_clean_smudge.sh
index 93aad16f..77336b11 100644
--- a/data/tests/integration_tests/test_basic_clean_smudge.sh
+++ b/data/tests/integration_tests/test_basic_clean_smudge.sh
@@ -29,4 +29,6 @@ assert_files_equal large.dat large.dat.2
 create_text_file small.txt key1 100 1
 
 x clean -d small.pft small.txt
-assert_files_equal small.pft small.txt # not converted to pointer file
+assert_is_pointer_file small.pft
+x smudge -f small.pft small.txt.2
+assert_files_equal small.txt small.txt.2