Skip to content

Commit

Permalink
Reducing hash calls
Browse files Browse the repository at this point in the history
  • Loading branch information
Whattabatt committed May 2, 2024
1 parent 46b7e64 commit 5c76691
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 24 deletions.
3 changes: 3 additions & 0 deletions .devcontainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ To get started with the devcontainer, follow these steps:

7. Open a terminal within VSCode for all cli related tasks. (ie: `make test`) Simplest option is `Ctrl+Shift+P` (or `Cmd+Shift+P` on macOS) and select `Terminal: Create New Terminal`

8. To run a profiler in the devcontainer, you'll need to add the line ' "runArgs": ["--privileged","--cap-add=SYS_ADMIN"] ' to devcontainer.json before building.

## Additional Configuration

If you need to customize the devcontainer configuration, you can modify the `.devcontainer/devcontainer.json` file in this directory. Refer to the [VSCode Remote - Containers documentation](https://code.visualstudio.com/docs/remote/containers) for more information on configuring devcontainers. Just be selective about merging changes that might impact correctness across platforms/OS.

Happy hacking!

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name = "dolma"
version = "1.0.3"
edition = "2021"
license = "Apache-2.0"
inherits = "release"
debug = true

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ requires-python = ">=3.8"
dependencies = [
"anyascii>=0.3.2",
"blingfire==0.1.8",
"boto3>=1.28",
"boto3[crt]>=1.28",
"cached-path>=1.5.1",
# "fasttext==0.9.2", # broken with new version of setuptools; using fasttext-wheel instead
"fasttext-wheel==0.9.2",
Expand Down
16 changes: 2 additions & 14 deletions src/bloom_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ use std::io::{BufReader, BufWriter, Write};
use std::mem::size_of;
use std::path::PathBuf;
use std::sync::atomic::{AtomicU32, Ordering};

mod bloom_test;
// A thread-safe bloom filter.
pub struct BloomFilter {
Expand Down Expand Up @@ -191,7 +190,7 @@ impl BloomFilter {
}

// No-op if read-only
pub fn insert_hashes(&self, hashes: &Vec<u64>) {
pub fn insert(&self, hashes: &Vec<u64>) {
if !self.read_only {
for hash in hashes {
let hash = *hash as usize;
Expand All @@ -202,13 +201,7 @@ impl BloomFilter {
}
}

// No-op if read-only
pub fn insert(&self, s: &VecDeque<&str>) {
let hashes = self.hashes(s);
self.insert_hashes(&hashes);
}

pub fn contains_hashes(&self, hashes: &Vec<u64>) -> bool {
pub fn contains(&self, hashes: &Vec<u64>) -> bool {
for hash in hashes {
let hash = *hash as usize;
let index = hash / 32 % self.bits.len();
Expand All @@ -220,11 +213,6 @@ impl BloomFilter {
true
}

pub fn contains(&self, s: &VecDeque<&str>) -> bool {
let hashes = self.hashes(s);
self.contains_hashes(&hashes)
}

pub fn initialize(config: &BloomFilterConfig) -> Result<BloomFilter, io::Error> {
let save_file = PathBuf::from(&config.file);
let bloom_filter = if save_file.exists() {
Expand Down
22 changes: 13 additions & 9 deletions src/deduper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ fn write_attributes(
);

std::fs::create_dir_all(local_output.parent().unwrap())?;

log::info!(
"Writing attributes for {} to {}",
docs_location,
Expand Down Expand Up @@ -184,7 +183,8 @@ fn write_attributes(
attributes[&cfg.attribute_name] = Value::Array(Vec::new());
} else {
let dedupe_key = VecDeque::from([document_key.as_str()]);
if bloom_filter.contains(&dedupe_key) {
let hashes = bloom_filter.hashes(&dedupe_key);
if bloom_filter.contains(&hashes) {
// attributes[&cfg.attribute_name] = Value::Bool(true);

let mut duplicate_docs_array = Vec::new();
Expand All @@ -196,7 +196,7 @@ fn write_attributes(
duplicate_docs_array.push(Value::Array(attr));
attributes[&cfg.attribute_name] = Value::Array(duplicate_docs_array);
} else if !bloom_filter.read_only {
bloom_filter.insert(&dedupe_key);
bloom_filter.insert(&hashes);
}
}
}
Expand Down Expand Up @@ -246,7 +246,8 @@ fn write_attributes(
{
// Dedupe the entire paragraph
let dedupe_key = VecDeque::from([p]);
if bloom_filter.contains(&dedupe_key) {
let hashes = bloom_filter.hashes(&dedupe_key);
if bloom_filter.contains(&hashes) {
let span = vec![
Value::Number(par_start.into()),
Value::Number(par_end.into()),
Expand All @@ -255,7 +256,7 @@ fn write_attributes(
// add span to duplicate_paragraph_spans
duplicate_paragraph_spans.push(Value::Array(span));
} else if !bloom_filter.read_only {
bloom_filter.insert(&dedupe_key);
bloom_filter.insert(&hashes);
}
} else {
// Dedupe by ngram overlap
Expand All @@ -278,10 +279,12 @@ fn write_attributes(
last_ngram_start = ngram_start;
ngram_count += 1;
let dedupe_key = VecDeque::from(ngram.clone());
if bloom_filter.contains(&dedupe_key) {
let hashes = bloom_filter.hashes(&dedupe_key);

if bloom_filter.contains(&hashes) {
duplicate_ngram_count += 1;
} else if !bloom_filter.read_only {
bloom_filter.insert(&dedupe_key);
bloom_filter.insert(&hashes);
}
}
ngram.pop_front();
Expand All @@ -293,14 +296,15 @@ fn write_attributes(
{
// Too few ngrams to dedupe by overlap. Just compare the whole thing
let dedupe_key = VecDeque::from([p]);
let hashes = bloom_filter.hashes(&dedupe_key);

let span_score = match bloom_filter.contains(&dedupe_key) {
let span_score = match bloom_filter.contains(&hashes) {
// we found a match! score is 1.0
true => 1.0,
false => {
// this is a new paragraph, push to bloom filter
if !bloom_filter.read_only {
bloom_filter.insert(&dedupe_key);
bloom_filter.insert(&hashes);
}
// score is 0.0 because it's not a duplicate
0.0
Expand Down
1 change: 1 addition & 0 deletions src/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ impl FileCache {
&path,
Some(3), // retry twice if fail
))?;
log::info!("Download complete.");
Ok(path.clone())
} else {
let path = Path::new(location);
Expand Down

0 comments on commit 5c76691

Please sign in to comment.