Skip to content

Commit

Permalink
Merge pull request #17 from lnx-search/custom-doc-structure
Browse files Browse the repository at this point in the history
add custom doc structure to allow for more lax semantics
  • Loading branch information
ChillFish8 authored Aug 28, 2021
2 parents e2b2ea8 + 8ba85c7 commit 0d16c72
Show file tree
Hide file tree
Showing 24 changed files with 389 additions and 238 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lnx"
version = "0.4.1"
version = "0.5.0"
authors = ["Harrison Burt <[email protected]>"]
edition = "2018"

Expand Down
3 changes: 2 additions & 1 deletion engine/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "engine"
version = "0.1.1"
version = "0.2.0"
authors = ["Harrison Burt <[email protected]>"]
edition = "2018"

Expand All @@ -13,6 +13,7 @@ hashbrown = { version = "0.11", features = ["serde"] }
uuid = { version = "0.8", features = ["v4", "serde"] }
symspell = { git = "https://github.com/ChillFish8/symspell", branch = "master" }

chrono = "0.4"
serde_json = "1"
num_cpus = "1.13"
rayon = "1.5"
Expand Down
3 changes: 1 addition & 2 deletions engine/build.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::fs;
use std::io::Write;
use std::path;
use std::{fs, path};

use anyhow::Result;
use flate2::write::GzEncoder;
Expand Down
5 changes: 2 additions & 3 deletions engine/src/correction.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use once_cell::sync::OnceCell;
use std::fs;
use std::io::Write;
use std::sync::atomic::{AtomicBool, Ordering};

use symspell::{AsciiStringStrategy, SymSpell};

use flate2::write::GzDecoder;
use once_cell::sync::OnceCell;
use symspell::{AsciiStringStrategy, SymSpell};

static SYMSPELL: OnceCell<SymSpell<AsciiStringStrategy>> = OnceCell::new();
static ENABLED: AtomicBool = AtomicBool::new(false);
Expand Down
3 changes: 2 additions & 1 deletion engine/src/engine.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::sync::Arc;

use anyhow::{Error, Result};
use hashbrown::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;

use crate::correction::enable_load_dictionaries;
Expand Down
30 changes: 23 additions & 7 deletions engine/src/helpers.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,46 @@
use ahash::AHasher;
use std::hash::{Hash, Hasher};

use tantivy::schema::{NamedFieldDocument, Value};
use ahash::AHasher;

use crate::correction::correct_sentence;
use crate::structures;
use crate::structures::{DocumentValue, DocumentItem};

pub(crate) fn hash<T: Hash>(v: &T) -> u64 {
let mut hasher = AHasher::default();
v.hash(&mut hasher);
hasher.finish()
}

pub fn correct_doc_fields(doc: &mut NamedFieldDocument, indexed_text_fields: &Vec<String>) {
pub fn correct_doc_fields(doc: &mut structures::Document, indexed_text_fields: &Vec<String>) {
let mut changes = vec![];

for target in indexed_text_fields {
let id = hash(target);

let maybe_values = doc.0.get(target);
if let Some(values) = maybe_values {
for val in values {
if let Value::Str(data) = val {
let corrected = correct_sentence(data, 1);
changes.push((format!("_{}", id), vec![Value::Str(corrected)]));
match values {
DocumentItem::Single(value) => {
if let DocumentValue::Text(ref data) = value {
let corrected = correct_sentence(data, 1);
changes.push((format!("_{}", id), DocumentItem::Single(DocumentValue::Text(corrected))));
}
},
DocumentItem::Multi(values) => {
let mut local_changes = vec![];
for val in values {
if let DocumentValue::Text(ref data) = val {
let corrected = correct_sentence(data, 1);
local_changes.push(DocumentValue::Text(corrected));
}
}
if local_changes.len() > 0 {
changes.push((format!("_{}", id), DocumentItem::Multi(local_changes)));
}
}
}

};
}

Expand Down
95 changes: 50 additions & 45 deletions engine/src/index/mod.rs
Original file line number Diff line number Diff line change
@@ -1,44 +1,25 @@
use anyhow::{Error, Result};
use parking_lot::Mutex;
use std::sync::Arc;
use tokio::fs;
use tokio::task::JoinHandle;

use anyhow::{Error, Result};
use parking_lot::Mutex;
use tantivy::directory::MmapDirectory;
use tantivy::query::QueryParser;
use tantivy::schema::{NamedFieldDocument, Schema, Value};
use tantivy::schema::{Schema, Value, FieldType};
use tantivy::{Document, Index, IndexBuilder, ReloadPolicy, Term};
use tokio::fs;
use tokio::task::JoinHandle;

use crate::correction;
use crate::helpers::{self, hash};
use crate::index::reader::QueryHit;
use crate::structures::{FieldValue, IndexStorageType, LoadedIndex, QueryPayload};
use crate::structures::{self, IndexStorageType, LoadedIndex, QueryPayload, DocumentValue};
use chrono::Utc;

pub(super) mod reader;
pub(super) mod writer;

static INDEX_DATA_PATH: &str = "./lnx/index-data";

/// Converts an array of items into selecting one or rejecting
/// it all together.
macro_rules! add_values_to_terms {
($t:ident::$cb:ident, $field:expr, &$sv:expr) => {{
if $sv.len() == 0 {
Err(Error::msg("field must have one value"))
} else {
Ok($t::$cb($field, &$sv[0]))
}
}};

($t:ident::$cb:ident, $field:expr, $sv:expr) => {{
if $sv.len() == 0 {
Err(Error::msg("field must have one value"))
} else {
Ok($t::$cb($field, $sv[0]))
}
}};
}

/// A search engine index.
///
/// Each index maintains a rayon thread pool which searches are executed
Expand Down Expand Up @@ -104,11 +85,11 @@ impl IndexHandler {
&loader.name
);
(index.create_from_tempdir()?, None)
}
},
IndexStorageType::Memory => {
info!("[ SETUP @ {} ] creating index in memory", &loader.name);
(index.create_in_ram()?, None)
}
},
IndexStorageType::FileSystem => {
info!("[ SETUP @ {} ] creating index in directory", &loader.name);

Expand All @@ -117,7 +98,7 @@ impl IndexHandler {

let dir = MmapDirectory::open(&path)?;
(index.open_or_create(dir)?, Some(path.clone()))
}
},
};

Ok(out)
Expand Down Expand Up @@ -159,15 +140,15 @@ impl IndexHandler {
} else {
search_fields.push((field, 0.0f32));
};
}
},
(Some(field), None) => {
if let Some(boost) = loader.boost_fields.get(&ref_field) {
debug!("boosting field for query parser {} {}", &ref_field, boost);
search_fields.push((field, *boost));
} else {
search_fields.push((field, 0.0f32));
};
}
},
(None, _) => {
let fields: Vec<String> = loader
.schema
Expand All @@ -180,7 +161,7 @@ impl IndexHandler {
and declared the a search_field {:?} but this does not exist in the defined fields.",
fields, &ref_field
)));
}
},
};
}

Expand Down Expand Up @@ -253,22 +234,47 @@ impl IndexHandler {
/// Builds a `Term` from a given field and value.
///
/// This assumes that the value type matches up with the field type.
pub fn get_term(&self, field: &str, value: FieldValue) -> Result<Term> {
pub fn get_term(&self, field: &str, value: DocumentValue) -> Result<Term> {
let field = self
.schema
.get_field(field)
.map(|v| Ok(v))
.unwrap_or_else(|| Err(Error::msg("unknown field")))?;

let v = match value {
FieldValue::I64(v) => add_values_to_terms!(Term::from_field_i64, field, v)?,
FieldValue::F64(v) => add_values_to_terms!(Term::from_field_f64, field, v)?,
FieldValue::U64(v) => add_values_to_terms!(Term::from_field_u64, field, v)?,
FieldValue::Datetime(v) => add_values_to_terms!(Term::from_field_date, field, &v)?,
FieldValue::Text(v) => add_values_to_terms!(Term::from_field_text, field, &v)?,
let entry = self.schema.get_field_entry(field);
let field_type = entry.field_type();

let term = match (value, field_type) {
(DocumentValue::I64(v), FieldType::I64(_)) => Term::from_field_i64(field, v),
(DocumentValue::U64(v), FieldType::U64(_)) => Term::from_field_u64(field, v),
(DocumentValue::F64(v), FieldType::F64(_)) => Term::from_field_f64(field, v),
(DocumentValue::Text(v), FieldType::Str(_)) => Term::from_field_text(field, &v),
(DocumentValue::Datetime(v), FieldType::Str(_)) => Term::from_field_text(field, &v.to_string()),
(DocumentValue::Datetime(v), FieldType::Date(_)) => Term::from_field_date(field, &v),
(DocumentValue::I64(v), FieldType::Date(_)) => {
match chrono::NaiveDateTime::from_timestamp_opt(v, 0) {
Some(dt) => {
let dt = chrono::DateTime::from_utc(dt, Utc);
Term::from_field_date(field, &dt)
},
None =>
return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value (invalid timestamp)", &field, field_type))),
}
},
(DocumentValue::U64(v), FieldType::Date(_)) => {
match chrono::NaiveDateTime::from_timestamp_opt(v as i64, 0) {
Some(dt) => {
let dt = chrono::DateTime::from_utc(dt, Utc);
Term::from_field_date(field, &dt)
},
None =>
return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value (invalid timestamp)", &field, field_type))),
}
},
_ => return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value", &field, field_type)))
};

Ok(v)
Ok(term)
}

/// Gets a document with a given document address.
Expand Down Expand Up @@ -297,7 +303,7 @@ impl IndexHandler {
}

/// Submits a document to be processed by the index writer.
pub async fn add_document(&self, mut document: NamedFieldDocument) -> Result<()> {
pub async fn add_document(&self, mut document: structures::Document) -> Result<()> {
let field = self.schema.get_field("_id").ok_or_else(|| {
Error::msg(
"system has not correctly initialised this schema,\
Expand All @@ -309,7 +315,7 @@ impl IndexHandler {
helpers::correct_doc_fields(&mut document, self.indexed_fields());
}

let mut doc = self.schema.convert_named_doc(document)?;
let mut doc = document.parse_into_document(&self.schema)?;

let id = uuid::Uuid::new_v4();
doc.add_u64(field, hash(&id));
Expand All @@ -326,7 +332,7 @@ impl IndexHandler {
/// linear.
///
/// If fast fuzzy is not enabled however, this just calls add_docs in a loop.
pub async fn add_many_documents(&self, documents: Vec<NamedFieldDocument>) -> Result<()> {
pub async fn add_many_documents(&self, documents: Vec<structures::Document>) -> Result<()> {
let field = self.schema.get_field("_id").ok_or_else(|| {
Error::msg(
"system has not correctly initialised this schema,\
Expand Down Expand Up @@ -361,8 +367,7 @@ impl IndexHandler {
let mut processed_documents = vec![];
while let Ok(mut doc) = receiver.recv() {
helpers::correct_doc_fields(&mut doc, fields.as_ref());
let doc = schema.convert_named_doc(doc)?;

let doc = doc.parse_into_document(&schema)?;
processed_documents.push(doc);
}

Expand Down
35 changes: 19 additions & 16 deletions engine/src/index/reader.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
use std::sync::Arc;

use anyhow::{Error, Result};
use serde::Serialize;

use tokio::sync::oneshot;
use tokio::sync::Semaphore;

use crossbeam::queue::ArrayQueue;

use serde::Serialize;
use tantivy::collector::{Count, TopDocs};
use tantivy::query::{
BooleanQuery, EmptyQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery,
BooleanQuery,
BoostQuery,
EmptyQuery,
FuzzyTermQuery,
MoreLikeThisQuery,
Occur,
Query,
QueryParser,
TermQuery,
};
use tantivy::query::{BoostQuery, MoreLikeThisQuery};
use tantivy::schema::{Field, FieldType, IndexRecordOption, NamedFieldDocument, Schema, Value};
use tantivy::{DocAddress, Executor, IndexReader, LeasedItem, Score, Searcher, Term};
use tokio::sync::{oneshot, Semaphore};

use crate::correction::{self, correct_sentence};
use crate::structures::{QueryMode, QueryPayload};
Expand All @@ -32,7 +35,7 @@ macro_rules! try_get_doc {
Err(e) => {
let _ = $resolve.send(Err(Error::from(e)));
return;
}
},
Ok(res) => res,
};

Expand Down Expand Up @@ -268,7 +271,7 @@ impl IndexReaderHandler {
Some(doc) => {
let doc = try_get_doc!(resolve, searcher, doc);
Some(doc)
}
},
};

let query = match parse_query(
Expand All @@ -283,7 +286,7 @@ impl IndexReaderHandler {
Err(e) => {
let _ = resolve.send(Err(e));
return;
}
},
Ok(q) => q,
};

Expand Down Expand Up @@ -349,7 +352,7 @@ fn parse_query(
parse_fuzzy_query(query, search_fields)
};
Ok(qry)
}
},
(QueryMode::MoreLikeThis, _, None) => Err(Error::msg(
"query mode was `MoreLikeThis` but reference document is `None`",
)),
Expand Down Expand Up @@ -555,22 +558,22 @@ fn search(
let out: (Vec<(i64, DocAddress)>, usize) =
order_and_search!(searcher, collector, field, &query, executor)?;
(process_search!(searcher, schema, out.0), out.1)
}
},
FieldType::U64(_) => {
let out: (Vec<(u64, DocAddress)>, usize) =
order_and_search!(searcher, collector, field, &query, executor)?;
(process_search!(searcher, schema, out.0), out.1)
}
},
FieldType::F64(_) => {
let out: (Vec<(f64, DocAddress)>, usize) =
order_and_search!(searcher, collector, field, &query, executor)?;
(process_search!(searcher, schema, out.0), out.1)
}
},
FieldType::Date(_) => {
let out: (Vec<(i64, DocAddress)>, usize) =
order_and_search!(searcher, collector, field, &query, executor)?;
(process_search!(searcher, schema, out.0), out.1)
}
},
_ => return Err(Error::msg("field is not a fast field")),
}
} else {
Expand Down
Loading

0 comments on commit 0d16c72

Please sign in to comment.