Merge pull request #17 from lnx-search/custom-doc-structure

add custom doc structure to allow for more lax semantics
lnx-search · Aug 28, 2021 · 0d16c72 · 0d16c72
2 parents e2b2ea8 + 8ba85c7
commit 0d16c72
Show file tree

Hide file tree

Showing 24 changed files with 389 additions and 238 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lnx"
-version = "0.4.1"
+version = "0.5.0"
 authors = ["Harrison Burt <[email protected]>"]
 edition = "2018"
 

diff --git a/engine/Cargo.toml b/engine/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "engine"
-version = "0.1.1"
+version = "0.2.0"
 authors = ["Harrison Burt <[email protected]>"]
 edition = "2018"
 
@@ -13,6 +13,7 @@ hashbrown = { version = "0.11", features = ["serde"] }
 uuid = { version = "0.8", features = ["v4", "serde"] }
 symspell = { git = "https://github.com/ChillFish8/symspell", branch = "master" }
 
+chrono = "0.4"
 serde_json = "1"
 num_cpus = "1.13"
 rayon = "1.5"

diff --git a/engine/build.rs b/engine/build.rs
@@ -1,6 +1,5 @@
-use std::fs;
 use std::io::Write;
-use std::path;
+use std::{fs, path};
 
 use anyhow::Result;
 use flate2::write::GzEncoder;

diff --git a/engine/src/correction.rs b/engine/src/correction.rs
@@ -1,11 +1,10 @@
-use once_cell::sync::OnceCell;
 use std::fs;
 use std::io::Write;
 use std::sync::atomic::{AtomicBool, Ordering};
 
-use symspell::{AsciiStringStrategy, SymSpell};
-
 use flate2::write::GzDecoder;
+use once_cell::sync::OnceCell;
+use symspell::{AsciiStringStrategy, SymSpell};
 
 static SYMSPELL: OnceCell<SymSpell<AsciiStringStrategy>> = OnceCell::new();
 static ENABLED: AtomicBool = AtomicBool::new(false);

diff --git a/engine/src/engine.rs b/engine/src/engine.rs
@@ -1,6 +1,7 @@
+use std::sync::Arc;
+
 use anyhow::{Error, Result};
 use hashbrown::HashMap;
-use std::sync::Arc;
 use tokio::sync::RwLock;
 
 use crate::correction::enable_load_dictionaries;

diff --git a/engine/src/helpers.rs b/engine/src/helpers.rs
@@ -1,30 +1,46 @@
-use ahash::AHasher;
 use std::hash::{Hash, Hasher};
 
-use tantivy::schema::{NamedFieldDocument, Value};
+use ahash::AHasher;
 
 use crate::correction::correct_sentence;
+use crate::structures;
+use crate::structures::{DocumentValue, DocumentItem};
 
 pub(crate) fn hash<T: Hash>(v: &T) -> u64 {
     let mut hasher = AHasher::default();
     v.hash(&mut hasher);
     hasher.finish()
 }
 
-pub fn correct_doc_fields(doc: &mut NamedFieldDocument, indexed_text_fields: &Vec<String>) {
+pub fn correct_doc_fields(doc: &mut structures::Document, indexed_text_fields: &Vec<String>) {
     let mut changes = vec![];
 
     for target in indexed_text_fields {
         let id = hash(target);
 
         let maybe_values = doc.0.get(target);
         if let Some(values) = maybe_values {
-            for val in values {
-                if let Value::Str(data) = val {
-                    let corrected = correct_sentence(data, 1);
-                    changes.push((format!("_{}", id), vec![Value::Str(corrected)]));
+            match values {
+                DocumentItem::Single(value) =>  {
+                    if let DocumentValue::Text(ref data) = value {
+                        let corrected = correct_sentence(data, 1);
+                        changes.push((format!("_{}", id), DocumentItem::Single(DocumentValue::Text(corrected))));
+                    }
+                },
+                DocumentItem::Multi(values) => {
+                    let mut local_changes = vec![];
+                    for val in values {
+                        if let DocumentValue::Text(ref data) = val {
+                            let corrected = correct_sentence(data, 1);
+                            local_changes.push(DocumentValue::Text(corrected));
+                        }
+                    }
+                    if local_changes.len() > 0 {
+                        changes.push((format!("_{}", id), DocumentItem::Multi(local_changes)));
+                    }
                 }
             }
+
         };
     }
 

diff --git a/engine/src/index/mod.rs b/engine/src/index/mod.rs
@@ -1,44 +1,25 @@
-use anyhow::{Error, Result};
-use parking_lot::Mutex;
 use std::sync::Arc;
-use tokio::fs;
-use tokio::task::JoinHandle;
 
+use anyhow::{Error, Result};
+use parking_lot::Mutex;
 use tantivy::directory::MmapDirectory;
 use tantivy::query::QueryParser;
-use tantivy::schema::{NamedFieldDocument, Schema, Value};
+use tantivy::schema::{Schema, Value, FieldType};
 use tantivy::{Document, Index, IndexBuilder, ReloadPolicy, Term};
+use tokio::fs;
+use tokio::task::JoinHandle;
 
 use crate::correction;
 use crate::helpers::{self, hash};
 use crate::index::reader::QueryHit;
-use crate::structures::{FieldValue, IndexStorageType, LoadedIndex, QueryPayload};
+use crate::structures::{self, IndexStorageType, LoadedIndex, QueryPayload, DocumentValue};
+use chrono::Utc;
 
 pub(super) mod reader;
 pub(super) mod writer;
 
 static INDEX_DATA_PATH: &str = "./lnx/index-data";
 
-/// Converts an array of items into selecting one or rejecting
-/// it all together.
-macro_rules! add_values_to_terms {
-    ($t:ident::$cb:ident, $field:expr, &$sv:expr) => {{
-        if $sv.len() == 0 {
-            Err(Error::msg("field must have one value"))
-        } else {
-            Ok($t::$cb($field, &$sv[0]))
-        }
-    }};
-
-    ($t:ident::$cb:ident, $field:expr, $sv:expr) => {{
-        if $sv.len() == 0 {
-            Err(Error::msg("field must have one value"))
-        } else {
-            Ok($t::$cb($field, $sv[0]))
-        }
-    }};
-}
-
 /// A search engine index.
 ///
 /// Each index maintains a rayon thread pool which searches are executed
@@ -104,11 +85,11 @@ impl IndexHandler {
                     &loader.name
                 );
                 (index.create_from_tempdir()?, None)
-            }
+            },
             IndexStorageType::Memory => {
                 info!("[ SETUP @ {} ] creating index in memory", &loader.name);
                 (index.create_in_ram()?, None)
-            }
+            },
             IndexStorageType::FileSystem => {
                 info!("[ SETUP @ {} ] creating index in directory", &loader.name);
 
@@ -117,7 +98,7 @@ impl IndexHandler {
 
                 let dir = MmapDirectory::open(&path)?;
                 (index.open_or_create(dir)?, Some(path.clone()))
-            }
+            },
         };
 
         Ok(out)
@@ -159,15 +140,15 @@ impl IndexHandler {
                     } else {
                         search_fields.push((field, 0.0f32));
                     };
-                }
+                },
                 (Some(field), None) => {
                     if let Some(boost) = loader.boost_fields.get(&ref_field) {
                         debug!("boosting field for query parser {} {}", &ref_field, boost);
                         search_fields.push((field, *boost));
                     } else {
                         search_fields.push((field, 0.0f32));
                     };
-                }
+                },
                 (None, _) => {
                     let fields: Vec<String> = loader
                         .schema
@@ -180,7 +161,7 @@ impl IndexHandler {
                         and declared the a search_field {:?} but this does not exist in the defined fields.",
                         fields, &ref_field
                     )));
-                }
+                },
             };
         }
 
@@ -253,22 +234,47 @@ impl IndexHandler {
     /// Builds a `Term` from a given field and value.
     ///
     /// This assumes that the value type matches up with the field type.
-    pub fn get_term(&self, field: &str, value: FieldValue) -> Result<Term> {
+    pub fn get_term(&self, field: &str, value: DocumentValue) -> Result<Term> {
         let field = self
             .schema
             .get_field(field)
             .map(|v| Ok(v))
             .unwrap_or_else(|| Err(Error::msg("unknown field")))?;
 
-        let v = match value {
-            FieldValue::I64(v) => add_values_to_terms!(Term::from_field_i64, field, v)?,
-            FieldValue::F64(v) => add_values_to_terms!(Term::from_field_f64, field, v)?,
-            FieldValue::U64(v) => add_values_to_terms!(Term::from_field_u64, field, v)?,
-            FieldValue::Datetime(v) => add_values_to_terms!(Term::from_field_date, field, &v)?,
-            FieldValue::Text(v) => add_values_to_terms!(Term::from_field_text, field, &v)?,
+        let entry = self.schema.get_field_entry(field);
+        let field_type = entry.field_type();
+
+        let term = match (value, field_type) {
+            (DocumentValue::I64(v), FieldType::I64(_)) => Term::from_field_i64(field, v),
+            (DocumentValue::U64(v), FieldType::U64(_)) => Term::from_field_u64(field, v),
+            (DocumentValue::F64(v), FieldType::F64(_)) => Term::from_field_f64(field, v),
+            (DocumentValue::Text(v), FieldType::Str(_)) => Term::from_field_text(field, &v),
+            (DocumentValue::Datetime(v), FieldType::Str(_)) => Term::from_field_text(field, &v.to_string()),
+            (DocumentValue::Datetime(v), FieldType::Date(_)) => Term::from_field_date(field, &v),
+            (DocumentValue::I64(v), FieldType::Date(_)) => {
+                match chrono::NaiveDateTime::from_timestamp_opt(v, 0) {
+                    Some(dt) => {
+                        let dt = chrono::DateTime::from_utc(dt, Utc);
+                        Term::from_field_date(field, &dt)
+                    },
+                    None =>
+                        return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value (invalid timestamp)", &field, field_type))),
+                }
+            },
+            (DocumentValue::U64(v), FieldType::Date(_)) => {
+                match chrono::NaiveDateTime::from_timestamp_opt(v as i64, 0) {
+                    Some(dt) => {
+                        let dt = chrono::DateTime::from_utc(dt, Utc);
+                        Term::from_field_date(field, &dt)
+                    },
+                    None =>
+                        return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value (invalid timestamp)", &field, field_type))),
+                }
+            },
+            _ => return Err(Error::msg(format!("filed {:?} is type {:?} in schema but did not get a valid value", &field, field_type)))
         };
 
-        Ok(v)
+        Ok(term)
     }
 
     /// Gets a document with a given document address.
@@ -297,7 +303,7 @@ impl IndexHandler {
     }
 
     /// Submits a document to be processed by the index writer.
-    pub async fn add_document(&self, mut document: NamedFieldDocument) -> Result<()> {
+    pub async fn add_document(&self, mut document: structures::Document) -> Result<()> {
         let field = self.schema.get_field("_id").ok_or_else(|| {
             Error::msg(
                 "system has not correctly initialised this schema,\
@@ -309,7 +315,7 @@ impl IndexHandler {
             helpers::correct_doc_fields(&mut document, self.indexed_fields());
         }
 
-        let mut doc = self.schema.convert_named_doc(document)?;
+        let mut doc = document.parse_into_document(&self.schema)?;
 
         let id = uuid::Uuid::new_v4();
         doc.add_u64(field, hash(&id));
@@ -326,7 +332,7 @@ impl IndexHandler {
     /// linear.
     ///
     /// If fast fuzzy is not enabled however, this just calls add_docs in a loop.
-    pub async fn add_many_documents(&self, documents: Vec<NamedFieldDocument>) -> Result<()> {
+    pub async fn add_many_documents(&self, documents: Vec<structures::Document>) -> Result<()> {
         let field = self.schema.get_field("_id").ok_or_else(|| {
             Error::msg(
                 "system has not correctly initialised this schema,\
@@ -361,8 +367,7 @@ impl IndexHandler {
                     let mut processed_documents = vec![];
                     while let Ok(mut doc) = receiver.recv() {
                         helpers::correct_doc_fields(&mut doc, fields.as_ref());
-                        let doc = schema.convert_named_doc(doc)?;
-
+                        let doc = doc.parse_into_document(&schema)?;
                         processed_documents.push(doc);
                     }
 

diff --git a/engine/src/index/reader.rs b/engine/src/index/reader.rs
@@ -1,20 +1,23 @@
 use std::sync::Arc;
 
 use anyhow::{Error, Result};
-use serde::Serialize;
-
-use tokio::sync::oneshot;
-use tokio::sync::Semaphore;
-
 use crossbeam::queue::ArrayQueue;
-
+use serde::Serialize;
 use tantivy::collector::{Count, TopDocs};
 use tantivy::query::{
-    BooleanQuery, EmptyQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery,
+    BooleanQuery,
+    BoostQuery,
+    EmptyQuery,
+    FuzzyTermQuery,
+    MoreLikeThisQuery,
+    Occur,
+    Query,
+    QueryParser,
+    TermQuery,
 };
-use tantivy::query::{BoostQuery, MoreLikeThisQuery};
 use tantivy::schema::{Field, FieldType, IndexRecordOption, NamedFieldDocument, Schema, Value};
 use tantivy::{DocAddress, Executor, IndexReader, LeasedItem, Score, Searcher, Term};
+use tokio::sync::{oneshot, Semaphore};
 
 use crate::correction::{self, correct_sentence};
 use crate::structures::{QueryMode, QueryPayload};
@@ -32,7 +35,7 @@ macro_rules! try_get_doc {
             Err(e) => {
                 let _ = $resolve.send(Err(Error::from(e)));
                 return;
-            }
+            },
             Ok(res) => res,
         };
 
@@ -268,7 +271,7 @@ impl IndexReaderHandler {
                 Some(doc) => {
                     let doc = try_get_doc!(resolve, searcher, doc);
                     Some(doc)
-                }
+                },
             };
 
             let query = match parse_query(
@@ -283,7 +286,7 @@ impl IndexReaderHandler {
                 Err(e) => {
                     let _ = resolve.send(Err(e));
                     return;
-                }
+                },
                 Ok(q) => q,
             };
 
@@ -349,7 +352,7 @@ fn parse_query(
                 parse_fuzzy_query(query, search_fields)
             };
             Ok(qry)
-        }
+        },
         (QueryMode::MoreLikeThis, _, None) => Err(Error::msg(
             "query mode was `MoreLikeThis` but reference document is `None`",
         )),
@@ -555,22 +558,22 @@ fn search(
                 let out: (Vec<(i64, DocAddress)>, usize) =
                     order_and_search!(searcher, collector, field, &query, executor)?;
                 (process_search!(searcher, schema, out.0), out.1)
-            }
+            },
             FieldType::U64(_) => {
                 let out: (Vec<(u64, DocAddress)>, usize) =
                     order_and_search!(searcher, collector, field, &query, executor)?;
                 (process_search!(searcher, schema, out.0), out.1)
-            }
+            },
             FieldType::F64(_) => {
                 let out: (Vec<(f64, DocAddress)>, usize) =
                     order_and_search!(searcher, collector, field, &query, executor)?;
                 (process_search!(searcher, schema, out.0), out.1)
-            }
+            },
             FieldType::Date(_) => {
                 let out: (Vec<(i64, DocAddress)>, usize) =
                     order_and_search!(searcher, collector, field, &query, executor)?;
                 (process_search!(searcher, schema, out.0), out.1)
-            }
+            },
             _ => return Err(Error::msg("field is not a fast field")),
         }
     } else {