Skip to content

Commit

Permalink
Add .txt file support
Browse files Browse the repository at this point in the history
  • Loading branch information
ang-zeyu committed Jan 2, 2022
1 parent 91e6c9c commit 86780c2
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 0 deletions.
13 changes: 13 additions & 0 deletions docs/src/indexer/indexing.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,19 @@ Field mappings for csv files can be configured using one of the `field_map / fie
The `parse_options` key specifies options for parsing the csv file. In particular, note that the `has_headers` key is distinct from and does not influence the `use_headers` parameter.


**`TxtLoader`**

```json
"loader_configs": {
"TxtLoader": {
"field": "field_name",
}
}
```

This loader simply reads `.txt` files and indexes all of the content into a single `field`.


## Search Performance

**`pl_limit`**
Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
"buildSearch": "webpack --config ./webpack.prod.js",
"index1": "cargo run -p morsels_indexer -- ./test_files/1/source ./test_files/1/output --perf",
"index2": "cargo run -p morsels_indexer -- ./test_files/2/source ./test_files/2/output --perf",
"index3": "cargo run -p morsels_indexer -- ./test_files/3/source ./test_files/3/output --perf",
"index1R": "cargo run -p morsels_indexer --release ./test_files/1/source ./test_files/1/output --perf",
"index2R": "cargo run -p morsels_indexer --release ./test_files/2/source ./test_files/2/output --perf",
"index3R": "cargo run -p morsels_indexer --release ./test_files/3/source ./test_files/3/output --perf",
"devServer1": "http-server ./test_files/1 -p 3000 --cors -c-1",
"devServer2": "http-server ./test_files/2 -p 3000 --cors -c-1",
"devServer3": "http-server ./test_files/3 -p 3000 --cors -c-1",
"lint": "eslint packages/search-ui --ext .ts && eslint packages/search --ext .ts",
"lintFix": "eslint packages/search-ui --ext .ts --fix && eslint packages/search --ext .ts --fix",
"test": "cargo test && npm run e2e",
Expand Down
2 changes: 2 additions & 0 deletions packages/morsels_indexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use crate::fieldinfo::FieldsConfig;
use crate::loader::csv::CsvLoader;
use crate::loader::html::HtmlLoader;
use crate::loader::json::JsonLoader;
use crate::loader::txt::TxtLoader;
use crate::loader::Loader;
use crate::worker::miner::WorkerMiner;
use crate::worker::{MainToWorkerMessage, Worker, WorkerToMainMessage};
Expand Down Expand Up @@ -133,6 +134,7 @@ impl MorselsIndexingConfig {
"HtmlLoader" => loaders.push(HtmlLoader::get_new_html_loader(value)),
"CsvLoader" => loaders.push(CsvLoader::get_new_csv_loader(value)),
"JsonLoader" => loaders.push(JsonLoader::get_new_json_loader(value)),
"TxtLoader" => loaders.push(TxtLoader::get_new_txt_loader(value)),
_ => panic!("Unknown loader type encountered in config"),
}
}
Expand Down
1 change: 1 addition & 0 deletions packages/morsels_indexer/src/loader.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub mod csv;
pub mod html;
pub mod json;
pub mod txt;

use std::path::Path;

Expand Down
79 changes: 79 additions & 0 deletions packages/morsels_indexer/src/loader/txt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use std::path::Path;

use path_slash::PathExt;
use serde::{Deserialize, Deserializer, Serialize, Serializer};

use crate::loader::BasicLoaderResult;
use crate::loader::Loader;
use crate::loader::LoaderResult;
use crate::loader::LoaderResultIterator;

#[derive(Serialize, Deserialize)]
pub struct TxtLoaderOptions {
field: String,
}

pub struct TxtLoader {
options: TxtLoaderOptions,
}

impl TxtLoader {
pub fn get_new_txt_loader(config: serde_json::Value) -> Box<Self> {
let json_loader_options: TxtLoaderOptions = serde_json::from_value(config)
.expect("TxtLoader options did not match schema!");

Box::new(TxtLoader { options: json_loader_options })
}

fn get_txt_loader_result(&self, text: String, link: String) -> Box<dyn LoaderResult + Send> {
let mut field_texts: Vec<(String, String)> = Vec::with_capacity(2);
field_texts.push(("_relative_fp".to_owned(), link));
field_texts.push((self.options.field.clone(), text));
Box::new(BasicLoaderResult { field_texts }) as Box<dyn LoaderResult + Send>
}
}

#[typetag::serde]
impl Loader for TxtLoader {
fn try_index_file<'a>(
&'a self,
_input_folder_path: &Path,
absolute_path: &Path,
relative_path: &Path,
) -> Option<LoaderResultIterator<'a>> {
if let Some(extension) = relative_path.extension() {
if extension == "txt" {
let text = std::fs::read_to_string(absolute_path)
.expect(&format!("Failed to read .txt file {}", absolute_path.to_string_lossy().into_owned()));
let link = relative_path.to_slash().unwrap();
return Some(Box::new(std::iter::once(
self.get_txt_loader_result(text, link),
)));
}
}

None
}

fn get_name(&self) -> String {
"TxtLoader".to_owned()
}
}

impl Serialize for TxtLoader {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.options.serialize(serializer)
}
}

impl<'de> Deserialize<'de> for TxtLoader {
fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
panic!("Called deserialize for TxtLoader")
}
}
5 changes: 5 additions & 0 deletions packages/search-ui/src/searchResultTransform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,11 @@ async function singleResultRender(
);
resultTitle = newTitle || resultTitle;
resultHeadingsAndTexts = newHeadingsAndTexts;
} else if (fullLink.endsWith('.txt') && loaderConfigs.TxtLoader) {
const asText = await (await fetch(fullLink)).text();
resultHeadingsAndTexts = transformText(
[['body', asText]], query.searchedTerms, termRegexes, linkToAttach, options,
);
} else {
const fullLinkUrl = parseURL(fullLink);
if (fullLinkUrl.pathname.endsWith('.json') && loaderConfigs.JsonLoader) {
Expand Down

0 comments on commit 86780c2

Please sign in to comment.