Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream dictionary entries #248

Merged
merged 7 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

package com.worksap.nlp.sudachi;

import com.worksap.nlp.sudachi.dictionary.POS;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Stream;

import com.worksap.nlp.sudachi.dictionary.POS;

/**
* A lexicon and a grammar for morphological analysis.
Expand Down Expand Up @@ -56,6 +57,18 @@ public interface Dictionary extends AutoCloseable {
@Override
public void close() throws IOException;

/**
* Create a parallel stream of all words in the dictionary as morphemes.
*
* Corresponds to the lines in the lexicon csv, i.e. it includes entries that
* appear only when refered from other words (e.g. as constitution) during an
* analysis and excludes entries that automatically added to store a
* normalization form of another word. Entries in the stream are not sorted.
*
* @return a parallel stream of morphemes.
*/
public Stream<Morpheme> entries();

/**
* Lookup entries in the dictionary without performing an analysis.
*
Expand Down
47 changes: 47 additions & 0 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,15 @@
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Predicate;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

public class JapaneseDictionary implements Dictionary, DictionaryAccess {

Expand Down Expand Up @@ -127,6 +133,45 @@ public void close() throws IOException {
}
}

/**
* Iterator of morphemes in the dictionary.
*/
private class EntryItr implements Iterator<Morpheme> {
private final GrammarImpl grammar;
private final LexiconSet lexicon;
private Iterator<Integer> wordIdItr;

EntryItr() {
this.grammar = getGrammar();
this.lexicon = getLexicon();
this.wordIdItr = this.lexicon.wordIds();
}

@Override
public boolean hasNext() {
return wordIdItr.hasNext();
}

@Override
public Morpheme next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return new SingleMorphemeImpl(this.grammar, this.lexicon, wordIdItr.next());
}
}

@Override
public Stream<Morpheme> entries() {
Iterator<Morpheme> iterator = new EntryItr();
int size = getLexicon().size();
int characteristics = Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED;
boolean parallel = true;

Spliterator<Morpheme> spliterator = Spliterators.spliterator(iterator, size, characteristics);
return StreamSupport.stream(spliterator, parallel);
}

@Override
public List<Morpheme> lookup(CharSequence surface) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
Expand Down Expand Up @@ -204,10 +249,12 @@ static String readAll(InputStream input) throws IOException {
}
}

@Override
public GrammarImpl getGrammar() {
return grammar;
}

@Override
public LexiconSet getLexicon() {
return lexicon;
}
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/Morpheme.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -123,9 +123,10 @@ public interface Morpheme {
* The IDs change when the dictionaries are updated or the combination of
* dictionaries changes.
*
* If the morpheme is OOV, it returns an undefined value.
* If the morpheme is OOV, it returns an id consist of OOV flag and pos id.
*
* @return the word ID
* @see WordId
*/
public int getWordId();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,13 @@ public enum WordRefMode {
grammar.setCharacterCategory(CharacterCategory.loadDefault());
textNormalizer = new TextNormalizer(grammar);

// in order to output dictionary entries in in-dictionary order we need to sort
// them. iterator over them will get them not in the sorted order, but grouped
// by index-form (and sorted in groups).
// In order to output dictionary entries in in-dictionary order we need to sort
// them. Iterator over them will get them not in the sorted order, but grouped
// by index-form. Here we assume DoubleArrayLexicon and use WordIdTable.wordIds
// for the performance.
DoubleArrayLexicon targetLex = dic.getLexicon();
Ints allIds = new Ints(targetLex.size());
Iterator<Ints> ids = targetLex.wordIds(0);
Iterator<Ints> ids = targetLex.getWordIdTable().wordIds();
while (ids.hasNext()) {
allIds.appendAll(ids.next());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.util.Iterator;
import java.util.NoSuchElementException;

import com.worksap.nlp.dartsclone.DoubleArray;
import com.worksap.nlp.sudachi.MorphemeList;
Expand Down Expand Up @@ -103,7 +104,7 @@ public long parameters(int wordId) {

private class Itr implements Iterator<int[]> {
private final Iterator<int[]> iterator;
private Integer[] wordIds;
private int[] wordIds;
private int length;
private int index;

Expand Down Expand Up @@ -148,8 +149,39 @@ public int size() {
return description.getNumTotalEntries();
}

public Iterator<Ints> wordIds(int dic) {
return wordIdTable.wordIds();
public Iterator<Integer> wordIds() {
return new WordIdItr();
}

private class WordIdItr implements Iterator<Integer> {
private final Iterator<Ints> iterator;
private Ints ints;
private int index;

WordIdItr() {
this.iterator = getWordIdTable().wordIds();
index = 0;
}

@Override
public boolean hasNext() {
while (ints == null || index >= ints.length()) {
if (!iterator.hasNext()) {
return false;
}
ints = iterator.next();
index = 0;
}
return true;
}

@Override
public Integer next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return ints.get(index++);
}
}

/**
Expand Down
18 changes: 14 additions & 4 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,6 +25,15 @@
*/
public interface Lexicon {

/**
* Lookup entries that match the text starting from the offset.
*
* @param text
* input byte text. should be normalized
* @param offset
* input offset to start lookup from
* @return iterator of (wordid, length) pair
*/
Iterator<int[]> lookup(byte[] text, int offset);

/**
Expand Down Expand Up @@ -73,8 +82,9 @@ public interface Lexicon {
WordInfoList wordInfos(int dic);

/**
* Iterates over all word ids in the specified dictionary. Returned word ids are
* not sorted.
* Iterates over all word ids in the dictionary.
*
* Returned word ids are not sorted.
*/
Iterator<Ints> wordIds(int dic);
Iterator<Integer> wordIds();
}
37 changes: 34 additions & 3 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/LexiconSet.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -171,7 +171,38 @@ public WordInfoList wordInfos(int dic) {
}

@Override
public Iterator<Ints> wordIds(int dic) {
return lexicons.get(dic).wordIds(dic);
public Iterator<Integer> wordIds() {
return new WordIdItr();
}

private class WordIdItr implements Iterator<Integer> {
private int dictId;
private Iterator<Integer> iterator;

WordIdItr() {
this.dictId = 0;
this.iterator = lexicons.get(dictId).wordIds();
}

@Override
public boolean hasNext() {
while (!iterator.hasNext()) {
int nextDictId = dictId + 1;
if (nextDictId >= lexicons.size()) {
return false;
}
dictId = nextDictId;
iterator = lexicons.get(nextDictId).wordIds();
}
return true;
}

@Override
public Integer next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return iterator.next();
}
}
}
26 changes: 13 additions & 13 deletions src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,27 +23,27 @@
import java.util.NoSuchElementException;
import java.util.Iterator;

class WordIdTable {
/**
* Table which contains the list of (internal) word ids that has same index
* form.
*
* Automatically fills dict parts of word id using the dicId set.
*/
public class WordIdTable {
private final ByteBuffer bytes;
private int dicIdMask = 0;

WordIdTable(ByteBuffer bytes) {
this.bytes = bytes;
}

Integer[] get(int index) {
int[] get(int index) {
ByteBuffer dup = bytes.duplicate();
dup.position(index);
BufReader reader = new BufReader(dup);
int length = reader.readVarint32();
Integer[] result = new Integer[length];
int mask = dicIdMask;
int sum = 0;
for (int i = 0; i < length; i++) {
int v = reader.readVarint32();
result[i] = WordId.applyMask(v + sum, mask);
sum += v;
}
int[] result = new int[length];
readDeltaCompressed(result, length, this.dicIdMask, reader);
return result;
}

Expand Down Expand Up @@ -75,8 +75,8 @@ private static void readDeltaCompressed(int[] result, int count, int mask, BufRe
}
}

void setDictionaryId(int id) {
dicIdMask = WordId.dicIdMask(id);
void setDictionaryId(int dictId) {
dicIdMask = WordId.dicIdMask(dictId);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import com.worksap.nlp.sudachi.dictionary.Block;
import com.worksap.nlp.sudachi.dictionary.DoubleArrayLexicon;
import com.worksap.nlp.sudachi.dictionary.Ints;
import com.worksap.nlp.sudachi.dictionary.Lexicon;
import com.worksap.nlp.sudachi.dictionary.WordInfoList;

import java.io.IOException;
Expand Down Expand Up @@ -63,17 +62,19 @@ public class RawLexicon {
* used to resolve wordref.
*
* @param lexicon
* lexicon of a system dictionary.
* @return number of entries read.
*/
public int preloadFrom(Lexicon lexicon, Progress progress) {
public int preloadFrom(DoubleArrayLexicon lexicon, Progress progress) {
this.isUser = true;

Ints allIds = new Ints(lexicon.size());
Iterator<Ints> ids = lexicon.wordIds(0);
Iterator<Ints> ids = lexicon.getWordIdTable().wordIds();
while (ids.hasNext()) {
allIds.appendAll(ids.next());
}
allIds.sort();

for (int i = 0; i < allIds.length(); i++) {
preloadedEntries.add(new CompiledWordEntry(lexicon, allIds.get(i)));
progress.progress(i, allIds.length());
Expand Down
Loading