Skip to content

Commit

Permalink
a chinese module about tokenizer (deeplearning4j#3484)
Browse files Browse the repository at this point in the history
* new module of the chinese area is created

* myself github

* Improvement code
  • Loading branch information
wangfeng-skymind authored and Adam Gibson committed Jun 5, 2017
1 parent e026980 commit ec3aa06
Show file tree
Hide file tree
Showing 12 changed files with 22,208 additions and 0 deletions.
42 changes: 42 additions & 0 deletions deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>deeplearning4j-nlp-parent</artifactId>
<groupId>org.deeplearning4j</groupId>
<version>0.8.1-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp-chinese</artifactId>
<version>0.8.1-SNAPSHOT</version>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apdplat</groupId>
<artifactId>word</artifactId>
<version>1.3</version>
</dependency>
</dependencies>

<profiles>
<profile>
<id>test-nd4j-native</id>
</profile>
<profile>
<id>test-nd4j-cuda-8.0</id>
</profile>
</profiles>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package org.deeplearning4j.text.tokenization.tokenizer;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.Word;

/**
* @date: June 2,2017
* @author: wangfeng
* @Description:The word of the open source segmentation algorithm is based on dictionaries
*
*/

public class ChineseTokenizer implements Tokenizer{

private TokenPreProcess tokenPreProcess;
private List<Word> tokenList;
private Iterator<Word> tokenIter;

public ChineseTokenizer() {}
public ChineseTokenizer(String toTokenize) {
this.tokenList = WordSegmenter.seg(toTokenize);;
this.tokenIter = tokenList.iterator();
}

@Override
public boolean hasMoreTokens() {
return tokenIter.hasNext();
}

@Override
public int countTokens() {
return tokenList != null ? tokenList.size() : 0;
}

@Override
public String nextToken() {
if (!hasMoreTokens()) {
throw new NoSuchElementException();
}
return this.tokenPreProcess != null ? this.tokenPreProcess.preProcess(tokenIter.next().toString()) : tokenIter.next().toString();
}

@Override
public List<String> getTokens() {
ArrayList tokenList = new ArrayList();

while(hasMoreTokens()) {
tokenList.add(nextToken());
}
return tokenList;
}

@Override
public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
this.tokenPreProcess = tokenPreProcessor;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package org.deeplearning4j.text.tokenization.tokenizerFactory;

import org.deeplearning4j.text.tokenization.tokenizer.ChineseTokenizer;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import java.io.InputStream;

/**
* @date: June 2,2017
* @author: wangfeng
* @Description:
*/

public class ChineseTokenizerFactory implements TokenizerFactory {

private TokenPreProcess tokenPreProcess;

@Override
public Tokenizer create(String toTokenize) {
Tokenizer tokenizer = new ChineseTokenizer(toTokenize);
tokenizer.setTokenPreProcessor(tokenPreProcess);
return tokenizer;
}

@Override
public Tokenizer create(InputStream toTokenize) {
throw new UnsupportedOperationException();
/* Tokenizer t = new ChineseStreamTokenizer(toTokenize);
t.setTokenPreProcessor(tokenPreProcess);
return t;*/
}

@Override
public void setTokenPreProcessor(TokenPreProcess tokenPreProcess) {
this.tokenPreProcess = tokenPreProcess;
}

@Override
public TokenPreProcess getTokenPreProcessor() {
return tokenPreProcess;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package org.deeplearning4j.text.tokenization.tokenizer;

import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.IOException;

import lombok.extern.slf4j.Slf4j;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizerFactory.ChineseTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**@author wangfeng
* @date June 3,2017
* @Description
*
*/
@Slf4j
public class ChineseTokenizerTest {

private final String toTokenize = "青山绿水和伟大的科学家让世界更美好";
private final String[] expect = {"青山绿水","和","伟大","的","科学家","让","世界","更","美好"};

@Test
public void testChineseTokenizer() {
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}

//Train model by some data of the chinese names,Then find out the names from the dataset
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(2)
.iterations(5)
.layerSize(100)
.seed(42)
.learningRate(0.1)
.windowSize(20)
.iterate(iter)
.tokenizerFactory(tokenizerFactory)
.build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;

//test model,Whether the model find out name from unknow text;

}


}
Loading

0 comments on commit ec3aa06

Please sign in to comment.