Skip to content

Commit

Permalink
Merge pull request #64 from quadrama/release/2.0.0
Browse files Browse the repository at this point in the history
Release/2.0.0
  • Loading branch information
pagelj authored Apr 10, 2019
2 parents d76c36e + 743ac87 commit 4c2b780
Show file tree
Hide file tree
Showing 106 changed files with 26,920 additions and 27,705 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
**/target/
**/pom.xml.versionsBackup
**/src/test/resources/**/typesystem.xml

6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ This repository contains a number of UIMA components to process dramatic texts,
As an example, we'll work on the data from the GerDraCor collection (which is based on TextGrid). Download the files from [GitHub](https://github.com/quadrama/gerdracor) and store the XML files in a directory. We will call the directory `$TEIDIR` in the following examples. The directory `$OUTDIR` is used to store the output of the pipeline. You'll need the file `drama.Main.jar`.

Enter the following command in the command line interface:
`java -cp target/assembly/drama.Main.jar de.unistuttgart.ims.drama.main.TEI2XMI --input $TEIDIR --output $OUTDIR/xmi --csvOutput $OUTDIR/csv --skipSpeakerIdentifier --collectionId "gdc" --doCleanup --readerClassname "de.unistuttgart.quadrama.io.tei.textgrid.GerDraCorUrlReader"`
`java -cp target/assembly/drama.Main.jar de.unistuttgart.ims.drama.main.TEI2XMI --input $TEIDIR --output $OUTDIR/xmi --csvOutput $OUTDIR/csv --conllOutput $OUTDIR/conll --skipSpeakerIdentifier --corpus GERDRACOR --collectionId "gdc" --doCleanup`

After running, the directory `$OUTDIR` contains two sub directories, `xmi` and `csv`, which are different file formats for the plays.
After running, the directory `$OUTDIR` contains three sub directories, `xmi`, `csv` and `conll`, which are different file formats for the plays.


## TEI/XML dialects

This package supports the following drama corpora
- TextGrid (German)
- GerDraCor (German)
- theatre classique (French)
- theatre classique (French)
2 changes: 1 addition & 1 deletion de.unistuttgart.ims.drama.api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<parent>
<groupId>de.unistuttgart.ims</groupId>
<artifactId>de.unistuttgart.ims.drama</artifactId>
<version>1.0.1</version>
<version>2.0.0</version>
</parent>
<artifactId>de.unistuttgart.ims.drama.api</artifactId>
<properties>
Expand Down
197 changes: 105 additions & 92 deletions de.unistuttgart.ims.drama.api/src/main/resources/dramatypes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
<description/>
<rangeTypeName>uima.cas.FSArray</rangeTypeName>
<elementType>de.unistuttgart.ims.drama.api.CastFigure</elementType>
<multipleReferencesAllowed>false</multipleReferencesAllowed>
</featureDescription>
</features>
</typeDescription>
Expand Down Expand Up @@ -76,6 +75,11 @@
<name>de.unistuttgart.ims.drama.api.StageDirection</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>CastFigure</name>
<description></description>
<rangeTypeName>de.unistuttgart.ims.drama.api.DiscourseEntity</rangeTypeName></featureDescription></features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.Speech</name>
Expand Down Expand Up @@ -122,34 +126,34 @@
<description/>
<rangeTypeName>uima.cas.FSArray</rangeTypeName>
<elementType>de.unistuttgart.ims.drama.api.CastFigure</elementType>
<multipleReferencesAllowed>false</multipleReferencesAllowed>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
<featureDescription>
<name>DatePrinted</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DateWritten</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DatePremiere</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>Date</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DateTranslation</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
</features>
<featureDescription>
<name>DatePrinted</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DateWritten</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DatePremiere</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>Date</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
<featureDescription>
<name>DateTranslation</name>
<description/>
<rangeTypeName>uima.cas.Integer</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.SpeechVerse</name>
Expand Down Expand Up @@ -219,16 +223,12 @@
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.FigureMention</name>
<description/>
<supertypeName>de.unistuttgart.ims.drama.api.Mention</supertypeName>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.SpeakerFigure</name>
<description>@Deprecated

This type can be used to add figures on the fly, that are not declared in the dramatis personae.</description>
This type can be used to add figures on the fly, that are not declared
in the dramatis personae.</description>
<supertypeName>de.unistuttgart.ims.drama.api.Figure</supertypeName>
</typeDescription>
<typeDescription>
Expand Down Expand Up @@ -368,59 +368,73 @@ This type can be used to add figures on the fly, that are not declared in the dr
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.CastFigure</name>
<description/>
<description></description>
<supertypeName>de.unistuttgart.ims.drama.api.DiscourseEntity</supertypeName>
<features>
<featureDescription>
<name>XmlId</name>
<name>Names</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<multipleReferencesAllowed>false</multipleReferencesAllowed>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
<featureDescription>
<name>Names</name>
<name>Gender</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>Age</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.DiscourseEntity</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>DisplayName</name>
<description>String representation of the entity</description>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>Id</name>
<description>Internal and numerical Id</description>
<rangeTypeName>uima.cas.Integer</rangeTypeName></featureDescription>
<featureDescription>
<name>XmlId</name><description>The original xml id</description>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
<featureDescription>
<name>EntityGroup</name>
<description></description>
<rangeTypeName>uima.cas.FSArray</rangeTypeName>
<elementType>de.unistuttgart.ims.drama.api.DiscourseEntity</elementType></featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.Mention</name>
<description>Representation of a concrete referential mention</description>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>Entity</name>
<description>Each mention points to exactly one entity</description>
<rangeTypeName>de.unistuttgart.ims.drama.api.DiscourseEntity</rangeTypeName>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
<featureDescription>
<name>Gender</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>Age</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
<name>SurfaceString</name>
<description>The surface string of the mention in the text</description>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.DiscourseEntity</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>DisplayName</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.ims.drama.api.Mention</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>Entity</name>
<description/>
<rangeTypeName>uima.cas.FSArray</rangeTypeName>
<elementType>de.unistuttgart.ims.drama.api.DiscourseEntity</elementType>
<multipleReferencesAllowed>false</multipleReferencesAllowed>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<typeDescription>
<name>de.unistuttgart.quadrama.io.core.type.HTMLAnnotation</name>
<description/>
<supertypeName>de.unistuttgart.quadrama.io.core.type.XMLElement</supertypeName>
Expand Down Expand Up @@ -457,23 +471,22 @@ This type can be used to add figures on the fly, that are not declared in the dr
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>de.unistuttgart.quadrama.io.core.type.XMLParsingDescription</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>Encoding</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>XmlDeclarations</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<multipleReferencesAllowed>false</multipleReferencesAllowed>
</featureDescription>
</features>
</typeDescription>
</types>
<typeDescription>
<name>de.unistuttgart.quadrama.io.core.type.XMLParsingDescription</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>Encoding</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>XmlDeclarations</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
6 changes: 3 additions & 3 deletions de.unistuttgart.ims.drama.core.cr/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
<parent>
<groupId>de.unistuttgart.ims</groupId>
<artifactId>de.unistuttgart.ims.drama</artifactId>
<version>1.0.1</version>
<version>2.0.0</version>
</parent>
<artifactId>de.unistuttgart.ims.drama.core.cr</artifactId>
<dependencies>
<dependency>
<groupId>de.unistuttgart.ims</groupId>
<artifactId>de.unistuttgart.ims.drama.api</artifactId>
<version>1.0.1</version>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.cleartk</groupId>
Expand Down Expand Up @@ -79,4 +79,4 @@
</plugin>
</plugins>
</build>
</project>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.unistuttgart.ims.drama.api.FigureMention;
import de.unistuttgart.ims.drama.api.Mention;
import de.unistuttgart.ims.entitydetection.api.TrainingArea;

public class ClearTkMentionAnnotator extends CleartkSequenceAnnotator<String> {
FeatureExtractor1<Token> extractor;

CleartkExtractor<Token, Token> contextExtractor;

BioChunking<Token, FigureMention> chunking;
BioChunking<Token, Mention> chunking;

@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
Expand All @@ -58,7 +58,7 @@ public void initialize(UimaContext context) throws ResourceInitializationExcepti
// NamedEntityMentions, with labels
// from the "mentionType" attribute so that we get B-location, I-person,
// etc.
this.chunking = new BioChunking<Token, FigureMention>(Token.class, FigureMention.class);
this.chunking = new BioChunking<Token, Mention>(Token.class, Mention.class);
}

@Override
Expand Down Expand Up @@ -86,7 +86,7 @@ public void process(JCas jCas) throws AnalysisEngineProcessException {

// extract the gold (human annotated) NamedEntityMention
// annotations
List<FigureMention> namedEntityMentions = JCasUtil.selectCovered(jCas, FigureMention.class, sentence);
List<Mention> namedEntityMentions = JCasUtil.selectCovered(jCas, Mention.class, sentence);

// convert the NamedEntityMention annotations into token-level
// BIO outcome labels
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiReader;
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import de.unistuttgart.ims.drama.api.FigureMention;
import de.unistuttgart.ims.drama.api.Mention;
import de.unistuttgart.ims.drama.api.Speech;
import de.unistuttgart.ims.entitydetection.api.TrainingArea;
import de.unistuttgart.ims.uimautil.ClearAnnotation;
Expand Down Expand Up @@ -85,7 +85,7 @@ public void train(CollectionReader collectionReader, File outputDirectory) throw

aggregate
.add(createEngineDescription(ContextWindowAnnotator.class, ContextWindowAnnotator.PARAM_BASE_ANNOTATION,
FigureMention.class, ContextWindowAnnotator.PARAM_CONTEXT_CLASS, Speech.class,
Mention.class, ContextWindowAnnotator.PARAM_CONTEXT_CLASS, Speech.class,
ContextWindowAnnotator.PARAM_TARGET_ANNOTATION, TrainingArea.class));
// our NamedEntityChunker annotator, configured to write Mallet CRF
// training data
Expand Down Expand Up @@ -152,9 +152,9 @@ protected AnnotationStatistics<String> test(CollectionReader collectionReader, F
JCas systemView = jCas.getView(defaultViewName);

// extract the named entity mentions from both gold and system views
Collection<FigureMention> goldMentions, systemMentions;
goldMentions = JCasUtil.select(goldView, FigureMention.class);
systemMentions = JCasUtil.select(systemView, FigureMention.class);
Collection<Mention> goldMentions, systemMentions;
goldMentions = JCasUtil.select(goldView, Mention.class);
systemMentions = JCasUtil.select(systemView, Mention.class);

// compare the system mentions to the gold mentions
stats.add(goldMentions, systemMentions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiReader;
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import de.unistuttgart.ims.drama.api.FigureMention;
import de.unistuttgart.ims.drama.api.Mention;
import de.unistuttgart.ims.drama.api.Speech;
import de.unistuttgart.ims.entitydetection.api.TrainingArea;
import de.unistuttgart.ims.uimautil.ContextWindowAnnotator;
Expand All @@ -35,7 +35,7 @@ public static void main(String[] args) throws Exception {
// run the pipeline over the training corpus
SimplePipeline.runPipeline(reader,
createEngineDescription(ContextWindowAnnotator.class, ContextWindowAnnotator.PARAM_BASE_ANNOTATION,
FigureMention.class, ContextWindowAnnotator.PARAM_CONTEXT_CLASS, Speech.class,
Mention.class, ContextWindowAnnotator.PARAM_CONTEXT_CLASS, Speech.class,
ContextWindowAnnotator.PARAM_TARGET_ANNOTATION, TrainingArea.class),
createEngineDescription(ClearTkMentionAnnotator.class, CleartkSequenceAnnotator.PARAM_IS_TRAINING, true,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, options.getModelDirectory(),
Expand Down
Loading

0 comments on commit 4c2b780

Please sign in to comment.