diff --git a/NOTICE b/NOTICE
index 0591ebb..3c4fdad 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,4 +1,4 @@
- Copyright 2012-2019 OpenSextant.org
+ Copyright 2012-2021 MITRE Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
- * (c) 2012-2014 The MITRE Corporation. All Rights Reserved.
+ * (c) 2012-2021 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
diff --git a/pom.xml b/pom.xml
index 68886e2..e8eee3d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,598 +1,603 @@
- 4.0.0
- org.opensextant
- 3.4.0
- XText
- opensextant-xponents-xtext
- Content extraction simplified! Retrieve text, data and metadata from binary documents using Tika and
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ 4.0.0
+ org.opensextant
+ 3.5.0-SNAPSHOT
+ XText
+ opensextant-xponents-xtext
+ Content extraction simplified! Retrieve text, data and metadata from binary documents using Tika and
similar toolkits
- 2013
- https://opensextant.github.io/XText
- scm:git:https://github.com/OpenSextant/XText.git
- scm:git:https://github.com/OpenSextant/XText.git
- https://github.com/OpenSextant/XText.git
- Apache 2
- http://www.apache.org/licenses/LICENSE-2.0.txt
- Marc Ubaldino
- ubaldino@mitre.org
- Lead
- 1.7.30
- org.opensextant
- opensextant-xponents-core
- 3.4.0
- org.opensextant
- giscore
- org.opensextant
- geodesy
- com.norconex.language
- langdetect
- org.locationtech.spatial4j
- spatial4j
- gnu.getopt
- java-getopt
- 1.0.13
- test
- org.apache.commons
- commons-lang3
- 3.12.0
- commons-lang
- commons-lang
- 2.6
- runtime
- com.ibm.icu
- icu4j
- 65.1
- org.apache.commons
- commons-text
- 1.9
- commons-codec
- commons-codec
- 1.15
- commons-io
- commons-io
- 2.8.0
- commons-logging
- commons-logging
- 1.2
- runtime
- org.apache.commons
- commons-compress
- 1.20
- com.pff
- java-libpst
- 0.9.3
- org.apache.tika
- tika-core
- 1.24.1
- org.apache.tika
- tika-parsers
- 1.24.1
- org.apache.sis.storage
- sis-netcdf
- pdfbox
- org.apache.pdfbox
- commons-logging
- commons-logging
- httpservices
- edu.ucar
- junrar
- com.github.junrar
- netcdf4
- edu.ucar
- grib
- edu.ucar
- cdm
- edu.ucar
- cxf-rt-rs-client
- org.apache.cxf
- vorbis-java-core
- org.gagravarr
- vorbis-java-tika
- org.gagravarr
- sis-metadata
- org.apache.sis.core
- sis-utility
- org.apache.sis.core
- jmatio
- net.sourceforge.jmatio
- opennlp-tools
- org.apache.opennlp
- org.json
- json
- edu.usc.ir
- sentiment-analysis-parser
- org.tallison
- jmatio
- com.rometools
- rome
- org.apache.uima
- uimafit-core
- org.apache.uima
- uimaj-core
- net.htmlparser.jericho
- jericho-html
- 3.4
- xml-apis
- xml-apis
- 1.4.01
- org.slf4j
- slf4j-api
- junit
- junit
- test
- com.sun.mail
- javax.mail
- 1.5.1
- org.apache.pdfbox
- pdfbox
- 2.0.22
- commons-logging
- commons-logging
- ch.qos.logback
- logback-classic
- org.apache.httpcomponents
- fluent-hc
- 4.5.13
- runtime
- commons-logging
- commons-logging
- org.apache.httpcomponents
- httpclient-cache
- 4.5.13
- runtime
- commons-logging
- commons-logging
- org.apache.httpcomponents
- httpclient
- 4.5.13
- commons-logging
- commons-logging
- org.apache.httpcomponents
- httpcore
- 4.4.14
- org.apache.httpcomponents
- httpmime
- 4.5.13
- joda-time
- joda-time
- 2.10.6
- de.l3s.boilerpipe
- boilerpipe
- 1.1.0
- javax.activation
- activation
- 1.1
- org.jodd
- jodd-json
- 5.1.5
+ 2013
+ https://opensextant.github.io/XText
+ scm:git:https://github.com/OpenSextant/XText.git
+ scm:git:https://github.com/OpenSextant/XText.git
+ https://github.com/OpenSextant/XText.git
+ Apache 2
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ Marc Ubaldino
+ ubaldino@mitre.org
+ Lead
+ 1.7.30
- junit
- junit
- 4.13.1
- test
- org.slf4j
- slf4j-api
- ${slf4j.version}
- ch.qos.logback
- logback-classic
- 1.2.3
+ org.opensextant
+ opensextant-xponents-core
+ 3.5.0-SNAPSHOT
+ org.opensextant
+ giscore
+ org.opensextant
+ geodesy
+ com.norconex.language
+ langdetect
+ org.locationtech.spatial4j
+ spatial4j
+ gnu.getopt
+ java-getopt
+ 1.0.13
+ test
+ org.apache.commons
+ commons-lang3
+ 3.12.0
+ commons-lang
+ commons-lang
+ 2.6
+ runtime
+ com.ibm.icu
+ icu4j
+ 70.1
+ org.apache.commons
+ commons-text
+ 1.9
+ commons-codec
+ commons-codec
+ 1.15
+ commons-io
+ commons-io
+ 2.10.0
+ commons-logging
+ commons-logging
+ 1.2
+ runtime
+ org.apache.commons
+ commons-compress
+ 1.21
+ com.pff
+ java-libpst
+ 0.9.3
+ org.apache.tika
+ tika-core
+ 1.27
+ org.apache.tika
+ tika-parsers
+ 1.27
+ org.apache.sis.storage
+ sis-netcdf
+ pdfbox
+ org.apache.pdfbox
+ commons-logging
+ commons-logging
+ httpservices
+ edu.ucar
+ junrar
+ com.github.junrar
+ netcdf4
+ edu.ucar
+ grib
+ edu.ucar
+ cdm
+ edu.ucar
+ cxf-rt-rs-client
+ org.apache.cxf
+ vorbis-java-core
+ org.gagravarr
+ vorbis-java-tika
+ org.gagravarr
+ sis-metadata
+ org.apache.sis.core
+ sis-utility
+ org.apache.sis.core
+ jmatio
+ net.sourceforge.jmatio
+ opennlp-tools
+ org.apache.opennlp
+ org.json
+ json
+ edu.usc.ir
+ sentiment-analysis-parser
+ org.tallison
+ jmatio
+ com.rometools
+ rome
+ org.apache.uima
+ uimafit-core
+ org.apache.uima
+ uimaj-core
+ net.htmlparser.jericho
+ jericho-html
+ 3.4
+ xml-apis
+ xml-apis
+ 1.4.01
+ org.slf4j
+ slf4j-api
+ junit
+ junit
+ test
+ com.sun.mail
+ javax.mail
+ 1.5.1
+ org.apache.pdfbox
+ pdfbox
+ 2.0.24
+ commons-logging
+ commons-logging
+ ch.qos.logback
+ logback-classic
+ org.apache.httpcomponents
+ fluent-hc
+ 4.5.13
+ runtime
+ commons-logging
+ commons-logging
+ org.apache.httpcomponents
+ httpclient-cache
+ 4.5.13
+ runtime
+ commons-logging
+ commons-logging
+ org.apache.httpcomponents
+ httpclient
+ 4.5.13
+ commons-logging
+ commons-logging
+ org.apache.httpcomponents
+ httpcore
+ 4.4.14
+ org.apache.httpcomponents
+ httpmime
+ 4.5.13
+ joda-time
+ joda-time
+ 2.10.13
+ de.l3s.boilerpipe
+ boilerpipe
+ 1.1.0
+ javax.activation
+ activation
+ 1.1
+ org.jodd
+ jodd-json
+ 5.1.5
- org.apache.maven.plugins
- maven-dependency-plugin
- 3.1.2
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.8.1
- org.apache.maven.plugins
- maven-javadoc-plugin
- 3.2.0
- org.apache.maven.plugins
- maven-source-plugin
- 3.2.1
- org.apache.maven.plugins
- maven-deploy-plugin
- 3.0.0-M1
- org.apache.maven.plugins
- maven-jar-plugin
- 3.2.0
- attach-tests
- test-jar
- org.apache.maven.plugins
- maven-checkstyle-plugin
- 3.1.1
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M4
- org.codehaus.mojo
- findbugs-maven-plugin
- 3.0.5
- org.sonarsource.scanner.maven
- sonar-maven-plugin
- maven-javadoc-plugin
- <img alt='[OpenSextant Logo]' height='36'
- width='36'
- src='doc-files/opensextant-manual-logo.png'/><br>copyright
- OpenSextant.org, 2013-2020
- true
- XText - Content Extraction Simplified
- false
- attach-javadoc
- jar
- org.apache.maven.plugins
- maven-release-plugin
- true
- true
- release
- deploy
- maven-compiler-plugin
- 1.8
- 1.8
- 1.8
- -Xlint:all,-path
- true
- true
- maven-source-plugin
- attach-sources
- jar
- maven-checkstyle-plugin
- checkstyle.xml
- checkstyle.indentChars=4
- checkstyle-suppressions.xml
- true
- false
- org.codehaus.mojo
- findbugs-maven-plugin
- true
- maven-dependency-plugin
- default-cli
- copy-dependencies
- lib
- runtime
- test
- dependency-analysis
- analyze-only
- verify
- maven-surefire-plugin
- ${basedir}/src/test/resources
- org.apache.maven.plugins
- maven-project-info-reports-plugin
- 2.7
- false
- maven-javadoc-plugin
- javadoc
- release
+ junit
+ junit
+ 4.13.1
+ test
+ org.slf4j
+ slf4j-api
+ ${slf4j.version}
+ ch.qos.logback
+ logback-classic
+ 1.2.3
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.1.2
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.2.0
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.2.1
+ org.apache.maven.plugins
+ maven-deploy-plugin
+ 3.0.0-M1
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 3.2.0
+ attach-tests
+ test-jar
+ org.apache.maven.plugins
+ maven-checkstyle-plugin
+ 3.1.1
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M4
+ org.codehaus.mojo
+ findbugs-maven-plugin
+ 3.0.5
+ org.sonarsource.scanner.maven
+ sonar-maven-plugin
+ org.apache.maven.plugins
+ maven-release-plugin
+ 3.0.0-M4
- maven-source-plugin
- attach-sources
- jar-no-fork
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.6
- sign-artifacts
- verify
- sign
- org.sonatype.plugins
- nexus-staging-maven-plugin
- 1.6.7
- true
- ossrh
- https://oss.sonatype.org/
- true
+ maven-javadoc-plugin
+ 1.8
+ <img alt='[OpenSextant Logo]' height='36'
+ width='36'
+ src='doc-files/opensextant-manual-logo.png'/><br>copyright
+ OpenSextant.org, 2013-2021
+ true
+ XText - Content Extraction Simplified
+ false
+ attach-javadoc
+ jar
+ maven-release-plugin
+ true
+ true
+ release
+ deploy
+ maven-compiler-plugin
+ 1.8
+ 1.8
+ 1.8
+ -Xlint:all,-path
+ true
+ true
+ maven-source-plugin
+ attach-sources
+ jar
+ maven-checkstyle-plugin
+ checkstyle.xml
+ checkstyle.indentChars=4
+ checkstyle-suppressions.xml
+ true
+ false
+ org.codehaus.mojo
+ findbugs-maven-plugin
+ true
+ maven-dependency-plugin
+ default-cli
+ copy-dependencies
+ lib
+ runtime
+ test
+ dependency-analysis
+ analyze-only
+ verify
+ maven-surefire-plugin
+ ${basedir}/src/test/resources
- ossrh
- https://oss.sonatype.org/content/repositories/snapshots
- ossrh
- https://oss.sonatype.org/service/local/staging/deploy/maven2
+ org.apache.maven.plugins
+ maven-project-info-reports-plugin
+ 2.7
+ false
+ maven-javadoc-plugin
+ javadoc
+ release
+ maven-source-plugin
+ attach-sources
+ jar-no-fork
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.6
+ sign-artifacts
+ verify
+ sign
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.7
+ true
+ ossrh
+ https://oss.sonatype.org/
+ true
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+ ossrh
+ https://oss.sonatype.org/service/local/staging/deploy/maven2
diff --git a/src/main/java/org/opensextant/xtext/XText.java b/src/main/java/org/opensextant/xtext/XText.java
index 4f04ecb..f0f29c8 100644
--- a/src/main/java/org/opensextant/xtext/XText.java
+++ b/src/main/java/org/opensextant/xtext/XText.java
@@ -1,20 +1,3 @@
- *
- * Copyright 2012-2013 The MITRE Corporation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
// _____ ____ __ __
@@ -28,6 +11,7 @@
// \/_/
// OpenSextant XText
+// Copyright 2012-2021 MITRE
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
package org.opensextant.xtext;
@@ -63,6 +47,7 @@
import org.opensextant.xtext.converters.TextTranscodingConverter;
import org.opensextant.xtext.converters.TikaHTMLConverter;
import org.opensextant.xtext.converters.WebArchiveConverter;
+import org.opensextant.xtext.converters.OfficeConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -126,16 +111,16 @@ public PathManager getPathManager() {
private final int maxHTMLBuffer = 5 * maxBuffer;
private long maxFileSize = FILE_SIZE_LIMIT;
- protected Set archiveFileTypes = new HashSet();
+ protected Set archiveFileTypes = new HashSet<>();
- public static Map converters = new HashMap();
+ public static Map converters = new HashMap<>();
private Converter defaultConversion;
private Converter embeddedConversion;
- private final Set requestedFileTypes = new HashSet();
- private final Set ignoreFileTypes = new HashSet();
+ private final Set requestedFileTypes = new HashSet<>();
+ private final Set ignoreFileTypes = new HashSet<>();
private boolean allowNoExtension = false;
@@ -682,14 +667,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw
long t2 = System.currentTimeMillis();
int duration = (int) (t2 - t1);
if (textDoc != null) {
- // Buffer can be null. If you got this far, you are interested
- // in the file, as it passed
- // all filters above. Return the document with whatever metadata
- // it found.
- // if (textDoc.buffer == null) {
- // throw new
- // IOException("Engineering error: Doc converted, but converter failed to setText()");
- // }
+ // Buffer can be null. If you got this far, you are interested in the file, as it passed
+ // all filters above. Return the document with whatever metadata it found.
if (paths.isSaving() && textDoc.is_converted) {
// Get Parent info in there.
if (parent != null) {
@@ -935,7 +914,9 @@ public void setup() throws IOException {
- MessageConverter emailParser = new MessageConverter();
+ boolean useMSOffice = false;
+ Converter emailParser = useMSOffice ? new OfficeConverter() : new MessageConverter();
mimetype = "eml";
if (requestedFileTypes.contains(mimetype)) {
converters.put(mimetype, emailParser);
diff --git a/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java b/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java
new file mode 100644
index 0000000..5b5445e
--- /dev/null
+++ b/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java
@@ -0,0 +1,64 @@
+package org.opensextant.xtext.converters;
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.opensextant.xtext.ConvertedDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+public class OfficeConverter extends ConverterAdapter {
+ protected Logger logger = LoggerFactory.getLogger(getClass());
+ private OfficeParser parser = new OfficeParser();
+ public OfficeConverter() { }
+ /**
+ *
+ * @param input input stream
+ * @param doc File
+ * @return ConvertedDocument
+ * @throws IOException on IO failure with stream or conversion of content
+ */
+ @Override
+ protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
+ throws IOException {
+ Metadata metadata = new Metadata();
+ ParseContext ctx = new ParseContext();
+ BodyContentHandler handler = new BodyContentHandler();
+ try {
+ parser.parse(input, handler, metadata, ctx);
+ } catch (NoClassDefFoundError classErr){
+ throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
+ } catch (Exception xerr) {
+ throw new IOException("Unable to parse content", xerr);
+ } finally {
+ input.close();
+ }
+ /* Construct a response */
+ ConvertedDocument textdoc = new ConvertedDocument(doc);
+ /* Add essential metadata */
+ textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
+ textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
+ textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
+ textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));
+ /* Mark the document as converted */
+ textdoc.is_converted = true;
+ return textdoc;
+ }
diff --git a/src/test/java/OfficeParserTool.java b/src/test/java/OfficeParserTool.java
new file mode 100644
index 0000000..34d4721
--- /dev/null
+++ b/src/test/java/OfficeParserTool.java
@@ -0,0 +1,44 @@
+import org.opensextant.xtext.ConvertedDocument;
+import org.opensextant.xtext.converters.MessageConverter;
+import org.opensextant.xtext.converters.OfficeConverter;
+public class OfficeParserTool {
+ /*
+ * TODO: Both MessageConverter and OfficeConverter are failing in basic .EML and .MSG files.
+ * no standards....
+ */
+ public static void main(String[] args) {
+ OfficeConverter converter = new OfficeConverter();
+ String msMsg, mimeMsg = null;
+ ConvertedDocument msdoc = null, mimedoc = null;
+ try {
+ msdoc = converter.convert(args[0]);
+ msMsg = "success - " + msdoc.getProperty("title");
+ } catch (Exception err) {
+ //err.printStackTrace();
+ msMsg = err.getMessage();
+ }
+ try {
+ mimedoc = new MessageConverter().convert(args[0]);
+ mimeMsg = "success - " + mimedoc.getProperty("title");
+ } catch (Exception err) {
+ mimeMsg = err.getMessage();
+ }
+ System.out.println("MS OfficeConverter\n\tResult:" + msMsg);
+ if (msdoc != null) {
+ System.out.println("\tDoc " + msdoc.toString());
+ }
+ System.out.println("MIME MessageConverter\n\tResult:" + mimeMsg);
+ if (mimedoc != null) {
+ System.out.println("\tDoc " + mimedoc.toString());
+ }
+ }
diff --git a/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java b/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java
new file mode 100644
index 0000000..f091991
--- /dev/null
+++ b/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java
@@ -0,0 +1,40 @@
+package org.opensextant.xtext.converters.test;
+import static org.junit.Assert.*;
+import java.io.File;
+import java.io.IOException;
+import org.apache.commons.io.FileUtils;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.opensextant.xtext.ConvertedDocument;
+import org.opensextant.xtext.converters.OfficeConverter;
+public class TestOfficeMail {
+ @ClassRule
+ public static final TemporaryFolder TEMP_DIR = new TemporaryFolder();
+ private static File TEST_FILE = null;
+ @BeforeClass
+ public static void setupTemporaryFolder() throws IOException {
+ TEST_FILE = TEMP_DIR.newFile("mimeEmailWithAttachmentsTest.eml");
+ FileUtils.copyInputStreamToFile(
+ MessageConverterTest.class.getResourceAsStream("mimeEmailWithAttachmentsTest.eml"), TEST_FILE);
+ }
+ // @Test
+ public void testMailMessageParser() {
+ OfficeConverter converter = new OfficeConverter();
+ try {
+ ConvertedDocument doc = converter.convert(TEST_FILE);
+ } catch (Exception err) {
+ err.printStackTrace();
+ fail("EML conversion failed");
+ }
+ }