From 7cd7b35d760a9e0d00ffd05927f9acf99c12cad3 Mon Sep 17 00:00:00 2001 From: Marc Ubaldino Date: Tue, 16 Nov 2021 14:24:53 -0500 Subject: [PATCH] CONVERTERS: EML office or mime parsers are wonky, but test okay -- they produce results or errors and need more work -- or replacement. --- NOTICE | 4 +- pom.xml | 1183 +++++++++-------- .../java/org/opensextant/xtext/XText.java | 41 +- .../xtext/converters/OfficeConverter.java | 64 + src/test/java/OfficeParserTool.java | 44 + .../xtext/converters/test/TestOfficeMail.java | 40 + 6 files changed, 755 insertions(+), 621 deletions(-) create mode 100644 src/main/java/org/opensextant/xtext/converters/OfficeConverter.java create mode 100644 src/test/java/OfficeParserTool.java create mode 100644 src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java diff --git a/NOTICE b/NOTICE index 0591ebb..3c4fdad 100644 --- a/NOTICE +++ b/NOTICE @@ -1,4 +1,4 @@ - Copyright 2012-2019 OpenSextant.org + Copyright 2012-2021 MITRE Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * - * (c) 2012-2014 The MITRE Corporation. All Rights Reserved. + * (c) 2012-2021 The MITRE Corporation. All Rights Reserved. * ************************************************************************** diff --git a/pom.xml b/pom.xml index 68886e2..e8eee3d 100644 --- a/pom.xml +++ b/pom.xml @@ -1,598 +1,603 @@ - 4.0.0 - org.opensextant - 3.4.0 - XText - opensextant-xponents-xtext - Content extraction simplified! Retrieve text, data and metadata from binary documents using Tika and + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + org.opensextant + 3.5.0-SNAPSHOT + XText + opensextant-xponents-xtext + Content extraction simplified! Retrieve text, data and metadata from binary documents using Tika and similar toolkits - 2013 - https://opensextant.github.io/XText - - scm:git:https://github.com/OpenSextant/XText.git - scm:git:https://github.com/OpenSextant/XText.git - https://github.com/OpenSextant/XText.git - - - - Apache 2 - http://www.apache.org/licenses/LICENSE-2.0.txt - - - - - Marc Ubaldino - ubaldino@mitre.org - MITRE - - Lead - - - - - 1.7.30 - - - - org.opensextant - opensextant-xponents-core - 3.4.0 - - - org.opensextant - giscore - - - org.opensextant - geodesy - - - com.norconex.language - langdetect - - - org.locationtech.spatial4j - spatial4j - - - - - gnu.getopt - java-getopt - 1.0.13 - test - - - org.apache.commons - commons-lang3 - 3.12.0 - - - - commons-lang - commons-lang - 2.6 - runtime - - - com.ibm.icu - icu4j - 65.1 - - - org.apache.commons - commons-text - 1.9 - - - commons-codec - commons-codec - 1.15 - - - commons-io - commons-io - 2.8.0 - - - commons-logging - commons-logging - 1.2 - runtime - - - org.apache.commons - commons-compress - 1.20 - - - - com.pff - java-libpst - 0.9.3 - - - org.apache.tika - tika-core - 1.24.1 - - - org.apache.tika - tika-parsers - 1.24.1 - - - org.apache.sis.storage - sis-netcdf - - - pdfbox - org.apache.pdfbox - - - commons-logging - commons-logging - - - httpservices - edu.ucar - - - junrar - com.github.junrar - - - netcdf4 - edu.ucar - - - grib - edu.ucar - - - cdm - edu.ucar - - - cxf-rt-rs-client - org.apache.cxf - - - vorbis-java-core - org.gagravarr - - - vorbis-java-tika - org.gagravarr - - - sis-metadata - org.apache.sis.core - - - sis-utility - org.apache.sis.core - - - jmatio - net.sourceforge.jmatio - - - opennlp-tools - org.apache.opennlp - - - org.json - json - - - edu.usc.ir - sentiment-analysis-parser - - - org.tallison - jmatio - - - com.rometools - rome - - - org.apache.uima - uimafit-core - - - org.apache.uima - uimaj-core - - - - - net.htmlparser.jericho - jericho-html - 3.4 - - - xml-apis - xml-apis - 1.4.01 - - - org.slf4j - slf4j-api - - - junit - junit - test - - - com.sun.mail - javax.mail - 1.5.1 - - - org.apache.pdfbox - pdfbox - 2.0.22 - - - commons-logging - commons-logging - - - - - - ch.qos.logback - logback-classic - - - - org.apache.httpcomponents - fluent-hc - 4.5.13 - runtime - - - commons-logging - commons-logging - - - - - org.apache.httpcomponents - httpclient-cache - 4.5.13 - runtime - - - commons-logging - commons-logging - - - - - org.apache.httpcomponents - httpclient - 4.5.13 - - - commons-logging - commons-logging - - - - - org.apache.httpcomponents - httpcore - 4.4.14 - - - org.apache.httpcomponents - httpmime - 4.5.13 - - - joda-time - joda-time - 2.10.6 - - - de.l3s.boilerpipe - boilerpipe - 1.1.0 - - - javax.activation - activation - 1.1 - - - org.jodd - jodd-json - 5.1.5 - - - + 2013 + https://opensextant.github.io/XText + + scm:git:https://github.com/OpenSextant/XText.git + scm:git:https://github.com/OpenSextant/XText.git + https://github.com/OpenSextant/XText.git + + + + Apache 2 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + + Marc Ubaldino + ubaldino@mitre.org + MITRE + + Lead + + + + + 1.7.30 + - - - junit - junit - 4.13.1 - test - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - ch.qos.logback - logback-classic - 1.2.3 - + + org.opensextant + opensextant-xponents-core + 3.5.0-SNAPSHOT + + + org.opensextant + giscore + + + org.opensextant + geodesy + + + com.norconex.language + langdetect + + + org.locationtech.spatial4j + spatial4j + + + + + gnu.getopt + java-getopt + 1.0.13 + test + + + org.apache.commons + commons-lang3 + 3.12.0 + + + + commons-lang + commons-lang + 2.6 + runtime + + + com.ibm.icu + icu4j + 70.1 + + + org.apache.commons + commons-text + 1.9 + + + commons-codec + commons-codec + 1.15 + + + commons-io + commons-io + 2.10.0 + + + commons-logging + commons-logging + 1.2 + runtime + + + org.apache.commons + commons-compress + 1.21 + + + + com.pff + java-libpst + 0.9.3 + + + org.apache.tika + tika-core + 1.27 + + + org.apache.tika + tika-parsers + 1.27 + + + org.apache.sis.storage + sis-netcdf + + + pdfbox + org.apache.pdfbox + + + commons-logging + commons-logging + + + httpservices + edu.ucar + + + junrar + com.github.junrar + + + netcdf4 + edu.ucar + + + grib + edu.ucar + + + cdm + edu.ucar + + + cxf-rt-rs-client + org.apache.cxf + + + vorbis-java-core + org.gagravarr + + + vorbis-java-tika + org.gagravarr + + + sis-metadata + org.apache.sis.core + + + sis-utility + org.apache.sis.core + + + jmatio + net.sourceforge.jmatio + + + opennlp-tools + org.apache.opennlp + + + org.json + json + + + edu.usc.ir + sentiment-analysis-parser + + + org.tallison + jmatio + + + com.rometools + rome + + + org.apache.uima + uimafit-core + + + org.apache.uima + uimaj-core + + + + + net.htmlparser.jericho + jericho-html + 3.4 + + + xml-apis + xml-apis + 1.4.01 + + + org.slf4j + slf4j-api + + + junit + junit + test + + + com.sun.mail + javax.mail + 1.5.1 + + + org.apache.pdfbox + pdfbox + 2.0.24 + + + commons-logging + commons-logging + + + + + + ch.qos.logback + logback-classic + + + + org.apache.httpcomponents + fluent-hc + 4.5.13 + runtime + + + commons-logging + commons-logging + + + + + org.apache.httpcomponents + httpclient-cache + 4.5.13 + runtime + + + commons-logging + commons-logging + + + + + org.apache.httpcomponents + httpclient + 4.5.13 + + + commons-logging + commons-logging + + + + + org.apache.httpcomponents + httpcore + 4.4.14 + + + org.apache.httpcomponents + httpmime + 4.5.13 + + + joda-time + joda-time + 2.10.13 + + + de.l3s.boilerpipe + boilerpipe + 1.1.0 + + + javax.activation + activation + 1.1 + + + org.jodd + jodd-json + 5.1.5 + - - - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.1.2 - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.2.0 - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - - org.apache.maven.plugins - maven-jar-plugin - 3.2.0 - - - attach-tests - - test-jar - - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - 3.1.1 - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M4 - - - org.codehaus.mojo - findbugs-maven-plugin - 3.0.5 - - - org.sonarsource.scanner.maven - sonar-maven-plugin - 3.8.0.2131 - - - - - - maven-javadoc-plugin - -
<img alt='[OpenSextant Logo]' height='36' - width='36' - src='doc-files/opensextant-manual-logo.png'/><br>copyright - OpenSextant.org, 2013-2020 -
- true - XText - Content Extraction Simplified - false -
- - - attach-javadoc - - jar - - - -
- - org.apache.maven.plugins - maven-release-plugin - - true - true - release - deploy - - - - maven-compiler-plugin - - 1.8 - 1.8 - 1.8 - -Xlint:all,-path - true - true - - - - maven-source-plugin - - - attach-sources - - jar - - - - - - - maven-checkstyle-plugin - - checkstyle.xml - checkstyle.indentChars=4 - checkstyle-suppressions.xml - true - false - - - - - org.codehaus.mojo - findbugs-maven-plugin - - true - - - - - maven-dependency-plugin - - - default-cli - - copy-dependencies - - - - lib - runtime - test - - - - dependency-analysis - - analyze-only - - verify - - - - - - - maven-surefire-plugin - - - ${basedir}/src/test/resources - - - -
-
- - - - org.apache.maven.plugins - maven-project-info-reports-plugin - 2.7 - - - false - - - - maven-javadoc-plugin - - - - javadoc - - - - - - - - - - release - + + + + + junit + junit + 4.13.1 + test + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + ch.qos.logback + logback-classic + 1.2.3 + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.1.2 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.2.0 + + + org.apache.maven.plugins + maven-source-plugin + 3.2.1 + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + + org.apache.maven.plugins + maven-jar-plugin + 3.2.0 + + + attach-tests + + test-jar + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.1.1 + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M4 + + + org.codehaus.mojo + findbugs-maven-plugin + 3.0.5 + + + org.sonarsource.scanner.maven + sonar-maven-plugin + 3.8.0.2131 + + + org.apache.maven.plugins + maven-release-plugin + 3.0.0-M4 + + + - - maven-source-plugin - - - attach-sources - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.7 - true - - ossrh - https://oss.sonatype.org/ - true - - + + maven-javadoc-plugin + + 1.8 +
<img alt='[OpenSextant Logo]' height='36' + width='36' + src='doc-files/opensextant-manual-logo.png'/><br>copyright + OpenSextant.org, 2013-2021 +
+ true + XText - Content Extraction Simplified + false +
+ + + attach-javadoc + + jar + + + +
+ + maven-release-plugin + + true + true + release + deploy + + + + maven-compiler-plugin + + 1.8 + 1.8 + 1.8 + -Xlint:all,-path + true + true + + + + maven-source-plugin + + + attach-sources + + jar + + + + + + + maven-checkstyle-plugin + + checkstyle.xml + checkstyle.indentChars=4 + checkstyle-suppressions.xml + true + false + + + + + org.codehaus.mojo + findbugs-maven-plugin + + true + + + + + maven-dependency-plugin + + + default-cli + + copy-dependencies + + + + lib + runtime + test + + + + dependency-analysis + + analyze-only + + verify + + + + + + + maven-surefire-plugin + + + ${basedir}/src/test/resources + + +
-
-
-
- - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2 - - + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 2.7 + + + false + + + + maven-javadoc-plugin + + + + javadoc + + + + + + + + + + release + + + + maven-source-plugin + + + attach-sources + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.7 + true + + ossrh + https://oss.sonatype.org/ + true + + + + + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2 + +
diff --git a/src/main/java/org/opensextant/xtext/XText.java b/src/main/java/org/opensextant/xtext/XText.java index 4f04ecb..f0f29c8 100644 --- a/src/main/java/org/opensextant/xtext/XText.java +++ b/src/main/java/org/opensextant/xtext/XText.java @@ -1,20 +1,3 @@ -/* - * - * Copyright 2012-2013 The MITRE Corporation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| // // _____ ____ __ __ @@ -28,6 +11,7 @@ // \/_/ // // OpenSextant XText +// Copyright 2012-2021 MITRE // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| // package org.opensextant.xtext; @@ -63,6 +47,7 @@ import org.opensextant.xtext.converters.TextTranscodingConverter; import org.opensextant.xtext.converters.TikaHTMLConverter; import org.opensextant.xtext.converters.WebArchiveConverter; +import org.opensextant.xtext.converters.OfficeConverter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -126,16 +111,16 @@ public PathManager getPathManager() { private final int maxHTMLBuffer = 5 * maxBuffer; private long maxFileSize = FILE_SIZE_LIMIT; - protected Set archiveFileTypes = new HashSet(); + protected Set archiveFileTypes = new HashSet<>(); /** * */ - public static Map converters = new HashMap(); + public static Map converters = new HashMap<>(); private Converter defaultConversion; private Converter embeddedConversion; - private final Set requestedFileTypes = new HashSet(); - private final Set ignoreFileTypes = new HashSet(); + private final Set requestedFileTypes = new HashSet<>(); + private final Set ignoreFileTypes = new HashSet<>(); private boolean allowNoExtension = false; /** @@ -682,14 +667,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw long t2 = System.currentTimeMillis(); int duration = (int) (t2 - t1); if (textDoc != null) { - // Buffer can be null. If you got this far, you are interested - // in the file, as it passed - // all filters above. Return the document with whatever metadata - // it found. - // if (textDoc.buffer == null) { - // throw new - // IOException("Engineering error: Doc converted, but converter failed to setText()"); - // } + // Buffer can be null. If you got this far, you are interested in the file, as it passed + // all filters above. Return the document with whatever metadata it found. if (paths.isSaving() && textDoc.is_converted) { // Get Parent info in there. if (parent != null) { @@ -935,7 +914,9 @@ public void setup() throws IOException { requestedFileTypes.add("xhtml"); } - MessageConverter emailParser = new MessageConverter(); + boolean useMSOffice = false; + Converter emailParser = useMSOffice ? new OfficeConverter() : new MessageConverter(); + mimetype = "eml"; if (requestedFileTypes.contains(mimetype)) { converters.put(mimetype, emailParser); diff --git a/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java b/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java new file mode 100644 index 0000000..5b5445e --- /dev/null +++ b/src/main/java/org/opensextant/xtext/converters/OfficeConverter.java @@ -0,0 +1,64 @@ +package org.opensextant.xtext.converters; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.sax.BodyContentHandler; +import org.opensextant.xtext.ConvertedDocument; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +public class OfficeConverter extends ConverterAdapter { + + + protected Logger logger = LoggerFactory.getLogger(getClass()); + private OfficeParser parser = new OfficeParser(); + + public OfficeConverter() { } + + + /** + * + * @param input input stream + * @param doc File + * @return ConvertedDocument + * @throws IOException on IO failure with stream or conversion of content + */ + @Override + protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) + throws IOException { + Metadata metadata = new Metadata(); + ParseContext ctx = new ParseContext(); + BodyContentHandler handler = new BodyContentHandler(); + + try { + parser.parse(input, handler, metadata, ctx); + } catch (NoClassDefFoundError classErr){ + throw new IOException("Unable to parse content due to Tika misconfiguration", classErr); + } catch (Exception xerr) { + throw new IOException("Unable to parse content", xerr); + } finally { + input.close(); + } + + /* Construct a response */ + ConvertedDocument textdoc = new ConvertedDocument(doc); + + /* Add essential metadata */ + textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE)); + textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING)); + textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED)); + textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR)); + + /* Mark the document as converted */ + textdoc.is_converted = true; + return textdoc; + } + + +} diff --git a/src/test/java/OfficeParserTool.java b/src/test/java/OfficeParserTool.java new file mode 100644 index 0000000..34d4721 --- /dev/null +++ b/src/test/java/OfficeParserTool.java @@ -0,0 +1,44 @@ + +import org.opensextant.xtext.ConvertedDocument; +import org.opensextant.xtext.converters.MessageConverter; +import org.opensextant.xtext.converters.OfficeConverter; + +public class OfficeParserTool { + + /* + * TODO: Both MessageConverter and OfficeConverter are failing in basic .EML and .MSG files. + * no standards.... + */ + + + public static void main(String[] args) { + OfficeConverter converter = new OfficeConverter(); + + String msMsg, mimeMsg = null; + ConvertedDocument msdoc = null, mimedoc = null; + try { + msdoc = converter.convert(args[0]); + msMsg = "success - " + msdoc.getProperty("title"); + } catch (Exception err) { + //err.printStackTrace(); + msMsg = err.getMessage(); + } + + try { + mimedoc = new MessageConverter().convert(args[0]); + mimeMsg = "success - " + mimedoc.getProperty("title"); + } catch (Exception err) { + mimeMsg = err.getMessage(); + } + + System.out.println("MS OfficeConverter\n\tResult:" + msMsg); + if (msdoc != null) { + System.out.println("\tDoc " + msdoc.toString()); + } + + System.out.println("MIME MessageConverter\n\tResult:" + mimeMsg); + if (mimedoc != null) { + System.out.println("\tDoc " + mimedoc.toString()); + } + } +} diff --git a/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java b/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java new file mode 100644 index 0000000..f091991 --- /dev/null +++ b/src/test/java/org/opensextant/xtext/converters/test/TestOfficeMail.java @@ -0,0 +1,40 @@ +package org.opensextant.xtext.converters.test; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; + +import org.apache.commons.io.FileUtils; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.opensextant.xtext.ConvertedDocument; +import org.opensextant.xtext.converters.OfficeConverter; + +public class TestOfficeMail { + + @ClassRule + public static final TemporaryFolder TEMP_DIR = new TemporaryFolder(); + private static File TEST_FILE = null; + + @BeforeClass + public static void setupTemporaryFolder() throws IOException { + TEST_FILE = TEMP_DIR.newFile("mimeEmailWithAttachmentsTest.eml"); + FileUtils.copyInputStreamToFile( + MessageConverterTest.class.getResourceAsStream("mimeEmailWithAttachmentsTest.eml"), TEST_FILE); + } + + // @Test + public void testMailMessageParser() { + OfficeConverter converter = new OfficeConverter(); + + try { + ConvertedDocument doc = converter.convert(TEST_FILE); + } catch (Exception err) { + err.printStackTrace(); + fail("EML conversion failed"); + } + } +}