diff --git a/etc/tika-config.xml b/etc/tika-config.xml
index 92406b3..2357d52 100644
--- a/etc/tika-config.xml
+++ b/etc/tika-config.xml
@@ -12,6 +12,7 @@
+
diff --git a/pom.xml b/pom.xml
index 6ad95c4..e423644 100644
--- a/pom.xml
+++ b/pom.xml
@@ -34,9 +34,9 @@
- 2.0.3
- 2.5.0
- 2.19.0
+ 2.0.6
+ 2.7.0
+ 2.20.0
@@ -277,6 +277,11 @@
tika-parser-pdf-module
${tika.version}
+
+ org.apache.tika
+ tika-parser-image-module
+ ${tika.version}
+
diff --git a/src/main/java/org/opensextant/xtext/ConvertedDocument.java b/src/main/java/org/opensextant/xtext/ConvertedDocument.java
index b34af59..093487c 100644
--- a/src/main/java/org/opensextant/xtext/ConvertedDocument.java
+++ b/src/main/java/org/opensextant/xtext/ConvertedDocument.java
@@ -482,9 +482,8 @@ public void setDefaultID() throws IOException, NoSuchAlgorithmException {
* The whole point of this mess: get the text from the original. It is set here and line endings normalized to unix line endings, \n
*
* @param buf textual data for this document object
- * @throws UnsupportedEncodingException on err
*/
- public void setText(String buf) throws UnsupportedEncodingException {
+ public void setText(String buf) {
this.buffer = buf;
if (StringUtils.isBlank(buffer)) {
@@ -503,7 +502,7 @@ public void setText(String buf) throws UnsupportedEncodingException {
textpath = this.filepath;
}
- addNumberProperty("textsize", (long) buffer.length());
+ addNumberProperty("textsize", buffer.length());
}
/**
diff --git a/src/main/java/org/opensextant/xtext/PathManager.java b/src/main/java/org/opensextant/xtext/PathManager.java
index 160fd8a..66c80ab 100644
--- a/src/main/java/org/opensextant/xtext/PathManager.java
+++ b/src/main/java/org/opensextant/xtext/PathManager.java
@@ -610,7 +610,7 @@ public static String getEmbeddedPath(String container, String item) {
* @param filepath path to test
* @return true if file parent is "/xtext/" or "\xtext\, case sensitive is found anywhere in path.
*/
- public static final boolean isXTextCache(String filepath) {
+ public static boolean isXTextCache(String filepath) {
if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_PATH)) {
return true;
}
@@ -627,7 +627,7 @@ public static final boolean isXTextCache(String filepath) {
* @param obj path to test.
* @return true if file parent is "xtext", case sensitive.
*/
- public static final boolean isXTextCache(File obj) {
+ public static boolean isXTextCache(File obj) {
return DEFAULT_EMBED_FOLDER.equals(obj.getParentFile().getName());
}
diff --git a/src/main/java/org/opensextant/xtext/XText.java b/src/main/java/org/opensextant/xtext/XText.java
index ab423fd..049833d 100644
--- a/src/main/java/org/opensextant/xtext/XText.java
+++ b/src/main/java/org/opensextant/xtext/XText.java
@@ -687,9 +687,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw
* is trapped in loop
*
* @param input the input
- * @throws IOException on err
*/
- public void convertFolder(File input) throws IOException {
+ public void convertFolder(File input) {
java.util.Collection files = FileUtils.listFiles(input,
new SuffixFileFilter(fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter());
for (File f : files) {
diff --git a/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java b/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java
index d9c5805..81548e1 100644
--- a/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java
+++ b/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java
@@ -22,6 +22,7 @@
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -31,6 +32,9 @@
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
/**
* Default conversion is almost a pass through from Tika's auto parser and BodyContentHandler.
@@ -41,6 +45,8 @@
*/
public class DefaultConverter extends ConverterAdapter {
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
/* 1 MB of text from a given document */
public final static int MAX_TEXT_SIZE = 0x100000;
private final Detector detector = new DefaultDetector();
@@ -77,8 +83,10 @@ protected ConvertedDocument conversionImplementation(InputStream input, java.io.
parser.parse(input, handler, metadata, ctx);
} catch (NoClassDefFoundError classErr) {
throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
- } catch (Exception xerr) {
- throw new IOException("Unable to parse content", xerr);
+ } catch (TikaException e1) {
+ throw new IOException("Tika: Unable to parse content", e1);
+ } catch (SAXException e2) {
+ throw new IOException("SAX: Unable to parse content", e2);
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
diff --git a/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java b/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java
index d019855..fddab6f 100644
--- a/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java
+++ b/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java
@@ -23,9 +23,7 @@
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
@@ -58,12 +56,7 @@ public class ImageMetadataConverter extends ConverterAdapter {
private final Logger logger = LoggerFactory.getLogger(getClass());
private boolean emitMinimalText = true;
- public final static String[] usefulFields = { "geo", "gps", "creation", "date", "model" };
-
- private final static Set usefulFieldsSet = new HashSet<>();
- static {
- usefulFieldsSet.addAll(Arrays.asList(usefulFields));
- }
+ public final static String[] usefulFields = {"geo", "gps", "creation", "date", "model"};
public ImageMetadataConverter() {
ctx.set(Parser.class, parser);
@@ -131,6 +124,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)
StringBuilder buf = new StringBuilder();
BodyContentHandler handler = new BodyContentHandler();
+ //
String type = "Image";
String objName = null;
if (doc != null) {
@@ -188,7 +182,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)
// Location if available.
if (lat != null && lon != null) {
- logger.info("Found a location LAT={} LON={}", lat, lon);
+ logger.debug("Found a location LAT={} LON={}", lat, lon);
// imgDoc.addProperty("location", String.format("%2.8f,%3.8f", ));
imgDoc.addUserProperty("location", String.format("%s, %s", lat, lon));
try {
diff --git a/src/main/java/org/opensextant/xtext/converters/MessageConverter.java b/src/main/java/org/opensextant/xtext/converters/MessageConverter.java
index 44dbd51..8cceb0e 100644
--- a/src/main/java/org/opensextant/xtext/converters/MessageConverter.java
+++ b/src/main/java/org/opensextant/xtext/converters/MessageConverter.java
@@ -79,7 +79,6 @@ public class MessageConverter extends ConverterAdapter {
protected Logger logger = LoggerFactory.getLogger(getClass());
private final Session noSession = Session.getDefaultInstance(new Properties());
private int attachmentNumber = 0;
- private final List textEncodings = new LinkedList<>();
private Converter payloadConverter = new TikaHTMLConverter(false);
/**
@@ -91,7 +90,6 @@ public class MessageConverter extends ConverterAdapter {
protected ConvertedDocument conversionImplementation(InputStream in, File doc)
throws IOException {
attachmentNumber = 0;
- textEncodings.clear();
try {
// Connect to the message file
MimeMessage msg = new MimeMessage(noSession, in);
@@ -268,8 +266,6 @@ public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder
try {
PartMetadata meta = new PartMetadata(bodyPart);
- // String charset = (meta.charset == null ? "UTF-8" : meta.charset);
- textEncodings.add(meta.charset);
String filename = bodyPart.getFileName();
String fileext = meta.getPossibleFileExtension();