diff --git a/etc/tika-config.xml b/etc/tika-config.xml index 92406b3..2357d52 100644 --- a/etc/tika-config.xml +++ b/etc/tika-config.xml @@ -12,6 +12,7 @@ + diff --git a/pom.xml b/pom.xml index 6ad95c4..e423644 100644 --- a/pom.xml +++ b/pom.xml @@ -34,9 +34,9 @@ - 2.0.3 - 2.5.0 - 2.19.0 + 2.0.6 + 2.7.0 + 2.20.0 @@ -277,6 +277,11 @@ tika-parser-pdf-module ${tika.version} + + org.apache.tika + tika-parser-image-module + ${tika.version} + diff --git a/src/main/java/org/opensextant/xtext/ConvertedDocument.java b/src/main/java/org/opensextant/xtext/ConvertedDocument.java index b34af59..093487c 100644 --- a/src/main/java/org/opensextant/xtext/ConvertedDocument.java +++ b/src/main/java/org/opensextant/xtext/ConvertedDocument.java @@ -482,9 +482,8 @@ public void setDefaultID() throws IOException, NoSuchAlgorithmException { * The whole point of this mess: get the text from the original. It is set here and line endings normalized to unix line endings, \n * * @param buf textual data for this document object - * @throws UnsupportedEncodingException on err */ - public void setText(String buf) throws UnsupportedEncodingException { + public void setText(String buf) { this.buffer = buf; if (StringUtils.isBlank(buffer)) { @@ -503,7 +502,7 @@ public void setText(String buf) throws UnsupportedEncodingException { textpath = this.filepath; } - addNumberProperty("textsize", (long) buffer.length()); + addNumberProperty("textsize", buffer.length()); } /** diff --git a/src/main/java/org/opensextant/xtext/PathManager.java b/src/main/java/org/opensextant/xtext/PathManager.java index 160fd8a..66c80ab 100644 --- a/src/main/java/org/opensextant/xtext/PathManager.java +++ b/src/main/java/org/opensextant/xtext/PathManager.java @@ -610,7 +610,7 @@ public static String getEmbeddedPath(String container, String item) { * @param filepath path to test * @return true if file parent is "/xtext/" or "\xtext\, case sensitive is found anywhere in path. */ - public static final boolean isXTextCache(String filepath) { + public static boolean isXTextCache(String filepath) { if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_PATH)) { return true; } @@ -627,7 +627,7 @@ public static final boolean isXTextCache(String filepath) { * @param obj path to test. * @return true if file parent is "xtext", case sensitive. */ - public static final boolean isXTextCache(File obj) { + public static boolean isXTextCache(File obj) { return DEFAULT_EMBED_FOLDER.equals(obj.getParentFile().getName()); } diff --git a/src/main/java/org/opensextant/xtext/XText.java b/src/main/java/org/opensextant/xtext/XText.java index ab423fd..049833d 100644 --- a/src/main/java/org/opensextant/xtext/XText.java +++ b/src/main/java/org/opensextant/xtext/XText.java @@ -687,9 +687,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw * is trapped in loop * * @param input the input - * @throws IOException on err */ - public void convertFolder(File input) throws IOException { + public void convertFolder(File input) { java.util.Collection files = FileUtils.listFiles(input, new SuffixFileFilter(fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter()); for (File f : files) { diff --git a/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java b/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java index d9c5805..81548e1 100644 --- a/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java +++ b/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java @@ -22,6 +22,7 @@ import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; @@ -31,6 +32,9 @@ import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; import org.opensextant.xtext.ConvertedDocument; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; /** * Default conversion is almost a pass through from Tika's auto parser and BodyContentHandler. @@ -41,6 +45,8 @@ */ public class DefaultConverter extends ConverterAdapter { + private final Logger log = LoggerFactory.getLogger(getClass()); + /* 1 MB of text from a given document */ public final static int MAX_TEXT_SIZE = 0x100000; private final Detector detector = new DefaultDetector(); @@ -77,8 +83,10 @@ protected ConvertedDocument conversionImplementation(InputStream input, java.io. parser.parse(input, handler, metadata, ctx); } catch (NoClassDefFoundError classErr) { throw new IOException("Unable to parse content due to Tika misconfiguration", classErr); - } catch (Exception xerr) { - throw new IOException("Unable to parse content", xerr); + } catch (TikaException e1) { + throw new IOException("Tika: Unable to parse content", e1); + } catch (SAXException e2) { + throw new IOException("SAX: Unable to parse content", e2); } ConvertedDocument textdoc = new ConvertedDocument(doc); diff --git a/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java b/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java index d019855..fddab6f 100644 --- a/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java +++ b/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java @@ -23,9 +23,7 @@ import java.text.ParseException; import java.util.Arrays; import java.util.Collections; -import java.util.HashSet; import java.util.List; -import java.util.Set; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; @@ -58,12 +56,7 @@ public class ImageMetadataConverter extends ConverterAdapter { private final Logger logger = LoggerFactory.getLogger(getClass()); private boolean emitMinimalText = true; - public final static String[] usefulFields = { "geo", "gps", "creation", "date", "model" }; - - private final static Set usefulFieldsSet = new HashSet<>(); - static { - usefulFieldsSet.addAll(Arrays.asList(usefulFields)); - } + public final static String[] usefulFields = {"geo", "gps", "creation", "date", "model"}; public ImageMetadataConverter() { ctx.set(Parser.class, parser); @@ -131,6 +124,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc) StringBuilder buf = new StringBuilder(); BodyContentHandler handler = new BodyContentHandler(); + // String type = "Image"; String objName = null; if (doc != null) { @@ -188,7 +182,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc) // Location if available. if (lat != null && lon != null) { - logger.info("Found a location LAT={} LON={}", lat, lon); + logger.debug("Found a location LAT={} LON={}", lat, lon); // imgDoc.addProperty("location", String.format("%2.8f,%3.8f", )); imgDoc.addUserProperty("location", String.format("%s, %s", lat, lon)); try { diff --git a/src/main/java/org/opensextant/xtext/converters/MessageConverter.java b/src/main/java/org/opensextant/xtext/converters/MessageConverter.java index 44dbd51..8cceb0e 100644 --- a/src/main/java/org/opensextant/xtext/converters/MessageConverter.java +++ b/src/main/java/org/opensextant/xtext/converters/MessageConverter.java @@ -79,7 +79,6 @@ public class MessageConverter extends ConverterAdapter { protected Logger logger = LoggerFactory.getLogger(getClass()); private final Session noSession = Session.getDefaultInstance(new Properties()); private int attachmentNumber = 0; - private final List textEncodings = new LinkedList<>(); private Converter payloadConverter = new TikaHTMLConverter(false); /** @@ -91,7 +90,6 @@ public class MessageConverter extends ConverterAdapter { protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException { attachmentNumber = 0; - textEncodings.clear(); try { // Connect to the message file MimeMessage msg = new MimeMessage(noSession, in); @@ -268,8 +266,6 @@ public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder try { PartMetadata meta = new PartMetadata(bodyPart); - // String charset = (meta.charset == null ? "UTF-8" : meta.charset); - textEncodings.add(meta.charset); String filename = bodyPart.getFileName(); String fileext = meta.getPossibleFileExtension();