IMAGES: configure image parser support, without it, no EXIF header pa…

…rsing happens. PyLint: simple code cleanup
OpenSextant · Mar 2, 2023 · 82d84a1 · 82d84a1
1 parent 115a860
commit 82d84a1
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 25 deletions.
diff --git a/etc/tika-config.xml b/etc/tika-config.xml
@@ -12,6 +12,7 @@
     <parser class="org.apache.tika.parser.microsoft.rtf.RTFParser"></parser>
     <parser class="org.apache.tika.parser.odf.OpenDocumentParser"></parser>
     <parser class="org.apache.tika.parser.pdf.PDFParser"></parser>
+    <parser class="org.apache.tika.parser.image.JpegParser"></parser>
 
   </parsers>
 </properties>
diff --git a/pom.xml b/pom.xml
@@ -34,9 +34,9 @@
     </developer>
   </developers>
   <properties>
-    <slf4j.version>2.0.3</slf4j.version>
-    <tika.version>2.5.0</tika.version>
-    <log4j.version>2.19.0</log4j.version>
+    <slf4j.version>2.0.6</slf4j.version>
+    <tika.version>2.7.0</tika.version>
+    <log4j.version>2.20.0</log4j.version>
   </properties>
   <dependencyManagement>
     <dependencies>
@@ -277,6 +277,11 @@
       <artifactId>tika-parser-pdf-module</artifactId>
       <version>${tika.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parser-image-module</artifactId>
+      <version>${tika.version}</version>
+    </dependency>
   </dependencies>
   <build>
     <pluginManagement>

diff --git a/src/main/java/org/opensextant/xtext/ConvertedDocument.java b/src/main/java/org/opensextant/xtext/ConvertedDocument.java
@@ -482,9 +482,8 @@ public void setDefaultID() throws IOException, NoSuchAlgorithmException {
      * The whole point of this mess:  get the text from the original. It is set here and line endings normalized to unix line endings, \n
      *
      * @param buf textual data for this document object
-     * @throws UnsupportedEncodingException on err
      */
-    public void setText(String buf) throws UnsupportedEncodingException {
+    public void setText(String buf) {
         this.buffer = buf;
 
         if (StringUtils.isBlank(buffer)) {
@@ -503,7 +502,7 @@ public void setText(String buf) throws UnsupportedEncodingException {
             textpath = this.filepath;
         }
 
-        addNumberProperty("textsize", (long) buffer.length());
+        addNumberProperty("textsize", buffer.length());
     }
 
     /**

diff --git a/src/main/java/org/opensextant/xtext/PathManager.java b/src/main/java/org/opensextant/xtext/PathManager.java
@@ -610,7 +610,7 @@ public static String getEmbeddedPath(String container, String item) {
      * @param filepath path to test
      * @return true if file parent is "/xtext/" or "\xtext\, case sensitive is found anywhere in path.
      */
-    public  static final boolean isXTextCache(String filepath) {
+    public  static boolean isXTextCache(String filepath) {
         if (filepath.contains(DEFAULT_EMBED_FOLDER_IN_PATH)) {
             return true;
         }
@@ -627,7 +627,7 @@ public  static final boolean isXTextCache(String filepath) {
      * @param obj path to test.
      * @return  true if file parent is "xtext", case sensitive.
      */
-    public  static final boolean isXTextCache(File obj) {
+    public  static boolean isXTextCache(File obj) {
         return DEFAULT_EMBED_FOLDER.equals(obj.getParentFile().getName());
     }
 

diff --git a/src/main/java/org/opensextant/xtext/XText.java b/src/main/java/org/opensextant/xtext/XText.java
@@ -687,9 +687,8 @@ public ConvertedDocument convertFile(File input, ConvertedDocument parent) throw
      * is trapped in loop
      *
      * @param input the input
-     * @throws IOException on err
      */
-    public void convertFolder(File input) throws IOException {
+    public void convertFolder(File input) {
         java.util.Collection<File> files = FileUtils.listFiles(input,
                 new SuffixFileFilter(fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter());
         for (File f : files) {

diff --git a/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java b/src/main/java/org/opensextant/xtext/converters/DefaultConverter.java
@@ -22,6 +22,7 @@
 
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -31,6 +32,9 @@
 import org.opensextant.util.FileUtility;
 import org.opensextant.util.TextUtils;
 import org.opensextant.xtext.ConvertedDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
 
 /**
  * Default conversion is almost a pass through from Tika's auto parser and BodyContentHandler.
@@ -41,6 +45,8 @@
  */
 public class DefaultConverter extends ConverterAdapter {
 
+    private final Logger log = LoggerFactory.getLogger(getClass());
+
     /* 1 MB of text from a given document */
     public final static int MAX_TEXT_SIZE = 0x100000;
     private final Detector detector = new DefaultDetector();
@@ -77,8 +83,10 @@ protected ConvertedDocument conversionImplementation(InputStream input, java.io.
             parser.parse(input, handler, metadata, ctx);
         } catch (NoClassDefFoundError classErr) {
             throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
-        } catch (Exception xerr) {
-            throw new IOException("Unable to parse content", xerr);
+        } catch (TikaException e1) {
+            throw new IOException("Tika: Unable to parse content", e1);
+        } catch (SAXException e2) {
+            throw new IOException("SAX: Unable to parse content", e2);
         }
         ConvertedDocument textdoc = new ConvertedDocument(doc);
 

diff --git a/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java b/src/main/java/org/opensextant/xtext/converters/ImageMetadataConverter.java
@@ -23,9 +23,7 @@
 import java.text.ParseException;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
 
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.lang3.StringUtils;
@@ -58,12 +56,7 @@ public class ImageMetadataConverter extends ConverterAdapter {
     private final Logger logger = LoggerFactory.getLogger(getClass());
     private boolean emitMinimalText = true;
 
-    public final static String[] usefulFields = { "geo", "gps", "creation", "date", "model" };
-
-    private final static Set<String> usefulFieldsSet = new HashSet<>();
-    static {
-        usefulFieldsSet.addAll(Arrays.asList(usefulFields));
-    }
+    public final static String[] usefulFields = {"geo", "gps", "creation", "date", "model"};
 
     public ImageMetadataConverter() {
         ctx.set(Parser.class, parser);
@@ -131,6 +124,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)
         StringBuilder buf = new StringBuilder();
         BodyContentHandler handler = new BodyContentHandler();
 
+        //
         String type = "Image";
         String objName = null;
         if (doc != null) {
@@ -188,7 +182,7 @@ protected ConvertedDocument conversionImplementation(InputStream in, File doc)
 
             // Location if available.
             if (lat != null && lon != null) {
-                logger.info("Found a location LAT={} LON={}", lat, lon);
+                logger.debug("Found a location LAT={} LON={}", lat, lon);
                 // imgDoc.addProperty("location", String.format("%2.8f,%3.8f", ));
                 imgDoc.addUserProperty("location", String.format("%s, %s", lat, lon));
                 try {

diff --git a/src/main/java/org/opensextant/xtext/converters/MessageConverter.java b/src/main/java/org/opensextant/xtext/converters/MessageConverter.java
@@ -79,7 +79,6 @@ public class MessageConverter extends ConverterAdapter {
     protected Logger logger = LoggerFactory.getLogger(getClass());
     private final Session noSession = Session.getDefaultInstance(new Properties());
     private int attachmentNumber = 0;
-    private final List<String> textEncodings = new LinkedList<>();
     private Converter payloadConverter = new TikaHTMLConverter(false);
 
     /**
@@ -91,7 +90,6 @@ public class MessageConverter extends ConverterAdapter {
     protected ConvertedDocument conversionImplementation(InputStream in, File doc)
             throws IOException {
         attachmentNumber = 0;
-        textEncodings.clear();
         try {
             // Connect to the message file
             MimeMessage msg = new MimeMessage(noSession, in);
@@ -268,8 +266,6 @@ public void parseMessage(Part bodyPart, ConvertedDocument parent, StringBuilder
         try {
 
             PartMetadata meta = new PartMetadata(bodyPart);
-            // String charset = (meta.charset == null ? "UTF-8" : meta.charset);
-            textEncodings.add(meta.charset);
 
             String filename = bodyPart.getFileName();
             String fileext = meta.getPossibleFileExtension();