o19s · epugh · Sep 14, 2020 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/README.md b/README.md
@@ -54,22 +54,34 @@ From the `./ocr/` directory, there are some Powershell ( ;-) ) scripts to recrea
 
 1. `cd ./ocr`
 
-1. Make sure you have Tesseract installed.  `brew install tesseract` on OSX.
+1. We have three places for OCRing, on the deployed demo site pdf-discovery-demo.dev.o19s.com, on your local Docker deployed service, or running on your computer.
 
-1. Check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup.
+1. Look at the file `./extract.ps1` to see where the extraction is actually being run of the above three options.
+
+1. If you are running on local computer, first make sure you have Tesseract installed.  `brew install tesseract` on OSX.
+
+1. Then check the `./tika-properties/.../TesseractOCRConfig.properties` file, make sure it points to your Tesseract setup.
 
 1. Run the extraction process, creating the working docs in the `/extracts` directory from the PDF's in `/files`.
 
 ```
-pwsh extract-directory.ps1 ./files
+pwsh extract-directory.ps1 ./files3 ./extracts3
 ```
 
 1. Create Solr documents.
 
 ```
-pwsh create-solr-docs.ps1 ./extracts ./files ./docs_for_solr/
+pwsh create-solr-docs.ps1 ./extracts3 ./files3 ./docs_for_solr3/
 ```
 
+1. Load Solr documents INTO Sol
+
+Run the load script:
+
+``
+./init/load_sample_files.sh ./docs_for_solr3 http://localhost:8983/solr/documents/update
+``
+
 ### Interested in manually extracting content from Tika Server?
 
 From the `./ocr/` directory run:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -26,7 +26,7 @@ services:
       - ./volumes/solr_backup:/solr_backup
     links:
       - zookeeper
-    command: "bash -c '/opt/solr/bin/solr start -f -z zookeeper:2181 -Dbootstrap_confdir=/solr-config/index/conf'"
+    command: "bash -c './post-hooks.sh & /opt/solr/bin/solr start -f -z zookeeper:2181 -Denable.packages=true -Dbootstrap_confdir=/solr-config/index/conf'"
 
   solr-proxy:
     build: ./solr-proxy

diff --git a/ocr/extract-directory.ps1 b/ocr/extract-directory.ps1
@@ -17,6 +17,6 @@ $pdf_files = Get-ChildItem -Path $source_directory –Recurse | Where-Object {$_
 foreach ($pdf_file in $pdf_files) {
 
   Write-Host $pdf_file
-  Invoke-Expression "./extract.ps1 $pdf_file $extracts_directory"
+  Invoke-Expression "./extract2.ps1 $pdf_file $extracts_directory"
 
 }
diff --git a/ocr/extract.ps1 b/ocr/extract.ps1
@@ -28,8 +28,9 @@ if(!(Test-Path($extract_file_json))){
 
   Write-Host "About to Tika Extract PDF file $pdf_file"
 
-  $result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
-  #$result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
+  # Specify where we are OCR'ing the data
+  #$result = curl -T $pdf_file http://pdf-discovery-demo.dev.o19s.com:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
+  $result = curl -T $pdf_file http://localhost:9998/rmeta --header "X-Tika-OCRLanguage: eng" --header "X-Tika-PDFOcrStrategy: ocr_and_text_extraction" --header "X-Tika-OCRoutputType: hocr"
   #$result = java -cp tika-app-1.24.1.jar org.apache.tika.cli.TikaCLI --config=tika-config.xml --xmp --jsonRecursive --extract --pretty-print -x $pdf_file
 
   Set-Content -Path $extract_file_json -Value $result

diff --git a/ocr/init/init.sh b/ocr/init/init.sh
@@ -5,10 +5,13 @@
 #	echo "Waiting on MySQL init..."
 #	sleep 5
 #done
-#echo "Sleeping 15"
-#sleep 15
+echo "Sleeping 15"
+sleep 15
 ./wait-for-solr.sh --max-attempts 10 --wait-seconds 4 --solr-url http://solr:8983
 
+echo "Sleeping 30 more"
+sleep 30
+
 echo "Uploading security.json to ZK"
 
 java -jar ./jackhanna-0.0.4-SNAPSHOT.jar zookeeper:2181 putfile --file security.json --zkFile /security.json

diff --git a/solr/Dockerfile b/solr/Dockerfile
@@ -1,4 +1,4 @@
-FROM solr:8.11.1
+FROM solr:8.11.4
 
 # Add Tesseract
 USER root
@@ -12,8 +12,9 @@ RUN mkdir -p /home/solr/ # Cache dir for fonts from Tika.
 RUN chown -R 8983:8983 /home/solr
 
 # Add Solr customizations
-ADD lib/*.jar /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/
 ADD web.xml /opt/solr/server/solr-webapp/webapp/WEB-INF
+ADD post-hooks.sh /opt/solr
+RUN chmod +x /opt/solr/post-hooks.sh
 
 COPY solr-home /solr-config
 ADD set_heap.sh /docker-entrypoint-initdb.db

diff --git a/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar b/solr/lib/offset-hl-formatter-1.0.1-solr7.1.0-SNAPSHOT.jar
diff --git a/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar b/solr/lib/solr-payloads-1.0.3-solr7.1.0-SNAPSHOT.jar
diff --git a/solr/post-hooks.sh b/solr/post-hooks.sh
@@ -0,0 +1,6 @@
+wait-for-solr.sh --max-attempts 25 --wait-seconds 4 --solr-url http://solr:8983
+
+bin/solr package add-repo osc https://raw.githubusercontent.com/o19s/payload-component/master/repo
+bin/solr package install solr-payloads:1.1.4
+
+
diff --git a/solr/solr-home/index/conf/schema.xml b/solr/solr-home/index/conf/schema.xml
@@ -67,13 +67,13 @@
     <fieldType name="ocr" stored="false" indexed="true" class="solr.TextField">
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="com.o19s.payloads.Base64Encoder" />
+        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="solr-payloads:com.o19s.payloads.Base64Encoder" />
         <!-- The payload buffer works around WDF removing payloads from tokens by copying and replacing later in the chain -->
-        <filter class="com.o19s.payloads.filter.PayloadBufferFilterFactory" />
+        <filter class="solr-payloads:com.o19s.payloads.filter.PayloadBufferFilterFactory" />
         <filter class="solr.LowerCaseFilterFactory" />
         <filter class="solr.WordDelimiterFilterFactory" />
         <filter class="solr.KStemFilterFactory" />
-        <filter class="com.o19s.payloads.filter.PayloadBufferFilterFactory" />
+        <filter class="solr-payloads:com.o19s.payloads.filter.PayloadBufferFilterFactory" />
       </analyzer>
     </fieldType>
 

diff --git a/solr/solr-home/index/conf/solrconfig.xml b/solr/solr-home/index/conf/solrconfig.xml
@@ -134,7 +134,7 @@
   </admin>
 
 
-  <searchComponent class="com.o19s.payloads.component.PayloadComponent" name="payload" />
+  <searchComponent class="solr-payloads:com.o19s.payloads.component.PayloadComponent" name="payload" />
     <!-- Highlighting Component
 
        http://wiki.apache.org/solr/HighlightingParameters
@@ -169,7 +169,7 @@
       <!-- Configure the standard formatter -->
       <formatter name="html"
                  default="true"
-                 class="com.o19s.labs.OffsetFormatter">
+                 class="solr-payloads:com.o19s.hl.OffsetFormatter">
       </formatter>
 
       <!-- Configure the standard encoder -->