diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f23f11e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+cmake-build*
+.idea/
+**/build/
+**/dist/
+**/nbproject/
+/pgdumps/
+*~
+/tpctl.config
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..524e015
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,86 @@
+cmake_minimum_required(VERSION 3.5)
+project(tpctools)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
+set(CMAKE_CXX_FLAGS "-DBOOST_NO_CXX11_SCOPED_ENUMS -w")
+
+set(CMAKE_STATIC_LIBRARY_PREFIX "") # avoid the prefix "lib"
+set(CMAKE_SHARED_LIBRARY_PREFIX "")
+
+SET(CMAKE_INSTALL_PREFIX /usr/local)
+SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
+SET(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+find_package(APR REQUIRED)
+include_directories(${APR_INCLUDE_DIR})
+
+find_package(Lucene++ REQUIRED)
+include_directories(${Lucene++_INCLUDE_DIRS})
+
+find_package(Textpresso REQUIRED)
+include_directories(${Textpresso_INCLUDE_DIR})
+
+include_directories("/home/valerio/workspace/caltech/libtpc")
+
+#### Main projects ####
+
+add_executable(printbibfromnxmlorcasfile printbibfromnxmlorcasfile/main.cpp printbibfromnxmlorcasfile/cmdline.h)
+target_link_libraries(printbibfromnxmlorcasfile uima xerces-c boost_system boost_filesystem boost_iostreams
+        boost_regex)
+
+#### Subprojects ####
+
+add_executable(ppm2jpg ppm2jpg/main.cpp)
+target_link_libraries(ppm2jpg boost_system boost_filesystem boost_program_options pthread ${CImg_SYSTEM_LIBS})
+
+add_executable(cas2index cas2index/cas2index.cpp)
+target_link_libraries(cas2index ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options lucene++)
+
+add_executable(updatecorpuscounter cas2index/update_corpus_counter.cpp)
+target_link_libraries(updatecorpuscounter ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options
+        lucene++)
+
+add_executable(indexmerger cas2index/index_merger.cpp lucene/CaseSensitiveAnalyzer.cpp)
+target_link_libraries(indexmerger uima boost_filesystem boost_system lucene++)
+
+
+add_executable(articles2cas articles2cas/articles2cas.cpp articles2cas/Utils.h articles2cas/Utils.cpp)
+target_link_libraries(articles2cas ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options boost_iostreams)
+
+add_executable(getbib getbib/getbib.cpp getbib/getbibUtils.h getbib/getbibUtils.cpp
+        TextpressoCentralGlobalDefinitions.h TextpressoCentralGlobals.h)
+target_link_libraries(getbib lucene++ xerces-c icuuc boost_system uima boost_filesystem boost_iostreams)
+
+add_executable(getbib4nxml getbib/getbib4nxml.cpp getbib/getbib4nxmlUtils.h getbib/getbib4nxmlUtils.cpp
+        TextpressoCentralGlobalDefinitions.h TextpressoCentralGlobals.h)
+target_link_libraries(getbib4nxml lucene++ xerces-c icuuc boost_system uima boost_filesystem boost_iostreams)
+
+add_executable(saveidstodb cas2index/saveidstodb.cpp)
+target_link_libraries(saveidstodb lucene++ boost_filesystem boost_system boost_program_options ${Textpresso_LIBRARY}
+        db_cxx db_stl)
+
+#### INSTALL ####
+
+install(TARGETS getbib getbib4nxml
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib)
+
+install(TARGETS cas2index RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+install(TARGETS saveidstodb RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+install(TARGETS updatecorpuscounter RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+install(PROGRAMS cas2index/create_index_multi_literatures.sh cas2index/create_single_index.sh
+        ppm2jpg/ppm2jpgCas.sh run_tpc_pipeline_incremental.sh
+        getpdfs/getpdfs.py getbibinfoforpdffromserver/download_pdfinfo.pl
+        getbibinfoforpdffromserver/extract_pdfbibinfo.pl
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+
+install(TARGETS articles2cas RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+install(PROGRAMS articles2cas/convertallarticles2cas.sh
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+
+
+
diff --git a/TextpressoCentralGlobalDefinitions.h b/TextpressoCentralGlobalDefinitions.h
new file mode 100644
index 0000000..faf8689
--- /dev/null
+++ b/TextpressoCentralGlobalDefinitions.h
@@ -0,0 +1,52 @@
+// Global file containing all global definitions.
+
+#ifndef TEXTPRESSOCENTRALGLOBALDEFINITIONS_H
+#define TEXTPRESSOCENTRALGLOBALDEFINITIONS_H
+
+// Are these definitions really global? Otherwise move them back to their local project.
+#define PDF2TPCASDESCRIPTOR "/usr/local/uima_descriptors/TpTokenizer.xml"
+#define XML2TPCASDESCRIPTOR "/usr/local/uima_descriptors/TxTokenizer.xml"
+#define TPCAS2LINDEXDESCRIPTOR "/usr/local/uima_descriptors/Tpcas2Lindex.xml"
+
+#define AVAILABLELITERATUREFILE "/usr/local/textpresso/luceneindex/subindex.config"
+#define USERUPLOADROOTDIR "/usr/local/textpresso/useruploads"
+
+#define PGONTOLOGYBROWSER "dbname=www-data"
+#define PGONTOLOGYBROWSWERCOLUMNS "ontologybrowsercolumnnames"
+#define PGONTOLOGY "dbname=www-data"
+#define PGONTOLOGYTABLENAME "tpontology"
+#define PGONTOLOGYTMPTABLENAME "tmptpontology"
+#define PGCURATION "dbname=www-data"
+#define PGCURATIONTABLENAME "tpcuration"
+#define PCRELATIONSTABLENAME "pcrelations"
+#define PCRELATIONSTMPTABLENAME "tmppcrelations"
+#define PADCRELATIONSTABLENAME "padcrelations"
+#define PADCRELATIONSTMPTABLENAME "tmppadcrelations"
+#define STOPWORDTABLENAME "stopwords"
+#define PGLITERATURE "dbname=www-data"
+#define PGLITPREFTABLENAME "literaturepreference"
+#define PGPRELOADEDCATEGORIES "dbname=www-data"
+#define PGPRELOADEDCATTABLENAME "preloadedcategories"
+#define PGTIPOFDAY "dbname=www-data"
+#define PGTIPOFDAYTABLENAME "tipoftheday"
+#define PGCURATIONFIELDS "dbname=www-data"
+#define PGCURATIONFIELDSTABLENAME "curationfields"
+#define PGCURATIONFORMS "dbname=www-data"
+#define PGCURATIONFORMSTABLENAME "curationforms"
+#define PGCURATIONDATAFROMVIEWER "dbname=www-data"
+#define PGCURATIONDATAFROMVIEWERTABLENAME "curationdatafromviewer"
+#define PGLISTOFONTOLOGIES "dbname=www-data"
+#define PGLISTOFONTOLOGIESTABLENAME "listofontologies"
+#define PGPREPOPULATION "dbname=www-data"
+#define PGPREPOPULATIONTABLENAME "prepopulation"
+#define AUTHIDENTITIES "dbname=www-data"
+#define AUTHIDENTITIESTABLENAME "auth_identity"
+#define PGLITERATUREPERMISSION "dbname=www-data"
+#define PGLITERATUREPERMISSIONTABLENAME "literaturepermissions"
+#define PGCUSTOMCOLORS "dbname=www-data"
+#define PGCUSTOMCOLORSTABLENAME "customcolor"
+#define PGDIALOGPREFERENCES "dbname=www-data"
+#define PGDIALOGPREFERENCESTABLENAME "dialogpreferencestable"
+#define SENTENCE_SEARCH_MAX_NUM_DISPLAY_WORDS 50
+
+#endif
diff --git a/TextpressoCentralGlobals.h b/TextpressoCentralGlobals.h
new file mode 100644
index 0000000..66f0184
--- /dev/null
+++ b/TextpressoCentralGlobals.h
@@ -0,0 +1,41 @@
+// Global file containing all global definitions.
+
+#ifndef TEXTPRESSOCENTRALGLOBALS_H
+#define TEXTPRESSOCENTRALGLOBALS_H
+
+
+// Are these definitions really global? Otherwise move them back to their local project. 
+
+#include "TextpressoCentralGlobalDefinitions.h"
+
+#include <uima/api.hpp>
+
+// If a composite delimiter exists, then there cannot be another delimiter
+// that is a subset of that composite token delimiter. Decompose it accordingly.
+// This applies to token and sentence delimiter
+UnicodeString G_initT[] = {
+    " ", "\n", "\t", "'", "\"",
+    "/", "—", "(", ")", "[",
+    "]", "{", "}", ":", ". ",
+    "; ", ", ", "! ", "? "
+};
+
+const int G_initT_No = 19;
+UnicodeString G_initS[] = {
+    ".\n", "!\n", "?\n", ". ", "! ", "? ",
+    ".\t", "!\t", "?\t", ".<", "!<", "?<"
+};
+const int G_initS_No = 12;
+UnicodeString G_initP[] = {"<_pdf _image", "<_pdf _sbr", "<_pdf _hbr",
+    "<_pdf _fsc", "<_pdf _fnc", "<_pdf _ydiff", "<_pdf _cr", "<_pdf _page"};
+const int G_initP_No = 8;
+const std::string ServerNames[] = {"http://goldturtle.caltech.edu/cgi-bin/ReceivePost.cgi",
+    "http://go-genkisugi.rhcloud.com/capella", "http://localhost/cgi-bin/ReceivePost.cgi"};
+const int ServerNames_No = 3;
+
+//const std::string G_pdftagstart("<_pdf ");
+//const std::string G_pdftagend("/>");
+const UnicodeString usG_pdftagstart("<_pdf ");
+const UnicodeString usG_pdftagend("/>");
+
+#endif
diff --git a/articles2cas/Utils.cpp b/articles2cas/Utils.cpp
new file mode 100644
index 0000000..e5cbde2
--- /dev/null
+++ b/articles2cas/Utils.cpp
@@ -0,0 +1,230 @@
+/**
+    Project: libtpc
+    File name: Utils.cpp
+    
+    @author valerio
+    @version 1.0 7/26/17.
+*/
+
+#include "Utils.h"
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <fstream>
+#include <boost/iostreams/categories.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+
+using namespace std;
+using namespace boost::posix_time;
+
+string Utils::get_temp_dir_path()
+{
+    ptime now = boost::posix_time::microsec_clock::local_time();
+    int month = static_cast<int> (now.date().month());
+    int year = static_cast<int> (now.date().year());
+    int day = static_cast<int> (now.date().day());
+    time_duration duration(now.time_of_day());
+    long microseconds = duration.total_microseconds();
+    long pid = getpid();
+    long random = pid + microseconds;
+    stringstream ss;
+    ss << year << month << day << random;
+    return "/run/shm/" + ss.str();
+}
+
+string Utils::decompress_gzip(const string& gz_file, const string& tmp_dir) {
+    std::ifstream filein(gz_file.c_str(), std::ios_base::in | std::ios_base::binary);
+    boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
+    in.push(boost::iostreams::gzip_decompressor());
+    in.push(filein);
+    int lastdot = gz_file.find_last_of(".");
+    int lastslash = gz_file.find_last_of("/");
+    string tpFile = gz_file.substr(lastslash + 1, lastdot - lastslash - 1);
+    string tempFile = tmp_dir + "/" + tpFile;
+    std::ofstream out(tempFile.c_str());
+    boost::iostreams::copy(in, out);
+    out.close();
+    return tempFile;
+}
+
+void Utils::write_index_descriptor(const std::string& index_path, const std::string& descriptor_path,
+                                   const std::string& tmp_conf_files_path)
+{
+    ofstream output(descriptor_path.c_str());
+    output << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" << endl;
+    output << "<taeDescription xmlns = \"http://uima.apache.org/resourceSpecifier\" >" << endl;
+    output << " <frameworkImplementation > org.apache.uima.cpp</frameworkImplementation>" << endl;
+    output << " <primitive > true </primitive>" << endl;
+    //output << " <annotatorImplementationName > Tpcas2Lpp</annotatorImplementationName>" << endl;
+    output << " <annotatorImplementationName > Tpcas2SingleIndex</annotatorImplementationName>" << endl;
+    output << " <analysisEngineMetaData> " << endl;
+    //output << "         <name > Tpcas2Lpp</name>" << endl;
+    output << "         <name > Tpcas2SingeIndex</name>" << endl;
+    output << "         <description > Writes an XCAS to a Lucene index.</description> " << endl;
+    output << "         <version > 1.0 </version> " << endl;
+    output << "         <vendor > Textpresso</vendor> " << endl;
+    output << "         <configurationParameters> " << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << endl;
+    output << "                         <description > Directory path of Lucene index for fulltext.</description> " << endl;
+    output << "                         <type > String</type> " << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > FulltextCaseSensitiveLuceneIndexDirectory</name>" << endl;
+    output << "                         <description > Directory path of case sensitive Lucene index for fulltext.</description> " << endl;
+    output << "                         <type > String</type> " << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                 <name > TokenLuceneIndexDirectory</name> " << endl;
+    output << "                 <description > Directory path of Lucene index for tokens.</description> " << endl;
+    output << "                 <type > String</type>" << endl;
+    output << "                 <multiValued > false </multiValued>" << endl;
+    output << "                 <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                 <name > TokenCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                 <description > Directory path of case sensitive Lucene index for tokens.</description> " << endl;
+    output << "                 <type > String</type>" << endl;
+    output << "                 <multiValued > false </multiValued>" << endl;
+    output << "                 <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for sentences.</description>" << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter> " << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > SentenceCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of case sensitive Lucene index for sentences.</description>" << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter> " << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for lexical annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > LexicalCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of case sensitive Lucene index for lexical annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for bibliography annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > BibliographyCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of case sensitive Lucene index for bibliography annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > TempDirectory</name> " << endl;
+    output << "                         <description > temporary directory under /run/shm/ to store newindexflag </description>" << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "         </configurationParameters>" << endl;
+    output << "         <configurationParameterSettings>" << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/fulltext" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > TokenLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/token" << "</string>" << endl;
+    output << "                         </value>" << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                        <string>" << index_path << "/sentence" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/lexical" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/bibliography" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > FulltextCaseSensitiveLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/fulltext_cs" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > TokenCaseSensitiveLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/token_cs" << "</string>" << endl;
+    output << "                         </value>" << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > SentenceCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                        <string>" << index_path << "/sentence_cs" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > LexicalCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/lexical_cs" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > BibliographyCaseSensitiveLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << index_path << "/bibliography_cs" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name >TempDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << tmp_conf_files_path << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "         </configurationParameterSettings> " << endl;
+    output << " <typeSystemDescription> " << endl;
+    output << "         <imports> " << endl;
+    output << "         <import location = \"/usr/local/uima_descriptors/TpLexiconAnnotatorTypeSystem.xml\"/> " << endl;
+    output << "         </imports>" << endl;
+    output << " </typeSystemDescription>" << endl;
+    output << " <capabilities> " << endl;
+    output << " <capability>" << endl;
+    output << " <inputs/> " << endl;
+    output << " <outputs/>" << endl;
+    output << " <languagesSupported> " << endl;
+    output << "         <language >x-unspecified</language>" << endl;
+    output << " </languagesSupported>" << endl;
+    output << " </capability>" << endl;
+    output << " </capabilities> " << endl;
+    output << " </analysisEngineMetaData>" << endl;
+    output << "</taeDescription> " << endl;
+    output.close();
+}
\ No newline at end of file
diff --git a/articles2cas/Utils.h b/articles2cas/Utils.h
new file mode 100644
index 0000000..2b73b94
--- /dev/null
+++ b/articles2cas/Utils.h
@@ -0,0 +1,39 @@
+/**
+    Project: libtpc
+    File name: Utils.h
+    
+    @author valerio
+    @version 1.0 7/26/17.
+*/
+
+#ifndef LIBTPC_UTILS_H
+#define LIBTPC_UTILS_H
+
+#include <string>
+
+class Utils {
+public:
+    /*!
+     * generate a random path name for a tmp directory
+     */
+    static std::string get_temp_dir_path();
+
+    /*!
+     * write a uima descriptor for an index to file
+     * @param index_path the path of the index
+     * @param descriptor_path the path of the descriptor to be created
+     * @param tmp_conf_files_path the path of the directory containing the temp files for the index
+     */
+    static void write_index_descriptor(const std::string& index_path, const std::string& descriptor_path,
+                                       const std::string& tmp_conf_files_path);
+
+    /*!
+     * decompress file to a new file and return file path of the latter
+     * @param gz_file the gx file to decompress
+     * @return the file path of the decompressed file
+     */
+    static std::string decompress_gzip(const std::string & gz_file, const std::string& tmp_dir);
+};
+
+
+#endif //LIBTPC_UTILS_H
diff --git a/articles2cas/articles2cas.cpp b/articles2cas/articles2cas.cpp
new file mode 100644
index 0000000..35bd7e5
--- /dev/null
+++ b/articles2cas/articles2cas.cpp
@@ -0,0 +1,118 @@
+/**
+    Project: textpressocentral
+    File name: articles2cas.cpp
+    
+    @author valerio
+    @version 1.0 7/30/17.
+*/
+
+#include "CASManager.h"
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options.hpp>
+#include <iostream>
+#include <boost/filesystem.hpp>
+#include <fstream>
+#include "Utils.h"
+
+using namespace std;
+using namespace boost::filesystem;
+namespace po = boost::program_options;
+using namespace tpc::cas;
+
+
+void convert_dir_recursively(const string& inputDir, const string& outputDir, const string& literature,
+                             const set<string>& filelist_set, const set<string>& dirlist_set, FileType fileType,
+                             bool use_parent_dir_as_outname) {
+    for (directory_iterator dit(inputDir); dit != directory_iterator(); ++dit) {
+        if ((is_regular_file(*dit) && (dit->path().filename().string().find(".nxml.gz") != string::npos ||
+                dit->path().filename().string().find(".pdf") != string::npos)) && ((filelist_set.empty() ||
+                filelist_set.find(dit->path().filename().string()) != filelist_set.end()) &&
+                (dirlist_set.empty() || dirlist_set.find(dit->path().parent_path().filename().string()) !=
+                                                dirlist_set.end()))) {
+            if (fileType == FileType::xml) {
+                string decomp_file = Utils::decompress_gzip(dit->path().string(),
+                                                            dit->path().parent_path().string());
+                CASManager::convert_raw_file_to_cas1(decomp_file, fileType, outputDir, use_parent_dir_as_outname);
+                boost::filesystem::remove(decomp_file);
+            } else {
+                CASManager::convert_raw_file_to_cas1(dit->path().string(), fileType, outputDir,
+                                                     use_parent_dir_as_outname);
+            }
+        } else if (is_directory(*dit) && dit->path().filename().string() != "images"){
+            convert_dir_recursively(dit->path().string(), outputDir, literature, filelist_set, dirlist_set, fileType,
+                                    use_parent_dir_as_outname);
+        }
+    }
+}
+
+
+int main(int argc, const char* argv[]) {
+    po::options_description desc("options");
+    po::positional_options_description p;
+    po::variables_map vm;
+
+    // arguments
+    string inputDir;
+    string outputDir;
+    int fileType;
+    string filelist;
+    string dirlist;
+
+    try {
+        desc.add_options()
+                ("help,h", "produce help message")
+                ("articles-input-directory,i", po::value<string>(&inputDir)->required(),
+                 "input directory containing articles")
+                ("cas-output-directory,o", po::value<string>(&outputDir)->required(),
+                 "directory where to write cas files")
+                ("input-files-type,t", po::value<int>(&fileType)->default_value(1),
+                 "type of files to process. 1 for pdf, 2 for xml")
+                ("dir-list,l", po::value<string>(&dirlist)->default_value(""),
+                "optional list of directory names containing the final files to be processed. Other "
+                        "directories are ignored")
+                ("file-list,L", po::value<string>(&filelist)->default_value(""),
+                "optional list of file names to be processed. Other files are ignored")
+                ("use_parent_dir_as_outname,p", po::bool_switch()->default_value(false), "Use parent dir name instead "
+                        "of file name as output name for the cas file");
+        p.add("articles-input-directory", 1);
+        p.add("cas-output-directory", 1);
+        po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+        po::notify(vm);
+
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return 1;
+        }
+    } catch (std::exception &e) {
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return (EXIT_SUCCESS);
+        }
+        std::cerr << "Error: " << e.what() << "\n";
+        return (EXIT_FAILURE);
+    }
+
+    FileType ft = fileType == 1 ? FileType::pdf : FileType::xml;
+    if (is_directory(inputDir)) {
+        path p(inputDir);
+        string literature = p.filename().string();
+        create_directories(outputDir);
+        std::fstream f;
+        f.open(dirlist, std::fstream::in);
+        string line;
+        set<string> dirlist_set;
+        while (f >> line) {
+            dirlist_set.insert(line);
+        }
+        f.close();
+        f.open(filelist, std::fstream::in);
+        string line2;
+        set<string> filelist_set;
+        while (f >> line) {
+            filelist_set.insert(line);
+        }
+        f.close();
+        convert_dir_recursively(inputDir, outputDir, literature, filelist_set, dirlist_set, ft,
+                                vm["use_parent_dir_as_outname"].as<bool>());
+    }
+}
diff --git a/articles2cas/convertallarticles2cas.sh b/articles2cas/convertallarticles2cas.sh
new file mode 100755
index 0000000..5f689b8
--- /dev/null
+++ b/articles2cas/convertallarticles2cas.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+##### create indexes for all literatures in parallel with a maximum numberof parallel processes
+
+function usage {
+    echo "usage: $(basename $0) [-ph] <articles_input_dir> <cas_output_dir>"
+    echo "  -p --num-processes       number of parallel processes"
+    echo "  -t --file-type           type of input files. 1 for pdf, 2 for xml"
+    echo "  -h --help                display help"
+    exit 1
+}
+
+if [[ "${#}" < 1 ]]
+then
+    usage
+fi
+
+TYPE=1
+NUM_PROCESSES=1
+ARTICLES_ROOT_DIR=""
+CAS_OUT_DIR=""
+
+while [[ $# -gt 1 ]]
+do
+key="$1"
+
+case $key in
+    -t|--file-type)
+    shift
+    TYPE="$1"
+    shift # past argument
+    ;;
+    -p|--num-processes)
+    shift # past argument
+    NUM_PROCESSES="$1"
+    shift
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [ -d $key ]
+    then
+        ARTICLES_ROOT_DIR=$key
+    else
+        usage
+    fi
+    shift
+    if [ -d $1 ]
+    then
+        CAS_OUT_DIR=$1
+    else
+        usage
+    fi
+    shift
+    ;;
+esac
+done
+
+# check for the required argument ROOT_DIR
+if [[ ${ARTICLES_ROOT_DIR} == "" || ${CAS_OUT_DIR} == "" ]]
+then
+    usage
+fi
+
+find -L ${ARTICLES_ROOT_DIR} -maxdepth 1 -mindepth 1 -type d | xargs -n 1 -P ${NUM_PROCESSES} -I {} sh -c "basename \"{}\" | xargs -I % articles2cas \"{}\" ${CAS_OUT_DIR}/%"
+
+exit 0
\ No newline at end of file
diff --git a/cas2index/cas2index.cpp b/cas2index/cas2index.cpp
new file mode 100644
index 0000000..0d98292
--- /dev/null
+++ b/cas2index/cas2index.cpp
@@ -0,0 +1,93 @@
+/**
+    Project: textpressocentral
+    File name: cas2index.cpp
+    
+    @author valerio
+    @version 1.0 7/30/17.
+*/
+
+#include "IndexManager.h"
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+
+using namespace std;
+using namespace boost::filesystem;
+namespace po = boost::program_options;
+
+int main(int argc, const char* argv[]) {
+    po::options_description desc("options");
+    po::positional_options_description p;
+    po::variables_map vm;
+
+    // arguments
+    string inputDir;
+    path inputdir;
+    string indexpath;
+    string fileList;
+    string onlyFilesList;
+    int numPapersPerIndex;
+
+    try {
+        desc.add_options()
+                ("help,h", "produce help message")
+                ("cas-input-directory,i", po::value<string>(&inputDir)->required(),
+                 "input directory containing cas files")
+                ("index-output-directory,o", po::value<string>(&indexpath)->required(),
+                 "directory where to write index")
+                ("subindex-size,s", po::value<int>(&numPapersPerIndex)->default_value(50000),
+                 "maximum number of paper per sub-index")
+                ("add-files,a", po::value<string>(&fileList),
+                 "add files listed in the provided file to the existing indices. File names must be in the form"
+                         "<literature_name>/<cas_dir_name>/<cas_file_name>")
+                ("file-list,f", po::value<string>(&onlyFilesList), "create index using only the files provided in the "
+                        "list")
+                ("external,e", po::bool_switch()->default_value(false), "Create external index");
+        p.add("cas-input-directory", 1);
+        p.add("index-output-directory", 1);
+        po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+        po::notify(vm);
+
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return 1;
+        }
+        if (vm.count("index-output-directory")) {
+            inputdir = path(inputDir);
+        }
+    } catch (std::exception &e) {
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return (EXIT_SUCCESS);
+        }
+        std::cerr << "Error: " << e.what() << "\n";
+        return (EXIT_FAILURE);
+    }
+    tpc::index::IndexManager indexManager(indexpath, false, true);
+    if (!fileList.empty()) {
+        std::hash<std::string> string_hash;
+        std::ifstream infile(fileList);
+        string filename;
+        string lit;
+        string cas_dirname;
+        vector<string> filename_arr;
+        while (std::getline(infile, filename))
+        {
+            boost::split(filename_arr, filename, boost::is_any_of("/"));
+            lit = filename_arr[0];
+            cas_dirname = filename_arr[1];
+            indexManager.remove_file_from_index(filename);
+            indexManager.add_file_to_index(inputDir + "/" + filename, numPapersPerIndex);
+        }
+    } else {
+        std::fstream f;
+        f.open(onlyFilesList, std::fstream::in);
+        string line;
+        set<string> filelist_set;
+        while (getline(f, line)) {
+            filelist_set.insert(line);
+        }
+        indexManager.create_index_from_existing_cas_dir(inputDir, filelist_set, numPapersPerIndex);
+    }
+}
diff --git a/cas2index/create_index_multi_literatures.sh b/cas2index/create_index_multi_literatures.sh
new file mode 100755
index 0000000..6cf6efb
--- /dev/null
+++ b/cas2index/create_index_multi_literatures.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+##### create indexes for all literatures in parallel with a maximum numberof parallel processes
+
+function usage {
+    echo "usage: $(basename $0) [-mph] <cas_input_dir> <indexes_output_dir>"
+    echo "  -m --max-num-papers      maximum number of papers per index. Sub-indexes are created when the threshold is exceeded"
+    echo "  -p --num-processes       number of parallel processes"
+    echo "  -h --help                display help"
+    exit 1
+}
+
+if [[ "${#}" < 1 ]]
+then
+    usage
+fi
+
+NUM_PAPERS=50000
+NUM_PROCESSES=1
+CAS_ROOT_DIR=""
+INDEX_OUT_DIR=""
+
+while [[ $# -gt 1 ]]
+do
+key="$1"
+
+case $key in
+    -m|--max-num-papers)
+    shift
+    NUM_PAPERS="$1"
+    shift # past argument
+    ;;
+    -p|--num-processes)
+    shift # past argument
+    NUM_PROCESSES="$1"
+    shift
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [ -d $key ]
+    then
+        CAS_ROOT_DIR=$key
+    else
+        usage
+    fi
+    shift
+    if [ -d $1 ]
+    then
+        INDEX_OUT_DIR=$1
+    else
+        usage
+    fi
+    shift
+    ;;
+esac
+done
+
+# check for the required argument ROOT_DIR
+if [[ $CAS_ROOT_DIR == "" || $INDEX_OUT_DIR == "" ]]
+then
+    usage
+fi
+
+find -L ${CAS_ROOT_DIR} -maxdepth 1 -mindepth 1 -type d | xargs -n 1 -P ${NUM_PROCESSES} -I {} sh -c "basename \"{}\" | xargs -I % cas2index \"{}\" $INDEX_OUT_DIR/"
+
+exit 0
\ No newline at end of file
diff --git a/cas2index/create_single_index.sh b/cas2index/create_single_index.sh
new file mode 100755
index 0000000..9a30c95
--- /dev/null
+++ b/cas2index/create_single_index.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+##### create sub-indexes for single index
+
+function usage {
+    echo "usage: $(basename $0) [-mph] <cas_input_dir> <indexes_output_dir>"
+    echo "  -m --max-num-papers      maximum number of papers per index. Sub-indexes are created with this maximum size, in parallel"
+    echo "  -h --help                display help"
+    exit 1
+}
+
+if [[ "${#}" < 1 ]]
+then
+    usage
+fi
+
+NUM_PAPERS=50000
+CAS_ROOT_DIR=""
+INDEX_OUT_DIR=""
+
+while [[ $# -gt 1 ]]
+do
+key="$1"
+
+case $key in
+    -m|--max-num-papers)
+    shift
+    NUM_PAPERS="$1"
+    shift # past argument
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [ -d $key ]
+    then
+        CAS_ROOT_DIR=$key
+    else
+        usage
+    fi
+    shift
+    if [ -d $1 ]
+    then
+        INDEX_OUT_DIR=$1
+    else
+        usage
+    fi
+    shift
+    ;;
+esac
+done
+
+# check for the required argument ROOT_DIR
+if [[ $CAS_ROOT_DIR == "" || $INDEX_OUT_DIR == "" ]]
+then
+    usage
+fi
+
+tempdir=$(mktemp -d)
+for dir in ${CAS_ROOT_DIR}/*; do for subdir in "$dir"/*; do echo "$subdir"; done; done | tac | awk -F"/" '!x[$NF]++' | tac | awk 'BEGIN{FS="/"; OFS="/"}{print $(NF-1), $NF}' | split -l ${NUM_PAPERS} - ${tempdir}/file_to_index-
+i=0
+for file_list in $(ls ${tempdir})
+do
+    mkdir ${INDEX_OUT_DIR}/tmpindex${i}
+
+    counter=$(($i * ${NUM_PAPERS}))
+    echo "22 serialization::archive 12 "${counter} > ${INDEX_OUT_DIR}/tmpindex${i}/counter.dat
+    (export INDEX_PATH=${INDEX_OUT_DIR}/tmpindex${i}; cas2index -i ${CAS_ROOT_DIR} -o ${INDEX_OUT_DIR}/tmpindex${i} -s ${NUM_PAPERS} -f ${tempdir}/${file_list}) &
+    let i=$(($i + 1))
+done
+wait
+echo "22 serialization::archive 12 "$(cat ${tempdir}/file_to_index-* | wc -l | awk '{print $1}') > ${INDEX_OUT_DIR}/counter.dat
+find ${INDEX_OUT_DIR} -type d -name tmpindex* | while read line
+do
+    tmpnum=$(basename ${line} | sed 's/tmpindex//g')
+    cp -r ${line}/subindex_0 ${INDEX_OUT_DIR}/subindex_${tmpnum}
+done
+rm -rf ${INDEX_OUT_DIR}/tmpindex*
+rm -rf ${tempdir}
+updatecorpuscounter -i ${INDEX_OUT_DIR}
+exit 0
\ No newline at end of file
diff --git a/cas2index/index_merger.cpp b/cas2index/index_merger.cpp
new file mode 100644
index 0000000..ba30397
--- /dev/null
+++ b/cas2index/index_merger.cpp
@@ -0,0 +1,115 @@
+/*
+ * File:   main.cpp
+ * Author: liyuling
+ *
+ * Created on Dec, 2013
+ */
+
+#include "../../TextpressoCentralGlobalDefinitions.h"
+#include "xercesc/util/XMLString.hpp"
+#include "../../TpC/lucene/CaseSensitiveAnalyzer.h"
+#include <uima/api.hpp>
+#include <uima/xmideserializer.hpp>
+#include <cstdio>
+#include <iostream>
+#include <lucene++/LuceneHeaders.h>
+#include <boost/filesystem.hpp>
+#include <boost/date_time.hpp>
+
+//#define TPCAS_2_LINDEX_VERSION "0.9.0"
+using namespace std;
+using namespace boost::filesystem;
+using namespace Lucene;
+
+void print_who() {
+    std::cout << std::endl << "Lucene index merger" << std::endl;
+    std::cout << "Build Date: " << __DATE__ << std::endl;
+
+}
+
+void print_help() {
+    std::cout << std::endl;
+    std::cout << "Usage: IndexerMerger [index1][index2][optimization yes|no]" << std::endl;
+    std::cout << std::endl;
+    std::cout << "it merges [index2] into [index1], after merging, index1 will be optimized if [optimization] = yes";
+    std::cout << std::endl;
+    std::cout << "both index1 and index2 need to be valid TexpressoCentral index structure(not empty)";
+    std::cout << std::endl;
+}
+
+void mergeIndex(const string& indexpath1, const string& indexpath2, const string& optimization, bool caseSensitive) {
+
+    cout << "L138" << endl;
+    String IndexDir1 = StringUtils::toString(indexpath1.c_str());
+    IndexWriterPtr writer;
+    if (caseSensitive) {
+        writer = newLucene<IndexWriter>(FSDirectory::open(IndexDir1),
+                                        newLucene<CaseSensitiveAnalyzer>(LuceneVersion::LUCENE_30), false, // append
+                                        IndexWriter::MaxFieldLengthUNLIMITED);
+    } else {
+        writer = newLucene<IndexWriter>(FSDirectory::open(IndexDir1),
+                                        newLucene<StandardAnalyzer>(LuceneVersion::LUCENE_30), false, // append
+                                        IndexWriter::MaxFieldLengthUNLIMITED);
+    }
+    cout << "maxDoc(): " << writer->maxDoc() << endl;
+    cout << "L143" << endl;
+    String IndexDir2 = StringUtils::toString(indexpath2.c_str());
+    wcout << "L146 " << IndexDir2.c_str() << endl;
+    FSDirectoryPtr dir2 = FSDirectory::open(IndexDir2);
+    Collection<DirectoryPtr> indexes = Collection<DirectoryPtr>::newInstance(0);
+    indexes.add(dir2);
+    // cout << "L152 size " << indexes.size() << endl;
+    writer->addIndexesNoOptimize(indexes);
+    cout<<"L165" << endl;
+    if (optimization == "yes") {
+        writer->optimize();
+    }
+    cout<<"L166" << endl;
+    writer->close();
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 4) {
+        print_who();
+        print_help();
+        return (-1);
+    }
+
+    string indexpath1(argv[1]); //tpcas file dir
+    string indexpath2(argv[2]); //index location
+    string optimization(argv[3]); //optimize after merge or not
+
+    if (optimization != "yes" && optimization != "no") {
+        cout << "optimization flag error" << endl;
+        return (-1);
+    }
+
+    // lowercase indexes
+    if (exists(indexpath1 + "/fulltext") && exists(indexpath2 + "/fulltext")) {
+        mergeIndex(indexpath1 + "/fulltext", indexpath2 + "/fulltext", optimization, false);
+    }
+    if (exists(indexpath1 + "/sentence") && exists(indexpath2 + "/sentence")) {
+        mergeIndex(indexpath1 + "/sentence", indexpath2 + "/sentence", optimization, false);
+    }
+    if (exists(indexpath1 + "/lexical") && exists(indexpath2 + "/lexical")) {
+        mergeIndex(indexpath1 + "/lexical", indexpath2 + "/lexical", optimization, false);
+    }
+    if (exists(indexpath1 + "/bibliography") && exists(indexpath2 + "/bibliography")) {
+        mergeIndex(indexpath1 + "/bibliography", indexpath2 + "/bibliography", optimization, false);
+    }
+
+    // case sensitive indexes
+    if (exists(indexpath1 + "/fulltext_cs") && exists(indexpath2 + "/fulltext_cs")) {
+        mergeIndex(indexpath1 + "/fulltext_cs", indexpath2 + "/fulltext_cs", optimization, true);
+    }
+    if (exists(indexpath1 + "/sentence_cs") && exists(indexpath2 + "/sentence_cs")) {
+        mergeIndex(indexpath1 + "/sentence_cs", indexpath2 + "/sentence_cs", optimization, true);
+    }
+    if (exists(indexpath1 + "/lexical_cs") && exists(indexpath2 + "/lexical_cs")) {
+        mergeIndex(indexpath1 + "/lexical_cs", indexpath2 + "/lexical_cs", optimization, true);
+    }
+    if (exists(indexpath1 + "/bibliography_cs") && exists(indexpath2 + "/bibliography_cs")) {
+        mergeIndex(indexpath1 + "/bibliography_cs", indexpath2 + "/bibliography_cs", optimization, true);
+    }
+}
+
diff --git a/cas2index/saveidstodb.cpp b/cas2index/saveidstodb.cpp
new file mode 100644
index 0000000..536375e
--- /dev/null
+++ b/cas2index/saveidstodb.cpp
@@ -0,0 +1,50 @@
+/**
+    Project: textpressocentral
+    File name: saveidstodb.cpp
+    
+    @author valerio
+    @version 1.0 10/9/17.
+*/
+
+#include "IndexManager.h"
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+
+using namespace std;
+using namespace boost::filesystem;
+namespace po = boost::program_options;
+
+int main(int argc, const char* argv[]) {
+    po::options_description desc("options");
+    po::positional_options_description p;
+    po::variables_map vm;
+
+    // arguments
+    string inputDir;
+
+    try {
+        desc.add_options()
+                ("help,h", "produce help message")
+                ("index_dir,i", po::value<string>(&inputDir)->required(),
+                 "index directory where to read the data and store the db file");
+        p.add("cas-input-directory", 1);
+        po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+        po::notify(vm);
+
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return 1;
+        }
+    } catch (std::exception &e) {
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return (EXIT_SUCCESS);
+        }
+        std::cerr << "Error: " << e.what() << "\n";
+        return (EXIT_FAILURE);
+    }
+    tpc::index::IndexManager indexManager(inputDir, false, false);
+    indexManager.save_all_doc_ids_for_sentences_to_db();
+}
diff --git a/cas2index/update_corpus_counter.cpp b/cas2index/update_corpus_counter.cpp
new file mode 100644
index 0000000..e3e6369
--- /dev/null
+++ b/cas2index/update_corpus_counter.cpp
@@ -0,0 +1,50 @@
+/**
+    Project: textpressocentral
+    File name: update_corpus_counter.cpp
+    
+    @author valerio
+    @version 1.0 10/06/17.
+*/
+
+#include "IndexManager.h"
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+
+using namespace std;
+using namespace boost::filesystem;
+namespace po = boost::program_options;
+
+int main(int argc, const char* argv[]) {
+    po::options_description desc("options");
+    po::positional_options_description p;
+    po::variables_map vm;
+
+    // arguments
+    string inputDir;
+
+    try {
+        desc.add_options()
+                ("help,h", "produce help message")
+                ("index_dir,i", po::value<string>(&inputDir)->required(),
+                 "index directory where to read the data and store the counter file");
+        p.add("cas-input-directory", 1);
+        po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
+        po::notify(vm);
+
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return 1;
+        }
+    } catch (std::exception &e) {
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return (EXIT_SUCCESS);
+        }
+        std::cerr << "Error: " << e.what() << "\n";
+        return (EXIT_FAILURE);
+    }
+    tpc::index::IndexManager indexManager(inputDir, false, false);
+    indexManager.calculate_and_save_corpus_counter();
+}
diff --git a/cmake/Modules/FindAPR.cmake b/cmake/Modules/FindAPR.cmake
new file mode 100644
index 0000000..8e01ec1
--- /dev/null
+++ b/cmake/Modules/FindAPR.cmake
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find Apache Portable Runtime
+# Find the APR includes and libraries
+# This module defines
+#  APR_INCLUDE_DIR and APRUTIL_INCLUDE_DIR, where to find apr.h, etc.
+#  APR_LIBRARIES and APRUTIL_LIBRARIES, the libraries needed to use APR.
+#  APR_FOUND and APRUTIL_FOUND, If false, do not try to use APR.
+# also defined, but not for general use are
+#  APR_LIBRARY and APRUTIL_LIBRARY, where to find the APR library.
+
+# APR first.
+
+FIND_PATH(APR_INCLUDE_DIR apr.h
+  /opt/homebrew/opt/apr/include/apr-1
+  /usr/local/include/apr-1
+  /usr/local/include/apr-1.0
+  /usr/include/apr-1
+  /usr/include/apr-1.0
+  /usr/local/apr/include/apr-1
+)
+
+SET(APR_NAMES ${APR_NAMES} apr-1)
+FIND_LIBRARY(APR_LIBRARY
+  NAMES ${APR_NAMES}
+  HINTS
+    /opt/homebrew/opt/apr/lib
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /usr/local/apr-1/lib
+    /usr/local/apr/lib
+  )
+
+IF (APR_LIBRARY AND APR_INCLUDE_DIR)
+    SET(APR_LIBRARIES ${APR_LIBRARY})
+    SET(APR_FOUND "YES")
+ELSE (APR_LIBRARY AND APR_INCLUDE_DIR)
+  SET(APR_FOUND "NO")
+ENDIF (APR_LIBRARY AND APR_INCLUDE_DIR)
+
+
+IF (APR_FOUND)
+   IF (NOT APR_FIND_QUIETLY)
+      MESSAGE(STATUS "Found APR headers: ${APR_INCLUDE_DIR}")
+      MESSAGE(STATUS "Found APR library: ${APR_LIBRARIES}")
+   ENDIF (NOT APR_FIND_QUIETLY)
+ELSE (APR_FOUND)
+   IF (APR_FIND_REQUIRED)
+      MESSAGE(FATAL_ERROR "Could not find APR library")
+   ENDIF (APR_FIND_REQUIRED)
+ENDIF (APR_FOUND)
+
+# Deprecated declarations.
+SET (NATIVE_APR_INCLUDE_PATH ${APR_INCLUDE_DIR} )
+GET_FILENAME_COMPONENT (NATIVE_APR_LIB_PATH ${APR_LIBRARY} PATH)
+
+MARK_AS_ADVANCED(
+  APR_LIBRARY
+  APR_INCLUDE_DIR
+  )
+
+# Next, APRUTIL.
+
+FIND_PATH(APRUTIL_INCLUDE_DIR apu.h
+  /opt/homebrew/opt/apr-util/include/apr-1
+  /usr/local/include/apr-1
+  /usr/local/include/apr-1.0
+  /usr/include/apr-1
+  /usr/include/apr-1.0
+  /usr/local/apr/include/apr-1
+)
+
+SET(APRUTIL_NAMES ${APRUTIL_NAMES} aprutil-1)
+FIND_LIBRARY(APRUTIL_LIBRARY
+  NAMES ${APRUTIL_NAMES}
+  HINTS
+    /opt/homebrew/opt/apr-util/lib
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /usr/local/apr/lib
+  )
+
+IF (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR)
+    SET(APRUTIL_LIBRARIES ${APRUTIL_LIBRARY})
+    SET(APRUTIL_FOUND "YES")
+ELSE (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR)
+  SET(APRUTIL_FOUND "NO")
+ENDIF (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR)
+
+
+IF (APRUTIL_FOUND)
+   IF (NOT APRUTIL_FIND_QUIETLY)
+      MESSAGE(STATUS "Found APRUTIL headers: ${APRUTIL_INCLUDE_DIR}")
+      MESSAGE(STATUS "Found APRUTIL library: ${APRUTIL_LIBRARIES}")
+   ENDIF (NOT APRUTIL_FIND_QUIETLY)
+ELSE (APRUTIL_FOUND)
+   IF (APRUTIL_FIND_REQUIRED)
+      MESSAGE(FATAL_ERROR "Could not find APRUTIL library")
+   ENDIF (APRUTIL_FIND_REQUIRED)
+ENDIF (APRUTIL_FOUND)
+
+# Deprecated declarations.
+SET (NATIVE_APRUTIL_INCLUDE_PATH ${APRUTIL_INCLUDE_DIR} )
+GET_FILENAME_COMPONENT (NATIVE_APRUTIL_LIB_PATH ${APRUTIL_LIBRARY} PATH)
+
+MARK_AS_ADVANCED(
+  APRUTIL_LIBRARY
+  APRUTIL_INCLUDE_DIR
+  )
diff --git a/cmake/Modules/FindCImg.cmake b/cmake/Modules/FindCImg.cmake
new file mode 100644
index 0000000..de34032
--- /dev/null
+++ b/cmake/Modules/FindCImg.cmake
@@ -0,0 +1,341 @@
+# - Try to find CImg lib
+#
+# The following variables are defined
+#
+#  CImg_FOUND - system has CImg lib
+#  CImg_INCLUDE_DIRS - the CImg include directory
+#  CImg_SYSTEM_LIBS - external libraries that CImg uses
+#  CImg_SYSTEM_LIBS_DIR - external library directories
+#  CImg_CFLAGS - compilation flags
+
+
+if (CImg_INCLUDE_DIR)
+  set(CImg_FOUND TRUE)
+else (CImg_INCLUDE_DIR)
+  find_path(CImg_INCLUDE_DIR
+    NAMES CImg.h
+    PATHS
+      ${CMAKE_INSTALL_PREFIX}/include
+      /usr/include
+  )
+  mark_as_advanced(CImg_INCLUDE_DIR)
+endif(CImg_INCLUDE_DIR)
+list(APPEND CImg_INCLUDE_DIRS
+  ${CImg_INCLUDE_DIR}
+)
+
+# To use PKG_CHECK_MODULES to find some optional packages
+find_package(PkgConfig)
+
+
+# ### CIMG related stuff
+# Flags to enable fast image display, using the XSHM library.
+SET(CIMG_XSHM_CCFLAGS  -Dcimg_use_xshm)
+
+# Flags to enable screen mode switching, using the XRandr library.
+SET(CIMG_XRANDR_CCFLAGS  -Dcimg_use_xrandr)
+
+# Flags to enable native support for JPEG image files, using the JPEG library.
+# ( http://www.ijg.org/ )
+SET(CIMG_JPEG_CCFLAGS  -Dcimg_use_jpeg)
+
+# Flags to enable native support for TIFF image files, using the TIFF library.
+# ( http://www.libtiff.org/ )
+SET(CIMG_TIFF_CCFLAGS  -Dcimg_use_tiff)
+
+# Flags to enable native support for PNG image files, using the PNG library.
+# ( http://www.libpng.org/ )
+SET(CIMG_PNG_CCFLAGS  -Dcimg_use_png)
+
+#Flags to enable OPENCV support (Camera)
+# ( http://www.opencv.org/ )
+SET(CIMG_OPENCV_CCFLAGS -Dcimg_use_opencv)
+
+# Flags to enable native support for EXR image files, using the OpenEXR library.
+# ( http://www.openexr.com/ )
+SET(CIMG_OPENEXR_CCFLAGS  -Dcimg_use_openexr)
+
+# Flags to enable native support for various video files, using the FFMPEG library.
+# ( http://www.ffmpeg.org/ )
+SET(CIMG_FFMPEG_CCFLAGS  -Dcimg_use_ffmpeg)
+
+# Flags to enable native support of most classical image file formats, using the Magick++ library.
+# ( http://www.imagemagick.org/Magick++/ )
+SET(CIMG_MAGICK_CCFLAGS -Dcimg_use_magick)
+
+# Flags to enable faster Discrete Fourier Transform computation, using the FFTW3 library
+# ( http://www.fftw.org/ )
+SET(CIMG_FFTW3_CCFLAGS  -Dcimg_use_fftw3)
+
+# Flags to enable zlib.
+# ( http://www.zlib.net/ )
+SET(CIMG_ZLIB_CCFLAGS  -Dcimg_use_zlib)
+
+# ### Search Additional Libraries ##########
+FIND_PACKAGE(OpenCV)
+FIND_PACKAGE(JPEG)
+FIND_PACKAGE(TIFF)
+FIND_PACKAGE(PNG)
+FIND_PACKAGE(ZLIB)
+FIND_PACKAGE(LAPACK)
+FIND_PACKAGE(BLAS)
+
+PKG_CHECK_MODULES(FFTW3 fftw3)
+PKG_CHECK_MODULES(OPENEXR OpenEXR)
+PKG_CHECK_MODULES(MAGICK Magick++)
+
+# PKG_CHECK_MODULES(LIBAVCODEC libavcodec)
+# PKG_CHECK_MODULES(LIBAVFORMAT libavformat)
+# PKG_CHECK_MODULES(LIBSWSCALE libswscale)
+# PKG_CHECK_MODULES(LIBAVUTIL libavutil)
+
+if(NOT WIN32)
+  FIND_PACKAGE(X11)
+  FIND_PACKAGE(Threads REQUIRED)
+endif()
+
+# #### End of additional libraries search ##########
+
+### Configure Paths according to detected packages
+if(TIFF_FOUND)
+  get_filename_component(TIFF_LIB_DIRS ${TIFF_LIBRARIES} PATH)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_TIFF_CCFLAGS}")
+#  link_directories(${TIFF_LIB_DIRS})
+#  include_directories(${TIFF_INCLUDE_DIR})
+#  SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${TIFF_LIBRARIES})
+  list(APPEND CImg_INCLUDE_DIRS
+    ${TIFF_INCLUDE_DIR}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${TIFF_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${TIFF_LIBRARIES}
+  )
+endif()
+
+if(JPEG_FOUND)
+  get_filename_component(JPEG_LIB_DIRS ${JPEG_LIBRARIES} PATH)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_JPEG_CCFLAGS}")
+#  link_directories(${JPEG_LIB_DIRS})
+#  include_directories(${JPEG_INCLUDE_DIR})
+#  SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${JPEG_LIBRARIES})
+  list(APPEND CImg_INCLUDE_DIRS
+    ${JPEG_INCLUDE_DIR}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${JPEG_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${JPEG_LIBRARIES}
+  )
+endif()
+
+if (ZLIB_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_ZLIB_CCFLAGS}")
+#  link_directories(${ZLIB_LIB_DIRS})
+#  include_directories(${ZLIB_INCLUDE_DIR})
+#  SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${ZLIB_LIBRARIES})
+  list(APPEND CImg_INCLUDE_DIRS
+    ${ZLIB_INCLUDE_DIR}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${ZLIB_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${ZLIB_LIBRARIES}
+  )
+  # PNG requires ZLIB
+  if(PNG_FOUND)
+    SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_PNG_CCFLAGS}")
+ #   link_directories(${PNG_LIB_DIRS})
+ #   include_directories(${PNG_INCLUDE_DIR} )
+ #   SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${PNG_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${PNG_INCLUDE_DIR}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${PNG_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${PNG_LIBRARIES}
+  )
+  endif()
+endif()
+
+if(FFTW3_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_FFTW3_CCFLAGS}")
+  #link_directories( ${FFTW3_LIBRARY_DIRS} )
+  #include_directories( ${FFTW3_INCLUDE_DIRS} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${FFTW3_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${FFTW3_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${FFTW3_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${FFTW3_LIBRARIES}
+  )
+endif()
+
+if(OPENEXR_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_OPENEXR_CCFLAGS}")
+  #link_directories( ${OPENEXR_LIBRARY_DIRS} )
+  #include_directories( ${OPENEXR_INCLUDE_DIRS} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${OPENEXR_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${OPENEXR_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${OPENEXR_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${OPENEXR_LIBRARIES}
+  )
+endif()
+
+if(MAGICK_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_MAGICK_CCFLAGS}")
+  #link_directories( ${MAGICK_LIBRARY_DIRS} )
+  #include_directories( ${MAGICK_INCLUDE_DIRS} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${MAGICK_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${MAGICK_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${MAGICK_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${MAGICK_LIBRARIES}
+  )
+endif()
+
+if( LIBAVCODEC_FOUND  AND LIBAVFORMAT_FOUND AND LIBSWSCALE_FOUND AND LIBAVUTIL_FOUND )
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_FFMPEG_CCFLAGS}")
+  #link_directories( ${LIBAVFORMAT_LIBRARY_DIRS} )
+  #link_directories( ${LIBAVCODEC_LIBRARY_DIRS} )
+  #link_directories( ${LIBSWSCALE_LIBRARY_DIRS} )
+  #link_directories( ${LIBAVUTIL_LIBRARY_DIRS} )
+  #include_directories( ${LIBAVFORMAT_INCLUDE_DIRS} ${LIBAVFORMAT_INCLUDE_DIRS}/libavformat)
+  #include_directories( ${LIBAVCODEC_INCLUDE_DIRS} ${LIBAVCODEC_INCLUDE_DIRS}/libavcodec )
+  #include_directories( ${LIBSWSCALE_INCLUDE_DIRS} ${LIBSWSCALE_INCLUDE_DIRS}/libswscale)
+  #include_directories( ${LIBAVUTIL_INCLUDE_DIRS} ${LIBAVUTIL_INCLUDE_DIRS}/libavutil )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVFORMAT_LIBRARIES} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVCODEC_LIBRARIES} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBSWSCALE_LIBRARIES} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVUTIL_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${LIBAVFORMAT_INCLUDE_DIRS} ${LIBAVFORMAT_INCLUDE_DIRS}/libavformat
+    ${LIBAVCODEC_INCLUDE_DIRS} ${LIBAVCODEC_INCLUDE_DIRS}/libavcodec 
+    ${LIBSWSCALE_INCLUDE_DIRS} ${LIBSWSCALE_INCLUDE_DIRS}/libswscale
+    ${LIBAVUTIL_INCLUDE_DIRS} ${LIBAVUTIL_INCLUDE_DIRS}/libavutil 
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${LIBAVFORMAT_LIBRARY_DIRS}
+    ${LIBAVCODEC_LIBRARY_DIRS}
+    ${LIBSWSCALE_LIBRARY_DIRS}
+    ${LIBAVUTIL_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${LIBAVFORMAT_LIBRARIES}
+    ${LIBAVCODEC_LIBRARIES}
+    ${LIBSWSCALE_LIBRARIES}
+    ${LIBAVUTIL_LIBRARIES}
+  )
+endif()
+
+if(NOT APPLE)
+  if(NOT WIN32)
+    if(X11_FOUND)
+      SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_XSHM_CCFLAGS} ${CIMG_XRANDR_CCFLAGS}")
+      SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} Xext Xrandr)
+    endif()
+  endif(NOT WIN32)
+endif(NOT APPLE)
+
+if(X11_FOUND)
+  #link_directories(${X11_LIB_DIRS})
+  #include_directories(${X11_INCLUDE_DIR})
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${X11_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${X11_INCLUDE_DIR}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${X11_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${X11_LIBRARIES}
+  )
+endif()
+
+if (NOT WIN32)
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${CMAKE_THREAD_LIBS_INIT} )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${CMAKE_THREAD_LIBS_INIT} 
+  )
+endif()
+
+if( WIN32)
+  #SET( CImg_SYSTEM_LIBS  ${CImg_SYSTEM_LIBS}  gdi32 )
+  list(APPEND CImg_SYSTEM_LIBS
+    gdi32
+  )
+endif()
+
+if (OpenCV_FOUND)
+  message("OpenCV Found")
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_OPENCV_CCFLAGS}")
+  #include_directories(${OpenCV_INCLUDE_DIRS})
+  #link_directories(${OpenCV_LIB_DIRS})
+  #SET( CImg_SYSTEM_LIBS  ${CImg_SYSTEM_LIBS}  ${OpenCV_LIBS} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${OpenCV_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${OpenCV_LIB_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${OpenCV_LIBS}
+  )
+endif()
+
+if(LAPACK_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_LAPACK_CCFLAGS}")
+  #link_directories( ${LAPACK_LIBRARY_DIRS} )
+  #include_directories( ${LAPACK_INCLUDE_DIRS} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LAPACK_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${LAPACK_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${LAPACK_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${LAPACK_LIBRARIES}
+  )
+endif()
+
+if(BLAS_FOUND)
+  SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_BLAS_CCFLAGS}")
+  #link_directories( ${BLAS_LIBRARY_DIRS} )
+  #include_directories( ${BLAS_INCLUDE_DIRS} )
+  #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${BLAS_LIBRARIES} )
+  list(APPEND CImg_INCLUDE_DIRS
+    ${BLAS_INCLUDE_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS_DIR
+    ${BLAS_LIBRARY_DIRS}
+  )
+  list(APPEND CImg_SYSTEM_LIBS
+    ${BLAS_LIBRARIES}
+  )
+endif()
+
+# Add CIMG Flags to Compilation Flags
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CIMG_CFLAGS}")
+
+foreach(program ${CIMG_FILES})
+  add_executable(${program} ${program}.cpp)
+  target_link_libraries(${program} ${CImg_SYSTEM_LIBS} )
+endforeach(program)
diff --git a/cmake/Modules/FindLucene++.cmake b/cmake/Modules/FindLucene++.cmake
new file mode 100644
index 0000000..6bcb6b6
--- /dev/null
+++ b/cmake/Modules/FindLucene++.cmake
@@ -0,0 +1,103 @@
+#
+# This module looks for lucene++ support
+# It will define the following values
+#
+# LUCENEPP_INCLUDE_DIRS = LUCENEPP_INCLUDE_DIR + LUCENEPP_LIBRARY_DIR
+# LUCENEPP_INCLUDE_DIR  = where lucene++/Lucene.h can be found
+# LUCENEPP_LIBRARY_DIR  = where liblucene++.so can be found
+# LUCENEPP_LIBRARIES    = the libraries to link against lucene++
+# LUCENEPP_VERSION      = The lucene++ version string
+# LUCENEPP_FOUND        = set to 1 if lucene++ is found
+#
+
+INCLUDE(CheckSymbolExists)
+#INCLUDE(FindLibraryWithDebug)
+
+IF(LUCENEPP_FIND_VERSION)
+  SET(LUCENEPP_MIN_VERSION ${LUCENEPP_FIND_VERSION})
+ELSEIF()
+  SET(LUCENEPP_MIN_VERSION "3.0.0")
+ENDIF(LUCENEPP_FIND_VERSION)
+
+SET(TRIAL_LIBRARY_PATHS
+  $ENV{LUCENEPP_HOME}/lib${LIB_SUFFIX}
+  ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}
+  ${CMAKE_INSTALL_PREFIX}/lib
+  /usr/local/lib${LIB_SUFFIX}
+  /usr/local/lib/${CMAKE_LIBRARY_ARCHITECTURE}
+  /usr/lib${LIB_SUFFIX}
+  /sw/lib${LIB_SUFFIX}
+  /usr/pkg/lib${LIB_SUFFIX}
+  /usr/lib64
+  /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}
+  )
+SET(TRIAL_INCLUDE_PATHS
+  $ENV{LUCENEPP_HOME}/include
+  ${CMAKE_INSTALL_PREFIX}/include
+  /usr/local/include
+  /usr/include
+  /sw/include
+  /usr/pkg/include
+  )
+#FIND_LIBRARY_WITH_DEBUG(LUCENEPP_CORE_LIBRARY
+#  WIN32_DEBUG_POSTFIX d
+#  NAMES lucene++
+#  PATHS ${TRIAL_LIBRARY_PATHS})
+IF (LUCENEPP_CORE_LIBRARY)
+  MESSAGE(STATUS "Found Lucene++ core library: ${LUCENEPP_CORE_LIBRARY}")
+ENDIF (LUCENEPP_CORE_LIBRARY)
+#FIND_LIBRARY_WITH_DEBUG(LUCENEPP_SHARED_LIBRARY
+#  WIN32_DEBUG_POSTFIX d
+#  NAMES lucene++-contrib
+#  PATHS ${TRIAL_LIBRARY_PATHS})
+IF (LUCENEPP_SHARED_LIBRARY)
+  MESSAGE(STATUS "Found Lucene++ contrib library: ${LUCENEPP_SHARED_LIBRARY}")
+ENDIF (LUCENEPP_SHARED_LIBRARY)
+
+IF(LUCENEPP_CORE_LIBRARY AND LUCENEPP_SHARED_LIBRARY)
+  SET(LUCENEPP_LIBRARIES ${LUCENEPP_CORE_LIBRARY} ${LUCENEPP_SHARED_LIBRARY} ${Boost_SYSTEM_LIBRARY})
+ENDIF(LUCENEPP_CORE_LIBRARY AND LUCENEPP_SHARED_LIBRARY)
+
+FIND_PATH(LUCENEPP_INCLUDE_DIR
+  NAMES lucene++/Lucene.h
+  PATHS ${TRIAL_INCLUDE_PATHS})
+
+IF (LUCENEPP_INCLUDE_DIR)
+  MESSAGE(STATUS "Found Lucene++ include dir: ${LUCENEPP_INCLUDE_DIR}")
+ENDIF (LUCENEPP_INCLUDE_DIR)
+
+SET(LUCENEPP_GOOD_VERSION TRUE)
+
+FIND_PATH(LUCENEPP_LIBRARY_DIR
+	NAMES liblucene++.dylib liblucene++.so liblucene++.dll.a lucene++
+       	PATHS ${TRIAL_LIBRARY_PATHS} ${TRIAL_INCLUDE_PATHS} NO_DEFAULT_PATH)
+IF (LUCENEPP_LIBRARY_DIR)
+  MESSAGE(STATUS "Found Lucene++ library dir: ${LUCENEPP_LIBRARY_DIR}")
+
+  IF (LUCENEPP_VERSION STRLESS "${LUCENEPP_MIN_VERSION}")
+    MESSAGE(ERROR " Lucene++ version ${LUCENEPP_VERSION} is less than the required minimum ${LUCENEPP_MIN_VERSION}")
+    SET(LUCENEPP_GOOD_VERSION FALSE)
+  ENDIF (LUCENEPP_VERSION STRLESS "${LUCENEPP_MIN_VERSION}")
+ENDIF (LUCENEPP_LIBRARY_DIR)
+
+IF(LUCENEPP_INCLUDE_DIR AND LUCENEPP_LIBRARIES AND LUCENEPP_LIBRARY_DIR AND LUCENEPP_GOOD_VERSION)
+  SET(LUCENEPP_FOUND TRUE)
+  SET(LUCENEPP_INCLUDE_DIRS ${LUCENEPP_LIBRARY_DIR} ${LUCENEPP_INCLUDE_DIR})
+ENDIF(LUCENEPP_INCLUDE_DIR AND LUCENEPP_LIBRARIES AND LUCENEPP_LIBRARY_DIR AND LUCENEPP_GOOD_VERSION)
+
+IF(LUCENEPP_FOUND)
+  IF(NOT LUCENEPP_FIND_QUIETLY)
+    MESSAGE(STATUS "Found Lucene++: ${LUCENEPP_LIBRARIES} version ${LUCENEPP_VERSION}")
+  ENDIF(NOT LUCENEPP_FIND_QUIETLY)
+ELSE(LUCENEPP_FOUND)
+  IF(LUCENEPP_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find Lucene++.")
+  ENDIF(LUCENEPP_FIND_REQUIRED)
+ENDIF(LUCENEPP_FOUND)
+
+MARK_AS_ADVANCED(
+  LUCENEPP_INCLUDE_DIRS
+  LUCENEPP_INCLUDE_DIR
+  LUCENEPP_LIBRARY_DIR
+  LUCENEPP_LIBRARIES
+  )
diff --git a/cmake/Modules/FindTextpresso.cmake b/cmake/Modules/FindTextpresso.cmake
new file mode 100644
index 0000000..7b1e25b
--- /dev/null
+++ b/cmake/Modules/FindTextpresso.cmake
@@ -0,0 +1,22 @@
+# find Textpresso core library
+
+FIND_PATH( Textpresso_INCLUDE_DIR NAMES CASManager.h IndexManager.h PATHS ENV PATH PATH_SUFFIXES
+        include textpresso)
+
+FIND_LIBRARY( Textpresso_LIBRARY NAMES textpresso PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+
+IF( Textpresso_LIBRARY )
+    SET( Textpresso_FOUND TRUE )
+    SET( Textpresso_LIBRARIES Textpresso_LIBRARY )
+ENDIF( Textpresso_LIBRARY)
+
+IF( Textpresso_FOUND )
+    IF (NOT Textpresso_FIND_QUIETLY)
+        MESSAGE(STATUS "Found the Textpresso libraries at ${Textpresso_LIBRARY}")
+        MESSAGE(STATUS "Found the Textpresso headers at ${Textpresso_INCLUDE_DIR}")
+    ENDIF (NOT Textpresso_FIND_QUIETLY)
+ELSE( Textpresso_FOUND )
+    IF(Textpresso_FIND_REQUIRED)
+        MESSAGE(FATAL_ERROR "Could NOT find Textpresso")
+    ENDIF(Textpresso_FIND_REQUIRED)
+ENDIF(Textpresso_FOUND)
\ No newline at end of file
diff --git a/cmake/Modules/FindWt.cmake b/cmake/Modules/FindWt.cmake
new file mode 100644
index 0000000..2928d32
--- /dev/null
+++ b/cmake/Modules/FindWt.cmake
@@ -0,0 +1,163 @@
+# Find Wt includes and libraries
+#
+# This script sets the following variables:
+#
+#  Wt_INCLUDE_DIR
+#  Wt_LIBRARIES  - Release libraries
+#  Wt_FOUND  - True if release libraries found
+#  Wt_DEBUG_LIBRARIES  - Debug libraries
+#  Wt_DEBUG_FOUND  - True if debug libraries found
+#
+# To direct the script to a particular Wt installation, use the
+# standard cmake variables CMAKE_INCLUDE_PATH and CMAKE_LIBRARY_PATH
+#
+# To use this script to find Wt, when using the new style for include files:
+#   #include <Wt/WLineEdit>
+#   #include <Wt/Ext/LineEdit>
+#   #include <Wt/Chart/WPieChart>
+#
+# include the following CMake snippet in your project:
+#
+#  FIND_PACKAGE( Wt REQUIRED )
+#  INCLUDE_DIRECTORIES( ${Wt_INCLUDE_DIR} )
+#  TARGET_LINK_LIBRARIES( yourexe
+#    ${Wt_DEBUG_LIBRARY}        # or {Wt_LIBRARY}
+#    ${Wt_HTTP_DEBUG_LIBRARY}   # or {Wt_HTTP_LIBRARY}
+#    ${Wt_EXT_DEBUG_LIBRARY}    # or {Wt_EXT_LIBRARY}
+#  )
+#
+# To use this script to find Wt, when using the old include style:
+#   #include <WLineEdit>
+#   #include <Ext/LineEdit>
+#   #include <Chart/WPieChart>
+# style of include files, change the INCLUDE_DIRECTORIES statement to:
+#   INCLUDE_DIRECTORIES( ${Wt_INCLUDE_DIR} ${Wt_INCLUDE_DIR}/Wt )
+#
+#
+#
+#
+# Copyright (c) 2007, Pau Garcia i Quiles, <pgquiles@elpauer.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+FIND_PATH( Wt_INCLUDE_DIR NAMES Wt/WObject PATHS ENV PATH PATH_SUFFIXES include wt )
+
+SET( Wt_FIND_COMPONENTS Release Debug )
+
+FIND_LIBRARY( Wt_LIBRARY NAMES wt PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_EXT_LIBRARY NAMES wtext PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_HTTP_LIBRARY NAMES wthttp PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_TEST_LIBRARY NAMES wttest PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_FCGI_LIBRARY NAMES wtfcgi PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_DBO_LIBRARY NAMES wtdbo PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_DBOSQLITE3_LIBRARY NAMES wtdbosqlite3 PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_DBOPOSTGRES_LIBRARY NAMES wtdbopostgres PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_DBOMYSQL_LIBRARY NAMES wtdbomysql PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+FIND_LIBRARY( Wt_DBOFIREBIRD_LIBRARY NAMES wtdbofirebird PATHS PATH PATH_SUFFIXES lib lib-release lib_release )
+
+FIND_LIBRARY( Wt_DEBUG_LIBRARY NAMES wtd wt PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_EXT_DEBUG_LIBRARY NAMES wtextd wtext PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_HTTP_DEBUG_LIBRARY NAMES wthttpd wthttp PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_TEST_DEBUG_LIBRARY NAMES wttestd wttest PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_FCGI_DEBUG_LIBRARY NAMES wtfcgid wtfcgi PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_DBO_DEBUG_LIBRARY NAMES wtdbod wtdbo PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_DBOSQLITE3_DEBUG_LIBRARY NAMES wtdbosqlite3d wtdbosqlite3 PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_DBOPOSTGRES_DEBUG_LIBRARY NAMES wtdbopostgresd wtdbopostgres PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_DBOMYSQL_DEBUG_LIBRARY NAMES wtdbomysqld wtdbomysql PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+FIND_LIBRARY( Wt_DBOFIREBIRD_DEBUG_LIBRARY NAMES wtdbofirebirdd wtdbofirebird PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib)
+
+IF( Wt_LIBRARY )
+    IF( Wt_HTTP_LIBRARY )
+        SET( Wt_FOUND TRUE )
+        SET( Wt_FIND_REQUIRED_Release TRUE )
+        SET( Wt_LIBRARIES ${Wt_LIBRARY} )
+
+        IF( Wt_FCGI_LIBRARY )
+            SET( Wt_LIBRARIES ${Wt_LIBRARIES} )
+        ENDIF( Wt_FCGI_LIBRARY )
+    ELSE( Wt_HTTP_LIBRARY )
+        IF( Wt_FCGI_LIBRARY )
+            SET( Wt_FOUND TRUE )
+            SET( Wt_FIND_REQUIRED_Release TRUE )
+            SET( Wt_LIBRARIES ${Wt_LIBRARY} )
+        ENDIF( Wt_FCGI_LIBRARY )
+    ENDIF( Wt_HTTP_LIBRARY )
+ENDIF( Wt_LIBRARY )
+
+IF( Wt_EXT_LIBRARY )
+    SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_EXT_LIBRARY} )
+ENDIF( Wt_EXT_LIBRARY )
+
+IF( Wt_DBO_LIBRARY )
+    SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBO_LIBRARY} )
+    IF( Wt_DBOSQLITE3_LIBRARY )
+        SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOSQLITE3_LIBRARY} )
+    ENDIF( Wt_DBOSQLITE3_LIBRARY )
+    IF( Wt_DBOPOSTGRES_LIBRARY )
+        SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOPOSTGRES_LIBRARY} )
+    ENDIF( Wt_DBOPOSTGRES_LIBRARY )
+    IF( Wt_DBOMYSQL_LIBRARY )
+        SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOMYSQL_LIBRARY} )
+    ENDIF( Wt_DBOMYSQL_LIBRARY )
+    IF ( Wt_DBOFIREBIRD_LIBRARY )
+        SET ( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOFIREBIRD_LIBRARY} )
+    ENDIF ( Wt_DBOFIREBIRD_LIBRARY )
+ENDIF( Wt_DBO_LIBRARY )
+
+IF( Wt_DEBUG_LIBRARY )
+    IF ( Wt_HTTP_DEBUG_LIBRARY)
+        SET( Wt_DEBUG_FOUND TRUE )
+        SET( Wt_FIND_REQUIRED_Debug TRUE )
+        SET( Wt_DEBUG_LIBRARIES ${Wt_HTTP_DEBUG_LIBRARY} ${Wt_DEBUG_LIBRARY} )
+
+        IF( Wt_FCGI_DEBUG_LIBRARY )
+            SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_FCGI_DEBUG_LIBRARY} )
+        ENDIF( Wt_FCGI_DEBUG_LIBRARY )
+        ELSE( Wt_HTTP_DEBUG_LIBRARY )
+            IF( Wt_FCGI_DEBUG_LIBRARY )
+                SET( Wt_DEBUG_FOUND TRUE )
+                SET( Wt_FIND_REQUIRED_Debug TRUE )
+                SET( Wt_DEBUG_LIBRARIES ${Wt_FCGI_DEBUG_LIBRARY} ${Wt_DEBUG_LIBRARY} )
+            ENDIF( Wt_FCGI_DEBUG_LIBRARY )
+        ENDIF( Wt_HTTP_DEBUG_LIBRARY)       
+ENDIF( Wt_DEBUG_LIBRARY )
+
+IF( Wt_DBO_DEBUG_LIBRARY )
+    SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBO_DEBUG_LIBRARY} )
+    IF( Wt_DBOSQLITE3_DEBUG_LIBRARY )
+        SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOSQLITE3_DEBUG_LIBRARY} )
+    ENDIF( Wt_DBOSQLITE3_DEBUG_LIBRARY )
+    IF( Wt_DBOPOSTGRES_DEBUG_LIBRARY )
+        SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOPOSTGRES_DEBUG_LIBRARY} )
+    ENDIF( Wt_DBOPOSTGRES_DEBUG_LIBRARY )
+    IF( Wt_DBOMYSQL_DEBUG_LIBRARY )
+        SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOMYSQL_DEBUG_LIBRARY} )
+    ENDIF ( Wt_DBOMYSQL_DEBUG_LIBRARY )
+    IF ( Wt_DBOFIREBIRD_DEBUG_LIBRARY )
+        SET (Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOFIREBIRD_DEBUG_LIBRARY} )
+    ENDIF ( Wt_DBOFIREBIRD_DEBUG_LIBRARY )
+ENDIF( Wt_DBO_DEBUG_LIBRARY )
+
+IF(Wt_FOUND)
+    IF (NOT Wt_FIND_QUIETLY)
+        MESSAGE(STATUS "Found the Wt libraries at ${Wt_LIBRARIES}")
+        MESSAGE(STATUS "Found the Wt headers at ${Wt_INCLUDE_DIR}")
+    ENDIF (NOT Wt_FIND_QUIETLY)
+ELSE(Wt_FOUND)
+    IF(Wt_FIND_REQUIRED)
+        MESSAGE(FATAL_ERROR "Could NOT find Wt")
+    ENDIF(Wt_FIND_REQUIRED)
+ENDIF(Wt_FOUND)
+
+IF(Wt_DEBUG_FOUND)
+    IF (NOT Wt_FIND_QUIETLY)
+        MESSAGE(STATUS "Found the Wt debug libraries at ${Wt_DEBUG_LIBRARIES}")
+        MESSAGE(STATUS "Found the Wt debug headers at ${Wt_INCLUDE_DIR}")
+    ENDIF (NOT Wt_FIND_QUIETLY)
+ELSE(Wt_DEBUG_FOUND)
+    IF(Wt_FIND_REQUIRED_Debug)
+        MESSAGE(FATAL_ERROR "Could NOT find Wt debug libraries")
+    ENDIF(Wt_FIND_REQUIRED_Debug)
+ENDIF(Wt_DEBUG_FOUND)
+
diff --git a/getbib/getallbibfiles.sh b/getbib/getallbibfiles.sh
new file mode 100644
index 0000000..d8108db
--- /dev/null
+++ b/getbib/getallbibfiles.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+function usage {
+    echo "usage: $(basename $0) [p] <tpcas_dir>"
+    echo "  -p --num-proc     maximum number of parallel processes"
+    echo "  -h --help         display help"
+    exit 1
+}
+
+if [[ "${#}" < 1 ]]
+then
+    usage
+fi
+
+ROOT_DIR=""
+N_PROC=1
+
+while [[ $# -gt 0 ]]
+do
+key=$1
+
+case $key in
+    -p|--num-proc)
+    shift
+    N_PROC=$1
+    shift
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [[ -d $key ]]
+    then
+        ROOT_DIR="$key"
+        shift
+    else
+        usage
+    fi
+    ;;
+esac
+done
+
+# check for the required argument ROOT_DIR
+if [[ ${ROOT_DIR} == "" ]]
+then
+    usage
+fi
+
+for corpus in ${ROOT_DIR}/*
+do
+    if [[ "${corpus}" == "C. elegans" || "${corpus}" == "C.elegans Supplementals" ]]
+    then
+        getbib ${ROOT_DIR}/${corpus}
+    else
+        getbib4nxml ${ROOT_DIR}/${corpus}
+    fi
+done
\ No newline at end of file
diff --git a/getbib/getbib.cpp b/getbib/getbib.cpp
new file mode 100644
index 0000000..a0430ae
--- /dev/null
+++ b/getbib/getbib.cpp
@@ -0,0 +1,235 @@
+/*
+ * File:   main.cpp
+ * Author: liyuling
+ *
+ * Created on November, 2013
+ */
+
+//#include "../TextpressoCentralGlobals.h"
+#include "../../TextpressoCentralGlobalDefinitions.h"
+#include "xercesc/util/XMLString.hpp"
+#include <uima/api.hpp>
+#include <uima/xmideserializer.hpp>
+#include <cstdio>
+#include <fstream>
+#include "getbibUtils.h"
+#include <boost/filesystem.hpp>
+#include <boost/date_time.hpp>
+
+
+
+#define TPCAS_2_LINDEX_VERSION "0.9.0"
+
+
+using namespace boost::filesystem;
+
+void print_who() {
+    std::cout << std::endl << "CAS file bib extracter" << std::endl;
+    std::cout << "Build Date: " << __DATE__ << std::endl;
+    std::cout << "Version: " << TPCAS_2_LINDEX_VERSION << std::endl;
+}
+
+void print_help() {
+    std::cout << std::endl;
+    std::cout << "Usage: getbib [tpcas_file_directory]" << std::endl;
+    std::cout << std::endl;
+ //   std::cout << "       CASconcumer reads in a directory of tpcas files and adds them to the lucene index(index_location specified by user), if index_location does not exist, it will create one. ";
+   // std::cout << std::endl;
+    //   std::cout << "       as defined in annotator that is referenced in";
+    //   std::cout << std::endl;
+    //  //std::cout << "       " << TPCAS2LINDEXDESCRIPTOR;
+    //   std::cout << "       " << TPCAS2SINGLEINDEXDESCRIPTOR;
+    //   std::cout << std::endl;
+}
+
+void addCasFile(const char* pszInput, string indexdescriptor) {
+
+
+    std::string gzfile(pszInput);
+    std::cout << gzfile << std::endl;
+    if(gzfile.find("tpcas") == std::string::npos)
+        return;
+
+
+
+    //    string doneflagpath = "/tmp/indexerdone/"+ gzfile;
+    //    cout << "done flag is " << doneflagpath << endl;
+    //    if(exists(doneflagpath))
+    //    {
+    //        return;
+    //    }
+    //    
+    //    std::ofstream doneflag(doneflagpath.c_str());
+    //    doneflag << "" << endl;
+    //    doneflag.close();
+
+    cout << "L43 addcas file " << pszInput << endl;
+    //const char * descriptor = TPCAS2LINDEXDESCRIPTOR;
+    const char * descriptor = indexdescriptor.c_str();
+
+
+
+    string tpcasfile = uncompressGzip(gzfile);
+    std::cout << "L52 tpcasfile " << tpcasfile << std::endl;
+
+    try {
+
+        /* Create/link up to a UIMACPP resource manager instance (singleton) */
+        (void) uima::ResourceManager::createInstance("TPCAS2LINDEXAE");
+
+        uima::ErrorInfo errorInfo;
+
+        uima::AnalysisEngine * pEngine
+                = uima::Framework::createAnalysisEngine(descriptor, errorInfo);
+
+
+
+        if (errorInfo.getErrorId() != UIMA_ERR_NONE) {
+            std::cerr << std::endl
+                    << "  Error string  : "
+                    << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId())
+                    << std::endl
+                    << "  UIMACPP Error info:" << std::endl
+                    << errorInfo << std::endl;
+            exit((int) errorInfo.getErrorId());
+        }
+        uima::TyErrorId utErrorId; // Variable to store UIMACPP return codes
+        /* Get a new CAS */
+        uima::CAS* cas = pEngine->newCAS();
+        if (cas == NULL) {
+            std::cerr << "pEngine->newCAS() failed." << std::endl;
+            exit(1);
+        }
+        /* process input / cas */
+        try {
+            /* initialize from an xmicas */
+
+            std::cout << "L69 consumer" << tpcasfile << std::endl;
+            XMLCh* native = XMLString::transcode(tpcasfile.c_str());
+            LocalFileInputSource fileIS(native);
+            XMLString::release(&native);
+
+            std::cout << "L71 tpcas " << tpcasfile.c_str() << std::endl;
+            uima::XmiDeserializer::deserialize(fileIS, *cas, true);
+
+            std::string filename(tpcasfile);
+            std::cout << "L77 " << filename << std::endl;
+
+            /* process the CAS */
+
+            // ((uima::AnalysisEngine*) pEngine)->processAndOutputNewCASes(*cas);
+
+            ((uima::AnalysisEngine*) pEngine)->process(*cas);
+
+
+
+        } catch (uima::Exception e) {
+            uima::ErrorInfo errInfo = e.getErrorInfo();
+            std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl;
+            std::cerr << errInfo << std::endl;
+        }
+
+
+
+        /* call collectionProcessComplete */
+        utErrorId = pEngine->collectionProcessComplete();
+        /* Free annotator */
+        utErrorId = pEngine->destroy();
+        delete cas;
+        delete pEngine;
+
+        std::remove(tpcasfile.c_str()); //delete uncompressed temp casfile
+
+    } catch (uima::Exception e) {
+        std::cerr << "Exception: " << e << std::endl;
+    }
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 2) {
+        print_who();
+        print_help();
+        return (-1);
+    }
+
+    //const char * descriptor = TPCAS2LINDEXDESCRIPTOR;
+
+    path inputdir(argv[1]); //tpcas file dir
+    //string indexpath(argv[2]); //index location
+    string indexpath("");
+    // string newOradd(argv[3]); // new/add  index option
+
+    //  string indexpath("/home/lyl/Dropbox/work/lucene/cas_index");
+    //  string indexdescriptor("/home/lyl/Dropbox/work/textpressocentral/trunk/LuceneIndexing/descriptors/Tpcas2Lindex.xml");
+
+
+
+    //    if (!exists(indexpath)) {
+    //        cout << "creating index directory " << endl;
+    //        create_directories(indexpath);
+    //        create_directories(indexpath + "/fulltext");
+    //     //   create_directories(indexpath + "/token");
+    //        create_directories(indexpath + "/sentence");
+    //     //   create_directories(indexpath + "/lexical");
+    //        create_directories(indexpath + "/bibliography");
+    //    }
+
+//    string inputpath(argv[1]);
+//    string donedir = "/tmp/indexerdone/" + inputpath;
+//    if (!exists(donedir)) {
+//        create_directories(donedir);
+//    }
+
+
+    
+    std::string tempDir = getTempDir();
+    // newindexflag = tempDir + "/newindexflag";
+    bool dir_created = false;
+    while (dir_created != true) {
+        cout << "dir not created" << endl;
+        tempDir = getTempDir();
+
+        dir_created = boost::filesystem::create_directories(tempDir);
+    }
+    
+
+    //string indexdescriptor(TPCAS2LINDEXDESCRIPTOR);
+    //string indexdescriptor(tempDir + "/Tpcas2Lindex.xml");
+    string indexdescriptor(tempDir + "/Tpcas2Bib.xml");
+    writeToIndexDescriptor(indexpath, indexdescriptor, tempDir); ///write to /run/shm/[tempDir]/Tpcas2Lindex.xml
+
+  directory_iterator end_itr;
+    for (directory_iterator dit(inputdir); dit != end_itr; dit++) {
+        if (is_regular_file(dit->status())) {
+
+            // cout << "extension " << dit->symlink_status() << endl;
+          
+                // addCasFile(dit->path());
+                cout << "file path is " << dit->path() << endl;
+                //addCasFile(dit->path().string().c_str() );
+                addCasFile(dit->path().string().c_str(), indexdescriptor);
+               
+
+           
+        } else if (is_directory(dit->status())) {
+            path subdir(dit->path().string().c_str());
+            for (directory_iterator dit2(subdir); dit2 != end_itr; dit2++) {
+
+                if (is_regular_file(dit2->status())) {
+
+               
+                        addCasFile(dit2->path().string().c_str(), indexdescriptor);
+                    
+                }
+
+            }
+
+        }
+
+
+    }
+
+    boost::filesystem::remove(indexdescriptor);
+    boost::filesystem::remove(tempDir);
+
+}
diff --git a/getbib/getbib4nxml.cpp b/getbib/getbib4nxml.cpp
new file mode 100644
index 0000000..9e7c366
--- /dev/null
+++ b/getbib/getbib4nxml.cpp
@@ -0,0 +1,126 @@
+/* 
+ * File:   main.cpp
+ * Author: mueller
+ *
+ * Created on October 26, 2016, 12:31 PM
+ */
+
+//#include "../TextpressoCentralGlobals.h"
+#include "../../TextpressoCentralGlobalDefinitions.h"
+#include "xercesc/util/XMLString.hpp"
+#include <uima/api.hpp>
+#include <uima/xmideserializer.hpp>
+#include <cstdio>
+#include <fstream>
+#include "getbib4nxmlUtils.h"
+#include <boost/filesystem.hpp>
+#include <boost/date_time.hpp>
+
+#define TPCAS_2_LINDEX_VERSION "0.9.0"
+
+//using namespace boost::filesystem;
+
+void print_who() {
+    std::cout << std::endl << "CAS file bib extracter" << std::endl;
+    std::cout << "Build Date: " << __DATE__ << std::endl;
+    std::cout << "Version: " << TPCAS_2_LINDEX_VERSION << std::endl;
+}
+
+void print_help() {
+    std::cout << std::endl;
+    std::cout << "Usage: getbib [tpcas_file_directory]" << std::endl;
+    std::cout << std::endl;
+}
+
+void addCasFile(const char* pszInput, std::string indexdescriptor) {
+    std::string gzfile(pszInput);
+    std::cout << gzfile << std::endl;
+    if (boost::filesystem::path(gzfile).filename().string().find("tpcas") == std::string::npos)
+        return;
+    const char * descriptor = indexdescriptor.c_str();
+    std::string tpcasfile = uncompressGzip(gzfile);
+    try {
+        /* Create/link up to a UIMACPP resource manager instance (singleton) */
+        (void) uima::ResourceManager::createInstance("TPCAS2LINDEXAE");
+        uima::ErrorInfo errorInfo;
+        uima::AnalysisEngine * pEngine
+                = uima::Framework::createAnalysisEngine(descriptor, errorInfo);
+        if (errorInfo.getErrorId() != UIMA_ERR_NONE) {
+            std::cerr << std::endl
+                    << "  Error string  : "
+                    << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId())
+                    << std::endl
+                    << "  UIMACPP Error info:" << std::endl
+                    << errorInfo << std::endl;
+            exit((int) errorInfo.getErrorId());
+        }
+        uima::TyErrorId utErrorId; // Variable to store UIMACPP return codes
+        /* Get a new CAS */
+        uima::CAS* cas = pEngine->newCAS();
+        if (cas == NULL) {
+            std::cerr << "pEngine->newCAS() failed." << std::endl;
+            exit(1);
+        }
+        /* process input / cas */
+        try {
+            /* initialize from an xmicas */
+            XMLCh* native = XMLString::transcode(tpcasfile.c_str());
+            LocalFileInputSource fileIS(native);
+            XMLString::release(&native);
+            uima::XmiDeserializer::deserialize(fileIS, *cas, true);
+            std::string filename(tpcasfile);
+            /* process the CAS */
+            ((uima::AnalysisEngine*) pEngine)->process(*cas);
+        } catch (uima::Exception e) {
+            uima::ErrorInfo errInfo = e.getErrorInfo();
+            std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl;
+            std::cerr << errInfo << std::endl;
+            std::cerr << "Writing default bib file";
+            writeDefaultBibFile(gzfile.replace(gzfile.end()-8, gzfile.end(), "bib"));
+        }
+        /* call collectionProcessComplete */
+        utErrorId = pEngine->collectionProcessComplete();
+        /* Free annotator */
+        utErrorId = pEngine->destroy();
+        delete cas;
+        delete pEngine;
+        std::remove(tpcasfile.c_str()); //delete uncompressed temp casfile
+    } catch (uima::Exception e) {
+        std::cerr << "Exception: " << e << std::endl;
+    }
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 2) {
+        print_who();
+        print_help();
+        return (-1);
+    }
+    boost::filesystem::path inputdir(argv[1]); //tpcas file dir
+    std::string indexpath("");
+    std::string tempDir = getTempDir();
+    bool dir_created = false;
+    while (dir_created != true) {
+        std::cout << "dir not created" << std::endl;
+        tempDir = getTempDir();
+        dir_created = boost::filesystem::create_directories(tempDir);
+    }
+    std::string indexdescriptor(tempDir + "/Tpcas2Bib.xml");
+    writeToIndexDescriptor(indexpath, indexdescriptor, tempDir); ///write to /run/shm/[tempDir]/Tpcas2Lindex.xml
+    boost::filesystem::directory_iterator end_itr;
+    for (boost::filesystem::directory_iterator dit(inputdir); dit != end_itr; dit++) {
+        if (boost::filesystem::is_regular_file(dit->status())) {
+          std::cout << "file path is " << dit->path() << std::endl;
+            addCasFile(dit->path().string().c_str(), indexdescriptor);
+        } else if (boost::filesystem::is_directory(dit->status())) {
+            boost::filesystem::path subdir(dit->path().string().c_str());
+            for (boost::filesystem::directory_iterator dit2(subdir); dit2 != end_itr; dit2++) {
+                if (boost::filesystem::is_regular_file(dit2->status())) {
+                    addCasFile(dit2->path().string().c_str(), indexdescriptor);
+                }
+            }
+        }
+    }
+    boost::filesystem::remove(indexdescriptor);
+    boost::filesystem::remove(tempDir);
+}
diff --git a/getbib/getbib4nxmlUtils.cpp b/getbib/getbib4nxmlUtils.cpp
new file mode 100644
index 0000000..d22507c
--- /dev/null
+++ b/getbib/getbib4nxmlUtils.cpp
@@ -0,0 +1,164 @@
+/*
+ * CAS file utils
+ * author: liyuling
+ * Date: Nov, 2013
+ */
+
+#include "getbib4nxmlUtils.h"
+
+std::string uncompressGzip(std::string gzFile) {
+    std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary);
+    boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
+    in.push(boost::iostreams::gzip_decompressor());
+    in.push(filein);
+    int lastdot = gzFile.find_last_of(".");
+    int lastslash = gzFile.find_last_of("/");
+    std::string tpFile = gzFile.substr(lastslash + 1, lastdot - lastslash - 1);
+    std::string shm("/run/shm/");
+    std::string tempFile = shm + tpFile;
+    std::ofstream out(tempFile.c_str());
+    boost::iostreams::copy(in, out);
+    out.close();
+    return tempFile;
+}
+
+std::string getTempDir() {
+    boost::posix_time::ptime now = boost::posix_time::microsec_clock::local_time();
+    int month = static_cast<int> (now.date().month());
+    int year = static_cast<int> (now.date().year());
+    int day = static_cast<int> (now.date().day());
+    boost::posix_time::time_duration duration(now.time_of_day());
+    long microseconds = duration.total_microseconds();
+    long pid = getpid();
+    long random = pid + microseconds;
+    std::stringstream ss;
+    ss << year << month << day << random;
+    std::string tempDir = "/run/shm/" + ss.str();
+    return tempDir;
+}
+
+void writeToIndexDescriptor(std::string indexpath, std::string descriptor, std::string tempDir) {
+    std::ofstream output(descriptor.c_str());
+    output << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" << std::endl;
+    output << "<taeDescription xmlns = \"http://uima.apache.org/resourceSpecifier\" >" << std::endl;
+    output << " <frameworkImplementation > org.apache.uima.cpp</frameworkImplementation>" << std::endl;
+    output << " <primitive > true </primitive>" << std::endl;
+    output << " <annotatorImplementationName > Tpcas2Bib4Nxml</annotatorImplementationName>" << std::endl;
+    output << " <analysisEngineMetaData> " << std::endl;
+    output << "         <name > Tpcas2Bib4Nxml</name>" << std::endl;
+    output << "         <description > Writes an XCAS to a Lucene index.</description> " << std::endl;
+    output << "         <version > 1.0 </version> " << std::endl;
+    output << "         <vendor > Textpresso</vendor> " << std::endl;
+    output << "         <configurationParameters> " << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << std::endl;
+    output << "                         <description > Directory path of Lucene index for fulltext.</description> " << std::endl;
+    output << "                         <type > String</type> " << std::endl;
+    output << "                         <multiValued > false </multiValued> " << std::endl;
+    output << "                         <mandatory > true </mandatory> " << std::endl;
+    output << "                 </configurationParameter>" << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                 <name > TokenLuceneIndexDirectory</name> " << std::endl;
+    output << "                 <description > Directory path of Lucene index for tokens.</description> " << std::endl;
+    output << "                 <type > String</type>" << std::endl;
+    output << "                 <multiValued > false </multiValued>" << std::endl;
+    output << "                 <mandatory > true </mandatory> " << std::endl;
+    output << "                 </configurationParameter>" << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <description > Directory path of Lucene index for sentences.</description>" << std::endl;
+    output << "                         <type > String</type>" << std::endl;
+    output << "                         <multiValued > false </multiValued> " << std::endl;
+    output << "                         <mandatory > true </mandatory> " << std::endl;
+    output << "                 </configurationParameter> " << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <description > Directory path of Lucene index for lexical annotations.</description> " << std::endl;
+    output << "                         <type > String</type>" << std::endl;
+    output << "                         <multiValued > false </multiValued>" << std::endl;
+    output << "                         <mandatory > true </mandatory>" << std::endl;
+    output << "                 </configurationParameter>" << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <description > Directory path of Lucene index for bibliography annotations.</description> " << std::endl;
+    output << "                         <type > String</type>" << std::endl;
+    output << "                         <multiValued > false </multiValued>" << std::endl;
+    output << "                         <mandatory > true </mandatory>" << std::endl;
+    output << "                 </configurationParameter>" << std::endl;
+    output << "                 <configurationParameter> " << std::endl;
+    output << "                         <name > TempDirectory</name> " << std::endl;
+    output << "                         <description > temporary directory under /run/shm/ to store newindexflag </description>" << std::endl;
+    output << "                         <type > String</type>" << std::endl;
+    output << "                         <multiValued > false </multiValued>" << std::endl;
+    output << "                         <mandatory > true </mandatory>" << std::endl;
+    output << "                 </configurationParameter>" << std::endl;
+    output << "         </configurationParameters>" << std::endl;
+    output << "         <configurationParameterSettings>" << std::endl;
+    output << "                 <nameValuePair> " << std::endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                         <string>" << indexpath << "/fulltext" << "</string>" << std::endl;
+    output << "                         </value> " << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "                 <nameValuePair> " << std::endl;
+    output << "                         <name > TokenLuceneIndexDirectory</name>" << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                         <string>" << indexpath << "/token" << "</string>" << std::endl;
+    output << "                         </value>" << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "                 <nameValuePair>" << std::endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                        <string>" << indexpath << "/sentence" << "</string>" << std::endl;
+    output << "                         </value> " << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "                 <nameValuePair>" << std::endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                         <string>" << indexpath << "/lexical" << "</string>" << std::endl;
+    output << "                         </value> " << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "                 <nameValuePair>" << std::endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                         <string>" << indexpath << "/bibliography" << "</string>" << std::endl;
+    output << "                         </value> " << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "                 <nameValuePair>" << std::endl;
+    output << "                         <name >TempDirectory</name> " << std::endl;
+    output << "                         <value> " << std::endl;
+    output << "                         <string>" << tempDir << "</string>" << std::endl;
+    output << "                         </value> " << std::endl;
+    output << "                 </nameValuePair> " << std::endl;
+    output << "         </configurationParameterSettings> " << std::endl;
+    output << " <typeSystemDescription> " << std::endl;
+    output << "         <imports> " << std::endl;
+    output << "         <import location = \"/usr/local/uima_descriptors/TpLexiconAnnotatorTypeSystem.xml\"/> " << std::endl;
+    output << "         </imports>" << std::endl;
+    output << " </typeSystemDescription>" << std::endl;
+    output << " <capabilities> " << std::endl;
+    output << " <capability>" << std::endl;
+    output << " <inputs/> " << std::endl;
+    output << " <outputs/>" << std::endl;
+    output << " <languagesSupported> " << std::endl;
+    output << "         <language >x-unspecified</language>" << std::endl;
+    output << " </languagesSupported>" << std::endl;
+    output << " </capability>" << std::endl;
+    output << " </capabilities> " << std::endl;
+    output << " </analysisEngineMetaData>" << std::endl;
+    output << "</taeDescription> " << std::endl;
+    output.close();
+}
+
+void writeDefaultBibFile(const std::string &file_path) {
+    std::ofstream output(file_path);
+    output << "author|<not uploaded>" << std::endl;
+    output << "accession|<not uploaded>" << std::endl;
+    output << "type|<not uploaded>" << std::endl;
+    output << "title|<not uploaded>" << std::endl;
+    output << "journal|<not uploaded>" << std::endl;
+    output << "citation|<not uploaded>" << std::endl;
+    output << "year|<not uploaded>" << std::endl;
+    output << "abstract|<not uploaded>" << std::endl;
+    output.close();
+}
diff --git a/getbib/getbib4nxmlUtils.h b/getbib/getbib4nxmlUtils.h
new file mode 100644
index 0000000..92f09c4
--- /dev/null
+++ b/getbib/getbib4nxmlUtils.h
@@ -0,0 +1,25 @@
+/* 
+ * File:   Utils.h
+ * Author: lyl
+ *
+ * Created on November 15, 2013, 3:48 PM
+ */
+
+#ifndef UTILS_H
+#define	UTILS_H
+
+#include <iostream>
+#include <uima/api.hpp>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/date_time.hpp>
+#include <unistd.h>
+
+extern std::string uncompressGzip(std::string gzFile); // uncompress gz file
+extern std::string getTempDir(); // generate a temp dir under /run/shm to store all temp files for each run. using year+month+day+min
+extern void writeToIndexDescriptor(std::string indexpath, std::string descriptor, std::string tempDir); //write to Tpcas2index.xml descriptor
+void writeDefaultBibFile(const std::string& file_path);
+
+#endif	/* UTILS_H */
+
diff --git a/getbib/getbibUtils.cpp b/getbib/getbibUtils.cpp
new file mode 100644
index 0000000..2d8b0a8
--- /dev/null
+++ b/getbib/getbibUtils.cpp
@@ -0,0 +1,196 @@
+/*
+ * CAS file utils
+ * author: liyuling
+ * Date: Nov, 2013
+ */
+
+#include "getbibUtils.h"
+
+//const char* newindexflag = "/run/shm/newindexflag"; 
+
+string uncompressGzip(string gzFile) {
+    // std::cout << "166" << endl;
+    std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary);
+    boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
+    in.push(boost::iostreams::gzip_decompressor());
+    in.push(filein);
+
+    int lastdot = gzFile.find_last_of(".");
+    int lastslash = gzFile.find_last_of("/");
+    string tpFile = gzFile.substr(lastslash + 1, lastdot - lastslash - 1);
+
+    string shm("/run/shm/");
+    string tempFile = shm + tpFile;
+    // string tempFile = getTempDir() + "/" +tpFile;
+    //  std::cout << "177 " << tempFile << endl;
+    std::ofstream out(tempFile.c_str());
+    boost::iostreams::copy(in, out);
+    out.close();
+
+    return tempFile;
+}
+
+string getTempDir() {
+    // boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+
+    boost::posix_time::ptime now = boost::posix_time::microsec_clock::local_time();
+
+    int month = static_cast<int> (now.date().month());
+    int year = static_cast<int> (now.date().year());
+    int day = static_cast<int> (now.date().day());
+
+
+    boost::posix_time::time_duration duration(now.time_of_day());
+
+    long microseconds = duration.total_microseconds();
+
+
+
+    long pid = getpid();
+    //int second = time(0);
+
+
+
+    //int random = pid + second;
+
+    long random = pid + microseconds;
+
+    //cout << "r: " << random << endl;
+    std::stringstream ss;
+    //ss << year << month << day << minutes;
+    ss << year << month << day << random;
+    std::string tempDir = "/run/shm/" + ss.str();
+    //cout <<"hello" <<tempDir << endl;
+
+    return tempDir;
+
+}
+
+//void writeToIndexDescriptor(string indexpath, string descriptor, string tempDir) {
+
+void writeToIndexDescriptor(string indexpath, string descriptor, string tempDir) {
+    ofstream output(descriptor.c_str());
+    output << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" << endl;
+    output << "<taeDescription xmlns = \"http://uima.apache.org/resourceSpecifier\" >" << endl;
+    output << " <frameworkImplementation > org.apache.uima.cpp</frameworkImplementation>" << endl;
+    output << " <primitive > true </primitive>" << endl;
+    //output << " <annotatorImplementationName > Tpcas2Lpp</annotatorImplementationName>" << endl;
+    output << " <annotatorImplementationName > Tpcas2Bib</annotatorImplementationName>" << endl;
+    output << " <analysisEngineMetaData> " << endl;
+    //output << "         <name > Tpcas2Lpp</name>" << endl;
+    output << "         <name > Tpcas2Bib</name>" << endl;
+    output << "         <description > Writes an XCAS to a Lucene index.</description> " << endl;
+    output << "         <version > 1.0 </version> " << endl;
+    output << "         <vendor > Textpresso</vendor> " << endl;
+    output << "         <configurationParameters> " << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << endl;
+    output << "                         <description > Directory path of Lucene index for fulltext.</description> " << endl;
+    output << "                         <type > String</type> " << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                 <name > TokenLuceneIndexDirectory</name> " << endl;
+    output << "                 <description > Directory path of Lucene index for tokens.</description> " << endl;
+    output << "                 <type > String</type>" << endl;
+    output << "                 <multiValued > false </multiValued>" << endl;
+    output << "                 <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for sentences.</description>" << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued> " << endl;
+    output << "                         <mandatory > true </mandatory> " << endl;
+    output << "                 </configurationParameter> " << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for lexical annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << endl;
+    output << "                         <description > Directory path of Lucene index for bibliography annotations.</description> " << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "                 <configurationParameter> " << endl;
+    output << "                         <name > TempDirectory</name> " << endl;
+    output << "                         <description > temporary directory under /run/shm/ to store newindexflag </description>" << endl;
+    output << "                         <type > String</type>" << endl;
+    output << "                         <multiValued > false </multiValued>" << endl;
+    output << "                         <mandatory > true </mandatory>" << endl;
+    output << "                 </configurationParameter>" << endl;
+    output << "         </configurationParameters>" << endl;
+    output << "         <configurationParameterSettings>" << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > FulltextLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << indexpath << "/fulltext" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair> " << endl;
+    output << "                         <name > TokenLuceneIndexDirectory</name>" << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << indexpath << "/token" << "</string>" << endl;
+    output << "                         </value>" << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > SentenceLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                        <string>" << indexpath << "/sentence" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > LexicalLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << indexpath << "/lexical" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name > BibliographyLuceneIndexDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << indexpath << "/bibliography" << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "                 <nameValuePair>" << endl;
+    output << "                         <name >TempDirectory</name> " << endl;
+    output << "                         <value> " << endl;
+    output << "                         <string>" << tempDir << "</string>" << endl;
+    output << "                         </value> " << endl;
+    output << "                 </nameValuePair> " << endl;
+    output << "         </configurationParameterSettings> " << endl;
+    output << " <typeSystemDescription> " << endl;
+    output << "         <imports> " << endl;
+    output << "         <import location = \"/usr/local/uima_descriptors/TpLexiconAnnotatorTypeSystem.xml\"/> " << endl;
+    output << "         </imports>" << endl;
+    output << " </typeSystemDescription>" << endl;
+    output << " <capabilities> " << endl;
+    output << " <capability>" << endl;
+    output << " <inputs/> " << endl;
+    output << " <outputs/>" << endl;
+    output << " <languagesSupported> " << endl;
+    output << "         <language >x-unspecified</language>" << endl;
+    output << " </languagesSupported>" << endl;
+    output << " </capability>" << endl;
+    output << " </capabilities> " << endl;
+    output << " </analysisEngineMetaData>" << endl;
+    output << "</taeDescription> " << endl;
+
+    output.close();
+
+}
+
+/*
+void optimizeIndex(string indexpath)
+{
+     String TokenIndexDir = StringUtils::toString(indexpath.c_str());
+     IndexWriterPtr tokenwriter = newLucene<IndexWriter > (FSDirectory::open(TokenIndexDir),
+     newLucene<StandardAnalyzer > (LuceneVersion::LUCENE_CURRENT), false, 
+     IndexWriter::MaxFieldLengthLIMITED);
+}
+ */
\ No newline at end of file
diff --git a/getbib/getbibUtils.h b/getbib/getbibUtils.h
new file mode 100644
index 0000000..942330f
--- /dev/null
+++ b/getbib/getbibUtils.h
@@ -0,0 +1,32 @@
+/* 
+ * File:   Utils.h
+ * Author: lyl
+ *
+ * Created on November 15, 2013, 3:48 PM
+ */
+
+#ifndef UTILS_H
+#define	UTILS_H
+
+#include <iostream>
+#include <uima/api.hpp>
+#include <lucene++/targetver.h>
+#include <lucene++/LuceneHeaders.h>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <unistd.h>
+
+using namespace std;
+using namespace uima;
+using namespace Lucene;
+
+
+//extern const char* newindexflag;  //new index lock flag
+#endif	/* UTILS_H */
+
+
+extern string uncompressGzip(string gzFile); // uncompress gz file
+extern string getTempDir(); // generate a temp dir under /run/shm to store all temp files for each run. using year+month+day+min
+extern void writeToIndexDescriptor(string indexpath, string descriptor, string tempDir); //write to Tpcas2index.xml descriptor
+//extern void optimizeIndex(string indexpath);
diff --git a/getbibinfoforpdffromserver/download_pdfinfo.pl b/getbibinfoforpdffromserver/download_pdfinfo.pl
new file mode 100755
index 0000000..6a62804
--- /dev/null
+++ b/getbibinfoforpdffromserver/download_pdfinfo.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+# Script downloads the acedumps from postgres and 
+# deposits them into local files
+#
+# USAGE: ./01.pl
+#
+#
+# BEGIN PROGRAM
+# 
+
+### modules
+
+use strict;
+use HTTP::Request;
+use LWP::UserAgent;
+
+### variables
+
+# path to outfile
+
+my $outpath = "$ARGV[0]";
+
+my ($dateShort);
+$|=1;  # forces output buffer to flush after every print statement!
+
+# backs up previous data files
+
+&getDate();
+
+print "\n\nBacking up last dumps ....";
+my @files = ("$outpath/Paper.dump", "$outpath/LongText.dump");
+for (@files){
+    if (-e $_){ 
+	my @args = ("mv", "$_", "$_.$dateShort");
+        system(@args) == 0
+	    or die "system @args failed: $?";
+    }
+}
+print "done.\n";
+
+my $outfile1 = "$outpath"."Paper.dump";   
+my $outfile2 = "$outpath"."LongText.dump";   
+
+print "Downloading now .......\n";
+open (OUT1, ">$outfile1") or die "Cannot create $outfile1 : $!";
+open (OUT2, ">$outfile2") or die "Cannot create $outfile2 : $!";
+
+# fetch all Paper objects & abstracts
+# This has been changed on 2010-06-28
+my $data = getwebpage("http://tazendra.caltech.edu/~postgres/michael/papers.ace");
+my @alllines = split /\n/, $data;
+my $flag = 0;
+foreach my $line (@alllines) {
+    if ($line =~ /Longtext \:/) {
+	$flag = 1;
+    }
+    if ($flag) {
+        # print longtext object
+	print OUT2 $line, "\n";
+    } else {
+        # print out Paper objects
+	print OUT1 $line, "\n";
+    }
+}
+
+my @aux = $data =~ /Paper \:/g;
+print scalar @aux , " paper objects downloaded.\n";
+@aux = $data =~ /\*\*\*LongTextEnd\*\*\*/g;
+print scalar @aux , " abstracts downloaded.\n";
+close (OUT1) or die "Cannot close $outfile1 : $!";
+close (OUT2) or die "Cannot close $outfile2 : $!";
+
+print "done.\n\n";
+
+sub getDate {
+
+    my $time_zone = 0;
+    my $time = time() + ($time_zone * 3600);
+    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
+    $year += ($year < 90) ? 2000 : 1900;
+    $dateShort = sprintf("%04d%02d%02d",$year,$mon+1,$mday);
+    return $dateShort;
+
+}
+
+
+
+sub getwebpage {
+
+    my $u = shift;
+    my $page = "";
+
+    my $ua = LWP::UserAgent->new(timeout => 30); # instantiates a new user agent
+    my $request = HTTP::Request->new(GET => $u); # grabs url
+    my $response = $ua->request($request);       # checks url, dies if not valid.
+    print "Error while getting ", $response->request->uri," -- ", $response->status_line, "\nAborting" unless $response-> is_success;
+    $page = $response->content;    #splits by line
+    return $page;
+
+}
diff --git a/getbibinfoforpdffromserver/extract_pdfbibinfo.pl b/getbibinfoforpdffromserver/extract_pdfbibinfo.pl
new file mode 100755
index 0000000..88ad6d7
--- /dev/null
+++ b/getbibinfoforpdffromserver/extract_pdfbibinfo.pl
@@ -0,0 +1,208 @@
+#!/usr/bin/perl
+
+use diagnostics;
+use strict;
+use DBI;
+
+my $infile = "$ARGV[0]/Paper.dump";
+my $infile2 = "$ARGV[0]/LongText.dump";
+my $outpath = "$ARGV[0]/";
+
+my $countentries = 0;
+my $dateShort;
+my @directories = qw (
+		      accession
+		      author
+		      abstract
+		      title
+		      journal
+		      citation
+		      year
+		      type
+		      );
+
+# back up old and make new directories.
+
+&getDate();
+
+foreach (@directories){
+    print "Making $outpath/$_ directory ..... ";
+    if (-d "$outpath/$_"){
+#	my @args = ("tar", "zcf", "$outpath/$_.$dateShort.tgz", "$outpath/$_"); 
+#	system(@args) == 0 or die "system @args failed: $?";
+	my @temp = <$outpath/$_/*>;
+	for (@temp){
+	    unlink "$_" or warn "Cannot delete $_: $!";
+	}
+    } else {
+	mkdir "$outpath/$_";
+    }
+    print "done.\n";
+}
+
+# extracts Title, Author, Citation, Year, Type, Journal from Paper.dump
+
+open (FILE, "<$infile") || die "Cannot open $infile : $!";
+print "loading $infile ....";
+undef $/;
+my $wholefile = <FILE>;
+$/ = "\n";
+close (FILE) or die "Cannot close $infile : $!";
+print "done.\n";
+my @sections = split (/\n\n/, $wholefile);
+my $count = scalar(@sections);
+
+# extracts abstracts from LongText.dump
+
+my $filename2 = "";
+open (FILE2, "<$infile2") || die "Cannot open $infile2 : $!";
+print "loading $infile2 ....";
+undef $/;
+my $wholefile2 = <FILE2>;
+$/ = "\n";
+close (FILE2) or die "Cannot close $infile2 : $!";
+print "done.\n";
+
+(my @array2) = split(/\*\*\*LongTextEnd\*\*\*\n/, $wholefile2);
+my $count2 = scalar(@array2);
+my %abstracttexts = ();
+foreach my $entry (@array2) {
+    (my $id) = $entry =~ /Longtext[ \t]\:[ \t]\"(.+?)\"/;
+    (my $text) = $entry =~ /\"$id\"\n\n(.+?)\n\n/;
+    $abstracttexts{$id} = $text;
+}
+
+
+
+my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb;host=131.215.52.76", "acedb", "") or die "Cannot connect to database!\n"; 
+my $result = $dbh->prepare( "SELECT * FROM pap_curation_flags WHERE pap_curation_flags = 'non_nematode'");
+$result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; 
+my %non_nematode = ();
+while (my @row = $result->fetchrow) {
+    if ($row[0]) { 
+	my $jk = shift (@row);
+	$non_nematode{"WBPaper$jk"} = 1;
+    } 
+}
+$dbh->disconnect;
+
+foreach my $s (@sections){
+
+#
+    next if (($s !~ /(^|\n)Paper/) || ($s !~ /\nAuthor/) || ($s !~ /\nTitle/));
+#
+
+    (my $filename) = $s =~ /Paper \:[ \t]+\"(WBPaper\d{8})\"/;
+#
+    next if ($non_nematode{$filename});
+#
+    (my @other_names) = $s =~ /\nName[ \t]+\"(.+?)\"/g;
+    (my $pmid_name) = $s =~ /\nDatabase[ \t]+\"MEDLINE\"[ \t]+\"PMID\"[ \t]+\"(\d+)\"/;
+    my @authors = $s =~ /\nAuthor[ \t]+\"(.+?)\"/g;
+    (my $aux) = $s =~ /\nVolume[ \t]+(.+)\n/;
+    my @volumes = $aux =~ /\"(.+?)\"/g;
+    ($aux) = $s =~ /\nPage[ \t]+(.+)\n/;
+    my @pages = $aux =~ /\"(.+?)\"/g;
+    my $journal = '';
+    ($aux) = $s =~ /\nJournal[ \t]+\"(.+?)\"/;
+    $journal .= $aux;
+    ($aux) = $s =~ /\nTitle[ \t]+(\".+?\n)/;
+    (my $title) = $aux =~ /^\"(.+)\"/;
+    (my $type) = $s =~ /\nType[ \t]+\"(.+?)\"/;
+    (my $year) = $s =~ /\nPublication_date[ \t]+\"([\-\d]+)\"/;
+    (my $absid) = $s =~ /\nAbstract[ \t]+\"(.+?)\"/;
+
+    $countentries++;
+
+    my $acc = '';
+    $acc .= "Other:" . "@other_names" . "\n" if (@other_names);
+    $acc =~ s/(doi|DOI|Doi)/$1:/g;
+    $acc .= "PMID:$pmid_name\n" if ($pmid_name ne '');
+
+# need to remove if-loop and add empty line
+# so an empty accession file can be written.
+# this is necessary so pdf without any accession 
+# can be downloaded
+
+    $acc .= "\n" if ($acc eq '');
+#    if ($acc ne '') {
+	open (OUT, ">$outpath/accession/$filename") or die "Cannot open $outpath/Accession/$filename : $!";
+	print OUT "$acc";
+	close (OUT) or die "Cannot close $outpath/Accession/$filename : $!";
+#    }
+#
+    if (@authors) {
+	# take care of new format in author section (repetition of author lines) 
+	my %seen = ();
+	my @aux = ();
+	foreach (@authors) {
+	    if (!$seen{$_}) {
+		$seen{$_} = 1;
+		push @aux, $_;
+	    }
+	}
+	open (OUT, ">$outpath/author/$filename") or die "Cannot open $outpath/Author/$filename : $!";	    
+	print OUT join(" ; \n", @aux);
+	close (OUT) or die "Cannot close $outpath/Author/$filename: $!";
+    }
+#    
+    if ((@volumes) || (@pages)) {
+	open (OUT, ">$outpath/citation/$filename") or die "Cannot open $outpath/Citation/$filename : $!";
+	print OUT "V: ", join(" ", @volumes), "\n" if (@volumes);
+	print OUT "P: ", join(" ", @pages), "\n" if (@pages);
+	close (OUT) or die "Cannot close $outpath/Citation/$filename : $!";
+    }
+#
+    if ($journal ne '') {
+	open (OUT, ">$outpath/journal/$filename") or die "Cannot open $\outpath/Journal/$filename : $!";
+	print OUT "$journal\n";
+	close (OUT) or die "Cannot close $outpath/Journal/$filename : $\!";
+    }
+#
+    if ($title ne '') {
+	open (OUT, ">$outpath/title/$filename") or die "Cannot open $outpath/Title/$filename : $!";
+	print OUT "$title\n";
+	close (OUT) or die "Cannot close $outpath/Title/$filename : $!";
+    }
+#
+    if ($type ne '') {
+	open (OUT, ">$outpath/type/$filename") or die "Cannot open $outpath/Type/$filename : $!";
+	print OUT "$type\n";
+	close (OUT) or die "Cannot close $outpath/Type/$filename : $!";
+    }
+#
+    if ($year ne '') {
+	open (OUT, ">$outpath/year/$filename") or die "Cannot open $outpath/Year/$filename : $!";
+	print OUT "$year\n";
+	close (OUT) or die "Cannot close $outpath/year/$filename : $!";
+    }
+#
+    if ($abstracttexts{$absid}) {
+	open (OUT, ">$outpath/abstract/$filename") or die "Cannot open $outpath/$filename : $!";
+	print OUT "$abstracttexts{$absid}";
+	close (OUT) or die "Cannot close $outpath/$filename : $!";
+    }
+#
+}
+
+print "\n\n#########################################";
+print "\nThere are $count citations total and\n";
+print "$countentries were complete enough to be usable.\n";
+print "$count2 abstracts were extracted.\n";
+print "\n\n";
+for (@directories){
+    my @cnt = <$outpath/$_/*>; 
+    my $cnt = scalar(@cnt); 
+    print "$_ has $cnt files\n";
+}
+
+print "\n\n##########################################\n";
+
+sub getDate{
+    my $time_zone = 0;
+    my $time = time() + ($time_zone * 3600);
+    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
+    $year += ($year < 90) ? 2000 : 1900;
+    $dateShort = sprintf("%04d%02d%02d",$year,$mon+1,$mday);
+    return $dateShort;
+}
diff --git a/getpdfs/getpdfs.py b/getpdfs/getpdfs.py
new file mode 100755
index 0000000..b2dc654
--- /dev/null
+++ b/getpdfs/getpdfs.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+
+"""Copy pdf files from server and map file names"""
+import hashlib
+import logging
+import shutil
+import urllib.request
+import urllib.error
+import urllib.parse
+import re
+import os
+import argparse
+import glob
+import psycopg2
+
+__author__ = "Valerio Arnaboldi"
+
+__version__ = "1.0.1"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download pdf files from Tazendra server and store them in a local "
+                                                 "directory, after applying name conversion")
+    parser.add_argument("-d", "--delete-old", dest="delete_old", action="store_true",
+                        help="delete old files before downloading the new ones")
+    parser.add_argument("-l", "--log-file", metavar="log_file", dest="log_file", default="info.log", type=str,
+                        help="path to log file")
+    parser.add_argument("-L", "--log-level", metavar="log_level", dest="log_level", default="INFO",
+                        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="log level")
+    parser.add_argument("out_dir", metavar="out_dir", type=str, help="output directory")
+    parser.add_argument("xml_dir", metavar="xml_dir", type=str, help="do not download pdfs whose PMID is already "
+                                                                     "present in the provided xml director")
+    args = parser.parse_args()
+    logging.basicConfig(filename=args.log_file, level=getattr(logging, args.log_level.upper()))
+    if args.delete_old:
+        shutil.rmtree(args.out_dir)
+    try:
+        os.makedirs(os.path.join(args.out_dir, "C. elegans"))
+        os.makedirs(os.path.join(args.out_dir, "C. elegans Supplementals"))
+    except FileExistsError:
+        logging.warning("Directories already exist")
+    non_nematode_papers = set()
+    conn = psycopg2.connect("dbname='testdb' user='acedb' host='131.215.52.76'")
+    cur = conn.cursor()
+    cur.execute("""SELECT * FROM pap_curation_flags WHERE pap_curation_flags = 'non_nematode'""")
+    rows = cur.fetchall()
+    for row in rows:
+        non_nematode_papers.add("WBPaper" + row[0])
+
+    # read papers mapping
+    id = None
+    papers_cgc_map = {}
+    papers_pubmed_map = {}
+    existing_xml_pmids = set([f for f in os.listdir(args.xml_dir) if os.path.isdir(os.path.join(args.xml_dir, f))])
+    wb_2_pmid = {}
+    for line in urllib.request.urlopen("http://tazendra.caltech.edu/~postgres/michael/papers.ace"):
+        line = line.decode('utf-8')
+        linearr = line.strip().split()
+        if len(linearr) > 1:
+            if linearr[0] == "Paper":
+                id = linearr[2][1:len(linearr[2])-1]
+            elif linearr[0] == "Name" and linearr[1].startswith("\"cgc"):
+                papers_cgc_map[linearr[1][4:len(linearr[1])-1]] = id
+            elif len(linearr) >= 4 and linearr[0] == "Database" and linearr[2] == "\"PMID\"":
+                papers_pubmed_map[linearr[3][1:len(linearr[3])-1]] = id
+                wb_2_pmid[id] = linearr[3][1:len(linearr[3])-1]
+
+    # read papers list and map them
+    p = re.compile('href="(.*)"')
+
+    pdflink = ""
+    all_wbpapers = set()
+    files_to_download = {}
+    for line in urllib.request.urlopen("http://tazendra.caltech.edu/~azurebrd/cgi-bin/allpdfs.cgi?action=textpresso"):
+        try:
+            line = line.decode('utf-8')
+            linearr = line.strip().split()
+            if len(linearr) > 1:
+                filetype = linearr[0]
+                namescheme = linearr[1]
+                pdflink = p.findall(" ".join(linearr[3:]))[0]
+                if pdflink.lower().endswith(".pdf"):
+                    if filetype == "supplemental":
+                        pdfname = re.split("_|-", pdflink.split("/")[-2])[0]
+                    else:
+                        pdfname = re.split("_|-", pdflink.split("/")[-1])[0]
+                    subdir = "C. elegans"
+                    if namescheme == "wb":
+                        pdfname = "WBPaper" + str(pdfname)
+                        if pdfname in wb_2_pmid and wb_2_pmid[pdfname] in existing_xml_pmids \
+                                and filetype != "supplemental":
+                            continue
+                    elif namescheme == "cgc":
+                        if str(pdfname).lstrip("0") in papers_cgc_map:
+                            pdfname = papers_cgc_map[str(pdfname).lstrip("0")]
+                            if pdfname in wb_2_pmid and wb_2_pmid[pdfname] in existing_xml_pmids \
+                                    and filetype != "supplemental":
+                                continue
+                        else:
+                            continue
+                    elif namescheme == "pubmed":
+                        if str(pdfname).lstrip("0") in papers_pubmed_map:
+                            pdfname = papers_pubmed_map[str(pdfname).lstrip("0")]
+                            if str(pdfname).lstrip("0") in existing_xml_pmids:
+                                continue
+                        else:
+                            continue
+                        subdir = "C. elegans"
+                    if pdfname in non_nematode_papers:
+                        continue
+                    if filetype == "supplemental":
+                        subdir = "C. elegans Supplementals"
+                        pdfname += ".sup."
+                        skip_file = False
+                        simfiles = glob.glob(os.path.join(args.out_dir, subdir, pdfname, pdfname) + "*.pdf")
+                        for simfile_name in simfiles:
+                            if hashlib.md5(urllib.request.urlopen(pdflink).read()).digest() == \
+                                    hashlib.md5(open(simfile_name, "rb").read()).digest():
+                                skip_file = True
+                                break
+                        if skip_file:
+                            continue
+                        sup_num = len(simfiles) + 1
+                        while pdfname + str(sup_num) in files_to_download:
+                            all_wbpapers.add(pdfname + str(sup_num))
+                            sup_num += 1
+                        all_wbpapers.add(pdfname + str(sup_num))
+                        pdfname += str(sup_num)
+                    else:
+                        all_wbpapers.add(pdfname)
+                        if pdflink.lower().endswith("_temp.pdf") and pdfname in files_to_download or \
+                                        pdflink.lower().endswith("_ocr.pdf") and pdfname in files_to_download:
+                            continue
+                    logging.info("Downloading paper: " + pdflink + " to " + os.path.join(args.out_dir, subdir,
+                                                                                         pdfname, pdfname + ".pdf"))
+                    if pdfname in files_to_download:
+                        link_re = re.search("[0-9]+[\_\-][^\d]+([0-9]+)", pdflink.replace(" ", ""))
+                        link_num = 0
+                        if link_re is not None:
+                            link_num = int(link_re.group(1))
+                        stored_re = re.search("[0-9]+[\_\-][^\d]+([0-9]+)",
+                                              files_to_download[pdfname][0].replace("%20", ""))
+                        stored_num = 0
+                        if stored_re is not None:
+                            stored_num = int(stored_re.group(1))
+                        if link_num <= stored_num:
+                            continue
+                    files_to_download[pdfname] = (pdflink.replace(" ", "%20"), os.path.join(args.out_dir, subdir,
+                                                                                            pdfname, pdfname + ".pdf"))
+                else:
+                    logging.warning("Skipping file: " + pdflink)
+        except UnicodeDecodeError:
+            pass
+
+    for pdflink, file_path in files_to_download.values():
+        try:
+            # check if best file selected for download is already present in the dest dir
+            if not args.delete_old and len(glob.glob(file_path)) > 0 and \
+                            hashlib.md5(urllib.request.urlopen(pdflink).read()).digest() == \
+                            hashlib.md5(open(file_path, "rb").read()).digest():
+                logging.info("File already present in collection, skipping " + pdflink)
+                continue
+            os.makedirs(os.path.dirname(file_path))
+            urllib.request.urlretrieve(pdflink, file_path)
+        except urllib.error.HTTPError:
+            logging.error("Paper not found: " + pdflink)
+            continue
+
+    # delete local files that have been removed from server
+    local_files = set(os.listdir(os.path.join(args.out_dir, "C. elegans")))
+    for file_to_remove in local_files.difference(all_wbpapers):
+        shutil.rmtree(os.path.join("C. elegans", file_to_remove))
+    local_files = set(os.listdir(os.path.join(args.out_dir, "C. elegans Supplementals")))
+    for file_to_remove in local_files.difference(all_wbpapers):
+        shutil.rmtree(os.path.join("C. elegans Supplementals", file_to_remove))
+
+if __name__ == '__main__':
+    main()
diff --git a/lucene/CaseSensitiveAnalyzer.cpp b/lucene/CaseSensitiveAnalyzer.cpp
new file mode 100644
index 0000000..5a5cc0e
--- /dev/null
+++ b/lucene/CaseSensitiveAnalyzer.cpp
@@ -0,0 +1,84 @@
+/**
+    Project: textpressocentral
+    File name: CaseSensitiveAnalyzer.cpp
+    
+    @author valerio
+    @version 1.0 6/9/17.
+*/
+
+#include "CaseSensitiveAnalyzer.h"
+#include <lucene++/LuceneHeaders.h>
+#include <lucene++/WordlistLoader.h>
+
+using namespace Lucene;
+
+DECLARE_SHARED_PTR(CaseSensitiveAnalyzer);
+
+/// Construct an analyzer with the given stop words.
+const int32_t CaseSensitiveAnalyzer::DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion) {
+    ConstructAnalyser(matchVersion, StopAnalyzer::ENGLISH_STOP_WORDS_SET());
+}
+
+CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
+    ConstructAnalyser(matchVersion, stopWords);
+}
+
+CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, const String& stopwords) {
+    ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
+}
+
+CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, const ReaderPtr& stopwords) {
+    ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords));
+}
+
+CaseSensitiveAnalyzer::~CaseSensitiveAnalyzer() {
+}
+
+void CaseSensitiveAnalyzer::ConstructAnalyser(LuceneVersion::Version matchVersion, HashSet<String> stopWords) {
+    stopSet = stopWords;
+    enableStopPositionIncrements = StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion);
+    replaceInvalidAcronym = LuceneVersion::onOrAfter(matchVersion, LuceneVersion::LUCENE_24);
+    this->matchVersion = matchVersion;
+    this->maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+}
+
+TokenStreamPtr CaseSensitiveAnalyzer::tokenStream(const String& fieldName, const ReaderPtr& reader) {
+    StandardTokenizerPtr tokenStream(newLucene<StandardTokenizer>(matchVersion, reader));
+    tokenStream->setMaxTokenLength(maxTokenLength);
+    TokenStreamPtr result(newLucene<StandardFilter>(tokenStream));
+    //result = newLucene<LowerCaseFilter>(result);
+    result = newLucene<StopFilter>(enableStopPositionIncrements, result, stopSet);
+    return result;
+}
+
+void CaseSensitiveAnalyzer::setMaxTokenLength(int32_t length) {
+    maxTokenLength = length;
+}
+
+int32_t CaseSensitiveAnalyzer::getMaxTokenLength() {
+    return maxTokenLength;
+}
+DECLARE_SHARED_PTR(CaseSensitiveAnalyzerSavedStreams);
+TokenStreamPtr CaseSensitiveAnalyzer::reusableTokenStream(const String& fieldName, const ReaderPtr& reader) {
+    CaseSensitiveAnalyzerSavedStreamsPtr streams = boost::dynamic_pointer_cast<CaseSensitiveAnalyzerSavedStreams>(getPreviousTokenStream());
+    if (!streams) {
+        streams = newLucene<CaseSensitiveAnalyzerSavedStreams>();
+        setPreviousTokenStream(streams);
+        streams->tokenStream = newLucene<StandardTokenizer>(matchVersion, reader);
+        streams->filteredTokenStream = newLucene<StandardFilter>(streams->tokenStream);
+        //streams->filteredTokenStream = newLucene<LowerCaseFilter>(streams->filteredTokenStream);
+        streams->filteredTokenStream = newLucene<StopFilter>(enableStopPositionIncrements, streams->filteredTokenStream, stopSet);
+    } else {
+        streams->tokenStream->reset(reader);
+    }
+    streams->tokenStream->setMaxTokenLength(maxTokenLength);
+
+    streams->tokenStream->setReplaceInvalidAcronym(replaceInvalidAcronym);
+
+    return streams->filteredTokenStream;
+}
+
+CaseSensitiveAnalyzerSavedStreams::~CaseSensitiveAnalyzerSavedStreams() {
+}
\ No newline at end of file
diff --git a/lucene/CaseSensitiveAnalyzer.h b/lucene/CaseSensitiveAnalyzer.h
new file mode 100644
index 0000000..532d457
--- /dev/null
+++ b/lucene/CaseSensitiveAnalyzer.h
@@ -0,0 +1,58 @@
+/**
+    Project: textpressocentral
+    File name: CaseSensitiveAnalyzer.h
+    
+    @author valerio
+    @version 1.0 6/9/17.
+*/
+
+#ifndef TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H
+#define TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H
+
+//#include <lucene++/Analyzer.h>
+#include <lucene++/LuceneHeaders.h>
+
+using namespace Lucene;
+
+class CaseSensitiveAnalyzerSavedStreams : public LuceneObject {
+public:
+    virtual ~CaseSensitiveAnalyzerSavedStreams();
+    LUCENE_CLASS(CaseSensitiveAnalyzerSavedStreams);
+
+public:
+    StandardTokenizerPtr tokenStream;
+    TokenStreamPtr filteredTokenStream;
+};
+
+class CaseSensitiveAnalyzer: public Analyzer {
+
+public:
+    CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion);
+    CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, Lucene::HashSet<Lucene::String> stopWords);
+    CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, const Lucene::String &stopwords);
+    CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, const Lucene::ReaderPtr &stopwords);
+    virtual ~CaseSensitiveAnalyzer();
+
+    LUCENE_CLASS(CaseSensitiveAnalyzer);
+
+public:
+    static const int32_t DEFAULT_MAX_TOKEN_LENGTH;
+
+protected:
+    Lucene::HashSet<Lucene::String> stopSet;
+
+    bool replaceInvalidAcronym;
+    bool enableStopPositionIncrements;
+    Lucene::LuceneVersion::Version matchVersion;
+    int32_t maxTokenLength;
+
+public:
+    virtual Lucene::TokenStreamPtr tokenStream(const Lucene::String &fieldName, const Lucene::ReaderPtr &reader);
+    void setMaxTokenLength(int32_t length);
+    int32_t getMaxTokenLength();
+    virtual Lucene::TokenStreamPtr reusableTokenStream(const Lucene::String &fieldName, const Lucene::ReaderPtr &reader);
+    void ConstructAnalyser(Lucene::LuceneVersion::Version matchVersion, Lucene::HashSet<Lucene::String> stopWords);
+};
+
+
+#endif //TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H
diff --git a/lucene/LazySelector.h b/lucene/LazySelector.h
new file mode 100644
index 0000000..9eb6181
--- /dev/null
+++ b/lucene/LazySelector.h
@@ -0,0 +1,36 @@
+/**
+    Project: textpressocentral
+    File name: LazySelector.h
+    
+    @author valerio
+    @version 1.0 6/10/17.
+*/
+
+#ifndef TEXTPRESSOCENTRAL_LAZYSELECTOR_H
+#define TEXTPRESSOCENTRAL_LAZYSELECTOR_H
+
+#include <lucene++/LuceneHeaders.h>
+
+DECLARE_SHARED_PTR(LazySelector);
+class LazySelector : public FieldSelector {
+public:
+    LazySelector(const String& magicField) {
+        this->magicField = magicField;
+    }
+    virtual ~LazySelector() {
+    }
+    LUCENE_CLASS(LazySelector);
+protected:
+    String magicField;
+
+public:
+    virtual FieldSelectorResult accept(const String& fieldName) {
+        if (fieldName == magicField) {
+            return FieldSelector::SELECTOR_LOAD;
+        } else {
+            return FieldSelector::SELECTOR_NO_LOAD;
+        }
+    }
+};
+
+#endif //TEXTPRESSOCENTRAL_LAZYSELECTOR_H
diff --git a/ppm2jpg/main.cpp b/ppm2jpg/main.cpp
new file mode 100644
index 0000000..ff34c0d
--- /dev/null
+++ b/ppm2jpg/main.cpp
@@ -0,0 +1,97 @@
+/**
+    Project: textpressocentral
+    File name: main.cpp
+    
+    @author valerio
+    @version 1.0 6/5/17.
+*/
+
+#include <string>
+#include <boost/program_options.hpp>
+#include <CImg.h>
+#include <boost/filesystem/path.hpp>
+#include <boost/filesystem/operations.hpp>
+
+namespace po = boost::program_options;
+namespace fs = boost::filesystem;
+using namespace std;
+using namespace cimg_library;
+
+void convertFile(const string& inputFileName, bool remove) {
+    fs::path inputFilePath(inputFileName);
+    fs::path outputFilePath = inputFilePath;
+    outputFilePath.replace_extension(fs::path("jpg"));
+    try {
+        CImg<unsigned char> image(inputFilePath.string().c_str());
+        image.save(outputFilePath.string().c_str());
+        if (remove) {
+            fs::remove(inputFilePath);
+        }
+    } catch (Magick::ErrorCorruptImage) {
+        cout << "cannot convert corrupted file " << inputFileName << endl;
+    }
+}
+
+int main(int argc, char* argv[]) {
+
+    po::options_description desc("options");
+    po::positional_options_description p;
+    po::variables_map vm;
+
+    bool remove = false;
+    bool recursive = false;
+    string startDir;
+    string inputFileName;
+
+    try {
+
+        desc.add_options()
+                ("help,h", "produce help message")
+                ("input-file,i", po::value<string>(&inputFileName)->required(), "input file or directory")
+                ("delete,d", "delete original ppm files")
+                ("recursive,r", "apply conversion recursively");
+
+
+        p.add("input-file", -1);
+        po::store(po::command_line_parser(argc, argv).
+                options(desc).positional(p).run(), vm);
+        po::notify(vm);
+
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return 1;
+        }
+
+        if (vm.count("delete")) {
+            remove = true;
+        }
+
+        if (vm.count("recursive")) {
+            recursive = true;
+            startDir = inputFileName;
+        }
+    } catch(std::exception& e) {
+        if (vm.count("help")) {
+            cout << desc << endl;
+            return (EXIT_SUCCESS);
+        }
+        std::cerr << "Error: " << e.what() << "\n";
+        return (EXIT_FAILURE);
+    }
+
+    if (recursive) {
+        fs::recursive_directory_iterator dir_end;
+        fs::recursive_directory_iterator dir(startDir);
+        while (dir != dir_end) {
+            fs::path _path(*dir);
+            ++dir;
+            if (!fs::is_directory(_path) && _path.extension().string() == ".ppm") {
+                convertFile(_path.string(), remove);
+            }
+        }
+    } else {
+        convertFile(inputFileName, remove);
+    }
+
+    return (EXIT_SUCCESS);
+}
\ No newline at end of file
diff --git a/ppm2jpg/ppm2jpgCas.sh b/ppm2jpg/ppm2jpgCas.sh
new file mode 100755
index 0000000..8d57e56
--- /dev/null
+++ b/ppm2jpg/ppm2jpgCas.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+##### simple script to change suffix .ppm to .jpg in compressed cas files and delete original .ppm images #####
+
+function fixcas() {
+    root_dir=$1
+    find "${root_dir}" -name *.tpcas.gz | xargs -n 1 -P $2 -I {} sudo bash -c "zcat '{}' | sed 's/\.ppm/.jpg/g' | gzip > '{}'.tmp; mv '{}'.tmp '{}'"
+}
+
+function remppm() {
+    root_dir=$1
+    find "${root_dir}" -name *.ppm | xargs -I {} sudo bash -c "rm '{}'"
+}
+
+function usage {
+    echo "usage: $(basename $0) [-fdh] <dir>"
+    echo "  -f --fix-cas      fix cas files in <dir> recursively by changing substituting .ppm by .jpg suffix"
+    echo "  -d --delete-ppm   delete .ppm images in <dir> recursively"
+    echo "  -p --num-proc     maximum number of parallel processes"
+    echo "  -h --help         display help"
+    exit 1
+}
+
+if [[ "${#}" < 2 ]]
+then
+    usage
+fi
+
+FIX_CAS=false
+REM_PPM=false
+ROOT_DIR=""
+N_PROC=1
+
+while [[ $# -gt 0 ]]
+do
+key=$1
+
+case $key in
+    -f|--fix-cas)
+    FIX_CAS=true
+    shift # past argument
+    ;;
+    -d|--delete-ppm)
+    REM_PPM=true
+    shift # past argument
+    ;;
+    -p|--num-proc)
+    shift
+    N_PROC=$1
+    shift
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [[ -d $key ]]
+    then
+        ROOT_DIR="$key"
+        shift
+    else
+        usage
+    fi
+    ;;
+esac
+done
+
+# check for the required argument ROOT_DIR
+if [[ ${ROOT_DIR} == "" ]]
+then
+    usage
+fi
+
+if [ ${FIX_CAS} = true ]
+then
+    fixcas "${ROOT_DIR}" ${N_PROC}
+fi
+if [ ${REM_PPM} = true ]
+then
+    remppm "${ROOT_DIR}"
+fi
+
+exit 0
\ No newline at end of file
diff --git a/printbibfromnxmlorcasfile/.dep.inc b/printbibfromnxmlorcasfile/.dep.inc
new file mode 100644
index 0000000..4560e55
--- /dev/null
+++ b/printbibfromnxmlorcasfile/.dep.inc
@@ -0,0 +1,5 @@
+# This code depends on make tool being used
+DEPFILES=$(wildcard $(addsuffix .d, ${OBJECTFILES}))
+ifneq (${DEPFILES},)
+include ${DEPFILES}
+endif
diff --git a/printbibfromnxmlorcasfile/Makefile b/printbibfromnxmlorcasfile/Makefile
new file mode 100644
index 0000000..05de621
--- /dev/null
+++ b/printbibfromnxmlorcasfile/Makefile
@@ -0,0 +1,128 @@
+#
+#  There exist several targets which are by default empty and which can be 
+#  used for execution of your targets. These targets are usually executed 
+#  before and after some main targets. They are: 
+#
+#     .build-pre:              called before 'build' target
+#     .build-post:             called after 'build' target
+#     .clean-pre:              called before 'clean' target
+#     .clean-post:             called after 'clean' target
+#     .clobber-pre:            called before 'clobber' target
+#     .clobber-post:           called after 'clobber' target
+#     .all-pre:                called before 'all' target
+#     .all-post:               called after 'all' target
+#     .help-pre:               called before 'help' target
+#     .help-post:              called after 'help' target
+#
+#  Targets beginning with '.' are not intended to be called on their own.
+#
+#  Main targets can be executed directly, and they are:
+#  
+#     build                    build a specific configuration
+#     clean                    remove built files from a configuration
+#     clobber                  remove all built files
+#     all                      build all configurations
+#     help                     print help mesage
+#  
+#  Targets .build-impl, .clean-impl, .clobber-impl, .all-impl, and
+#  .help-impl are implemented in nbproject/makefile-impl.mk.
+#
+#  Available make variables:
+#
+#     CND_BASEDIR                base directory for relative paths
+#     CND_DISTDIR                default top distribution directory (build artifacts)
+#     CND_BUILDDIR               default top build directory (object files, ...)
+#     CONF                       name of current configuration
+#     CND_PLATFORM_${CONF}       platform name (current configuration)
+#     CND_ARTIFACT_DIR_${CONF}   directory of build artifact (current configuration)
+#     CND_ARTIFACT_NAME_${CONF}  name of build artifact (current configuration)
+#     CND_ARTIFACT_PATH_${CONF}  path to build artifact (current configuration)
+#     CND_PACKAGE_DIR_${CONF}    directory of package (current configuration)
+#     CND_PACKAGE_NAME_${CONF}   name of package (current configuration)
+#     CND_PACKAGE_PATH_${CONF}   path to package (current configuration)
+#
+# NOCDDL
+
+
+# Environment 
+MKDIR=mkdir
+CP=cp
+CCADMIN=CCadmin
+
+
+# build
+build: .build-post
+
+.build-pre:
+# Add your pre 'build' code here...
+
+.build-post: .build-impl
+# Add your post 'build' code here...
+
+
+# clean
+clean: .clean-post
+
+.clean-pre:
+# Add your pre 'clean' code here...
+
+.clean-post: .clean-impl
+# Add your post 'clean' code here...
+
+
+# clobber
+clobber: .clobber-post
+
+.clobber-pre:
+# Add your pre 'clobber' code here...
+
+.clobber-post: .clobber-impl
+# Add your post 'clobber' code here...
+
+
+# all
+all: .all-post
+
+.all-pre:
+# Add your pre 'all' code here...
+
+.all-post: .all-impl
+# Add your post 'all' code here...
+
+
+# build tests
+build-tests: .build-tests-post
+
+.build-tests-pre:
+# Add your pre 'build-tests' code here...
+
+.build-tests-post: .build-tests-impl
+# Add your post 'build-tests' code here...
+
+
+# run tests
+test: .test-post
+
+.test-pre: build-tests
+# Add your pre 'test' code here...
+
+.test-post: .test-impl
+# Add your post 'test' code here...
+
+
+# help
+help: .help-post
+
+.help-pre:
+# Add your pre 'help' code here...
+
+.help-post: .help-impl
+# Add your post 'help' code here...
+
+
+
+# include project implementation makefile
+include nbproject/Makefile-impl.mk
+
+# include project make variables
+include nbproject/Makefile-variables.mk
diff --git a/printbibfromnxmlorcasfile/cmdline.h b/printbibfromnxmlorcasfile/cmdline.h
new file mode 100644
index 0000000..f142850
--- /dev/null
+++ b/printbibfromnxmlorcasfile/cmdline.h
@@ -0,0 +1,773 @@
+/*
+Copyright (c) 2009, Hideyuki Tanaka
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY <copyright holder> ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <string>
+#include <stdexcept>
+#include <typeinfo>
+#include <cstring>
+#include <algorithm>
+#include <cxxabi.h>
+#include <cstdlib>
+
+namespace cmdline{
+
+namespace detail{
+
+template <typename Target, typename Source, bool Same>
+class lexical_cast_t{
+public:
+  static Target cast(const Source &arg){
+    Target ret;
+    std::stringstream ss;
+    if (!(ss<<arg && ss>>ret && ss.eof()))
+      throw std::bad_cast();
+    
+    return ret;
+  }
+};
+
+template <typename Target, typename Source>
+class lexical_cast_t<Target, Source, true>{
+public:
+  static Target cast(const Source &arg){
+    return arg;
+  }  
+};
+
+template <typename Source>
+class lexical_cast_t<std::string, Source, false>{
+public:
+  static std::string cast(const Source &arg){
+    std::ostringstream ss;
+    ss<<arg;
+    return ss.str();
+  }
+};
+
+template <typename Target>
+class lexical_cast_t<Target, std::string, false>{
+public:
+  static Target cast(const std::string &arg){
+    Target ret;
+    std::istringstream ss(arg);
+    if (!(ss>>ret && ss.eof()))
+      throw std::bad_cast();
+    return ret;
+  }
+};
+
+template <typename T1, typename T2>
+struct is_same {
+  static const bool value = false;
+};
+
+template <typename T>
+struct is_same<T, T>{
+  static const bool value = true;
+};
+
+template<typename Target, typename Source>
+Target lexical_cast(const Source &arg)
+{
+  return lexical_cast_t<Target, Source, detail::is_same<Target, Source>::value>::cast(arg);
+}
+
+static inline std::string demangle(const std::string &name)
+{
+  int status=0;
+  char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status);
+  std::string ret(p);
+  free(p);
+  return ret;
+}
+
+template <class T>
+std::string readable_typename()
+{
+  return demangle(typeid(T).name());
+}
+
+template <>
+std::string readable_typename<std::string>()
+{
+  return "string";
+}
+
+} // detail
+
+//-----
+
+class cmdline_error : public std::exception {
+public:
+  cmdline_error(const std::string &msg): msg(msg){}
+  ~cmdline_error() throw() {}
+  const char *what() const throw() { return msg.c_str(); }
+private:
+  std::string msg;
+};
+
+template <class T>
+struct default_reader{
+  T operator()(const std::string &str){
+    return detail::lexical_cast<T>(str);
+  }
+};
+
+template <class T>
+struct range_reader{
+  range_reader(const T &low, const T &high): low(low), high(high) {}
+  T operator()(const std::string &s) const {
+    T ret=default_reader<T>()(s);
+    if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error");
+    return ret;
+  }
+private:
+  T low, high;
+};
+
+template <class T>
+range_reader<T> range(const T &low, const T &high)
+{
+  return range_reader<T>(low, high);
+}
+
+template <class T>
+struct oneof_reader{
+  T operator()(const std::string &s){
+    T ret=default_reader<T>()(s);
+    if (std::find(alt.begin(), alt.end(), s)==alt.end())
+      throw cmdline_error("");
+    return ret;
+  }
+  void add(const T &v){ alt.push_back(v); }
+private:
+  std::vector<T> alt;
+};
+
+template <class T>
+oneof_reader<T> oneof(T a1)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  ret.add(a9);
+  return ret;
+}
+
+template <class T>
+oneof_reader<T> oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10)
+{
+  oneof_reader<T> ret;
+  ret.add(a1);
+  ret.add(a2);
+  ret.add(a3);
+  ret.add(a4);
+  ret.add(a5);
+  ret.add(a6);
+  ret.add(a7);
+  ret.add(a8);
+  ret.add(a9);
+  ret.add(a10);
+  return ret;
+}
+
+//-----
+
+class parser{
+public:
+  parser(){
+  }
+  ~parser(){
+    for (std::map<std::string, option_base*>::iterator p=options.begin();
+	 p!=options.end(); p++)
+      delete p->second;
+  }
+
+  void add(const std::string &name,
+	   char short_name=0,
+	   const std::string &desc=""){
+    if (options.count(name)) throw cmdline_error("multiple definition: "+name);
+    options[name]=new option_without_value(name, short_name, desc);
+    ordered.push_back(options[name]);
+  }
+
+  template <class T>
+  void add(const std::string &name,
+	   char short_name=0,
+	   const std::string &desc="",
+	   bool need=true,
+	   const T def=T()){
+    add(name, short_name, desc, need, def, default_reader<T>());
+  }
+
+  template <class T, class F>
+  void add(const std::string &name,
+	   char short_name=0,
+	   const std::string &desc="",
+	   bool need=true,
+	   const T def=T(),
+	   F reader=F()){
+    if (options.count(name)) throw cmdline_error("multiple definition: "+name);
+    options[name]=new option_with_value_with_reader<T, F>(name, short_name, need, def, desc, reader);
+    ordered.push_back(options[name]);
+  }
+
+  void footer(const std::string &f){
+    ftr=f;
+  }
+
+  void set_program_name(const std::string &name){
+    prog_name=name;
+  }
+
+  bool exist(const std::string &name) const {
+    if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
+    return options.find(name)->second->has_set();
+  }
+
+  template <class T>
+  const T &get(const std::string &name) const {
+    if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name);
+    const option_with_value<T> *p=dynamic_cast<const option_with_value<T>*>(options.find(name)->second);
+    if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'");
+    return p->get();
+  }
+
+  const std::vector<std::string> &rest() const {
+    return others;
+  }
+
+  bool parse(const std::string &arg){
+    std::vector<std::string> args;
+
+    std::string buf;
+    bool in_quote=false;
+    for (std::string::size_type i=0; i<arg.length(); i++){
+      if (arg[i]=='\"'){
+	in_quote=!in_quote;
+	continue;
+      }
+
+      if (arg[i]==' ' && !in_quote){
+	args.push_back(buf);
+	buf="";
+	continue;
+      }
+
+      if (arg[i]=='\\'){
+	i++;
+	if (i>=arg.length()){
+	  errors.push_back("unexpected occurrence of '\\' at end of string");
+	  return false;
+	}
+      }
+
+      buf+=arg[i];
+    }
+
+    if (in_quote){
+      errors.push_back("quote is not closed");
+      return false;
+    }
+
+    if (buf.length()>0)
+      args.push_back(buf);
+
+    for (size_t i=0; i<args.size(); i++)
+      std::cout<<"\""<<args[i]<<"\""<<std::endl;
+
+    return parse(args);
+  }
+
+  bool parse(const std::vector<std::string> &args){
+    int argc=static_cast<int>(args.size());
+    std::vector<const char*> argv(argc);
+
+    for (int i=0; i<argc; i++)
+      argv[i]=args[i].c_str();
+
+    return parse(argc, &argv[0]);
+  }
+
+  bool parse(int argc, const char * const argv[]){
+    errors.clear();
+    others.clear();
+
+    if (argc<1){
+      errors.push_back("argument number must be longer than 0");
+      return false;
+    }
+    if (prog_name=="")
+      prog_name=argv[0];
+
+    std::map<char, std::string> lookup;
+    for (std::map<std::string, option_base*>::iterator p=options.begin();
+	 p!=options.end(); p++){
+      if (p->first.length()==0) continue;
+      char initial=p->second->short_name();
+      if (initial){
+	if (lookup.count(initial)>0){
+	  lookup[initial]="";
+	  errors.push_back(std::string("short option '")+initial+"' is ambiguous");
+	  return false;
+	}
+	else lookup[initial]=p->first;
+      }
+    }
+
+    for (int i=1; i<argc; i++){
+      if (strncmp(argv[i], "--", 2)==0){
+	const char *p=strchr(argv[i]+2, '=');
+	if (p){
+	  std::string name(argv[i]+2, p);
+	  std::string val(p+1);
+	  set_option(name, val);
+	}
+	else{
+	  std::string name(argv[i]+2);
+          if (options.count(name)==0){
+            errors.push_back("undefined option: --"+name);
+            continue;
+          }
+          if (options[name]->has_value()){
+            if (i+1>=argc){
+              errors.push_back("option needs value: --"+name);
+              continue;
+            }
+            else{
+              i++;
+              set_option(name, argv[i]);
+            }
+          }
+          else{
+            set_option(name);
+          }
+	}
+      }
+      else if (strncmp(argv[i], "-", 1)==0){
+	if (!argv[i][1]) continue;
+	char last=argv[i][1];
+	for (int j=2; argv[i][j]; j++){
+	  last=argv[i][j];
+	  if (lookup.count(argv[i][j-1])==0){
+	    errors.push_back(std::string("undefined short option: -")+argv[i][j-1]);
+	    continue;
+	  }
+	  if (lookup[argv[i][j-1]]==""){
+	    errors.push_back(std::string("ambiguous short option: -")+argv[i][j-1]);
+	    continue;
+	  }
+	  set_option(lookup[argv[i][j-1]]);
+	}
+
+	if (lookup.count(last)==0){
+	  errors.push_back(std::string("undefined short option: -")+last);
+	  continue;
+	}
+	if (lookup[last]==""){
+	  errors.push_back(std::string("ambiguous short option: -")+last);
+	  continue;
+	}
+
+	if (i+1<argc && options[lookup[last]]->has_value()){
+	  set_option(lookup[last], argv[i+1]);
+	  i++;
+	}
+	else{
+	  set_option(lookup[last]);
+	}
+      }
+      else{
+	others.push_back(argv[i]);
+      }
+    }
+
+    for (std::map<std::string, option_base*>::iterator p=options.begin();
+	 p!=options.end(); p++)
+      if (!p->second->valid())
+	errors.push_back("need option: --"+std::string(p->first));
+
+    return errors.size()==0;
+  }
+
+  std::string error() const{
+    return errors.size()>0?errors[0]:"";
+  }
+
+  std::string error_full() const{
+    std::ostringstream oss;
+    for (size_t i=0; i<errors.size(); i++)
+      oss<<errors[i]<<std::endl;
+    return oss.str();
+  }
+
+  std::string usage() const {
+    std::ostringstream oss;
+    oss<<"usage: "<<prog_name<<" ";
+    for (size_t i=0; i<ordered.size(); i++){
+      if (ordered[i]->must())
+	oss<<ordered[i]->short_description()<<" ";
+    }
+    
+    oss<<"[options] ... "<<ftr<<std::endl;
+    oss<<"options:"<<std::endl;
+
+    size_t max_width=0;
+    for (size_t i=0; i<ordered.size(); i++){
+      max_width=std::max(max_width, ordered[i]->name().length());
+    }
+    for (size_t i=0; i<ordered.size(); i++){
+      if (ordered[i]->short_name()){
+	oss<<"  -"<<ordered[i]->short_name()<<", ";
+      }
+      else{
+	oss<<"      ";
+      }
+
+      oss<<"--"<<ordered[i]->name();
+      for (size_t j=ordered[i]->name().length(); j<max_width+4; j++)
+	oss<<' ';
+      oss<<ordered[i]->description()<<std::endl;
+    }
+    return oss.str();
+  }
+
+private:
+
+  void set_option(const std::string &name){
+    if (options.count(name)==0){
+      errors.push_back("undefined option: --"+name);
+      return;
+    }
+    if (!options[name]->set()){
+      errors.push_back("option needs value: --"+name);
+      return;
+    }
+  }
+
+  void set_option(const std::string &name, const std::string &value){
+    if (options.count(name)==0){
+      errors.push_back("undefined option: --"+name);
+      return;
+    }
+    if (!options[name]->set(value)){
+      errors.push_back("option value is invalid: --"+name+"="+value);
+      return;
+    }
+  }
+
+  class option_base{
+  public:
+    virtual ~option_base(){}
+
+    virtual bool has_value() const=0;
+    virtual bool set()=0;
+    virtual bool set(const std::string &value)=0;
+    virtual bool has_set() const=0;
+    virtual bool valid() const=0;
+    virtual bool must() const=0;
+
+    virtual const std::string &name() const=0;
+    virtual char short_name() const=0;
+    virtual const std::string &description() const=0;
+    virtual std::string short_description() const=0;
+  };
+
+  class option_without_value : public option_base {
+  public:
+    option_without_value(const std::string &name,
+			 char short_name,
+			 const std::string &desc)
+      :nam(name), snam(short_name), desc(desc), has(false){
+    }
+    ~option_without_value(){}
+
+    bool has_value() const { return false; }
+
+    bool set(){
+      has=true;
+      return true;
+    }
+
+    bool set(const std::string &){
+      return false;
+    }
+
+    bool has_set() const {
+      return has;
+    }
+
+    bool valid() const{
+      return true;
+    }
+
+    bool must() const{
+      return false;
+    }
+
+    const std::string &name() const{
+      return nam;
+    }
+
+    char short_name() const{
+      return snam;
+    }
+
+    const std::string &description() const {
+      return desc;
+    }
+
+    std::string short_description() const{
+      return "--"+nam;
+    }
+
+  private:
+    std::string nam;
+    char snam;
+    std::string desc;
+    bool has;
+  };
+
+  template <class T>
+  class option_with_value : public option_base {
+  public:
+    option_with_value(const std::string &name,
+		      char short_name,
+		      bool need,
+		      const T &def,
+		      const std::string &desc)
+      : nam(name), snam(short_name), need(need), has(false)
+      , def(def), actual(def) {
+      this->desc=full_description(desc);
+    }
+    ~option_with_value(){}
+
+    const T &get() const {
+      return actual;
+    }
+
+    bool has_value() const { return true; }
+
+    bool set(){
+      return false;
+    }
+
+    bool set(const std::string &value){
+      try{
+	actual=read(value);
+	has=true;
+      }
+      catch(const std::exception &e){
+	return false;
+      }
+      return true;
+    }
+
+    bool has_set() const{
+      return has;
+    }
+
+    bool valid() const{
+      if (need && !has) return false;
+      return true;
+    }
+
+    bool must() const{
+      return need;
+    }
+
+    const std::string &name() const{
+      return nam;
+    }
+
+    char short_name() const{
+      return snam;
+    }
+
+    const std::string &description() const {
+      return desc;
+    }
+
+    std::string short_description() const{
+      return "--"+nam+"="+detail::readable_typename<T>();
+    }
+
+  protected:
+    std::string full_description(const std::string &desc){
+      return
+	desc+" ("+detail::readable_typename<T>()+
+	(need?"":" [="+detail::lexical_cast<std::string>(def)+"]")
+	+")";
+    }
+
+    virtual T read(const std::string &s)=0;
+
+    std::string nam;
+    char snam;
+    bool need;
+    std::string desc;
+
+    bool has;
+    T def;
+    T actual;
+  };
+
+  template <class T, class F>
+  class option_with_value_with_reader : public option_with_value<T> {
+  public:
+    option_with_value_with_reader(const std::string &name,
+				  char short_name,
+				  bool need,
+				  const T def,
+				  const std::string &desc,
+				  F reader)
+      : option_with_value<T>(name, short_name, need, def, desc), reader(reader){
+    }
+
+  private:
+    T read(const std::string &s){
+      return reader(s);
+    }
+
+    F reader;
+  };
+
+  std::map<std::string, option_base*> options;
+  std::vector<option_base*> ordered;
+  std::string ftr;
+
+  std::string prog_name;
+  std::vector<std::string> others;
+
+  std::vector<std::string> errors;
+};
+
+} // cmdline
diff --git a/printbibfromnxmlorcasfile/main.cpp b/printbibfromnxmlorcasfile/main.cpp
new file mode 100644
index 0000000..5c87813
--- /dev/null
+++ b/printbibfromnxmlorcasfile/main.cpp
@@ -0,0 +1,271 @@
+/* 
+ * File:   main.cpp
+ * Author: mueller
+ *
+ * Created on July 18, 2014, 10:27 AM
+ */
+
+#define TPCAS2TPCENTRALDESCRIPTOR "/usr/local/uima_descriptors/Tpcas2TpCentral.xml"
+
+#include "xercesc/util/XMLString.hpp"
+#include <uima/api.hpp>
+#include "uima/xmideserializer.hpp"
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/regex.hpp>
+#include "cmdline.h"
+
+/*
+ * getXMLstring and GetBibFromXML were written by Yuling Li.
+ */
+
+namespace {
+
+    std::string getXMLstring(uima::CAS & tcas) {
+        uima::UnicodeStringRef usdocref = tcas.getDocumentText();
+        if (usdocref.length() > 0) {
+            std::string xmlstring = usdocref.asUTF8();
+            return xmlstring;
+        } else {
+            return "";
+        }
+    }
+
+    std::vector<std::string> GetBibFromXML(std::string xml_text) {
+        boost::regex nline("\\n");
+        xml_text = boost::regex_replace(xml_text, nline, "");
+        //find author
+        std::string t_xmltext = xml_text;
+        boost::regex authorregex("\<contrib-group\>(.+?)\<\/contrib-group\>");
+        boost::smatch author_matches;
+        std::string author = "";
+        while (boost::regex_search(t_xmltext, author_matches, authorregex)) {
+            int size = author_matches.size();
+            std::string hit_text = author_matches[1];
+            boost::smatch name_matches;
+            boost::regex nameregex("\<surname\>(.+?)\<\/surname\>\\s+\<given-names\>(.+?)\<\/given-names\>");
+            while (boost::regex_search(hit_text, name_matches, nameregex)) {
+                author = author + name_matches[1] + " " + name_matches[2] + ", ";
+                hit_text = name_matches.suffix().str();
+            }
+            t_xmltext = author_matches.suffix().str();
+        }
+        boost::regex comma("\\, $");
+        author = boost::regex_replace(author, comma, "");
+        //find subject
+        t_xmltext = xml_text;
+        boost::regex subjectregex("\<subject\>(.+?)\<\/subject>");
+        boost::smatch subject_matches;
+        std::string subject = "";
+        while (boost::regex_search(t_xmltext, subject_matches, subjectregex)) {
+            subject = subject + subject_matches[1] + ", ";
+            t_xmltext = subject_matches.suffix().str();
+        }
+        subject = boost::regex_replace(subject, comma, "");
+        //find accession
+        t_xmltext = xml_text;
+        std::string accession = "";
+        boost::regex pmidregex("\<article-id pub-id-type=\"pmid\"\>(\\d+?)\<\/article-id\>");
+        boost::regex pmcregex("\<article-id pub-id-type=\"pmc\"\>(\\d+?)\<\/article-id\>");
+        boost::smatch pmid_matches;
+        boost::smatch pmc_matches;
+        if (boost::regex_search(t_xmltext, pmid_matches, pmidregex)) {
+            accession = "PMID       " + pmid_matches[1];
+        } else if (boost::regex_search(t_xmltext, pmc_matches, pmcregex)) {
+            accession = "PMC       " + pmc_matches[1];
+        }
+        // find article type
+        t_xmltext = xml_text;
+        std::string type = "";
+        boost::regex typeregex("article-type=\"(.+?)\"");
+        boost::smatch type_matches;
+        if (boost::regex_search(t_xmltext, type_matches, typeregex)) {
+            type = type_matches[1];
+        }
+        // find journal
+        t_xmltext = xml_text;
+        std::string journal = "";
+        boost::regex journalregex("\<journal-title\>(.+?)\<\/journal-title\>");
+        boost::smatch journal_matches;
+        if (boost::regex_search(t_xmltext, journal_matches, journalregex)) {
+            journal = journal_matches[1];
+        }
+        // find article title
+        t_xmltext = xml_text;
+        std::string title = "";
+        boost::regex articleregex("\<article-title\>(.+?)\<\/article-title\>");
+
+        boost::smatch article_matches;
+        if (boost::regex_search(t_xmltext, article_matches, articleregex)) {
+            title = article_matches[1];
+        }
+        // find abstract
+        t_xmltext = xml_text;
+        std::string abstract = "";
+        boost::regex abstractregex("\<abstract\>(.+?)\<\/abstract\>");
+        boost::smatch abstract_matches;
+        if (boost::regex_search(t_xmltext, abstract_matches, abstractregex)) {
+            abstract = abstract_matches[1];
+        }
+        // find citation
+        t_xmltext = xml_text;
+        std::string citation = "";
+        boost::regex volumeregex("\<volume\>(\\d+)\<\/volume\>");
+        boost::smatch volume_matches;
+        if (boost::regex_search(t_xmltext, volume_matches, volumeregex)) {
+            citation = citation + "V : " + volume_matches[1] + " ";
+        }
+        boost::regex issueregex("\<issue\>(\\d+)\<\/issue\>");
+        boost::smatch issue_matches;
+        if (boost::regex_search(t_xmltext, issue_matches, issueregex)) {
+            citation = citation + "(" + issue_matches[1] + ") ";
+        }
+        boost::regex pageregex("\<fpage\>(\\d+)\<\/fpage\>\\s+\<lpage\>(\\d+)\<\/lpage\>");
+        boost::smatch page_matches;
+        if (boost::regex_search(t_xmltext, page_matches, pageregex)) {
+            citation = citation + "pp. " + page_matches[1] + "-" + page_matches[2];
+        }
+        // find year
+        t_xmltext = xml_text;
+        std::string year = "";
+        boost::regex yearregex("\<pub-date pub-type=\".*?\"\>.*?\<year\>(\\d+)\<\/year\>\\s+\<\/pub-date\>");
+        boost::smatch year_matches;
+        if (boost::regex_search(t_xmltext, year_matches, yearregex)) {
+            year = year_matches[1];
+        }
+        std::vector<std::string> bibinfo;
+        bibinfo.push_back(author);
+        bibinfo.push_back(accession);
+        bibinfo.push_back(type);
+        bibinfo.push_back(title);
+        bibinfo.push_back(journal);
+        bibinfo.push_back(citation);
+        bibinfo.push_back(year);
+        bibinfo.push_back(abstract);
+        bibinfo.push_back(subject);
+        return bibinfo;
+    }
+
+    std::string uncompressGzip2(std::string gzFile) {
+        std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary);
+        boost::iostreams::filtering_streambuf<boost::iostreams::input> in;
+        in.push(boost::iostreams::gzip_decompressor());
+        in.push(filein);
+        char tmpname[L_tmpnam];
+        char * pDummy = tmpnam(tmpname);
+        std::string tmpfile(tmpname);
+        while (boost::filesystem::exists(tmpfile)) {
+            char * pDummy = tmpnam(tmpname);
+            tmpfile = std::string(tmpname);
+        }
+        std::ofstream out(tmpfile.c_str());
+        boost::iostreams::copy(in, out);
+        out.close();
+        return tmpfile;
+    }
+
+    //[ Uima related
+
+    uima::AnalysisEngine * CreateUimaEngine(const char * descriptor) {
+        uima::ErrorInfo errorInfo;
+        uima::AnalysisEngine * ret = uima::Framework::createAnalysisEngine(descriptor, errorInfo);
+        if (errorInfo.getErrorId() != UIMA_ERR_NONE) {
+            std::cerr << std::endl
+                    << "  Error string  : "
+                    << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId())
+                    << std::endl
+                    << "  UIMACPP Error info:" << std::endl
+                    << errorInfo << std::endl;
+        }
+        return ret;
+    }
+
+    uima::CAS * GetCas(const char * pszInputFile, uima::AnalysisEngine * pEngine) {
+        uima::CAS * ret = pEngine->newCAS();
+        if (ret == NULL) {
+            std::cerr << "pEngine_->newCAS() failed." << std::endl;
+        } else {
+            try {
+                /* initialize from an xmicas */
+                XMLCh * native = XMLString::transcode(pszInputFile);
+                LocalFileInputSource fileIS(native);
+                XMLString::release(&native);
+                uima::XmiDeserializer::deserialize(fileIS, * ret, true);
+            } catch (uima::Exception e) {
+                uima::ErrorInfo errInfo = e.getErrorInfo();
+                std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl;
+                std::cerr << errInfo << std::endl;
+            }
+        }
+        return ret;
+    }
+    //] Uima related
+}
+
+int main(int argc, char * argv[]) {
+
+    cmdline::parser p;
+    p.set_program_name("printbibfromcasfile");
+    p.add("abstract", 'a', "print abstract");
+    p.add("author", 'u', "print author");
+    p.add("accession", 'c', "print citation");
+    p.add("citation", 'i', "print citation");
+    p.add("journal", 'j', "print journal");
+    p.add("subject", 's', "print subject");
+    p.add("title", 't', "print title");
+    p.add("type", 'p', "print type");
+    p.add("year", 'y', "print year");
+    p.add("nxml", 'n', "file is in nxml format, not gzipped cas.");
+    p.footer("<filename>");
+    if (p.parse(argc, argv) == 0) {
+        std::cerr << "Error:" << p.error() << std::endl
+                << p.usage() << std::endl;
+        return -1;
+    }
+    if (argc < 3) {
+        std::cerr << p.usage() << std::endl;
+        return -1;
+    }
+    std::string filename;
+    if (p.rest().size() > 0) filename = p.rest()[0];
+    //
+    std::vector<std::string> bib_info;
+    if (p.exist("nxml")) {
+        std::ifstream f(filename.c_str());
+        std::string in;
+        std::string all;
+        while (getline(f, in)) all += in;
+        f.close();
+        bib_info = GetBibFromXML(all);
+    } else {
+        (void) uima::ResourceManager::createInstance("TPCAS2TPCENTRALAE");
+        uima::AnalysisEngine * pEngine = CreateUimaEngine(TPCAS2TPCENTRALDESCRIPTOR);
+        std::string tmpfl = uncompressGzip2(filename);
+        uima::CAS * pcas = GetCas(tmpfl.c_str(), pEngine);
+        boost::filesystem::remove(tmpfl);
+        bib_info = GetBibFromXML(getXMLstring(*pcas));
+    }
+    std::string l_author = bib_info[0];
+    std::string l_accession = bib_info[1];
+    std::string l_type = bib_info[2];
+    std::string l_title = bib_info[3];
+    std::string l_journal = bib_info[4];
+    std::string l_citation = bib_info[5];
+    std::string l_year = bib_info[6];
+    std::string l_abstract = bib_info[7];
+    std::string l_subject = bib_info[8];
+    std::cout << "Filename:" << filename << "\t";
+    if (p.exist("title")) std::cout << "Title:" << l_title << "\t";
+    if (p.exist("author")) std::cout << "Author:" << l_author << "\t";
+    if (p.exist("accession")) std::cout << "Accession:" << l_accession << "\t";
+    if (p.exist("type")) std::cout << "Type:" << l_type << "\t";
+    if (p.exist("journal")) std::cout << "Journal:" << l_journal << "\t";
+    if (p.exist("citation")) std::cout << "Citation:" << l_citation << "\t";
+    if (p.exist("year")) std::cout << "Year:" << l_year << "\t";
+    if (p.exist("abstract")) std::cout << "Abstract:" << l_abstract << "\t";
+    if (p.exist("subject")) std::cout << "Subject:" << l_subject << "\t";
+    std::cout << std::endl;
+    return 0;
+}
diff --git a/run_tpc_pipeline_incremental.sh b/run_tpc_pipeline_incremental.sh
new file mode 100644
index 0000000..3dbb24a
--- /dev/null
+++ b/run_tpc_pipeline_incremental.sh
@@ -0,0 +1,253 @@
+#!/usr/bin/env bash
+
+function usage {
+    echo "usage: $(basename $0) [-p]"
+    echo "  -p --pdf-dir      directory where raw pdf files will be stored"
+    echo "  -x --xml-dir      directory where raw xml files will be stored"
+    echo "  -c --cas1-dir     directory where generated cas1 files will be stored"
+    echo "  -C --cas2-dir     directory where generated cas2 files will be stored"
+    echo "  -t --tmp-dir      temp directory"
+    echo "  -f --ftp-dir      ftp mount point for pmcoa papers"
+    echo "  -P --num-proc     maximum number of parallel processes"
+    echo "  -h --help         display help"
+    exit 1
+}
+
+if [[ "${#}" < 2 ]]
+then
+    usage
+fi
+
+PDF_DIR="/data/textpresso/raw_files/pdf"
+XML_DIR="/data/textpresso/raw_files/xml"
+CAS2_DIR="/data/textpresso/tpcas"
+CAS1_DIR="/data/textpresso/tpcas-1"
+TMP_DIR="/data/textpresso/tmp"
+FTP_MNTPNT="/mnt/pmc_ftp"
+INDEX_DIR="/data/textpresso/luceneindex"
+N_PROC=1
+
+while [[ $# -gt 0 ]]
+do
+key=$1
+
+case $key in
+    -p|--pdf-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        PDF_DIR="$key"
+    fi
+    shift
+    ;;
+    -x|--xml-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        XML_DIR="$key"
+    fi
+    shift
+    ;;
+    -c|--cas1-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        CAS1_DIR="$key"
+    fi
+    shift
+    ;;
+    -C|--cas2-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        CAS2_DIR="$key"
+    fi
+    shift
+    ;;
+    -t|--tmp-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        TMP_DIR="$key"
+    fi
+    shift
+    ;;
+    -f|--ftp-dir)
+    shift
+    if [[ -d $key ]]
+    then
+        FTP_MNTPNT="$key"
+    fi
+    shift
+    ;;
+    -P|--num-proc)
+    shift
+    N_PROC=$1
+    shift
+    ;;
+    -h|--help)
+    usage
+    ;;
+    *)
+    if [[ -d $key ]]
+    then
+        ROOT_DIR="$key"
+        shift
+    else
+        usage
+    fi
+    ;;
+esac
+done
+
+# temp files
+logfile=$(mktemp)
+newpdf_list=$(mktemp)
+newxml_list=$(mktemp)
+newxml_local_list=$(mktemp)
+
+# download new xml files from pmcoa
+## create directory for unclassified xml files
+mkdir -p ${XML_DIR}
+## mount pmcoa ftp locally through curl
+curlftpfs ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ ${FTP_MNTPNT}
+#find ${FTP_MNTPNT} *.gz | xargs ls -d -l --time-style="full-iso" | awk '{if (substr($1,0,1) == "-") print $6, $7, $9}' > ${newxml_list}
+# save list of tazendra files locally
+for dir in ${FTP_MNTPNT}/*; do for subdir in ${dir}/*; do ls -d -l --time-style="full-iso" ${subdir}/* | awk '{print $6, $7, $9}' >> ${newxml_list}; done; done
+if [[ -e ${XML_DIR}/current_filelist.txt ]]
+then
+    # download diff files
+    diff ${newxml_list} ${XML_DIR}/current_filelist.txt | grep "^<" | awk '{print $3}' | xargs -I {} tar xfz {} --exclude="*.pdf" --exclude="*.PDF" --exclude="*.mp4" --exclude="*.webm" --exclude="*.flv" --exclude="*.avi" --exclude="*.zip" --exclude="*.mov" --exclude="*.csv" --exclude="*.xls*" --exclude="*.doc*" --exclude="*.ppt*" --exclude="*.rar" --exclude="*.txt" --exclude="*.TXT" --exclude="*.wmv" --exclude="*.DOC*" -C ${XML_DIR}
+    # save new current list
+    diff ${newxml_list} ${XML_DIR}/current_filelist.txt | grep "^<" | awk '{print $3}' >> ${XML_DIR}/current_filelist.txt
+else
+    # download all files
+    awk '{print $3}' ${newxml_list} | xargs -I {} tar xfz {} --exclude="*.pdf" --exclude="*.PDF" --exclude="*.mp4" --exclude="*.webm" --exclude="*.flv" --exclude="*.avi" --exclude="*.zip" --exclude="*.mov" --exclude="*.csv" --exclude="*.xls*" --exclude="*.doc*" --exclude="*.ppt*" --exclude="*.rar" --exclude="*.txt" --exclude="*.TXT" --exclude="*.wmv" --exclude="*.DOC*" -C ${XML_DIR}
+    # save file list as current
+    cp ${newxml_list} ${XML_DIR}/current_filelist.txt
+fi
+umount ${FTP_MNTPNT}
+
+# save new xml local file list
+cut -d " " -f 3 ${newxml_list} | sed "s/\/mnt\/pmc\_ftp\/.\{2\}\/.\{2\}\///g;s/\.tar\.gz//g" | xargs -I {} echo ${XML_DIR}/{} > ${newxml_local_list}
+
+# compress nxml and put images in a separate directory
+cat ${newxml_local_list} | while read line
+do
+    gzip $line/*.nxml; mkdir $line/images; ls -d $line/* | grep -v .nxml | grep -v $line/images | xargs -I [] mv [] $line/images
+done
+
+# download new pdf files incrementally from tazendra
+getpdfs.py -l ${logfile} -L INFO ${PDF_DIR} "${XML_DIR}/PMCOA C. elegans"
+grep -oP "Downloading paper: .* to \K.*\.pdf" ${logfile} > ${newpdf_list}
+
+# download bib info for pdfs
+mkdir -p /usr/local/textpresso/celegans_bib
+download_pdfinfo.pl /usr/local/textpresso/celegans_bib/
+extract_pdfbibinfo.pl  /usr/local/textpresso/celegans_bib/
+
+# generate tpcas-1 from new pdf files
+mkdir -p ${CAS1_DIR}/C.\ elegans
+mkdir -p ${CAS1_DIR}/C.\ elegans\ Supplementals
+cd ${CAS1_DIR}
+articles2cas -i ${PDF_DIR}/C.\ elegans -l ${newpdf_list} -t 1 -o C.\ elegans -p
+articles2cas -i ${PDF_DIR}/C.\ elegans\ Supplementals -l ${newpdf_list} -t 1 -o C.\ elegans\ Supplementals -p
+
+# generate tpcas-1 from new nxml files
+mkdir -p ${CAS1_DIR}/PMCOA
+cd ${CAS1_DIR}
+articles2cas -i "${XML_DIR}" -l <(awk 'BEGIN{FS="/"}{print $NF}' ${newxml_local_list}) -t 2 -o PMCOA -p
+
+# add images to tpcas directory and gzip
+## xml
+cat ${newxml_local_list} | while read line
+do
+    dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}')
+    rm -rf "${CAS1_DIR}/PMCOA/${dirname}/images"
+    ln -fs "${XML_DIR}/${dirname}/images" "${CAS1_DIR}/PMCOA/${dirname}/images"
+    find "${CAS1_DIR}/PMCOA/${dirname}" -name *.tpcas | xargs -I {} gzip "{}"
+done
+## pdf
+cat ${newpdf_list} | while read line
+do
+    gzip "${CAS1_DIR}/$(echo "${line}" | awk 'BEGIN{FS="/"}{print $NF-2"/"$NF-1"/"$NF}')"
+done
+
+# generate cas2 files from cas1
+## copy files to temp directory
+rm -rf ${TMP_DIR}/tpcas-1
+## xml
+mkdir -p ${TMP_DIR}/tpcas-1/xml
+cat ${newxml_local_list} | while read line
+do
+    dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}')
+    find "${CAS1_DIR}/PMCOA/${dirname}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/xml/${dirname}.tpcas.gz
+done
+
+mkdir -p ${TMP_DIR}/tpcas-1/pdf_celegans
+mkdir -p ${TMP_DIR}/tpcas-1/pdf_celegans_sup
+# TODO check if "line" contains only the file name
+grep -v "Supplementals" ${newpdf_list} | while read line
+do
+    find "${CAS1_DIR}/C. elegans/${line}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/pdf_celegans/${line}.tpcas.gz
+done
+grep "Supplementals" ${newpdf_list} | while read line
+do
+    find "${CAS1_DIR}/C. elegans Supplementals/${line}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/pdf_celegans_sup/${line}.tpcas.gz
+done
+
+## apply uima analysis
+rm -rf "${TMP_DIR}/tpcas-2"
+mkdir -p "${TMP_DIR}/tpcas-2/xml"
+mkdir -p "${TMP_DIR}/tpcas-2/pdf_celegans"
+mkdir -p "${TMP_DIR}/tpcas-2/pdf_celegans_sup"
+find ${TMP_DIR}/tpcas-1 -name *.tpcas.gz | xargs -n 1 -P ${N_PROC} gunzip
+runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/xml ${TMP_DIR}/tpcas-2/xml
+runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/pdf_celegans ${TMP_DIR}/tpcas-2/pdf_celegans
+runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/pdf_celegans_sup ${TMP_DIR}/tpcas-2/pdf_celegans_sup
+find ${TMP_DIR}/tpcas-2 -name *.tpcas | xargs -n 1 -P ${N_PROC} gzip
+
+# copy tpcas1 dirs to tpcas2 and replace tpcas files with the new ones
+mkdir -p "${CAS2_DIR}/PMCOA"
+mkdir -p "${CAS2_DIR}/C. elegans"
+mkdir -p "${CAS2_DIR}/C. elegans Supplementals"
+## xml
+cat ${newxml_local_list} | while read line
+do
+    dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}')
+    tpcas_file_name=$(ls ${CAS1_DIR}/PMCOA/${dirname}/*.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}')
+    mkdir "${CAS2_DIR}/PMCOA/${dirname}"
+    ln -s "${CAS1_DIR}/PMCOA/${dirname}/images" "${CAS2_DIR}/PMCOA/${dirname}/images"
+    cp ${TMP_DIR}/tpcas-2/xml/${dirname}.tpcas.gz "${CAS2_DIR}/PMCOA/${dirname}/${tpcas_file_name}"
+done
+## pdf
+grep -v "Supplementals" ${newpdf_list} | while read line
+do
+    mkdir "${CAS2_DIR}/C. elegans/${line}"
+    ln -s "${CAS1_DIR}/C. elegans/${line}/images" "${CAS2_DIR}/C. elegans/${line}/images"
+    find "${CAS1_DIR}/C. elegans/${line}/" -name *.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}' | xargs -I {} cp ${TMP_DIR}/tpcas-1/pdf_celegans/"{}" "${CAS2_DIR}/C. elegans/${line}/"
+done
+grep "Supplementals" ${newpdf_list} | while read line
+do
+    mkdir "${CAS2_DIR}/C. elegans Supplementals/${line}"
+    ln -s "${CAS1_DIR}/C. elegans Supplementals/${line}/images" "${CAS2_DIR}/C. elegans Supplementals/${line}/images"
+    find "${CAS1_DIR}/C. elegans Supplementals/${line}/" -name *.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}' | xargs -I {} cp ${TMP_DIR}/tpcas-1/pdf_celegans/"{}" "${CAS2_DIR}/C. elegans Supplementals/${line}/"
+done
+
+# generate bib files for cas files
+# TODO: check from here
+getallbibfiles.sh -p ${N_PROC} ${CAS2_DIR}
+
+if [[ ! -d ${INDEX_DIR} || $(ls ${INDEX_DIR} | grep -v "subindex.config" | wc -l) == "0" ]]
+then
+    mkdir -p ${INDEX_DIR}
+    createallindexes -p ${N_PROC} ${CAS2_DIR} ${INDEX_DIR}
+else
+    cas2index -i ${CAS2_DIR} -o ${INDEX_DIR}
+fi
+# cleanup tmp files
+rm -rf ${TMP_DIR}
+rm ${logfile}
+rm ${newpdf_list}
+rm ${newxml_list}
+rm ${newxml_local_list}
\ No newline at end of file
diff --git a/useruploads/run_userupload_pipeline_incremental.sh b/useruploads/run_userupload_pipeline_incremental.sh
new file mode 100755
index 0000000..7e90633
--- /dev/null
+++ b/useruploads/run_userupload_pipeline_incremental.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+USERUPLOADS_DIR="/usr/local/textpresso/useruploads"
+
+for user_dir in ${USERUPLOADS_DIR}/*
+do
+    cd ${user_dir}
+    username=${PWD##*/}
+    mkdir -p ${user_dir}/tpcas
+    mkdir -p ${user_dir}/tmp/cas1
+    mkdir -p ${user_dir}/tmp/cas2
+    mkdir -p ${user_dir}/useruploads/${username}
+    touch ${user_dir}/tpcas/processed_files.txt
+    touch ${user_dir}/tpcas/tokenized_files.txt
+    tmpfile=$(mktemp)
+    grep -vxf ${user_dir}/tpcas/processed_files.txt <(ls -1 ${user_dir}/uploadedfiles) > ${tmpfile}
+    if [[ $(grep ".pdf" ${tmpfile} | wc -l | awk '{print $1}') != "0" ]]
+    then
+        articles2cas -t 1 -i uploadedfiles -o useruploads/${username} -l <(grep ".pdf" ${tmpfile})
+    fi
+    if [[ $(grep ".nxml" ${tmpfile} | wc -l | awk '{print $1}') != "0" ]]
+    then
+        articles2cas -t 2 -i ${user_dir}/uploadedfiles -o useruploads/${username} -l <(grep ".nxml" ${tmpfile})
+    fi
+    # TODO process compressed archives
+    mv useruploads/${username}/* ${user_dir}/tpcas/
+    rm -rf useruploads
+    cat ${tmpfile} >> ${user_dir}/tpcas/tokenized_files.txt
+    grep -xf <(sed -e 's/\.[^.]*$//' ${tmpfile}) <(ls ${user_dir}/tpcas/) | xargs -I {} cp ${user_dir}/tpcas/{}/{}.tpcas  ${user_dir}/tmp/cas1
+    if [[ $(ls ${user_dir}/tmp/cas1/ | wc -l | awk '{print $0}') != "0" ]]
+    then
+        runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${user_dir}/tmp/cas1 ${user_dir}/tmp/cas2
+    fi
+    for tpcas2_file in $(ls ${user_dir}/tmp/cas2/*)
+    do
+        mv ${tpcas2_file} ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//')
+        if [[ -f ${user_dir}/uploadedfiles/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//').bib ]]
+        then
+            cp ${user_dir}/uploadedfiles/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//').bib ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//')
+        fi
+        gzip ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//')/$(basename ${tpcas2_file})
+    done
+    rm -rf ${user_dir}/tmp/
+    cat ${tmpfile} >> ${user_dir}/tpcas/processed_files.txt
+    rm ${tmpfile}
+    mkdir -p /usr/local/textpresso/tpcas/useruploads/${username}
+    cd tpcas
+    find . -mindepth 1 -maxdepth 1 -type d | xargs -I {} ln -s ${user_dir}/tpcas/{} /usr/local/textpresso/tpcas/useruploads/${username}/{}
+    if [[ ! -f ${user_dir}/luceneindex ]]
+    then
+        mkdir -p ${user_dir}/luceneindex
+        cas2index -i ${user_dir}/tpcas -o ${user_dir}/luceneindex -s 300000 -e
+    fi
+done
\ No newline at end of file