diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f23f11e --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +cmake-build* +.idea/ +**/build/ +**/dist/ +**/nbproject/ +/pgdumps/ +*~ +/tpctl.config diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..524e015 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,86 @@ +cmake_minimum_required(VERSION 3.5) +project(tpctools) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required... +set(CMAKE_CXX_FLAGS "-DBOOST_NO_CXX11_SCOPED_ENUMS -w") + +set(CMAKE_STATIC_LIBRARY_PREFIX "") # avoid the prefix "lib" +set(CMAKE_SHARED_LIBRARY_PREFIX "") + +SET(CMAKE_INSTALL_PREFIX /usr/local) +SET(CMAKE_SKIP_BUILD_RPATH FALSE) +SET(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) +set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") +set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + +find_package(APR REQUIRED) +include_directories(${APR_INCLUDE_DIR}) + +find_package(Lucene++ REQUIRED) +include_directories(${Lucene++_INCLUDE_DIRS}) + +find_package(Textpresso REQUIRED) +include_directories(${Textpresso_INCLUDE_DIR}) + +include_directories("/home/valerio/workspace/caltech/libtpc") + +#### Main projects #### + +add_executable(printbibfromnxmlorcasfile printbibfromnxmlorcasfile/main.cpp printbibfromnxmlorcasfile/cmdline.h) +target_link_libraries(printbibfromnxmlorcasfile uima xerces-c boost_system boost_filesystem boost_iostreams + boost_regex) + +#### Subprojects #### + +add_executable(ppm2jpg ppm2jpg/main.cpp) +target_link_libraries(ppm2jpg boost_system boost_filesystem boost_program_options pthread ${CImg_SYSTEM_LIBS}) + +add_executable(cas2index cas2index/cas2index.cpp) +target_link_libraries(cas2index ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options lucene++) + +add_executable(updatecorpuscounter cas2index/update_corpus_counter.cpp) +target_link_libraries(updatecorpuscounter ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options + lucene++) + +add_executable(indexmerger cas2index/index_merger.cpp lucene/CaseSensitiveAnalyzer.cpp) +target_link_libraries(indexmerger uima boost_filesystem boost_system lucene++) + + +add_executable(articles2cas articles2cas/articles2cas.cpp articles2cas/Utils.h articles2cas/Utils.cpp) +target_link_libraries(articles2cas ${Textpresso_LIBRARY} boost_filesystem boost_system boost_program_options boost_iostreams) + +add_executable(getbib getbib/getbib.cpp getbib/getbibUtils.h getbib/getbibUtils.cpp + TextpressoCentralGlobalDefinitions.h TextpressoCentralGlobals.h) +target_link_libraries(getbib lucene++ xerces-c icuuc boost_system uima boost_filesystem boost_iostreams) + +add_executable(getbib4nxml getbib/getbib4nxml.cpp getbib/getbib4nxmlUtils.h getbib/getbib4nxmlUtils.cpp + TextpressoCentralGlobalDefinitions.h TextpressoCentralGlobals.h) +target_link_libraries(getbib4nxml lucene++ xerces-c icuuc boost_system uima boost_filesystem boost_iostreams) + +add_executable(saveidstodb cas2index/saveidstodb.cpp) +target_link_libraries(saveidstodb lucene++ boost_filesystem boost_system boost_program_options ${Textpresso_LIBRARY} + db_cxx db_stl) + +#### INSTALL #### + +install(TARGETS getbib getbib4nxml + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib) + +install(TARGETS cas2index RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(TARGETS saveidstodb RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(TARGETS updatecorpuscounter RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(PROGRAMS cas2index/create_index_multi_literatures.sh cas2index/create_single_index.sh + ppm2jpg/ppm2jpgCas.sh run_tpc_pipeline_incremental.sh + getpdfs/getpdfs.py getbibinfoforpdffromserver/download_pdfinfo.pl + getbibinfoforpdffromserver/extract_pdfbibinfo.pl + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + +install(TARGETS articles2cas RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(PROGRAMS articles2cas/convertallarticles2cas.sh + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + + + diff --git a/TextpressoCentralGlobalDefinitions.h b/TextpressoCentralGlobalDefinitions.h new file mode 100644 index 0000000..faf8689 --- /dev/null +++ b/TextpressoCentralGlobalDefinitions.h @@ -0,0 +1,52 @@ +// Global file containing all global definitions. + +#ifndef TEXTPRESSOCENTRALGLOBALDEFINITIONS_H +#define TEXTPRESSOCENTRALGLOBALDEFINITIONS_H + +// Are these definitions really global? Otherwise move them back to their local project. +#define PDF2TPCASDESCRIPTOR "/usr/local/uima_descriptors/TpTokenizer.xml" +#define XML2TPCASDESCRIPTOR "/usr/local/uima_descriptors/TxTokenizer.xml" +#define TPCAS2LINDEXDESCRIPTOR "/usr/local/uima_descriptors/Tpcas2Lindex.xml" + +#define AVAILABLELITERATUREFILE "/usr/local/textpresso/luceneindex/subindex.config" +#define USERUPLOADROOTDIR "/usr/local/textpresso/useruploads" + +#define PGONTOLOGYBROWSER "dbname=www-data" +#define PGONTOLOGYBROWSWERCOLUMNS "ontologybrowsercolumnnames" +#define PGONTOLOGY "dbname=www-data" +#define PGONTOLOGYTABLENAME "tpontology" +#define PGONTOLOGYTMPTABLENAME "tmptpontology" +#define PGCURATION "dbname=www-data" +#define PGCURATIONTABLENAME "tpcuration" +#define PCRELATIONSTABLENAME "pcrelations" +#define PCRELATIONSTMPTABLENAME "tmppcrelations" +#define PADCRELATIONSTABLENAME "padcrelations" +#define PADCRELATIONSTMPTABLENAME "tmppadcrelations" +#define STOPWORDTABLENAME "stopwords" +#define PGLITERATURE "dbname=www-data" +#define PGLITPREFTABLENAME "literaturepreference" +#define PGPRELOADEDCATEGORIES "dbname=www-data" +#define PGPRELOADEDCATTABLENAME "preloadedcategories" +#define PGTIPOFDAY "dbname=www-data" +#define PGTIPOFDAYTABLENAME "tipoftheday" +#define PGCURATIONFIELDS "dbname=www-data" +#define PGCURATIONFIELDSTABLENAME "curationfields" +#define PGCURATIONFORMS "dbname=www-data" +#define PGCURATIONFORMSTABLENAME "curationforms" +#define PGCURATIONDATAFROMVIEWER "dbname=www-data" +#define PGCURATIONDATAFROMVIEWERTABLENAME "curationdatafromviewer" +#define PGLISTOFONTOLOGIES "dbname=www-data" +#define PGLISTOFONTOLOGIESTABLENAME "listofontologies" +#define PGPREPOPULATION "dbname=www-data" +#define PGPREPOPULATIONTABLENAME "prepopulation" +#define AUTHIDENTITIES "dbname=www-data" +#define AUTHIDENTITIESTABLENAME "auth_identity" +#define PGLITERATUREPERMISSION "dbname=www-data" +#define PGLITERATUREPERMISSIONTABLENAME "literaturepermissions" +#define PGCUSTOMCOLORS "dbname=www-data" +#define PGCUSTOMCOLORSTABLENAME "customcolor" +#define PGDIALOGPREFERENCES "dbname=www-data" +#define PGDIALOGPREFERENCESTABLENAME "dialogpreferencestable" +#define SENTENCE_SEARCH_MAX_NUM_DISPLAY_WORDS 50 + +#endif diff --git a/TextpressoCentralGlobals.h b/TextpressoCentralGlobals.h new file mode 100644 index 0000000..66f0184 --- /dev/null +++ b/TextpressoCentralGlobals.h @@ -0,0 +1,41 @@ +// Global file containing all global definitions. + +#ifndef TEXTPRESSOCENTRALGLOBALS_H +#define TEXTPRESSOCENTRALGLOBALS_H + + +// Are these definitions really global? Otherwise move them back to their local project. + +#include "TextpressoCentralGlobalDefinitions.h" + +#include + +// If a composite delimiter exists, then there cannot be another delimiter +// that is a subset of that composite token delimiter. Decompose it accordingly. +// This applies to token and sentence delimiter +UnicodeString G_initT[] = { + " ", "\n", "\t", "'", "\"", + "/", "—", "(", ")", "[", + "]", "{", "}", ":", ". ", + "; ", ", ", "! ", "? " +}; + +const int G_initT_No = 19; +UnicodeString G_initS[] = { + ".\n", "!\n", "?\n", ". ", "! ", "? ", + ".\t", "!\t", "?\t", ".<", "!<", "?<" +}; +const int G_initS_No = 12; +UnicodeString G_initP[] = {"<_pdf _image", "<_pdf _sbr", "<_pdf _hbr", + "<_pdf _fsc", "<_pdf _fnc", "<_pdf _ydiff", "<_pdf _cr", "<_pdf _page"}; +const int G_initP_No = 8; +const std::string ServerNames[] = {"http://goldturtle.caltech.edu/cgi-bin/ReceivePost.cgi", + "http://go-genkisugi.rhcloud.com/capella", "http://localhost/cgi-bin/ReceivePost.cgi"}; +const int ServerNames_No = 3; + +//const std::string G_pdftagstart("<_pdf "); +//const std::string G_pdftagend("/>"); +const UnicodeString usG_pdftagstart("<_pdf "); +const UnicodeString usG_pdftagend("/>"); + +#endif diff --git a/articles2cas/Utils.cpp b/articles2cas/Utils.cpp new file mode 100644 index 0000000..e5cbde2 --- /dev/null +++ b/articles2cas/Utils.cpp @@ -0,0 +1,230 @@ +/** + Project: libtpc + File name: Utils.cpp + + @author valerio + @version 1.0 7/26/17. +*/ + +#include "Utils.h" +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace boost::posix_time; + +string Utils::get_temp_dir_path() +{ + ptime now = boost::posix_time::microsec_clock::local_time(); + int month = static_cast (now.date().month()); + int year = static_cast (now.date().year()); + int day = static_cast (now.date().day()); + time_duration duration(now.time_of_day()); + long microseconds = duration.total_microseconds(); + long pid = getpid(); + long random = pid + microseconds; + stringstream ss; + ss << year << month << day << random; + return "/run/shm/" + ss.str(); +} + +string Utils::decompress_gzip(const string& gz_file, const string& tmp_dir) { + std::ifstream filein(gz_file.c_str(), std::ios_base::in | std::ios_base::binary); + boost::iostreams::filtering_streambuf in; + in.push(boost::iostreams::gzip_decompressor()); + in.push(filein); + int lastdot = gz_file.find_last_of("."); + int lastslash = gz_file.find_last_of("/"); + string tpFile = gz_file.substr(lastslash + 1, lastdot - lastslash - 1); + string tempFile = tmp_dir + "/" + tpFile; + std::ofstream out(tempFile.c_str()); + boost::iostreams::copy(in, out); + out.close(); + return tempFile; +} + +void Utils::write_index_descriptor(const std::string& index_path, const std::string& descriptor_path, + const std::string& tmp_conf_files_path) +{ + ofstream output(descriptor_path.c_str()); + output << "" << endl; + output << "" << endl; + output << " org.apache.uima.cpp" << endl; + output << " true " << endl; + //output << " Tpcas2Lpp" << endl; + output << " Tpcas2SingleIndex" << endl; + output << " " << endl; + //output << " Tpcas2Lpp" << endl; + output << " Tpcas2SingeIndex" << endl; + output << " Writes an XCAS to a Lucene index. " << endl; + output << " 1.0 " << endl; + output << " Textpresso " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextLuceneIndexDirectory" << endl; + output << " Directory path of Lucene index for fulltext. " << endl; + output << " String " << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextCaseSensitiveLuceneIndexDirectory" << endl; + output << " Directory path of case sensitive Lucene index for fulltext. " << endl; + output << " String " << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for tokens. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenCaseSensitiveLuceneIndexDirectory " << endl; + output << " Directory path of case sensitive Lucene index for tokens. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for sentences." << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceCaseSensitiveLuceneIndexDirectory " << endl; + output << " Directory path of case sensitive Lucene index for sentences." << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for lexical annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalCaseSensitiveLuceneIndexDirectory " << endl; + output << " Directory path of case sensitive Lucene index for lexical annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for bibliography annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyCaseSensitiveLuceneIndexDirectory " << endl; + output << " Directory path of case sensitive Lucene index for bibliography annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " TempDirectory " << endl; + output << " temporary directory under /run/shm/ to store newindexflag " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << index_path << "/fulltext" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << index_path << "/token" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/sentence" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/lexical" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/bibliography" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextCaseSensitiveLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << index_path << "/fulltext_cs" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenCaseSensitiveLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << index_path << "/token_cs" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceCaseSensitiveLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/sentence_cs" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalCaseSensitiveLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/lexical_cs" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyCaseSensitiveLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << index_path << "/bibliography_cs" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " TempDirectory " << endl; + output << " " << endl; + output << " " << tmp_conf_files_path << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " x-unspecified" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output.close(); +} \ No newline at end of file diff --git a/articles2cas/Utils.h b/articles2cas/Utils.h new file mode 100644 index 0000000..2b73b94 --- /dev/null +++ b/articles2cas/Utils.h @@ -0,0 +1,39 @@ +/** + Project: libtpc + File name: Utils.h + + @author valerio + @version 1.0 7/26/17. +*/ + +#ifndef LIBTPC_UTILS_H +#define LIBTPC_UTILS_H + +#include + +class Utils { +public: + /*! + * generate a random path name for a tmp directory + */ + static std::string get_temp_dir_path(); + + /*! + * write a uima descriptor for an index to file + * @param index_path the path of the index + * @param descriptor_path the path of the descriptor to be created + * @param tmp_conf_files_path the path of the directory containing the temp files for the index + */ + static void write_index_descriptor(const std::string& index_path, const std::string& descriptor_path, + const std::string& tmp_conf_files_path); + + /*! + * decompress file to a new file and return file path of the latter + * @param gz_file the gx file to decompress + * @return the file path of the decompressed file + */ + static std::string decompress_gzip(const std::string & gz_file, const std::string& tmp_dir); +}; + + +#endif //LIBTPC_UTILS_H diff --git a/articles2cas/articles2cas.cpp b/articles2cas/articles2cas.cpp new file mode 100644 index 0000000..35bd7e5 --- /dev/null +++ b/articles2cas/articles2cas.cpp @@ -0,0 +1,118 @@ +/** + Project: textpressocentral + File name: articles2cas.cpp + + @author valerio + @version 1.0 7/30/17. +*/ + +#include "CASManager.h" +#include +#include +#include +#include +#include +#include "Utils.h" + +using namespace std; +using namespace boost::filesystem; +namespace po = boost::program_options; +using namespace tpc::cas; + + +void convert_dir_recursively(const string& inputDir, const string& outputDir, const string& literature, + const set& filelist_set, const set& dirlist_set, FileType fileType, + bool use_parent_dir_as_outname) { + for (directory_iterator dit(inputDir); dit != directory_iterator(); ++dit) { + if ((is_regular_file(*dit) && (dit->path().filename().string().find(".nxml.gz") != string::npos || + dit->path().filename().string().find(".pdf") != string::npos)) && ((filelist_set.empty() || + filelist_set.find(dit->path().filename().string()) != filelist_set.end()) && + (dirlist_set.empty() || dirlist_set.find(dit->path().parent_path().filename().string()) != + dirlist_set.end()))) { + if (fileType == FileType::xml) { + string decomp_file = Utils::decompress_gzip(dit->path().string(), + dit->path().parent_path().string()); + CASManager::convert_raw_file_to_cas1(decomp_file, fileType, outputDir, use_parent_dir_as_outname); + boost::filesystem::remove(decomp_file); + } else { + CASManager::convert_raw_file_to_cas1(dit->path().string(), fileType, outputDir, + use_parent_dir_as_outname); + } + } else if (is_directory(*dit) && dit->path().filename().string() != "images"){ + convert_dir_recursively(dit->path().string(), outputDir, literature, filelist_set, dirlist_set, fileType, + use_parent_dir_as_outname); + } + } +} + + +int main(int argc, const char* argv[]) { + po::options_description desc("options"); + po::positional_options_description p; + po::variables_map vm; + + // arguments + string inputDir; + string outputDir; + int fileType; + string filelist; + string dirlist; + + try { + desc.add_options() + ("help,h", "produce help message") + ("articles-input-directory,i", po::value(&inputDir)->required(), + "input directory containing articles") + ("cas-output-directory,o", po::value(&outputDir)->required(), + "directory where to write cas files") + ("input-files-type,t", po::value(&fileType)->default_value(1), + "type of files to process. 1 for pdf, 2 for xml") + ("dir-list,l", po::value(&dirlist)->default_value(""), + "optional list of directory names containing the final files to be processed. Other " + "directories are ignored") + ("file-list,L", po::value(&filelist)->default_value(""), + "optional list of file names to be processed. Other files are ignored") + ("use_parent_dir_as_outname,p", po::bool_switch()->default_value(false), "Use parent dir name instead " + "of file name as output name for the cas file"); + p.add("articles-input-directory", 1); + p.add("cas-output-directory", 1); + po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << endl; + return 1; + } + } catch (std::exception &e) { + if (vm.count("help")) { + cout << desc << endl; + return (EXIT_SUCCESS); + } + std::cerr << "Error: " << e.what() << "\n"; + return (EXIT_FAILURE); + } + + FileType ft = fileType == 1 ? FileType::pdf : FileType::xml; + if (is_directory(inputDir)) { + path p(inputDir); + string literature = p.filename().string(); + create_directories(outputDir); + std::fstream f; + f.open(dirlist, std::fstream::in); + string line; + set dirlist_set; + while (f >> line) { + dirlist_set.insert(line); + } + f.close(); + f.open(filelist, std::fstream::in); + string line2; + set filelist_set; + while (f >> line) { + filelist_set.insert(line); + } + f.close(); + convert_dir_recursively(inputDir, outputDir, literature, filelist_set, dirlist_set, ft, + vm["use_parent_dir_as_outname"].as()); + } +} diff --git a/articles2cas/convertallarticles2cas.sh b/articles2cas/convertallarticles2cas.sh new file mode 100755 index 0000000..5f689b8 --- /dev/null +++ b/articles2cas/convertallarticles2cas.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +##### create indexes for all literatures in parallel with a maximum numberof parallel processes + +function usage { + echo "usage: $(basename $0) [-ph] " + echo " -p --num-processes number of parallel processes" + echo " -t --file-type type of input files. 1 for pdf, 2 for xml" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 1 ]] +then + usage +fi + +TYPE=1 +NUM_PROCESSES=1 +ARTICLES_ROOT_DIR="" +CAS_OUT_DIR="" + +while [[ $# -gt 1 ]] +do +key="$1" + +case $key in + -t|--file-type) + shift + TYPE="$1" + shift # past argument + ;; + -p|--num-processes) + shift # past argument + NUM_PROCESSES="$1" + shift + ;; + -h|--help) + usage + ;; + *) + if [ -d $key ] + then + ARTICLES_ROOT_DIR=$key + else + usage + fi + shift + if [ -d $1 ] + then + CAS_OUT_DIR=$1 + else + usage + fi + shift + ;; +esac +done + +# check for the required argument ROOT_DIR +if [[ ${ARTICLES_ROOT_DIR} == "" || ${CAS_OUT_DIR} == "" ]] +then + usage +fi + +find -L ${ARTICLES_ROOT_DIR} -maxdepth 1 -mindepth 1 -type d | xargs -n 1 -P ${NUM_PROCESSES} -I {} sh -c "basename \"{}\" | xargs -I % articles2cas \"{}\" ${CAS_OUT_DIR}/%" + +exit 0 \ No newline at end of file diff --git a/cas2index/cas2index.cpp b/cas2index/cas2index.cpp new file mode 100644 index 0000000..0d98292 --- /dev/null +++ b/cas2index/cas2index.cpp @@ -0,0 +1,93 @@ +/** + Project: textpressocentral + File name: cas2index.cpp + + @author valerio + @version 1.0 7/30/17. +*/ + +#include "IndexManager.h" +#include +#include +#include +#include + +using namespace std; +using namespace boost::filesystem; +namespace po = boost::program_options; + +int main(int argc, const char* argv[]) { + po::options_description desc("options"); + po::positional_options_description p; + po::variables_map vm; + + // arguments + string inputDir; + path inputdir; + string indexpath; + string fileList; + string onlyFilesList; + int numPapersPerIndex; + + try { + desc.add_options() + ("help,h", "produce help message") + ("cas-input-directory,i", po::value(&inputDir)->required(), + "input directory containing cas files") + ("index-output-directory,o", po::value(&indexpath)->required(), + "directory where to write index") + ("subindex-size,s", po::value(&numPapersPerIndex)->default_value(50000), + "maximum number of paper per sub-index") + ("add-files,a", po::value(&fileList), + "add files listed in the provided file to the existing indices. File names must be in the form" + "//") + ("file-list,f", po::value(&onlyFilesList), "create index using only the files provided in the " + "list") + ("external,e", po::bool_switch()->default_value(false), "Create external index"); + p.add("cas-input-directory", 1); + p.add("index-output-directory", 1); + po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << endl; + return 1; + } + if (vm.count("index-output-directory")) { + inputdir = path(inputDir); + } + } catch (std::exception &e) { + if (vm.count("help")) { + cout << desc << endl; + return (EXIT_SUCCESS); + } + std::cerr << "Error: " << e.what() << "\n"; + return (EXIT_FAILURE); + } + tpc::index::IndexManager indexManager(indexpath, false, true); + if (!fileList.empty()) { + std::hash string_hash; + std::ifstream infile(fileList); + string filename; + string lit; + string cas_dirname; + vector filename_arr; + while (std::getline(infile, filename)) + { + boost::split(filename_arr, filename, boost::is_any_of("/")); + lit = filename_arr[0]; + cas_dirname = filename_arr[1]; + indexManager.remove_file_from_index(filename); + indexManager.add_file_to_index(inputDir + "/" + filename, numPapersPerIndex); + } + } else { + std::fstream f; + f.open(onlyFilesList, std::fstream::in); + string line; + set filelist_set; + while (getline(f, line)) { + filelist_set.insert(line); + } + indexManager.create_index_from_existing_cas_dir(inputDir, filelist_set, numPapersPerIndex); + } +} diff --git a/cas2index/create_index_multi_literatures.sh b/cas2index/create_index_multi_literatures.sh new file mode 100755 index 0000000..6cf6efb --- /dev/null +++ b/cas2index/create_index_multi_literatures.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +##### create indexes for all literatures in parallel with a maximum numberof parallel processes + +function usage { + echo "usage: $(basename $0) [-mph] " + echo " -m --max-num-papers maximum number of papers per index. Sub-indexes are created when the threshold is exceeded" + echo " -p --num-processes number of parallel processes" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 1 ]] +then + usage +fi + +NUM_PAPERS=50000 +NUM_PROCESSES=1 +CAS_ROOT_DIR="" +INDEX_OUT_DIR="" + +while [[ $# -gt 1 ]] +do +key="$1" + +case $key in + -m|--max-num-papers) + shift + NUM_PAPERS="$1" + shift # past argument + ;; + -p|--num-processes) + shift # past argument + NUM_PROCESSES="$1" + shift + ;; + -h|--help) + usage + ;; + *) + if [ -d $key ] + then + CAS_ROOT_DIR=$key + else + usage + fi + shift + if [ -d $1 ] + then + INDEX_OUT_DIR=$1 + else + usage + fi + shift + ;; +esac +done + +# check for the required argument ROOT_DIR +if [[ $CAS_ROOT_DIR == "" || $INDEX_OUT_DIR == "" ]] +then + usage +fi + +find -L ${CAS_ROOT_DIR} -maxdepth 1 -mindepth 1 -type d | xargs -n 1 -P ${NUM_PROCESSES} -I {} sh -c "basename \"{}\" | xargs -I % cas2index \"{}\" $INDEX_OUT_DIR/" + +exit 0 \ No newline at end of file diff --git a/cas2index/create_single_index.sh b/cas2index/create_single_index.sh new file mode 100755 index 0000000..9a30c95 --- /dev/null +++ b/cas2index/create_single_index.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +##### create sub-indexes for single index + +function usage { + echo "usage: $(basename $0) [-mph] " + echo " -m --max-num-papers maximum number of papers per index. Sub-indexes are created with this maximum size, in parallel" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 1 ]] +then + usage +fi + +NUM_PAPERS=50000 +CAS_ROOT_DIR="" +INDEX_OUT_DIR="" + +while [[ $# -gt 1 ]] +do +key="$1" + +case $key in + -m|--max-num-papers) + shift + NUM_PAPERS="$1" + shift # past argument + ;; + -h|--help) + usage + ;; + *) + if [ -d $key ] + then + CAS_ROOT_DIR=$key + else + usage + fi + shift + if [ -d $1 ] + then + INDEX_OUT_DIR=$1 + else + usage + fi + shift + ;; +esac +done + +# check for the required argument ROOT_DIR +if [[ $CAS_ROOT_DIR == "" || $INDEX_OUT_DIR == "" ]] +then + usage +fi + +tempdir=$(mktemp -d) +for dir in ${CAS_ROOT_DIR}/*; do for subdir in "$dir"/*; do echo "$subdir"; done; done | tac | awk -F"/" '!x[$NF]++' | tac | awk 'BEGIN{FS="/"; OFS="/"}{print $(NF-1), $NF}' | split -l ${NUM_PAPERS} - ${tempdir}/file_to_index- +i=0 +for file_list in $(ls ${tempdir}) +do + mkdir ${INDEX_OUT_DIR}/tmpindex${i} + + counter=$(($i * ${NUM_PAPERS})) + echo "22 serialization::archive 12 "${counter} > ${INDEX_OUT_DIR}/tmpindex${i}/counter.dat + (export INDEX_PATH=${INDEX_OUT_DIR}/tmpindex${i}; cas2index -i ${CAS_ROOT_DIR} -o ${INDEX_OUT_DIR}/tmpindex${i} -s ${NUM_PAPERS} -f ${tempdir}/${file_list}) & + let i=$(($i + 1)) +done +wait +echo "22 serialization::archive 12 "$(cat ${tempdir}/file_to_index-* | wc -l | awk '{print $1}') > ${INDEX_OUT_DIR}/counter.dat +find ${INDEX_OUT_DIR} -type d -name tmpindex* | while read line +do + tmpnum=$(basename ${line} | sed 's/tmpindex//g') + cp -r ${line}/subindex_0 ${INDEX_OUT_DIR}/subindex_${tmpnum} +done +rm -rf ${INDEX_OUT_DIR}/tmpindex* +rm -rf ${tempdir} +updatecorpuscounter -i ${INDEX_OUT_DIR} +exit 0 \ No newline at end of file diff --git a/cas2index/index_merger.cpp b/cas2index/index_merger.cpp new file mode 100644 index 0000000..ba30397 --- /dev/null +++ b/cas2index/index_merger.cpp @@ -0,0 +1,115 @@ +/* + * File: main.cpp + * Author: liyuling + * + * Created on Dec, 2013 + */ + +#include "../../TextpressoCentralGlobalDefinitions.h" +#include "xercesc/util/XMLString.hpp" +#include "../../TpC/lucene/CaseSensitiveAnalyzer.h" +#include +#include +#include +#include +#include +#include +#include + +//#define TPCAS_2_LINDEX_VERSION "0.9.0" +using namespace std; +using namespace boost::filesystem; +using namespace Lucene; + +void print_who() { + std::cout << std::endl << "Lucene index merger" << std::endl; + std::cout << "Build Date: " << __DATE__ << std::endl; + +} + +void print_help() { + std::cout << std::endl; + std::cout << "Usage: IndexerMerger [index1][index2][optimization yes|no]" << std::endl; + std::cout << std::endl; + std::cout << "it merges [index2] into [index1], after merging, index1 will be optimized if [optimization] = yes"; + std::cout << std::endl; + std::cout << "both index1 and index2 need to be valid TexpressoCentral index structure(not empty)"; + std::cout << std::endl; +} + +void mergeIndex(const string& indexpath1, const string& indexpath2, const string& optimization, bool caseSensitive) { + + cout << "L138" << endl; + String IndexDir1 = StringUtils::toString(indexpath1.c_str()); + IndexWriterPtr writer; + if (caseSensitive) { + writer = newLucene(FSDirectory::open(IndexDir1), + newLucene(LuceneVersion::LUCENE_30), false, // append + IndexWriter::MaxFieldLengthUNLIMITED); + } else { + writer = newLucene(FSDirectory::open(IndexDir1), + newLucene(LuceneVersion::LUCENE_30), false, // append + IndexWriter::MaxFieldLengthUNLIMITED); + } + cout << "maxDoc(): " << writer->maxDoc() << endl; + cout << "L143" << endl; + String IndexDir2 = StringUtils::toString(indexpath2.c_str()); + wcout << "L146 " << IndexDir2.c_str() << endl; + FSDirectoryPtr dir2 = FSDirectory::open(IndexDir2); + Collection indexes = Collection::newInstance(0); + indexes.add(dir2); + // cout << "L152 size " << indexes.size() << endl; + writer->addIndexesNoOptimize(indexes); + cout<<"L165" << endl; + if (optimization == "yes") { + writer->optimize(); + } + cout<<"L166" << endl; + writer->close(); +} + +int main(int argc, char* argv[]) { + if (argc < 4) { + print_who(); + print_help(); + return (-1); + } + + string indexpath1(argv[1]); //tpcas file dir + string indexpath2(argv[2]); //index location + string optimization(argv[3]); //optimize after merge or not + + if (optimization != "yes" && optimization != "no") { + cout << "optimization flag error" << endl; + return (-1); + } + + // lowercase indexes + if (exists(indexpath1 + "/fulltext") && exists(indexpath2 + "/fulltext")) { + mergeIndex(indexpath1 + "/fulltext", indexpath2 + "/fulltext", optimization, false); + } + if (exists(indexpath1 + "/sentence") && exists(indexpath2 + "/sentence")) { + mergeIndex(indexpath1 + "/sentence", indexpath2 + "/sentence", optimization, false); + } + if (exists(indexpath1 + "/lexical") && exists(indexpath2 + "/lexical")) { + mergeIndex(indexpath1 + "/lexical", indexpath2 + "/lexical", optimization, false); + } + if (exists(indexpath1 + "/bibliography") && exists(indexpath2 + "/bibliography")) { + mergeIndex(indexpath1 + "/bibliography", indexpath2 + "/bibliography", optimization, false); + } + + // case sensitive indexes + if (exists(indexpath1 + "/fulltext_cs") && exists(indexpath2 + "/fulltext_cs")) { + mergeIndex(indexpath1 + "/fulltext_cs", indexpath2 + "/fulltext_cs", optimization, true); + } + if (exists(indexpath1 + "/sentence_cs") && exists(indexpath2 + "/sentence_cs")) { + mergeIndex(indexpath1 + "/sentence_cs", indexpath2 + "/sentence_cs", optimization, true); + } + if (exists(indexpath1 + "/lexical_cs") && exists(indexpath2 + "/lexical_cs")) { + mergeIndex(indexpath1 + "/lexical_cs", indexpath2 + "/lexical_cs", optimization, true); + } + if (exists(indexpath1 + "/bibliography_cs") && exists(indexpath2 + "/bibliography_cs")) { + mergeIndex(indexpath1 + "/bibliography_cs", indexpath2 + "/bibliography_cs", optimization, true); + } +} + diff --git a/cas2index/saveidstodb.cpp b/cas2index/saveidstodb.cpp new file mode 100644 index 0000000..536375e --- /dev/null +++ b/cas2index/saveidstodb.cpp @@ -0,0 +1,50 @@ +/** + Project: textpressocentral + File name: saveidstodb.cpp + + @author valerio + @version 1.0 10/9/17. +*/ + +#include "IndexManager.h" +#include +#include +#include +#include + +using namespace std; +using namespace boost::filesystem; +namespace po = boost::program_options; + +int main(int argc, const char* argv[]) { + po::options_description desc("options"); + po::positional_options_description p; + po::variables_map vm; + + // arguments + string inputDir; + + try { + desc.add_options() + ("help,h", "produce help message") + ("index_dir,i", po::value(&inputDir)->required(), + "index directory where to read the data and store the db file"); + p.add("cas-input-directory", 1); + po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << endl; + return 1; + } + } catch (std::exception &e) { + if (vm.count("help")) { + cout << desc << endl; + return (EXIT_SUCCESS); + } + std::cerr << "Error: " << e.what() << "\n"; + return (EXIT_FAILURE); + } + tpc::index::IndexManager indexManager(inputDir, false, false); + indexManager.save_all_doc_ids_for_sentences_to_db(); +} diff --git a/cas2index/update_corpus_counter.cpp b/cas2index/update_corpus_counter.cpp new file mode 100644 index 0000000..e3e6369 --- /dev/null +++ b/cas2index/update_corpus_counter.cpp @@ -0,0 +1,50 @@ +/** + Project: textpressocentral + File name: update_corpus_counter.cpp + + @author valerio + @version 1.0 10/06/17. +*/ + +#include "IndexManager.h" +#include +#include +#include +#include + +using namespace std; +using namespace boost::filesystem; +namespace po = boost::program_options; + +int main(int argc, const char* argv[]) { + po::options_description desc("options"); + po::positional_options_description p; + po::variables_map vm; + + // arguments + string inputDir; + + try { + desc.add_options() + ("help,h", "produce help message") + ("index_dir,i", po::value(&inputDir)->required(), + "index directory where to read the data and store the counter file"); + p.add("cas-input-directory", 1); + po::store(po::command_line_parser(argc, argv).options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << endl; + return 1; + } + } catch (std::exception &e) { + if (vm.count("help")) { + cout << desc << endl; + return (EXIT_SUCCESS); + } + std::cerr << "Error: " << e.what() << "\n"; + return (EXIT_FAILURE); + } + tpc::index::IndexManager indexManager(inputDir, false, false); + indexManager.calculate_and_save_corpus_counter(); +} diff --git a/cmake/Modules/FindAPR.cmake b/cmake/Modules/FindAPR.cmake new file mode 100644 index 0000000..8e01ec1 --- /dev/null +++ b/cmake/Modules/FindAPR.cmake @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find Apache Portable Runtime +# Find the APR includes and libraries +# This module defines +# APR_INCLUDE_DIR and APRUTIL_INCLUDE_DIR, where to find apr.h, etc. +# APR_LIBRARIES and APRUTIL_LIBRARIES, the libraries needed to use APR. +# APR_FOUND and APRUTIL_FOUND, If false, do not try to use APR. +# also defined, but not for general use are +# APR_LIBRARY and APRUTIL_LIBRARY, where to find the APR library. + +# APR first. + +FIND_PATH(APR_INCLUDE_DIR apr.h + /opt/homebrew/opt/apr/include/apr-1 + /usr/local/include/apr-1 + /usr/local/include/apr-1.0 + /usr/include/apr-1 + /usr/include/apr-1.0 + /usr/local/apr/include/apr-1 +) + +SET(APR_NAMES ${APR_NAMES} apr-1) +FIND_LIBRARY(APR_LIBRARY + NAMES ${APR_NAMES} + HINTS + /opt/homebrew/opt/apr/lib + PATHS + /usr/lib + /usr/local/lib + /usr/local/apr-1/lib + /usr/local/apr/lib + ) + +IF (APR_LIBRARY AND APR_INCLUDE_DIR) + SET(APR_LIBRARIES ${APR_LIBRARY}) + SET(APR_FOUND "YES") +ELSE (APR_LIBRARY AND APR_INCLUDE_DIR) + SET(APR_FOUND "NO") +ENDIF (APR_LIBRARY AND APR_INCLUDE_DIR) + + +IF (APR_FOUND) + IF (NOT APR_FIND_QUIETLY) + MESSAGE(STATUS "Found APR headers: ${APR_INCLUDE_DIR}") + MESSAGE(STATUS "Found APR library: ${APR_LIBRARIES}") + ENDIF (NOT APR_FIND_QUIETLY) +ELSE (APR_FOUND) + IF (APR_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find APR library") + ENDIF (APR_FIND_REQUIRED) +ENDIF (APR_FOUND) + +# Deprecated declarations. +SET (NATIVE_APR_INCLUDE_PATH ${APR_INCLUDE_DIR} ) +GET_FILENAME_COMPONENT (NATIVE_APR_LIB_PATH ${APR_LIBRARY} PATH) + +MARK_AS_ADVANCED( + APR_LIBRARY + APR_INCLUDE_DIR + ) + +# Next, APRUTIL. + +FIND_PATH(APRUTIL_INCLUDE_DIR apu.h + /opt/homebrew/opt/apr-util/include/apr-1 + /usr/local/include/apr-1 + /usr/local/include/apr-1.0 + /usr/include/apr-1 + /usr/include/apr-1.0 + /usr/local/apr/include/apr-1 +) + +SET(APRUTIL_NAMES ${APRUTIL_NAMES} aprutil-1) +FIND_LIBRARY(APRUTIL_LIBRARY + NAMES ${APRUTIL_NAMES} + HINTS + /opt/homebrew/opt/apr-util/lib + PATHS + /usr/lib + /usr/local/lib + /usr/local/apr/lib + ) + +IF (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR) + SET(APRUTIL_LIBRARIES ${APRUTIL_LIBRARY}) + SET(APRUTIL_FOUND "YES") +ELSE (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR) + SET(APRUTIL_FOUND "NO") +ENDIF (APRUTIL_LIBRARY AND APRUTIL_INCLUDE_DIR) + + +IF (APRUTIL_FOUND) + IF (NOT APRUTIL_FIND_QUIETLY) + MESSAGE(STATUS "Found APRUTIL headers: ${APRUTIL_INCLUDE_DIR}") + MESSAGE(STATUS "Found APRUTIL library: ${APRUTIL_LIBRARIES}") + ENDIF (NOT APRUTIL_FIND_QUIETLY) +ELSE (APRUTIL_FOUND) + IF (APRUTIL_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find APRUTIL library") + ENDIF (APRUTIL_FIND_REQUIRED) +ENDIF (APRUTIL_FOUND) + +# Deprecated declarations. +SET (NATIVE_APRUTIL_INCLUDE_PATH ${APRUTIL_INCLUDE_DIR} ) +GET_FILENAME_COMPONENT (NATIVE_APRUTIL_LIB_PATH ${APRUTIL_LIBRARY} PATH) + +MARK_AS_ADVANCED( + APRUTIL_LIBRARY + APRUTIL_INCLUDE_DIR + ) diff --git a/cmake/Modules/FindCImg.cmake b/cmake/Modules/FindCImg.cmake new file mode 100644 index 0000000..de34032 --- /dev/null +++ b/cmake/Modules/FindCImg.cmake @@ -0,0 +1,341 @@ +# - Try to find CImg lib +# +# The following variables are defined +# +# CImg_FOUND - system has CImg lib +# CImg_INCLUDE_DIRS - the CImg include directory +# CImg_SYSTEM_LIBS - external libraries that CImg uses +# CImg_SYSTEM_LIBS_DIR - external library directories +# CImg_CFLAGS - compilation flags + + +if (CImg_INCLUDE_DIR) + set(CImg_FOUND TRUE) +else (CImg_INCLUDE_DIR) + find_path(CImg_INCLUDE_DIR + NAMES CImg.h + PATHS + ${CMAKE_INSTALL_PREFIX}/include + /usr/include + ) + mark_as_advanced(CImg_INCLUDE_DIR) +endif(CImg_INCLUDE_DIR) +list(APPEND CImg_INCLUDE_DIRS + ${CImg_INCLUDE_DIR} +) + +# To use PKG_CHECK_MODULES to find some optional packages +find_package(PkgConfig) + + +# ### CIMG related stuff +# Flags to enable fast image display, using the XSHM library. +SET(CIMG_XSHM_CCFLAGS -Dcimg_use_xshm) + +# Flags to enable screen mode switching, using the XRandr library. +SET(CIMG_XRANDR_CCFLAGS -Dcimg_use_xrandr) + +# Flags to enable native support for JPEG image files, using the JPEG library. +# ( http://www.ijg.org/ ) +SET(CIMG_JPEG_CCFLAGS -Dcimg_use_jpeg) + +# Flags to enable native support for TIFF image files, using the TIFF library. +# ( http://www.libtiff.org/ ) +SET(CIMG_TIFF_CCFLAGS -Dcimg_use_tiff) + +# Flags to enable native support for PNG image files, using the PNG library. +# ( http://www.libpng.org/ ) +SET(CIMG_PNG_CCFLAGS -Dcimg_use_png) + +#Flags to enable OPENCV support (Camera) +# ( http://www.opencv.org/ ) +SET(CIMG_OPENCV_CCFLAGS -Dcimg_use_opencv) + +# Flags to enable native support for EXR image files, using the OpenEXR library. +# ( http://www.openexr.com/ ) +SET(CIMG_OPENEXR_CCFLAGS -Dcimg_use_openexr) + +# Flags to enable native support for various video files, using the FFMPEG library. +# ( http://www.ffmpeg.org/ ) +SET(CIMG_FFMPEG_CCFLAGS -Dcimg_use_ffmpeg) + +# Flags to enable native support of most classical image file formats, using the Magick++ library. +# ( http://www.imagemagick.org/Magick++/ ) +SET(CIMG_MAGICK_CCFLAGS -Dcimg_use_magick) + +# Flags to enable faster Discrete Fourier Transform computation, using the FFTW3 library +# ( http://www.fftw.org/ ) +SET(CIMG_FFTW3_CCFLAGS -Dcimg_use_fftw3) + +# Flags to enable zlib. +# ( http://www.zlib.net/ ) +SET(CIMG_ZLIB_CCFLAGS -Dcimg_use_zlib) + +# ### Search Additional Libraries ########## +FIND_PACKAGE(OpenCV) +FIND_PACKAGE(JPEG) +FIND_PACKAGE(TIFF) +FIND_PACKAGE(PNG) +FIND_PACKAGE(ZLIB) +FIND_PACKAGE(LAPACK) +FIND_PACKAGE(BLAS) + +PKG_CHECK_MODULES(FFTW3 fftw3) +PKG_CHECK_MODULES(OPENEXR OpenEXR) +PKG_CHECK_MODULES(MAGICK Magick++) + +# PKG_CHECK_MODULES(LIBAVCODEC libavcodec) +# PKG_CHECK_MODULES(LIBAVFORMAT libavformat) +# PKG_CHECK_MODULES(LIBSWSCALE libswscale) +# PKG_CHECK_MODULES(LIBAVUTIL libavutil) + +if(NOT WIN32) + FIND_PACKAGE(X11) + FIND_PACKAGE(Threads REQUIRED) +endif() + +# #### End of additional libraries search ########## + +### Configure Paths according to detected packages +if(TIFF_FOUND) + get_filename_component(TIFF_LIB_DIRS ${TIFF_LIBRARIES} PATH) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_TIFF_CCFLAGS}") +# link_directories(${TIFF_LIB_DIRS}) +# include_directories(${TIFF_INCLUDE_DIR}) +# SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${TIFF_LIBRARIES}) + list(APPEND CImg_INCLUDE_DIRS + ${TIFF_INCLUDE_DIR} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${TIFF_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${TIFF_LIBRARIES} + ) +endif() + +if(JPEG_FOUND) + get_filename_component(JPEG_LIB_DIRS ${JPEG_LIBRARIES} PATH) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_JPEG_CCFLAGS}") +# link_directories(${JPEG_LIB_DIRS}) +# include_directories(${JPEG_INCLUDE_DIR}) +# SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${JPEG_LIBRARIES}) + list(APPEND CImg_INCLUDE_DIRS + ${JPEG_INCLUDE_DIR} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${JPEG_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${JPEG_LIBRARIES} + ) +endif() + +if (ZLIB_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_ZLIB_CCFLAGS}") +# link_directories(${ZLIB_LIB_DIRS}) +# include_directories(${ZLIB_INCLUDE_DIR}) +# SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${ZLIB_LIBRARIES}) + list(APPEND CImg_INCLUDE_DIRS + ${ZLIB_INCLUDE_DIR} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${ZLIB_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${ZLIB_LIBRARIES} + ) + # PNG requires ZLIB + if(PNG_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_PNG_CCFLAGS}") + # link_directories(${PNG_LIB_DIRS}) + # include_directories(${PNG_INCLUDE_DIR} ) + # SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${PNG_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${PNG_INCLUDE_DIR} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${PNG_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${PNG_LIBRARIES} + ) + endif() +endif() + +if(FFTW3_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_FFTW3_CCFLAGS}") + #link_directories( ${FFTW3_LIBRARY_DIRS} ) + #include_directories( ${FFTW3_INCLUDE_DIRS} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${FFTW3_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${FFTW3_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${FFTW3_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${FFTW3_LIBRARIES} + ) +endif() + +if(OPENEXR_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_OPENEXR_CCFLAGS}") + #link_directories( ${OPENEXR_LIBRARY_DIRS} ) + #include_directories( ${OPENEXR_INCLUDE_DIRS} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${OPENEXR_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${OPENEXR_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${OPENEXR_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${OPENEXR_LIBRARIES} + ) +endif() + +if(MAGICK_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_MAGICK_CCFLAGS}") + #link_directories( ${MAGICK_LIBRARY_DIRS} ) + #include_directories( ${MAGICK_INCLUDE_DIRS} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${MAGICK_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${MAGICK_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${MAGICK_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${MAGICK_LIBRARIES} + ) +endif() + +if( LIBAVCODEC_FOUND AND LIBAVFORMAT_FOUND AND LIBSWSCALE_FOUND AND LIBAVUTIL_FOUND ) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_FFMPEG_CCFLAGS}") + #link_directories( ${LIBAVFORMAT_LIBRARY_DIRS} ) + #link_directories( ${LIBAVCODEC_LIBRARY_DIRS} ) + #link_directories( ${LIBSWSCALE_LIBRARY_DIRS} ) + #link_directories( ${LIBAVUTIL_LIBRARY_DIRS} ) + #include_directories( ${LIBAVFORMAT_INCLUDE_DIRS} ${LIBAVFORMAT_INCLUDE_DIRS}/libavformat) + #include_directories( ${LIBAVCODEC_INCLUDE_DIRS} ${LIBAVCODEC_INCLUDE_DIRS}/libavcodec ) + #include_directories( ${LIBSWSCALE_INCLUDE_DIRS} ${LIBSWSCALE_INCLUDE_DIRS}/libswscale) + #include_directories( ${LIBAVUTIL_INCLUDE_DIRS} ${LIBAVUTIL_INCLUDE_DIRS}/libavutil ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVFORMAT_LIBRARIES} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVCODEC_LIBRARIES} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBSWSCALE_LIBRARIES} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LIBAVUTIL_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${LIBAVFORMAT_INCLUDE_DIRS} ${LIBAVFORMAT_INCLUDE_DIRS}/libavformat + ${LIBAVCODEC_INCLUDE_DIRS} ${LIBAVCODEC_INCLUDE_DIRS}/libavcodec + ${LIBSWSCALE_INCLUDE_DIRS} ${LIBSWSCALE_INCLUDE_DIRS}/libswscale + ${LIBAVUTIL_INCLUDE_DIRS} ${LIBAVUTIL_INCLUDE_DIRS}/libavutil + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${LIBAVFORMAT_LIBRARY_DIRS} + ${LIBAVCODEC_LIBRARY_DIRS} + ${LIBSWSCALE_LIBRARY_DIRS} + ${LIBAVUTIL_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${LIBAVFORMAT_LIBRARIES} + ${LIBAVCODEC_LIBRARIES} + ${LIBSWSCALE_LIBRARIES} + ${LIBAVUTIL_LIBRARIES} + ) +endif() + +if(NOT APPLE) + if(NOT WIN32) + if(X11_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_XSHM_CCFLAGS} ${CIMG_XRANDR_CCFLAGS}") + SET(CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} Xext Xrandr) + endif() + endif(NOT WIN32) +endif(NOT APPLE) + +if(X11_FOUND) + #link_directories(${X11_LIB_DIRS}) + #include_directories(${X11_INCLUDE_DIR}) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${X11_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${X11_INCLUDE_DIR} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${X11_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${X11_LIBRARIES} + ) +endif() + +if (NOT WIN32) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${CMAKE_THREAD_LIBS_INIT} ) + list(APPEND CImg_SYSTEM_LIBS + ${CMAKE_THREAD_LIBS_INIT} + ) +endif() + +if( WIN32) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} gdi32 ) + list(APPEND CImg_SYSTEM_LIBS + gdi32 + ) +endif() + +if (OpenCV_FOUND) + message("OpenCV Found") + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_OPENCV_CCFLAGS}") + #include_directories(${OpenCV_INCLUDE_DIRS}) + #link_directories(${OpenCV_LIB_DIRS}) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${OpenCV_LIBS} ) + list(APPEND CImg_INCLUDE_DIRS + ${OpenCV_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${OpenCV_LIB_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${OpenCV_LIBS} + ) +endif() + +if(LAPACK_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_LAPACK_CCFLAGS}") + #link_directories( ${LAPACK_LIBRARY_DIRS} ) + #include_directories( ${LAPACK_INCLUDE_DIRS} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${LAPACK_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${LAPACK_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${LAPACK_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${LAPACK_LIBRARIES} + ) +endif() + +if(BLAS_FOUND) + SET(CIMG_CFLAGS "${CIMG_CFLAGS} ${CIMG_BLAS_CCFLAGS}") + #link_directories( ${BLAS_LIBRARY_DIRS} ) + #include_directories( ${BLAS_INCLUDE_DIRS} ) + #SET( CImg_SYSTEM_LIBS ${CImg_SYSTEM_LIBS} ${BLAS_LIBRARIES} ) + list(APPEND CImg_INCLUDE_DIRS + ${BLAS_INCLUDE_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS_DIR + ${BLAS_LIBRARY_DIRS} + ) + list(APPEND CImg_SYSTEM_LIBS + ${BLAS_LIBRARIES} + ) +endif() + +# Add CIMG Flags to Compilation Flags +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CIMG_CFLAGS}") + +foreach(program ${CIMG_FILES}) + add_executable(${program} ${program}.cpp) + target_link_libraries(${program} ${CImg_SYSTEM_LIBS} ) +endforeach(program) diff --git a/cmake/Modules/FindLucene++.cmake b/cmake/Modules/FindLucene++.cmake new file mode 100644 index 0000000..6bcb6b6 --- /dev/null +++ b/cmake/Modules/FindLucene++.cmake @@ -0,0 +1,103 @@ +# +# This module looks for lucene++ support +# It will define the following values +# +# LUCENEPP_INCLUDE_DIRS = LUCENEPP_INCLUDE_DIR + LUCENEPP_LIBRARY_DIR +# LUCENEPP_INCLUDE_DIR = where lucene++/Lucene.h can be found +# LUCENEPP_LIBRARY_DIR = where liblucene++.so can be found +# LUCENEPP_LIBRARIES = the libraries to link against lucene++ +# LUCENEPP_VERSION = The lucene++ version string +# LUCENEPP_FOUND = set to 1 if lucene++ is found +# + +INCLUDE(CheckSymbolExists) +#INCLUDE(FindLibraryWithDebug) + +IF(LUCENEPP_FIND_VERSION) + SET(LUCENEPP_MIN_VERSION ${LUCENEPP_FIND_VERSION}) +ELSEIF() + SET(LUCENEPP_MIN_VERSION "3.0.0") +ENDIF(LUCENEPP_FIND_VERSION) + +SET(TRIAL_LIBRARY_PATHS + $ENV{LUCENEPP_HOME}/lib${LIB_SUFFIX} + ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX} + ${CMAKE_INSTALL_PREFIX}/lib + /usr/local/lib${LIB_SUFFIX} + /usr/local/lib/${CMAKE_LIBRARY_ARCHITECTURE} + /usr/lib${LIB_SUFFIX} + /sw/lib${LIB_SUFFIX} + /usr/pkg/lib${LIB_SUFFIX} + /usr/lib64 + /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE} + ) +SET(TRIAL_INCLUDE_PATHS + $ENV{LUCENEPP_HOME}/include + ${CMAKE_INSTALL_PREFIX}/include + /usr/local/include + /usr/include + /sw/include + /usr/pkg/include + ) +#FIND_LIBRARY_WITH_DEBUG(LUCENEPP_CORE_LIBRARY +# WIN32_DEBUG_POSTFIX d +# NAMES lucene++ +# PATHS ${TRIAL_LIBRARY_PATHS}) +IF (LUCENEPP_CORE_LIBRARY) + MESSAGE(STATUS "Found Lucene++ core library: ${LUCENEPP_CORE_LIBRARY}") +ENDIF (LUCENEPP_CORE_LIBRARY) +#FIND_LIBRARY_WITH_DEBUG(LUCENEPP_SHARED_LIBRARY +# WIN32_DEBUG_POSTFIX d +# NAMES lucene++-contrib +# PATHS ${TRIAL_LIBRARY_PATHS}) +IF (LUCENEPP_SHARED_LIBRARY) + MESSAGE(STATUS "Found Lucene++ contrib library: ${LUCENEPP_SHARED_LIBRARY}") +ENDIF (LUCENEPP_SHARED_LIBRARY) + +IF(LUCENEPP_CORE_LIBRARY AND LUCENEPP_SHARED_LIBRARY) + SET(LUCENEPP_LIBRARIES ${LUCENEPP_CORE_LIBRARY} ${LUCENEPP_SHARED_LIBRARY} ${Boost_SYSTEM_LIBRARY}) +ENDIF(LUCENEPP_CORE_LIBRARY AND LUCENEPP_SHARED_LIBRARY) + +FIND_PATH(LUCENEPP_INCLUDE_DIR + NAMES lucene++/Lucene.h + PATHS ${TRIAL_INCLUDE_PATHS}) + +IF (LUCENEPP_INCLUDE_DIR) + MESSAGE(STATUS "Found Lucene++ include dir: ${LUCENEPP_INCLUDE_DIR}") +ENDIF (LUCENEPP_INCLUDE_DIR) + +SET(LUCENEPP_GOOD_VERSION TRUE) + +FIND_PATH(LUCENEPP_LIBRARY_DIR + NAMES liblucene++.dylib liblucene++.so liblucene++.dll.a lucene++ + PATHS ${TRIAL_LIBRARY_PATHS} ${TRIAL_INCLUDE_PATHS} NO_DEFAULT_PATH) +IF (LUCENEPP_LIBRARY_DIR) + MESSAGE(STATUS "Found Lucene++ library dir: ${LUCENEPP_LIBRARY_DIR}") + + IF (LUCENEPP_VERSION STRLESS "${LUCENEPP_MIN_VERSION}") + MESSAGE(ERROR " Lucene++ version ${LUCENEPP_VERSION} is less than the required minimum ${LUCENEPP_MIN_VERSION}") + SET(LUCENEPP_GOOD_VERSION FALSE) + ENDIF (LUCENEPP_VERSION STRLESS "${LUCENEPP_MIN_VERSION}") +ENDIF (LUCENEPP_LIBRARY_DIR) + +IF(LUCENEPP_INCLUDE_DIR AND LUCENEPP_LIBRARIES AND LUCENEPP_LIBRARY_DIR AND LUCENEPP_GOOD_VERSION) + SET(LUCENEPP_FOUND TRUE) + SET(LUCENEPP_INCLUDE_DIRS ${LUCENEPP_LIBRARY_DIR} ${LUCENEPP_INCLUDE_DIR}) +ENDIF(LUCENEPP_INCLUDE_DIR AND LUCENEPP_LIBRARIES AND LUCENEPP_LIBRARY_DIR AND LUCENEPP_GOOD_VERSION) + +IF(LUCENEPP_FOUND) + IF(NOT LUCENEPP_FIND_QUIETLY) + MESSAGE(STATUS "Found Lucene++: ${LUCENEPP_LIBRARIES} version ${LUCENEPP_VERSION}") + ENDIF(NOT LUCENEPP_FIND_QUIETLY) +ELSE(LUCENEPP_FOUND) + IF(LUCENEPP_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find Lucene++.") + ENDIF(LUCENEPP_FIND_REQUIRED) +ENDIF(LUCENEPP_FOUND) + +MARK_AS_ADVANCED( + LUCENEPP_INCLUDE_DIRS + LUCENEPP_INCLUDE_DIR + LUCENEPP_LIBRARY_DIR + LUCENEPP_LIBRARIES + ) diff --git a/cmake/Modules/FindTextpresso.cmake b/cmake/Modules/FindTextpresso.cmake new file mode 100644 index 0000000..7b1e25b --- /dev/null +++ b/cmake/Modules/FindTextpresso.cmake @@ -0,0 +1,22 @@ +# find Textpresso core library + +FIND_PATH( Textpresso_INCLUDE_DIR NAMES CASManager.h IndexManager.h PATHS ENV PATH PATH_SUFFIXES + include textpresso) + +FIND_LIBRARY( Textpresso_LIBRARY NAMES textpresso PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) + +IF( Textpresso_LIBRARY ) + SET( Textpresso_FOUND TRUE ) + SET( Textpresso_LIBRARIES Textpresso_LIBRARY ) +ENDIF( Textpresso_LIBRARY) + +IF( Textpresso_FOUND ) + IF (NOT Textpresso_FIND_QUIETLY) + MESSAGE(STATUS "Found the Textpresso libraries at ${Textpresso_LIBRARY}") + MESSAGE(STATUS "Found the Textpresso headers at ${Textpresso_INCLUDE_DIR}") + ENDIF (NOT Textpresso_FIND_QUIETLY) +ELSE( Textpresso_FOUND ) + IF(Textpresso_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could NOT find Textpresso") + ENDIF(Textpresso_FIND_REQUIRED) +ENDIF(Textpresso_FOUND) \ No newline at end of file diff --git a/cmake/Modules/FindWt.cmake b/cmake/Modules/FindWt.cmake new file mode 100644 index 0000000..2928d32 --- /dev/null +++ b/cmake/Modules/FindWt.cmake @@ -0,0 +1,163 @@ +# Find Wt includes and libraries +# +# This script sets the following variables: +# +# Wt_INCLUDE_DIR +# Wt_LIBRARIES - Release libraries +# Wt_FOUND - True if release libraries found +# Wt_DEBUG_LIBRARIES - Debug libraries +# Wt_DEBUG_FOUND - True if debug libraries found +# +# To direct the script to a particular Wt installation, use the +# standard cmake variables CMAKE_INCLUDE_PATH and CMAKE_LIBRARY_PATH +# +# To use this script to find Wt, when using the new style for include files: +# #include +# #include +# #include +# +# include the following CMake snippet in your project: +# +# FIND_PACKAGE( Wt REQUIRED ) +# INCLUDE_DIRECTORIES( ${Wt_INCLUDE_DIR} ) +# TARGET_LINK_LIBRARIES( yourexe +# ${Wt_DEBUG_LIBRARY} # or {Wt_LIBRARY} +# ${Wt_HTTP_DEBUG_LIBRARY} # or {Wt_HTTP_LIBRARY} +# ${Wt_EXT_DEBUG_LIBRARY} # or {Wt_EXT_LIBRARY} +# ) +# +# To use this script to find Wt, when using the old include style: +# #include +# #include +# #include +# style of include files, change the INCLUDE_DIRECTORIES statement to: +# INCLUDE_DIRECTORIES( ${Wt_INCLUDE_DIR} ${Wt_INCLUDE_DIR}/Wt ) +# +# +# +# +# Copyright (c) 2007, Pau Garcia i Quiles, +# +# Redistribution and use is allowed according to the terms of the BSD license. +# For details see the accompanying COPYING-CMAKE-SCRIPTS file. + +FIND_PATH( Wt_INCLUDE_DIR NAMES Wt/WObject PATHS ENV PATH PATH_SUFFIXES include wt ) + +SET( Wt_FIND_COMPONENTS Release Debug ) + +FIND_LIBRARY( Wt_LIBRARY NAMES wt PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_EXT_LIBRARY NAMES wtext PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_HTTP_LIBRARY NAMES wthttp PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_TEST_LIBRARY NAMES wttest PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_FCGI_LIBRARY NAMES wtfcgi PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_DBO_LIBRARY NAMES wtdbo PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_DBOSQLITE3_LIBRARY NAMES wtdbosqlite3 PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_DBOPOSTGRES_LIBRARY NAMES wtdbopostgres PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_DBOMYSQL_LIBRARY NAMES wtdbomysql PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) +FIND_LIBRARY( Wt_DBOFIREBIRD_LIBRARY NAMES wtdbofirebird PATHS PATH PATH_SUFFIXES lib lib-release lib_release ) + +FIND_LIBRARY( Wt_DEBUG_LIBRARY NAMES wtd wt PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_EXT_DEBUG_LIBRARY NAMES wtextd wtext PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_HTTP_DEBUG_LIBRARY NAMES wthttpd wthttp PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_TEST_DEBUG_LIBRARY NAMES wttestd wttest PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_FCGI_DEBUG_LIBRARY NAMES wtfcgid wtfcgi PATHS PATH PATH_SUFFIXES lib libd lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_DBO_DEBUG_LIBRARY NAMES wtdbod wtdbo PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_DBOSQLITE3_DEBUG_LIBRARY NAMES wtdbosqlite3d wtdbosqlite3 PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_DBOPOSTGRES_DEBUG_LIBRARY NAMES wtdbopostgresd wtdbopostgres PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_DBOMYSQL_DEBUG_LIBRARY NAMES wtdbomysqld wtdbomysql PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) +FIND_LIBRARY( Wt_DBOFIREBIRD_DEBUG_LIBRARY NAMES wtdbofirebirdd wtdbofirebird PATHS PATH PATH_SUFFIXES lib lib-debug lib_debug HINTS /usr/lib/debug/usr/lib) + +IF( Wt_LIBRARY ) + IF( Wt_HTTP_LIBRARY ) + SET( Wt_FOUND TRUE ) + SET( Wt_FIND_REQUIRED_Release TRUE ) + SET( Wt_LIBRARIES ${Wt_LIBRARY} ) + + IF( Wt_FCGI_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ) + ENDIF( Wt_FCGI_LIBRARY ) + ELSE( Wt_HTTP_LIBRARY ) + IF( Wt_FCGI_LIBRARY ) + SET( Wt_FOUND TRUE ) + SET( Wt_FIND_REQUIRED_Release TRUE ) + SET( Wt_LIBRARIES ${Wt_LIBRARY} ) + ENDIF( Wt_FCGI_LIBRARY ) + ENDIF( Wt_HTTP_LIBRARY ) +ENDIF( Wt_LIBRARY ) + +IF( Wt_EXT_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_EXT_LIBRARY} ) +ENDIF( Wt_EXT_LIBRARY ) + +IF( Wt_DBO_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBO_LIBRARY} ) + IF( Wt_DBOSQLITE3_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOSQLITE3_LIBRARY} ) + ENDIF( Wt_DBOSQLITE3_LIBRARY ) + IF( Wt_DBOPOSTGRES_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOPOSTGRES_LIBRARY} ) + ENDIF( Wt_DBOPOSTGRES_LIBRARY ) + IF( Wt_DBOMYSQL_LIBRARY ) + SET( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOMYSQL_LIBRARY} ) + ENDIF( Wt_DBOMYSQL_LIBRARY ) + IF ( Wt_DBOFIREBIRD_LIBRARY ) + SET ( Wt_LIBRARIES ${Wt_LIBRARIES} ${Wt_DBOFIREBIRD_LIBRARY} ) + ENDIF ( Wt_DBOFIREBIRD_LIBRARY ) +ENDIF( Wt_DBO_LIBRARY ) + +IF( Wt_DEBUG_LIBRARY ) + IF ( Wt_HTTP_DEBUG_LIBRARY) + SET( Wt_DEBUG_FOUND TRUE ) + SET( Wt_FIND_REQUIRED_Debug TRUE ) + SET( Wt_DEBUG_LIBRARIES ${Wt_HTTP_DEBUG_LIBRARY} ${Wt_DEBUG_LIBRARY} ) + + IF( Wt_FCGI_DEBUG_LIBRARY ) + SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_FCGI_DEBUG_LIBRARY} ) + ENDIF( Wt_FCGI_DEBUG_LIBRARY ) + ELSE( Wt_HTTP_DEBUG_LIBRARY ) + IF( Wt_FCGI_DEBUG_LIBRARY ) + SET( Wt_DEBUG_FOUND TRUE ) + SET( Wt_FIND_REQUIRED_Debug TRUE ) + SET( Wt_DEBUG_LIBRARIES ${Wt_FCGI_DEBUG_LIBRARY} ${Wt_DEBUG_LIBRARY} ) + ENDIF( Wt_FCGI_DEBUG_LIBRARY ) + ENDIF( Wt_HTTP_DEBUG_LIBRARY) +ENDIF( Wt_DEBUG_LIBRARY ) + +IF( Wt_DBO_DEBUG_LIBRARY ) + SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBO_DEBUG_LIBRARY} ) + IF( Wt_DBOSQLITE3_DEBUG_LIBRARY ) + SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOSQLITE3_DEBUG_LIBRARY} ) + ENDIF( Wt_DBOSQLITE3_DEBUG_LIBRARY ) + IF( Wt_DBOPOSTGRES_DEBUG_LIBRARY ) + SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOPOSTGRES_DEBUG_LIBRARY} ) + ENDIF( Wt_DBOPOSTGRES_DEBUG_LIBRARY ) + IF( Wt_DBOMYSQL_DEBUG_LIBRARY ) + SET( Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOMYSQL_DEBUG_LIBRARY} ) + ENDIF ( Wt_DBOMYSQL_DEBUG_LIBRARY ) + IF ( Wt_DBOFIREBIRD_DEBUG_LIBRARY ) + SET (Wt_DEBUG_LIBRARIES ${Wt_DEBUG_LIBRARIES} ${Wt_DBOFIREBIRD_DEBUG_LIBRARY} ) + ENDIF ( Wt_DBOFIREBIRD_DEBUG_LIBRARY ) +ENDIF( Wt_DBO_DEBUG_LIBRARY ) + +IF(Wt_FOUND) + IF (NOT Wt_FIND_QUIETLY) + MESSAGE(STATUS "Found the Wt libraries at ${Wt_LIBRARIES}") + MESSAGE(STATUS "Found the Wt headers at ${Wt_INCLUDE_DIR}") + ENDIF (NOT Wt_FIND_QUIETLY) +ELSE(Wt_FOUND) + IF(Wt_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could NOT find Wt") + ENDIF(Wt_FIND_REQUIRED) +ENDIF(Wt_FOUND) + +IF(Wt_DEBUG_FOUND) + IF (NOT Wt_FIND_QUIETLY) + MESSAGE(STATUS "Found the Wt debug libraries at ${Wt_DEBUG_LIBRARIES}") + MESSAGE(STATUS "Found the Wt debug headers at ${Wt_INCLUDE_DIR}") + ENDIF (NOT Wt_FIND_QUIETLY) +ELSE(Wt_DEBUG_FOUND) + IF(Wt_FIND_REQUIRED_Debug) + MESSAGE(FATAL_ERROR "Could NOT find Wt debug libraries") + ENDIF(Wt_FIND_REQUIRED_Debug) +ENDIF(Wt_DEBUG_FOUND) + diff --git a/getbib/getallbibfiles.sh b/getbib/getallbibfiles.sh new file mode 100644 index 0000000..d8108db --- /dev/null +++ b/getbib/getallbibfiles.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +function usage { + echo "usage: $(basename $0) [p] " + echo " -p --num-proc maximum number of parallel processes" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 1 ]] +then + usage +fi + +ROOT_DIR="" +N_PROC=1 + +while [[ $# -gt 0 ]] +do +key=$1 + +case $key in + -p|--num-proc) + shift + N_PROC=$1 + shift + ;; + -h|--help) + usage + ;; + *) + if [[ -d $key ]] + then + ROOT_DIR="$key" + shift + else + usage + fi + ;; +esac +done + +# check for the required argument ROOT_DIR +if [[ ${ROOT_DIR} == "" ]] +then + usage +fi + +for corpus in ${ROOT_DIR}/* +do + if [[ "${corpus}" == "C. elegans" || "${corpus}" == "C.elegans Supplementals" ]] + then + getbib ${ROOT_DIR}/${corpus} + else + getbib4nxml ${ROOT_DIR}/${corpus} + fi +done \ No newline at end of file diff --git a/getbib/getbib.cpp b/getbib/getbib.cpp new file mode 100644 index 0000000..a0430ae --- /dev/null +++ b/getbib/getbib.cpp @@ -0,0 +1,235 @@ +/* + * File: main.cpp + * Author: liyuling + * + * Created on November, 2013 + */ + +//#include "../TextpressoCentralGlobals.h" +#include "../../TextpressoCentralGlobalDefinitions.h" +#include "xercesc/util/XMLString.hpp" +#include +#include +#include +#include +#include "getbibUtils.h" +#include +#include + + + +#define TPCAS_2_LINDEX_VERSION "0.9.0" + + +using namespace boost::filesystem; + +void print_who() { + std::cout << std::endl << "CAS file bib extracter" << std::endl; + std::cout << "Build Date: " << __DATE__ << std::endl; + std::cout << "Version: " << TPCAS_2_LINDEX_VERSION << std::endl; +} + +void print_help() { + std::cout << std::endl; + std::cout << "Usage: getbib [tpcas_file_directory]" << std::endl; + std::cout << std::endl; + // std::cout << " CASconcumer reads in a directory of tpcas files and adds them to the lucene index(index_location specified by user), if index_location does not exist, it will create one. "; + // std::cout << std::endl; + // std::cout << " as defined in annotator that is referenced in"; + // std::cout << std::endl; + // //std::cout << " " << TPCAS2LINDEXDESCRIPTOR; + // std::cout << " " << TPCAS2SINGLEINDEXDESCRIPTOR; + // std::cout << std::endl; +} + +void addCasFile(const char* pszInput, string indexdescriptor) { + + + std::string gzfile(pszInput); + std::cout << gzfile << std::endl; + if(gzfile.find("tpcas") == std::string::npos) + return; + + + + // string doneflagpath = "/tmp/indexerdone/"+ gzfile; + // cout << "done flag is " << doneflagpath << endl; + // if(exists(doneflagpath)) + // { + // return; + // } + // + // std::ofstream doneflag(doneflagpath.c_str()); + // doneflag << "" << endl; + // doneflag.close(); + + cout << "L43 addcas file " << pszInput << endl; + //const char * descriptor = TPCAS2LINDEXDESCRIPTOR; + const char * descriptor = indexdescriptor.c_str(); + + + + string tpcasfile = uncompressGzip(gzfile); + std::cout << "L52 tpcasfile " << tpcasfile << std::endl; + + try { + + /* Create/link up to a UIMACPP resource manager instance (singleton) */ + (void) uima::ResourceManager::createInstance("TPCAS2LINDEXAE"); + + uima::ErrorInfo errorInfo; + + uima::AnalysisEngine * pEngine + = uima::Framework::createAnalysisEngine(descriptor, errorInfo); + + + + if (errorInfo.getErrorId() != UIMA_ERR_NONE) { + std::cerr << std::endl + << " Error string : " + << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId()) + << std::endl + << " UIMACPP Error info:" << std::endl + << errorInfo << std::endl; + exit((int) errorInfo.getErrorId()); + } + uima::TyErrorId utErrorId; // Variable to store UIMACPP return codes + /* Get a new CAS */ + uima::CAS* cas = pEngine->newCAS(); + if (cas == NULL) { + std::cerr << "pEngine->newCAS() failed." << std::endl; + exit(1); + } + /* process input / cas */ + try { + /* initialize from an xmicas */ + + std::cout << "L69 consumer" << tpcasfile << std::endl; + XMLCh* native = XMLString::transcode(tpcasfile.c_str()); + LocalFileInputSource fileIS(native); + XMLString::release(&native); + + std::cout << "L71 tpcas " << tpcasfile.c_str() << std::endl; + uima::XmiDeserializer::deserialize(fileIS, *cas, true); + + std::string filename(tpcasfile); + std::cout << "L77 " << filename << std::endl; + + /* process the CAS */ + + // ((uima::AnalysisEngine*) pEngine)->processAndOutputNewCASes(*cas); + + ((uima::AnalysisEngine*) pEngine)->process(*cas); + + + + } catch (uima::Exception e) { + uima::ErrorInfo errInfo = e.getErrorInfo(); + std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl; + std::cerr << errInfo << std::endl; + } + + + + /* call collectionProcessComplete */ + utErrorId = pEngine->collectionProcessComplete(); + /* Free annotator */ + utErrorId = pEngine->destroy(); + delete cas; + delete pEngine; + + std::remove(tpcasfile.c_str()); //delete uncompressed temp casfile + + } catch (uima::Exception e) { + std::cerr << "Exception: " << e << std::endl; + } +} + +int main(int argc, char* argv[]) { + if (argc < 2) { + print_who(); + print_help(); + return (-1); + } + + //const char * descriptor = TPCAS2LINDEXDESCRIPTOR; + + path inputdir(argv[1]); //tpcas file dir + //string indexpath(argv[2]); //index location + string indexpath(""); + // string newOradd(argv[3]); // new/add index option + + // string indexpath("/home/lyl/Dropbox/work/lucene/cas_index"); + // string indexdescriptor("/home/lyl/Dropbox/work/textpressocentral/trunk/LuceneIndexing/descriptors/Tpcas2Lindex.xml"); + + + + // if (!exists(indexpath)) { + // cout << "creating index directory " << endl; + // create_directories(indexpath); + // create_directories(indexpath + "/fulltext"); + // // create_directories(indexpath + "/token"); + // create_directories(indexpath + "/sentence"); + // // create_directories(indexpath + "/lexical"); + // create_directories(indexpath + "/bibliography"); + // } + +// string inputpath(argv[1]); +// string donedir = "/tmp/indexerdone/" + inputpath; +// if (!exists(donedir)) { +// create_directories(donedir); +// } + + + + std::string tempDir = getTempDir(); + // newindexflag = tempDir + "/newindexflag"; + bool dir_created = false; + while (dir_created != true) { + cout << "dir not created" << endl; + tempDir = getTempDir(); + + dir_created = boost::filesystem::create_directories(tempDir); + } + + + //string indexdescriptor(TPCAS2LINDEXDESCRIPTOR); + //string indexdescriptor(tempDir + "/Tpcas2Lindex.xml"); + string indexdescriptor(tempDir + "/Tpcas2Bib.xml"); + writeToIndexDescriptor(indexpath, indexdescriptor, tempDir); ///write to /run/shm/[tempDir]/Tpcas2Lindex.xml + + directory_iterator end_itr; + for (directory_iterator dit(inputdir); dit != end_itr; dit++) { + if (is_regular_file(dit->status())) { + + // cout << "extension " << dit->symlink_status() << endl; + + // addCasFile(dit->path()); + cout << "file path is " << dit->path() << endl; + //addCasFile(dit->path().string().c_str() ); + addCasFile(dit->path().string().c_str(), indexdescriptor); + + + + } else if (is_directory(dit->status())) { + path subdir(dit->path().string().c_str()); + for (directory_iterator dit2(subdir); dit2 != end_itr; dit2++) { + + if (is_regular_file(dit2->status())) { + + + addCasFile(dit2->path().string().c_str(), indexdescriptor); + + } + + } + + } + + + } + + boost::filesystem::remove(indexdescriptor); + boost::filesystem::remove(tempDir); + +} diff --git a/getbib/getbib4nxml.cpp b/getbib/getbib4nxml.cpp new file mode 100644 index 0000000..9e7c366 --- /dev/null +++ b/getbib/getbib4nxml.cpp @@ -0,0 +1,126 @@ +/* + * File: main.cpp + * Author: mueller + * + * Created on October 26, 2016, 12:31 PM + */ + +//#include "../TextpressoCentralGlobals.h" +#include "../../TextpressoCentralGlobalDefinitions.h" +#include "xercesc/util/XMLString.hpp" +#include +#include +#include +#include +#include "getbib4nxmlUtils.h" +#include +#include + +#define TPCAS_2_LINDEX_VERSION "0.9.0" + +//using namespace boost::filesystem; + +void print_who() { + std::cout << std::endl << "CAS file bib extracter" << std::endl; + std::cout << "Build Date: " << __DATE__ << std::endl; + std::cout << "Version: " << TPCAS_2_LINDEX_VERSION << std::endl; +} + +void print_help() { + std::cout << std::endl; + std::cout << "Usage: getbib [tpcas_file_directory]" << std::endl; + std::cout << std::endl; +} + +void addCasFile(const char* pszInput, std::string indexdescriptor) { + std::string gzfile(pszInput); + std::cout << gzfile << std::endl; + if (boost::filesystem::path(gzfile).filename().string().find("tpcas") == std::string::npos) + return; + const char * descriptor = indexdescriptor.c_str(); + std::string tpcasfile = uncompressGzip(gzfile); + try { + /* Create/link up to a UIMACPP resource manager instance (singleton) */ + (void) uima::ResourceManager::createInstance("TPCAS2LINDEXAE"); + uima::ErrorInfo errorInfo; + uima::AnalysisEngine * pEngine + = uima::Framework::createAnalysisEngine(descriptor, errorInfo); + if (errorInfo.getErrorId() != UIMA_ERR_NONE) { + std::cerr << std::endl + << " Error string : " + << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId()) + << std::endl + << " UIMACPP Error info:" << std::endl + << errorInfo << std::endl; + exit((int) errorInfo.getErrorId()); + } + uima::TyErrorId utErrorId; // Variable to store UIMACPP return codes + /* Get a new CAS */ + uima::CAS* cas = pEngine->newCAS(); + if (cas == NULL) { + std::cerr << "pEngine->newCAS() failed." << std::endl; + exit(1); + } + /* process input / cas */ + try { + /* initialize from an xmicas */ + XMLCh* native = XMLString::transcode(tpcasfile.c_str()); + LocalFileInputSource fileIS(native); + XMLString::release(&native); + uima::XmiDeserializer::deserialize(fileIS, *cas, true); + std::string filename(tpcasfile); + /* process the CAS */ + ((uima::AnalysisEngine*) pEngine)->process(*cas); + } catch (uima::Exception e) { + uima::ErrorInfo errInfo = e.getErrorInfo(); + std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl; + std::cerr << errInfo << std::endl; + std::cerr << "Writing default bib file"; + writeDefaultBibFile(gzfile.replace(gzfile.end()-8, gzfile.end(), "bib")); + } + /* call collectionProcessComplete */ + utErrorId = pEngine->collectionProcessComplete(); + /* Free annotator */ + utErrorId = pEngine->destroy(); + delete cas; + delete pEngine; + std::remove(tpcasfile.c_str()); //delete uncompressed temp casfile + } catch (uima::Exception e) { + std::cerr << "Exception: " << e << std::endl; + } +} + +int main(int argc, char* argv[]) { + if (argc < 2) { + print_who(); + print_help(); + return (-1); + } + boost::filesystem::path inputdir(argv[1]); //tpcas file dir + std::string indexpath(""); + std::string tempDir = getTempDir(); + bool dir_created = false; + while (dir_created != true) { + std::cout << "dir not created" << std::endl; + tempDir = getTempDir(); + dir_created = boost::filesystem::create_directories(tempDir); + } + std::string indexdescriptor(tempDir + "/Tpcas2Bib.xml"); + writeToIndexDescriptor(indexpath, indexdescriptor, tempDir); ///write to /run/shm/[tempDir]/Tpcas2Lindex.xml + boost::filesystem::directory_iterator end_itr; + for (boost::filesystem::directory_iterator dit(inputdir); dit != end_itr; dit++) { + if (boost::filesystem::is_regular_file(dit->status())) { + std::cout << "file path is " << dit->path() << std::endl; + addCasFile(dit->path().string().c_str(), indexdescriptor); + } else if (boost::filesystem::is_directory(dit->status())) { + boost::filesystem::path subdir(dit->path().string().c_str()); + for (boost::filesystem::directory_iterator dit2(subdir); dit2 != end_itr; dit2++) { + if (boost::filesystem::is_regular_file(dit2->status())) { + addCasFile(dit2->path().string().c_str(), indexdescriptor); + } + } + } + } + boost::filesystem::remove(indexdescriptor); + boost::filesystem::remove(tempDir); +} diff --git a/getbib/getbib4nxmlUtils.cpp b/getbib/getbib4nxmlUtils.cpp new file mode 100644 index 0000000..d22507c --- /dev/null +++ b/getbib/getbib4nxmlUtils.cpp @@ -0,0 +1,164 @@ +/* + * CAS file utils + * author: liyuling + * Date: Nov, 2013 + */ + +#include "getbib4nxmlUtils.h" + +std::string uncompressGzip(std::string gzFile) { + std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary); + boost::iostreams::filtering_streambuf in; + in.push(boost::iostreams::gzip_decompressor()); + in.push(filein); + int lastdot = gzFile.find_last_of("."); + int lastslash = gzFile.find_last_of("/"); + std::string tpFile = gzFile.substr(lastslash + 1, lastdot - lastslash - 1); + std::string shm("/run/shm/"); + std::string tempFile = shm + tpFile; + std::ofstream out(tempFile.c_str()); + boost::iostreams::copy(in, out); + out.close(); + return tempFile; +} + +std::string getTempDir() { + boost::posix_time::ptime now = boost::posix_time::microsec_clock::local_time(); + int month = static_cast (now.date().month()); + int year = static_cast (now.date().year()); + int day = static_cast (now.date().day()); + boost::posix_time::time_duration duration(now.time_of_day()); + long microseconds = duration.total_microseconds(); + long pid = getpid(); + long random = pid + microseconds; + std::stringstream ss; + ss << year << month << day << random; + std::string tempDir = "/run/shm/" + ss.str(); + return tempDir; +} + +void writeToIndexDescriptor(std::string indexpath, std::string descriptor, std::string tempDir) { + std::ofstream output(descriptor.c_str()); + output << "" << std::endl; + output << "" << std::endl; + output << " org.apache.uima.cpp" << std::endl; + output << " true " << std::endl; + output << " Tpcas2Bib4Nxml" << std::endl; + output << " " << std::endl; + output << " Tpcas2Bib4Nxml" << std::endl; + output << " Writes an XCAS to a Lucene index. " << std::endl; + output << " 1.0 " << std::endl; + output << " Textpresso " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " FulltextLuceneIndexDirectory" << std::endl; + output << " Directory path of Lucene index for fulltext. " << std::endl; + output << " String " << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " TokenLuceneIndexDirectory " << std::endl; + output << " Directory path of Lucene index for tokens. " << std::endl; + output << " String" << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " SentenceLuceneIndexDirectory " << std::endl; + output << " Directory path of Lucene index for sentences." << std::endl; + output << " String" << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " LexicalLuceneIndexDirectory " << std::endl; + output << " Directory path of Lucene index for lexical annotations. " << std::endl; + output << " String" << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " BibliographyLuceneIndexDirectory " << std::endl; + output << " Directory path of Lucene index for bibliography annotations. " << std::endl; + output << " String" << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " TempDirectory " << std::endl; + output << " temporary directory under /run/shm/ to store newindexflag " << std::endl; + output << " String" << std::endl; + output << " false " << std::endl; + output << " true " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " FulltextLuceneIndexDirectory" << std::endl; + output << " " << std::endl; + output << " " << indexpath << "/fulltext" << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " TokenLuceneIndexDirectory" << std::endl; + output << " " << std::endl; + output << " " << indexpath << "/token" << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " SentenceLuceneIndexDirectory " << std::endl; + output << " " << std::endl; + output << " " << indexpath << "/sentence" << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " LexicalLuceneIndexDirectory " << std::endl; + output << " " << std::endl; + output << " " << indexpath << "/lexical" << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " BibliographyLuceneIndexDirectory " << std::endl; + output << " " << std::endl; + output << " " << indexpath << "/bibliography" << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " TempDirectory " << std::endl; + output << " " << std::endl; + output << " " << tempDir << "" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " x-unspecified" << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output << " " << std::endl; + output.close(); +} + +void writeDefaultBibFile(const std::string &file_path) { + std::ofstream output(file_path); + output << "author|" << std::endl; + output << "accession|" << std::endl; + output << "type|" << std::endl; + output << "title|" << std::endl; + output << "journal|" << std::endl; + output << "citation|" << std::endl; + output << "year|" << std::endl; + output << "abstract|" << std::endl; + output.close(); +} diff --git a/getbib/getbib4nxmlUtils.h b/getbib/getbib4nxmlUtils.h new file mode 100644 index 0000000..92f09c4 --- /dev/null +++ b/getbib/getbib4nxmlUtils.h @@ -0,0 +1,25 @@ +/* + * File: Utils.h + * Author: lyl + * + * Created on November 15, 2013, 3:48 PM + */ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include +#include +#include + +extern std::string uncompressGzip(std::string gzFile); // uncompress gz file +extern std::string getTempDir(); // generate a temp dir under /run/shm to store all temp files for each run. using year+month+day+min +extern void writeToIndexDescriptor(std::string indexpath, std::string descriptor, std::string tempDir); //write to Tpcas2index.xml descriptor +void writeDefaultBibFile(const std::string& file_path); + +#endif /* UTILS_H */ + diff --git a/getbib/getbibUtils.cpp b/getbib/getbibUtils.cpp new file mode 100644 index 0000000..2d8b0a8 --- /dev/null +++ b/getbib/getbibUtils.cpp @@ -0,0 +1,196 @@ +/* + * CAS file utils + * author: liyuling + * Date: Nov, 2013 + */ + +#include "getbibUtils.h" + +//const char* newindexflag = "/run/shm/newindexflag"; + +string uncompressGzip(string gzFile) { + // std::cout << "166" << endl; + std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary); + boost::iostreams::filtering_streambuf in; + in.push(boost::iostreams::gzip_decompressor()); + in.push(filein); + + int lastdot = gzFile.find_last_of("."); + int lastslash = gzFile.find_last_of("/"); + string tpFile = gzFile.substr(lastslash + 1, lastdot - lastslash - 1); + + string shm("/run/shm/"); + string tempFile = shm + tpFile; + // string tempFile = getTempDir() + "/" +tpFile; + // std::cout << "177 " << tempFile << endl; + std::ofstream out(tempFile.c_str()); + boost::iostreams::copy(in, out); + out.close(); + + return tempFile; +} + +string getTempDir() { + // boost::posix_time::ptime now = boost::posix_time::second_clock::local_time(); + + boost::posix_time::ptime now = boost::posix_time::microsec_clock::local_time(); + + int month = static_cast (now.date().month()); + int year = static_cast (now.date().year()); + int day = static_cast (now.date().day()); + + + boost::posix_time::time_duration duration(now.time_of_day()); + + long microseconds = duration.total_microseconds(); + + + + long pid = getpid(); + //int second = time(0); + + + + //int random = pid + second; + + long random = pid + microseconds; + + //cout << "r: " << random << endl; + std::stringstream ss; + //ss << year << month << day << minutes; + ss << year << month << day << random; + std::string tempDir = "/run/shm/" + ss.str(); + //cout <<"hello" <" << endl; + output << "" << endl; + output << " org.apache.uima.cpp" << endl; + output << " true " << endl; + //output << " Tpcas2Lpp" << endl; + output << " Tpcas2Bib" << endl; + output << " " << endl; + //output << " Tpcas2Lpp" << endl; + output << " Tpcas2Bib" << endl; + output << " Writes an XCAS to a Lucene index. " << endl; + output << " 1.0 " << endl; + output << " Textpresso " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextLuceneIndexDirectory" << endl; + output << " Directory path of Lucene index for fulltext. " << endl; + output << " String " << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for tokens. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for sentences." << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for lexical annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyLuceneIndexDirectory " << endl; + output << " Directory path of Lucene index for bibliography annotations. " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " TempDirectory " << endl; + output << " temporary directory under /run/shm/ to store newindexflag " << endl; + output << " String" << endl; + output << " false " << endl; + output << " true " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " FulltextLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << indexpath << "/fulltext" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " TokenLuceneIndexDirectory" << endl; + output << " " << endl; + output << " " << indexpath << "/token" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " SentenceLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << indexpath << "/sentence" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " LexicalLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << indexpath << "/lexical" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " BibliographyLuceneIndexDirectory " << endl; + output << " " << endl; + output << " " << indexpath << "/bibliography" << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " TempDirectory " << endl; + output << " " << endl; + output << " " << tempDir << "" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " x-unspecified" << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + output << " " << endl; + + output.close(); + +} + +/* +void optimizeIndex(string indexpath) +{ + String TokenIndexDir = StringUtils::toString(indexpath.c_str()); + IndexWriterPtr tokenwriter = newLucene (FSDirectory::open(TokenIndexDir), + newLucene (LuceneVersion::LUCENE_CURRENT), false, + IndexWriter::MaxFieldLengthLIMITED); +} + */ \ No newline at end of file diff --git a/getbib/getbibUtils.h b/getbib/getbibUtils.h new file mode 100644 index 0000000..942330f --- /dev/null +++ b/getbib/getbibUtils.h @@ -0,0 +1,32 @@ +/* + * File: Utils.h + * Author: lyl + * + * Created on November 15, 2013, 3:48 PM + */ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace uima; +using namespace Lucene; + + +//extern const char* newindexflag; //new index lock flag +#endif /* UTILS_H */ + + +extern string uncompressGzip(string gzFile); // uncompress gz file +extern string getTempDir(); // generate a temp dir under /run/shm to store all temp files for each run. using year+month+day+min +extern void writeToIndexDescriptor(string indexpath, string descriptor, string tempDir); //write to Tpcas2index.xml descriptor +//extern void optimizeIndex(string indexpath); diff --git a/getbibinfoforpdffromserver/download_pdfinfo.pl b/getbibinfoforpdffromserver/download_pdfinfo.pl new file mode 100755 index 0000000..6a62804 --- /dev/null +++ b/getbibinfoforpdffromserver/download_pdfinfo.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl +# Script downloads the acedumps from postgres and +# deposits them into local files +# +# USAGE: ./01.pl +# +# +# BEGIN PROGRAM +# + +### modules + +use strict; +use HTTP::Request; +use LWP::UserAgent; + +### variables + +# path to outfile + +my $outpath = "$ARGV[0]"; + +my ($dateShort); +$|=1; # forces output buffer to flush after every print statement! + +# backs up previous data files + +&getDate(); + +print "\n\nBacking up last dumps ...."; +my @files = ("$outpath/Paper.dump", "$outpath/LongText.dump"); +for (@files){ + if (-e $_){ + my @args = ("mv", "$_", "$_.$dateShort"); + system(@args) == 0 + or die "system @args failed: $?"; + } +} +print "done.\n"; + +my $outfile1 = "$outpath"."Paper.dump"; +my $outfile2 = "$outpath"."LongText.dump"; + +print "Downloading now .......\n"; +open (OUT1, ">$outfile1") or die "Cannot create $outfile1 : $!"; +open (OUT2, ">$outfile2") or die "Cannot create $outfile2 : $!"; + +# fetch all Paper objects & abstracts +# This has been changed on 2010-06-28 +my $data = getwebpage("http://tazendra.caltech.edu/~postgres/michael/papers.ace"); +my @alllines = split /\n/, $data; +my $flag = 0; +foreach my $line (@alllines) { + if ($line =~ /Longtext \:/) { + $flag = 1; + } + if ($flag) { + # print longtext object + print OUT2 $line, "\n"; + } else { + # print out Paper objects + print OUT1 $line, "\n"; + } +} + +my @aux = $data =~ /Paper \:/g; +print scalar @aux , " paper objects downloaded.\n"; +@aux = $data =~ /\*\*\*LongTextEnd\*\*\*/g; +print scalar @aux , " abstracts downloaded.\n"; +close (OUT1) or die "Cannot close $outfile1 : $!"; +close (OUT2) or die "Cannot close $outfile2 : $!"; + +print "done.\n\n"; + +sub getDate { + + my $time_zone = 0; + my $time = time() + ($time_zone * 3600); + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time); + $year += ($year < 90) ? 2000 : 1900; + $dateShort = sprintf("%04d%02d%02d",$year,$mon+1,$mday); + return $dateShort; + +} + + + +sub getwebpage { + + my $u = shift; + my $page = ""; + + my $ua = LWP::UserAgent->new(timeout => 30); # instantiates a new user agent + my $request = HTTP::Request->new(GET => $u); # grabs url + my $response = $ua->request($request); # checks url, dies if not valid. + print "Error while getting ", $response->request->uri," -- ", $response->status_line, "\nAborting" unless $response-> is_success; + $page = $response->content; #splits by line + return $page; + +} diff --git a/getbibinfoforpdffromserver/extract_pdfbibinfo.pl b/getbibinfoforpdffromserver/extract_pdfbibinfo.pl new file mode 100755 index 0000000..88ad6d7 --- /dev/null +++ b/getbibinfoforpdffromserver/extract_pdfbibinfo.pl @@ -0,0 +1,208 @@ +#!/usr/bin/perl + +use diagnostics; +use strict; +use DBI; + +my $infile = "$ARGV[0]/Paper.dump"; +my $infile2 = "$ARGV[0]/LongText.dump"; +my $outpath = "$ARGV[0]/"; + +my $countentries = 0; +my $dateShort; +my @directories = qw ( + accession + author + abstract + title + journal + citation + year + type + ); + +# back up old and make new directories. + +&getDate(); + +foreach (@directories){ + print "Making $outpath/$_ directory ..... "; + if (-d "$outpath/$_"){ +# my @args = ("tar", "zcf", "$outpath/$_.$dateShort.tgz", "$outpath/$_"); +# system(@args) == 0 or die "system @args failed: $?"; + my @temp = <$outpath/$_/*>; + for (@temp){ + unlink "$_" or warn "Cannot delete $_: $!"; + } + } else { + mkdir "$outpath/$_"; + } + print "done.\n"; +} + +# extracts Title, Author, Citation, Year, Type, Journal from Paper.dump + +open (FILE, "<$infile") || die "Cannot open $infile : $!"; +print "loading $infile ...."; +undef $/; +my $wholefile = ; +$/ = "\n"; +close (FILE) or die "Cannot close $infile : $!"; +print "done.\n"; +my @sections = split (/\n\n/, $wholefile); +my $count = scalar(@sections); + +# extracts abstracts from LongText.dump + +my $filename2 = ""; +open (FILE2, "<$infile2") || die "Cannot open $infile2 : $!"; +print "loading $infile2 ...."; +undef $/; +my $wholefile2 = ; +$/ = "\n"; +close (FILE2) or die "Cannot close $infile2 : $!"; +print "done.\n"; + +(my @array2) = split(/\*\*\*LongTextEnd\*\*\*\n/, $wholefile2); +my $count2 = scalar(@array2); +my %abstracttexts = (); +foreach my $entry (@array2) { + (my $id) = $entry =~ /Longtext[ \t]\:[ \t]\"(.+?)\"/; + (my $text) = $entry =~ /\"$id\"\n\n(.+?)\n\n/; + $abstracttexts{$id} = $text; +} + + + +my $dbh = DBI->connect ( "dbi:Pg:dbname=testdb;host=131.215.52.76", "acedb", "") or die "Cannot connect to database!\n"; +my $result = $dbh->prepare( "SELECT * FROM pap_curation_flags WHERE pap_curation_flags = 'non_nematode'"); +$result->execute() or die "Cannot prepare statement: $DBI::errstr\n"; +my %non_nematode = (); +while (my @row = $result->fetchrow) { + if ($row[0]) { + my $jk = shift (@row); + $non_nematode{"WBPaper$jk"} = 1; + } +} +$dbh->disconnect; + +foreach my $s (@sections){ + +# + next if (($s !~ /(^|\n)Paper/) || ($s !~ /\nAuthor/) || ($s !~ /\nTitle/)); +# + + (my $filename) = $s =~ /Paper \:[ \t]+\"(WBPaper\d{8})\"/; +# + next if ($non_nematode{$filename}); +# + (my @other_names) = $s =~ /\nName[ \t]+\"(.+?)\"/g; + (my $pmid_name) = $s =~ /\nDatabase[ \t]+\"MEDLINE\"[ \t]+\"PMID\"[ \t]+\"(\d+)\"/; + my @authors = $s =~ /\nAuthor[ \t]+\"(.+?)\"/g; + (my $aux) = $s =~ /\nVolume[ \t]+(.+)\n/; + my @volumes = $aux =~ /\"(.+?)\"/g; + ($aux) = $s =~ /\nPage[ \t]+(.+)\n/; + my @pages = $aux =~ /\"(.+?)\"/g; + my $journal = ''; + ($aux) = $s =~ /\nJournal[ \t]+\"(.+?)\"/; + $journal .= $aux; + ($aux) = $s =~ /\nTitle[ \t]+(\".+?\n)/; + (my $title) = $aux =~ /^\"(.+)\"/; + (my $type) = $s =~ /\nType[ \t]+\"(.+?)\"/; + (my $year) = $s =~ /\nPublication_date[ \t]+\"([\-\d]+)\"/; + (my $absid) = $s =~ /\nAbstract[ \t]+\"(.+?)\"/; + + $countentries++; + + my $acc = ''; + $acc .= "Other:" . "@other_names" . "\n" if (@other_names); + $acc =~ s/(doi|DOI|Doi)/$1:/g; + $acc .= "PMID:$pmid_name\n" if ($pmid_name ne ''); + +# need to remove if-loop and add empty line +# so an empty accession file can be written. +# this is necessary so pdf without any accession +# can be downloaded + + $acc .= "\n" if ($acc eq ''); +# if ($acc ne '') { + open (OUT, ">$outpath/accession/$filename") or die "Cannot open $outpath/Accession/$filename : $!"; + print OUT "$acc"; + close (OUT) or die "Cannot close $outpath/Accession/$filename : $!"; +# } +# + if (@authors) { + # take care of new format in author section (repetition of author lines) + my %seen = (); + my @aux = (); + foreach (@authors) { + if (!$seen{$_}) { + $seen{$_} = 1; + push @aux, $_; + } + } + open (OUT, ">$outpath/author/$filename") or die "Cannot open $outpath/Author/$filename : $!"; + print OUT join(" ; \n", @aux); + close (OUT) or die "Cannot close $outpath/Author/$filename: $!"; + } +# + if ((@volumes) || (@pages)) { + open (OUT, ">$outpath/citation/$filename") or die "Cannot open $outpath/Citation/$filename : $!"; + print OUT "V: ", join(" ", @volumes), "\n" if (@volumes); + print OUT "P: ", join(" ", @pages), "\n" if (@pages); + close (OUT) or die "Cannot close $outpath/Citation/$filename : $!"; + } +# + if ($journal ne '') { + open (OUT, ">$outpath/journal/$filename") or die "Cannot open $\outpath/Journal/$filename : $!"; + print OUT "$journal\n"; + close (OUT) or die "Cannot close $outpath/Journal/$filename : $\!"; + } +# + if ($title ne '') { + open (OUT, ">$outpath/title/$filename") or die "Cannot open $outpath/Title/$filename : $!"; + print OUT "$title\n"; + close (OUT) or die "Cannot close $outpath/Title/$filename : $!"; + } +# + if ($type ne '') { + open (OUT, ">$outpath/type/$filename") or die "Cannot open $outpath/Type/$filename : $!"; + print OUT "$type\n"; + close (OUT) or die "Cannot close $outpath/Type/$filename : $!"; + } +# + if ($year ne '') { + open (OUT, ">$outpath/year/$filename") or die "Cannot open $outpath/Year/$filename : $!"; + print OUT "$year\n"; + close (OUT) or die "Cannot close $outpath/year/$filename : $!"; + } +# + if ($abstracttexts{$absid}) { + open (OUT, ">$outpath/abstract/$filename") or die "Cannot open $outpath/$filename : $!"; + print OUT "$abstracttexts{$absid}"; + close (OUT) or die "Cannot close $outpath/$filename : $!"; + } +# +} + +print "\n\n#########################################"; +print "\nThere are $count citations total and\n"; +print "$countentries were complete enough to be usable.\n"; +print "$count2 abstracts were extracted.\n"; +print "\n\n"; +for (@directories){ + my @cnt = <$outpath/$_/*>; + my $cnt = scalar(@cnt); + print "$_ has $cnt files\n"; +} + +print "\n\n##########################################\n"; + +sub getDate{ + my $time_zone = 0; + my $time = time() + ($time_zone * 3600); + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time); + $year += ($year < 90) ? 2000 : 1900; + $dateShort = sprintf("%04d%02d%02d",$year,$mon+1,$mday); + return $dateShort; +} diff --git a/getpdfs/getpdfs.py b/getpdfs/getpdfs.py new file mode 100755 index 0000000..b2dc654 --- /dev/null +++ b/getpdfs/getpdfs.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 + +"""Copy pdf files from server and map file names""" +import hashlib +import logging +import shutil +import urllib.request +import urllib.error +import urllib.parse +import re +import os +import argparse +import glob +import psycopg2 + +__author__ = "Valerio Arnaboldi" + +__version__ = "1.0.1" + + +def main(): + parser = argparse.ArgumentParser(description="Download pdf files from Tazendra server and store them in a local " + "directory, after applying name conversion") + parser.add_argument("-d", "--delete-old", dest="delete_old", action="store_true", + help="delete old files before downloading the new ones") + parser.add_argument("-l", "--log-file", metavar="log_file", dest="log_file", default="info.log", type=str, + help="path to log file") + parser.add_argument("-L", "--log-level", metavar="log_level", dest="log_level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="log level") + parser.add_argument("out_dir", metavar="out_dir", type=str, help="output directory") + parser.add_argument("xml_dir", metavar="xml_dir", type=str, help="do not download pdfs whose PMID is already " + "present in the provided xml director") + args = parser.parse_args() + logging.basicConfig(filename=args.log_file, level=getattr(logging, args.log_level.upper())) + if args.delete_old: + shutil.rmtree(args.out_dir) + try: + os.makedirs(os.path.join(args.out_dir, "C. elegans")) + os.makedirs(os.path.join(args.out_dir, "C. elegans Supplementals")) + except FileExistsError: + logging.warning("Directories already exist") + non_nematode_papers = set() + conn = psycopg2.connect("dbname='testdb' user='acedb' host='131.215.52.76'") + cur = conn.cursor() + cur.execute("""SELECT * FROM pap_curation_flags WHERE pap_curation_flags = 'non_nematode'""") + rows = cur.fetchall() + for row in rows: + non_nematode_papers.add("WBPaper" + row[0]) + + # read papers mapping + id = None + papers_cgc_map = {} + papers_pubmed_map = {} + existing_xml_pmids = set([f for f in os.listdir(args.xml_dir) if os.path.isdir(os.path.join(args.xml_dir, f))]) + wb_2_pmid = {} + for line in urllib.request.urlopen("http://tazendra.caltech.edu/~postgres/michael/papers.ace"): + line = line.decode('utf-8') + linearr = line.strip().split() + if len(linearr) > 1: + if linearr[0] == "Paper": + id = linearr[2][1:len(linearr[2])-1] + elif linearr[0] == "Name" and linearr[1].startswith("\"cgc"): + papers_cgc_map[linearr[1][4:len(linearr[1])-1]] = id + elif len(linearr) >= 4 and linearr[0] == "Database" and linearr[2] == "\"PMID\"": + papers_pubmed_map[linearr[3][1:len(linearr[3])-1]] = id + wb_2_pmid[id] = linearr[3][1:len(linearr[3])-1] + + # read papers list and map them + p = re.compile('href="(.*)"') + + pdflink = "" + all_wbpapers = set() + files_to_download = {} + for line in urllib.request.urlopen("http://tazendra.caltech.edu/~azurebrd/cgi-bin/allpdfs.cgi?action=textpresso"): + try: + line = line.decode('utf-8') + linearr = line.strip().split() + if len(linearr) > 1: + filetype = linearr[0] + namescheme = linearr[1] + pdflink = p.findall(" ".join(linearr[3:]))[0] + if pdflink.lower().endswith(".pdf"): + if filetype == "supplemental": + pdfname = re.split("_|-", pdflink.split("/")[-2])[0] + else: + pdfname = re.split("_|-", pdflink.split("/")[-1])[0] + subdir = "C. elegans" + if namescheme == "wb": + pdfname = "WBPaper" + str(pdfname) + if pdfname in wb_2_pmid and wb_2_pmid[pdfname] in existing_xml_pmids \ + and filetype != "supplemental": + continue + elif namescheme == "cgc": + if str(pdfname).lstrip("0") in papers_cgc_map: + pdfname = papers_cgc_map[str(pdfname).lstrip("0")] + if pdfname in wb_2_pmid and wb_2_pmid[pdfname] in existing_xml_pmids \ + and filetype != "supplemental": + continue + else: + continue + elif namescheme == "pubmed": + if str(pdfname).lstrip("0") in papers_pubmed_map: + pdfname = papers_pubmed_map[str(pdfname).lstrip("0")] + if str(pdfname).lstrip("0") in existing_xml_pmids: + continue + else: + continue + subdir = "C. elegans" + if pdfname in non_nematode_papers: + continue + if filetype == "supplemental": + subdir = "C. elegans Supplementals" + pdfname += ".sup." + skip_file = False + simfiles = glob.glob(os.path.join(args.out_dir, subdir, pdfname, pdfname) + "*.pdf") + for simfile_name in simfiles: + if hashlib.md5(urllib.request.urlopen(pdflink).read()).digest() == \ + hashlib.md5(open(simfile_name, "rb").read()).digest(): + skip_file = True + break + if skip_file: + continue + sup_num = len(simfiles) + 1 + while pdfname + str(sup_num) in files_to_download: + all_wbpapers.add(pdfname + str(sup_num)) + sup_num += 1 + all_wbpapers.add(pdfname + str(sup_num)) + pdfname += str(sup_num) + else: + all_wbpapers.add(pdfname) + if pdflink.lower().endswith("_temp.pdf") and pdfname in files_to_download or \ + pdflink.lower().endswith("_ocr.pdf") and pdfname in files_to_download: + continue + logging.info("Downloading paper: " + pdflink + " to " + os.path.join(args.out_dir, subdir, + pdfname, pdfname + ".pdf")) + if pdfname in files_to_download: + link_re = re.search("[0-9]+[\_\-][^\d]+([0-9]+)", pdflink.replace(" ", "")) + link_num = 0 + if link_re is not None: + link_num = int(link_re.group(1)) + stored_re = re.search("[0-9]+[\_\-][^\d]+([0-9]+)", + files_to_download[pdfname][0].replace("%20", "")) + stored_num = 0 + if stored_re is not None: + stored_num = int(stored_re.group(1)) + if link_num <= stored_num: + continue + files_to_download[pdfname] = (pdflink.replace(" ", "%20"), os.path.join(args.out_dir, subdir, + pdfname, pdfname + ".pdf")) + else: + logging.warning("Skipping file: " + pdflink) + except UnicodeDecodeError: + pass + + for pdflink, file_path in files_to_download.values(): + try: + # check if best file selected for download is already present in the dest dir + if not args.delete_old and len(glob.glob(file_path)) > 0 and \ + hashlib.md5(urllib.request.urlopen(pdflink).read()).digest() == \ + hashlib.md5(open(file_path, "rb").read()).digest(): + logging.info("File already present in collection, skipping " + pdflink) + continue + os.makedirs(os.path.dirname(file_path)) + urllib.request.urlretrieve(pdflink, file_path) + except urllib.error.HTTPError: + logging.error("Paper not found: " + pdflink) + continue + + # delete local files that have been removed from server + local_files = set(os.listdir(os.path.join(args.out_dir, "C. elegans"))) + for file_to_remove in local_files.difference(all_wbpapers): + shutil.rmtree(os.path.join("C. elegans", file_to_remove)) + local_files = set(os.listdir(os.path.join(args.out_dir, "C. elegans Supplementals"))) + for file_to_remove in local_files.difference(all_wbpapers): + shutil.rmtree(os.path.join("C. elegans Supplementals", file_to_remove)) + +if __name__ == '__main__': + main() diff --git a/lucene/CaseSensitiveAnalyzer.cpp b/lucene/CaseSensitiveAnalyzer.cpp new file mode 100644 index 0000000..5a5cc0e --- /dev/null +++ b/lucene/CaseSensitiveAnalyzer.cpp @@ -0,0 +1,84 @@ +/** + Project: textpressocentral + File name: CaseSensitiveAnalyzer.cpp + + @author valerio + @version 1.0 6/9/17. +*/ + +#include "CaseSensitiveAnalyzer.h" +#include +#include + +using namespace Lucene; + +DECLARE_SHARED_PTR(CaseSensitiveAnalyzer); + +/// Construct an analyzer with the given stop words. +const int32_t CaseSensitiveAnalyzer::DEFAULT_MAX_TOKEN_LENGTH = 255; + +CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion) { + ConstructAnalyser(matchVersion, StopAnalyzer::ENGLISH_STOP_WORDS_SET()); +} + +CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, HashSet stopWords) { + ConstructAnalyser(matchVersion, stopWords); +} + +CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, const String& stopwords) { + ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords)); +} + +CaseSensitiveAnalyzer::CaseSensitiveAnalyzer(LuceneVersion::Version matchVersion, const ReaderPtr& stopwords) { + ConstructAnalyser(matchVersion, WordlistLoader::getWordSet(stopwords)); +} + +CaseSensitiveAnalyzer::~CaseSensitiveAnalyzer() { +} + +void CaseSensitiveAnalyzer::ConstructAnalyser(LuceneVersion::Version matchVersion, HashSet stopWords) { + stopSet = stopWords; + enableStopPositionIncrements = StopFilter::getEnablePositionIncrementsVersionDefault(matchVersion); + replaceInvalidAcronym = LuceneVersion::onOrAfter(matchVersion, LuceneVersion::LUCENE_24); + this->matchVersion = matchVersion; + this->maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; +} + +TokenStreamPtr CaseSensitiveAnalyzer::tokenStream(const String& fieldName, const ReaderPtr& reader) { + StandardTokenizerPtr tokenStream(newLucene(matchVersion, reader)); + tokenStream->setMaxTokenLength(maxTokenLength); + TokenStreamPtr result(newLucene(tokenStream)); + //result = newLucene(result); + result = newLucene(enableStopPositionIncrements, result, stopSet); + return result; +} + +void CaseSensitiveAnalyzer::setMaxTokenLength(int32_t length) { + maxTokenLength = length; +} + +int32_t CaseSensitiveAnalyzer::getMaxTokenLength() { + return maxTokenLength; +} +DECLARE_SHARED_PTR(CaseSensitiveAnalyzerSavedStreams); +TokenStreamPtr CaseSensitiveAnalyzer::reusableTokenStream(const String& fieldName, const ReaderPtr& reader) { + CaseSensitiveAnalyzerSavedStreamsPtr streams = boost::dynamic_pointer_cast(getPreviousTokenStream()); + if (!streams) { + streams = newLucene(); + setPreviousTokenStream(streams); + streams->tokenStream = newLucene(matchVersion, reader); + streams->filteredTokenStream = newLucene(streams->tokenStream); + //streams->filteredTokenStream = newLucene(streams->filteredTokenStream); + streams->filteredTokenStream = newLucene(enableStopPositionIncrements, streams->filteredTokenStream, stopSet); + } else { + streams->tokenStream->reset(reader); + } + streams->tokenStream->setMaxTokenLength(maxTokenLength); + + streams->tokenStream->setReplaceInvalidAcronym(replaceInvalidAcronym); + + return streams->filteredTokenStream; +} + +CaseSensitiveAnalyzerSavedStreams::~CaseSensitiveAnalyzerSavedStreams() { +} \ No newline at end of file diff --git a/lucene/CaseSensitiveAnalyzer.h b/lucene/CaseSensitiveAnalyzer.h new file mode 100644 index 0000000..532d457 --- /dev/null +++ b/lucene/CaseSensitiveAnalyzer.h @@ -0,0 +1,58 @@ +/** + Project: textpressocentral + File name: CaseSensitiveAnalyzer.h + + @author valerio + @version 1.0 6/9/17. +*/ + +#ifndef TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H +#define TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H + +//#include +#include + +using namespace Lucene; + +class CaseSensitiveAnalyzerSavedStreams : public LuceneObject { +public: + virtual ~CaseSensitiveAnalyzerSavedStreams(); + LUCENE_CLASS(CaseSensitiveAnalyzerSavedStreams); + +public: + StandardTokenizerPtr tokenStream; + TokenStreamPtr filteredTokenStream; +}; + +class CaseSensitiveAnalyzer: public Analyzer { + +public: + CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion); + CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, Lucene::HashSet stopWords); + CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, const Lucene::String &stopwords); + CaseSensitiveAnalyzer(Lucene::LuceneVersion::Version matchVersion, const Lucene::ReaderPtr &stopwords); + virtual ~CaseSensitiveAnalyzer(); + + LUCENE_CLASS(CaseSensitiveAnalyzer); + +public: + static const int32_t DEFAULT_MAX_TOKEN_LENGTH; + +protected: + Lucene::HashSet stopSet; + + bool replaceInvalidAcronym; + bool enableStopPositionIncrements; + Lucene::LuceneVersion::Version matchVersion; + int32_t maxTokenLength; + +public: + virtual Lucene::TokenStreamPtr tokenStream(const Lucene::String &fieldName, const Lucene::ReaderPtr &reader); + void setMaxTokenLength(int32_t length); + int32_t getMaxTokenLength(); + virtual Lucene::TokenStreamPtr reusableTokenStream(const Lucene::String &fieldName, const Lucene::ReaderPtr &reader); + void ConstructAnalyser(Lucene::LuceneVersion::Version matchVersion, Lucene::HashSet stopWords); +}; + + +#endif //TEXTPRESSOCENTRAL_CASESENSITIVEANALYZER_H diff --git a/lucene/LazySelector.h b/lucene/LazySelector.h new file mode 100644 index 0000000..9eb6181 --- /dev/null +++ b/lucene/LazySelector.h @@ -0,0 +1,36 @@ +/** + Project: textpressocentral + File name: LazySelector.h + + @author valerio + @version 1.0 6/10/17. +*/ + +#ifndef TEXTPRESSOCENTRAL_LAZYSELECTOR_H +#define TEXTPRESSOCENTRAL_LAZYSELECTOR_H + +#include + +DECLARE_SHARED_PTR(LazySelector); +class LazySelector : public FieldSelector { +public: + LazySelector(const String& magicField) { + this->magicField = magicField; + } + virtual ~LazySelector() { + } + LUCENE_CLASS(LazySelector); +protected: + String magicField; + +public: + virtual FieldSelectorResult accept(const String& fieldName) { + if (fieldName == magicField) { + return FieldSelector::SELECTOR_LOAD; + } else { + return FieldSelector::SELECTOR_NO_LOAD; + } + } +}; + +#endif //TEXTPRESSOCENTRAL_LAZYSELECTOR_H diff --git a/ppm2jpg/main.cpp b/ppm2jpg/main.cpp new file mode 100644 index 0000000..ff34c0d --- /dev/null +++ b/ppm2jpg/main.cpp @@ -0,0 +1,97 @@ +/** + Project: textpressocentral + File name: main.cpp + + @author valerio + @version 1.0 6/5/17. +*/ + +#include +#include +#include +#include +#include + +namespace po = boost::program_options; +namespace fs = boost::filesystem; +using namespace std; +using namespace cimg_library; + +void convertFile(const string& inputFileName, bool remove) { + fs::path inputFilePath(inputFileName); + fs::path outputFilePath = inputFilePath; + outputFilePath.replace_extension(fs::path("jpg")); + try { + CImg image(inputFilePath.string().c_str()); + image.save(outputFilePath.string().c_str()); + if (remove) { + fs::remove(inputFilePath); + } + } catch (Magick::ErrorCorruptImage) { + cout << "cannot convert corrupted file " << inputFileName << endl; + } +} + +int main(int argc, char* argv[]) { + + po::options_description desc("options"); + po::positional_options_description p; + po::variables_map vm; + + bool remove = false; + bool recursive = false; + string startDir; + string inputFileName; + + try { + + desc.add_options() + ("help,h", "produce help message") + ("input-file,i", po::value(&inputFileName)->required(), "input file or directory") + ("delete,d", "delete original ppm files") + ("recursive,r", "apply conversion recursively"); + + + p.add("input-file", -1); + po::store(po::command_line_parser(argc, argv). + options(desc).positional(p).run(), vm); + po::notify(vm); + + if (vm.count("help")) { + cout << desc << endl; + return 1; + } + + if (vm.count("delete")) { + remove = true; + } + + if (vm.count("recursive")) { + recursive = true; + startDir = inputFileName; + } + } catch(std::exception& e) { + if (vm.count("help")) { + cout << desc << endl; + return (EXIT_SUCCESS); + } + std::cerr << "Error: " << e.what() << "\n"; + return (EXIT_FAILURE); + } + + if (recursive) { + fs::recursive_directory_iterator dir_end; + fs::recursive_directory_iterator dir(startDir); + while (dir != dir_end) { + fs::path _path(*dir); + ++dir; + if (!fs::is_directory(_path) && _path.extension().string() == ".ppm") { + convertFile(_path.string(), remove); + } + } + } else { + convertFile(inputFileName, remove); + } + + return (EXIT_SUCCESS); +} \ No newline at end of file diff --git a/ppm2jpg/ppm2jpgCas.sh b/ppm2jpg/ppm2jpgCas.sh new file mode 100755 index 0000000..8d57e56 --- /dev/null +++ b/ppm2jpg/ppm2jpgCas.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +##### simple script to change suffix .ppm to .jpg in compressed cas files and delete original .ppm images ##### + +function fixcas() { + root_dir=$1 + find "${root_dir}" -name *.tpcas.gz | xargs -n 1 -P $2 -I {} sudo bash -c "zcat '{}' | sed 's/\.ppm/.jpg/g' | gzip > '{}'.tmp; mv '{}'.tmp '{}'" +} + +function remppm() { + root_dir=$1 + find "${root_dir}" -name *.ppm | xargs -I {} sudo bash -c "rm '{}'" +} + +function usage { + echo "usage: $(basename $0) [-fdh] " + echo " -f --fix-cas fix cas files in recursively by changing substituting .ppm by .jpg suffix" + echo " -d --delete-ppm delete .ppm images in recursively" + echo " -p --num-proc maximum number of parallel processes" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 2 ]] +then + usage +fi + +FIX_CAS=false +REM_PPM=false +ROOT_DIR="" +N_PROC=1 + +while [[ $# -gt 0 ]] +do +key=$1 + +case $key in + -f|--fix-cas) + FIX_CAS=true + shift # past argument + ;; + -d|--delete-ppm) + REM_PPM=true + shift # past argument + ;; + -p|--num-proc) + shift + N_PROC=$1 + shift + ;; + -h|--help) + usage + ;; + *) + if [[ -d $key ]] + then + ROOT_DIR="$key" + shift + else + usage + fi + ;; +esac +done + +# check for the required argument ROOT_DIR +if [[ ${ROOT_DIR} == "" ]] +then + usage +fi + +if [ ${FIX_CAS} = true ] +then + fixcas "${ROOT_DIR}" ${N_PROC} +fi +if [ ${REM_PPM} = true ] +then + remppm "${ROOT_DIR}" +fi + +exit 0 \ No newline at end of file diff --git a/printbibfromnxmlorcasfile/.dep.inc b/printbibfromnxmlorcasfile/.dep.inc new file mode 100644 index 0000000..4560e55 --- /dev/null +++ b/printbibfromnxmlorcasfile/.dep.inc @@ -0,0 +1,5 @@ +# This code depends on make tool being used +DEPFILES=$(wildcard $(addsuffix .d, ${OBJECTFILES})) +ifneq (${DEPFILES},) +include ${DEPFILES} +endif diff --git a/printbibfromnxmlorcasfile/Makefile b/printbibfromnxmlorcasfile/Makefile new file mode 100644 index 0000000..05de621 --- /dev/null +++ b/printbibfromnxmlorcasfile/Makefile @@ -0,0 +1,128 @@ +# +# There exist several targets which are by default empty and which can be +# used for execution of your targets. These targets are usually executed +# before and after some main targets. They are: +# +# .build-pre: called before 'build' target +# .build-post: called after 'build' target +# .clean-pre: called before 'clean' target +# .clean-post: called after 'clean' target +# .clobber-pre: called before 'clobber' target +# .clobber-post: called after 'clobber' target +# .all-pre: called before 'all' target +# .all-post: called after 'all' target +# .help-pre: called before 'help' target +# .help-post: called after 'help' target +# +# Targets beginning with '.' are not intended to be called on their own. +# +# Main targets can be executed directly, and they are: +# +# build build a specific configuration +# clean remove built files from a configuration +# clobber remove all built files +# all build all configurations +# help print help mesage +# +# Targets .build-impl, .clean-impl, .clobber-impl, .all-impl, and +# .help-impl are implemented in nbproject/makefile-impl.mk. +# +# Available make variables: +# +# CND_BASEDIR base directory for relative paths +# CND_DISTDIR default top distribution directory (build artifacts) +# CND_BUILDDIR default top build directory (object files, ...) +# CONF name of current configuration +# CND_PLATFORM_${CONF} platform name (current configuration) +# CND_ARTIFACT_DIR_${CONF} directory of build artifact (current configuration) +# CND_ARTIFACT_NAME_${CONF} name of build artifact (current configuration) +# CND_ARTIFACT_PATH_${CONF} path to build artifact (current configuration) +# CND_PACKAGE_DIR_${CONF} directory of package (current configuration) +# CND_PACKAGE_NAME_${CONF} name of package (current configuration) +# CND_PACKAGE_PATH_${CONF} path to package (current configuration) +# +# NOCDDL + + +# Environment +MKDIR=mkdir +CP=cp +CCADMIN=CCadmin + + +# build +build: .build-post + +.build-pre: +# Add your pre 'build' code here... + +.build-post: .build-impl +# Add your post 'build' code here... + + +# clean +clean: .clean-post + +.clean-pre: +# Add your pre 'clean' code here... + +.clean-post: .clean-impl +# Add your post 'clean' code here... + + +# clobber +clobber: .clobber-post + +.clobber-pre: +# Add your pre 'clobber' code here... + +.clobber-post: .clobber-impl +# Add your post 'clobber' code here... + + +# all +all: .all-post + +.all-pre: +# Add your pre 'all' code here... + +.all-post: .all-impl +# Add your post 'all' code here... + + +# build tests +build-tests: .build-tests-post + +.build-tests-pre: +# Add your pre 'build-tests' code here... + +.build-tests-post: .build-tests-impl +# Add your post 'build-tests' code here... + + +# run tests +test: .test-post + +.test-pre: build-tests +# Add your pre 'test' code here... + +.test-post: .test-impl +# Add your post 'test' code here... + + +# help +help: .help-post + +.help-pre: +# Add your pre 'help' code here... + +.help-post: .help-impl +# Add your post 'help' code here... + + + +# include project implementation makefile +include nbproject/Makefile-impl.mk + +# include project make variables +include nbproject/Makefile-variables.mk diff --git a/printbibfromnxmlorcasfile/cmdline.h b/printbibfromnxmlorcasfile/cmdline.h new file mode 100644 index 0000000..f142850 --- /dev/null +++ b/printbibfromnxmlorcasfile/cmdline.h @@ -0,0 +1,773 @@ +/* +Copyright (c) 2009, Hideyuki Tanaka +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cmdline{ + +namespace detail{ + +template +class lexical_cast_t{ +public: + static Target cast(const Source &arg){ + Target ret; + std::stringstream ss; + if (!(ss<>ret && ss.eof())) + throw std::bad_cast(); + + return ret; + } +}; + +template +class lexical_cast_t{ +public: + static Target cast(const Source &arg){ + return arg; + } +}; + +template +class lexical_cast_t{ +public: + static std::string cast(const Source &arg){ + std::ostringstream ss; + ss< +class lexical_cast_t{ +public: + static Target cast(const std::string &arg){ + Target ret; + std::istringstream ss(arg); + if (!(ss>>ret && ss.eof())) + throw std::bad_cast(); + return ret; + } +}; + +template +struct is_same { + static const bool value = false; +}; + +template +struct is_same{ + static const bool value = true; +}; + +template +Target lexical_cast(const Source &arg) +{ + return lexical_cast_t::value>::cast(arg); +} + +static inline std::string demangle(const std::string &name) +{ + int status=0; + char *p=abi::__cxa_demangle(name.c_str(), 0, 0, &status); + std::string ret(p); + free(p); + return ret; +} + +template +std::string readable_typename() +{ + return demangle(typeid(T).name()); +} + +template <> +std::string readable_typename() +{ + return "string"; +} + +} // detail + +//----- + +class cmdline_error : public std::exception { +public: + cmdline_error(const std::string &msg): msg(msg){} + ~cmdline_error() throw() {} + const char *what() const throw() { return msg.c_str(); } +private: + std::string msg; +}; + +template +struct default_reader{ + T operator()(const std::string &str){ + return detail::lexical_cast(str); + } +}; + +template +struct range_reader{ + range_reader(const T &low, const T &high): low(low), high(high) {} + T operator()(const std::string &s) const { + T ret=default_reader()(s); + if (!(ret>=low && ret<=high)) throw cmdline::cmdline_error("range_error"); + return ret; + } +private: + T low, high; +}; + +template +range_reader range(const T &low, const T &high) +{ + return range_reader(low, high); +} + +template +struct oneof_reader{ + T operator()(const std::string &s){ + T ret=default_reader()(s); + if (std::find(alt.begin(), alt.end(), s)==alt.end()) + throw cmdline_error(""); + return ret; + } + void add(const T &v){ alt.push_back(v); } +private: + std::vector alt; +}; + +template +oneof_reader oneof(T a1) +{ + oneof_reader ret; + ret.add(a1); + return ret; +} + +template +oneof_reader oneof(T a1, T a2) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + return ret; +} + +template +oneof_reader oneof(T a1, T a2, T a3, T a4, T a5, T a6, T a7, T a8, T a9, T a10) +{ + oneof_reader ret; + ret.add(a1); + ret.add(a2); + ret.add(a3); + ret.add(a4); + ret.add(a5); + ret.add(a6); + ret.add(a7); + ret.add(a8); + ret.add(a9); + ret.add(a10); + return ret; +} + +//----- + +class parser{ +public: + parser(){ + } + ~parser(){ + for (std::map::iterator p=options.begin(); + p!=options.end(); p++) + delete p->second; + } + + void add(const std::string &name, + char short_name=0, + const std::string &desc=""){ + if (options.count(name)) throw cmdline_error("multiple definition: "+name); + options[name]=new option_without_value(name, short_name, desc); + ordered.push_back(options[name]); + } + + template + void add(const std::string &name, + char short_name=0, + const std::string &desc="", + bool need=true, + const T def=T()){ + add(name, short_name, desc, need, def, default_reader()); + } + + template + void add(const std::string &name, + char short_name=0, + const std::string &desc="", + bool need=true, + const T def=T(), + F reader=F()){ + if (options.count(name)) throw cmdline_error("multiple definition: "+name); + options[name]=new option_with_value_with_reader(name, short_name, need, def, desc, reader); + ordered.push_back(options[name]); + } + + void footer(const std::string &f){ + ftr=f; + } + + void set_program_name(const std::string &name){ + prog_name=name; + } + + bool exist(const std::string &name) const { + if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); + return options.find(name)->second->has_set(); + } + + template + const T &get(const std::string &name) const { + if (options.count(name)==0) throw cmdline_error("there is no flag: --"+name); + const option_with_value *p=dynamic_cast*>(options.find(name)->second); + if (p==NULL) throw cmdline_error("type mismatch flag '"+name+"'"); + return p->get(); + } + + const std::vector &rest() const { + return others; + } + + bool parse(const std::string &arg){ + std::vector args; + + std::string buf; + bool in_quote=false; + for (std::string::size_type i=0; i=arg.length()){ + errors.push_back("unexpected occurrence of '\\' at end of string"); + return false; + } + } + + buf+=arg[i]; + } + + if (in_quote){ + errors.push_back("quote is not closed"); + return false; + } + + if (buf.length()>0) + args.push_back(buf); + + for (size_t i=0; i &args){ + int argc=static_cast(args.size()); + std::vector argv(argc); + + for (int i=0; i lookup; + for (std::map::iterator p=options.begin(); + p!=options.end(); p++){ + if (p->first.length()==0) continue; + char initial=p->second->short_name(); + if (initial){ + if (lookup.count(initial)>0){ + lookup[initial]=""; + errors.push_back(std::string("short option '")+initial+"' is ambiguous"); + return false; + } + else lookup[initial]=p->first; + } + } + + for (int i=1; idescription()<set()){ + errors.push_back("option needs value: --"+name); + return; + } + } + + void set_option(const std::string &name, const std::string &value){ + if (options.count(name)==0){ + errors.push_back("undefined option: --"+name); + return; + } + if (!options[name]->set(value)){ + errors.push_back("option value is invalid: --"+name+"="+value); + return; + } + } + + class option_base{ + public: + virtual ~option_base(){} + + virtual bool has_value() const=0; + virtual bool set()=0; + virtual bool set(const std::string &value)=0; + virtual bool has_set() const=0; + virtual bool valid() const=0; + virtual bool must() const=0; + + virtual const std::string &name() const=0; + virtual char short_name() const=0; + virtual const std::string &description() const=0; + virtual std::string short_description() const=0; + }; + + class option_without_value : public option_base { + public: + option_without_value(const std::string &name, + char short_name, + const std::string &desc) + :nam(name), snam(short_name), desc(desc), has(false){ + } + ~option_without_value(){} + + bool has_value() const { return false; } + + bool set(){ + has=true; + return true; + } + + bool set(const std::string &){ + return false; + } + + bool has_set() const { + return has; + } + + bool valid() const{ + return true; + } + + bool must() const{ + return false; + } + + const std::string &name() const{ + return nam; + } + + char short_name() const{ + return snam; + } + + const std::string &description() const { + return desc; + } + + std::string short_description() const{ + return "--"+nam; + } + + private: + std::string nam; + char snam; + std::string desc; + bool has; + }; + + template + class option_with_value : public option_base { + public: + option_with_value(const std::string &name, + char short_name, + bool need, + const T &def, + const std::string &desc) + : nam(name), snam(short_name), need(need), has(false) + , def(def), actual(def) { + this->desc=full_description(desc); + } + ~option_with_value(){} + + const T &get() const { + return actual; + } + + bool has_value() const { return true; } + + bool set(){ + return false; + } + + bool set(const std::string &value){ + try{ + actual=read(value); + has=true; + } + catch(const std::exception &e){ + return false; + } + return true; + } + + bool has_set() const{ + return has; + } + + bool valid() const{ + if (need && !has) return false; + return true; + } + + bool must() const{ + return need; + } + + const std::string &name() const{ + return nam; + } + + char short_name() const{ + return snam; + } + + const std::string &description() const { + return desc; + } + + std::string short_description() const{ + return "--"+nam+"="+detail::readable_typename(); + } + + protected: + std::string full_description(const std::string &desc){ + return + desc+" ("+detail::readable_typename()+ + (need?"":" [="+detail::lexical_cast(def)+"]") + +")"; + } + + virtual T read(const std::string &s)=0; + + std::string nam; + char snam; + bool need; + std::string desc; + + bool has; + T def; + T actual; + }; + + template + class option_with_value_with_reader : public option_with_value { + public: + option_with_value_with_reader(const std::string &name, + char short_name, + bool need, + const T def, + const std::string &desc, + F reader) + : option_with_value(name, short_name, need, def, desc), reader(reader){ + } + + private: + T read(const std::string &s){ + return reader(s); + } + + F reader; + }; + + std::map options; + std::vector ordered; + std::string ftr; + + std::string prog_name; + std::vector others; + + std::vector errors; +}; + +} // cmdline diff --git a/printbibfromnxmlorcasfile/main.cpp b/printbibfromnxmlorcasfile/main.cpp new file mode 100644 index 0000000..5c87813 --- /dev/null +++ b/printbibfromnxmlorcasfile/main.cpp @@ -0,0 +1,271 @@ +/* + * File: main.cpp + * Author: mueller + * + * Created on July 18, 2014, 10:27 AM + */ + +#define TPCAS2TPCENTRALDESCRIPTOR "/usr/local/uima_descriptors/Tpcas2TpCentral.xml" + +#include "xercesc/util/XMLString.hpp" +#include +#include "uima/xmideserializer.hpp" +#include +#include +#include +#include +#include +#include "cmdline.h" + +/* + * getXMLstring and GetBibFromXML were written by Yuling Li. + */ + +namespace { + + std::string getXMLstring(uima::CAS & tcas) { + uima::UnicodeStringRef usdocref = tcas.getDocumentText(); + if (usdocref.length() > 0) { + std::string xmlstring = usdocref.asUTF8(); + return xmlstring; + } else { + return ""; + } + } + + std::vector GetBibFromXML(std::string xml_text) { + boost::regex nline("\\n"); + xml_text = boost::regex_replace(xml_text, nline, ""); + //find author + std::string t_xmltext = xml_text; + boost::regex authorregex("\(.+?)\<\/contrib-group\>"); + boost::smatch author_matches; + std::string author = ""; + while (boost::regex_search(t_xmltext, author_matches, authorregex)) { + int size = author_matches.size(); + std::string hit_text = author_matches[1]; + boost::smatch name_matches; + boost::regex nameregex("\(.+?)\<\/surname\>\\s+\(.+?)\<\/given-names\>"); + while (boost::regex_search(hit_text, name_matches, nameregex)) { + author = author + name_matches[1] + " " + name_matches[2] + ", "; + hit_text = name_matches.suffix().str(); + } + t_xmltext = author_matches.suffix().str(); + } + boost::regex comma("\\, $"); + author = boost::regex_replace(author, comma, ""); + //find subject + t_xmltext = xml_text; + boost::regex subjectregex("\(.+?)\<\/subject>"); + boost::smatch subject_matches; + std::string subject = ""; + while (boost::regex_search(t_xmltext, subject_matches, subjectregex)) { + subject = subject + subject_matches[1] + ", "; + t_xmltext = subject_matches.suffix().str(); + } + subject = boost::regex_replace(subject, comma, ""); + //find accession + t_xmltext = xml_text; + std::string accession = ""; + boost::regex pmidregex("\(\\d+?)\<\/article-id\>"); + boost::regex pmcregex("\(\\d+?)\<\/article-id\>"); + boost::smatch pmid_matches; + boost::smatch pmc_matches; + if (boost::regex_search(t_xmltext, pmid_matches, pmidregex)) { + accession = "PMID " + pmid_matches[1]; + } else if (boost::regex_search(t_xmltext, pmc_matches, pmcregex)) { + accession = "PMC " + pmc_matches[1]; + } + // find article type + t_xmltext = xml_text; + std::string type = ""; + boost::regex typeregex("article-type=\"(.+?)\""); + boost::smatch type_matches; + if (boost::regex_search(t_xmltext, type_matches, typeregex)) { + type = type_matches[1]; + } + // find journal + t_xmltext = xml_text; + std::string journal = ""; + boost::regex journalregex("\(.+?)\<\/journal-title\>"); + boost::smatch journal_matches; + if (boost::regex_search(t_xmltext, journal_matches, journalregex)) { + journal = journal_matches[1]; + } + // find article title + t_xmltext = xml_text; + std::string title = ""; + boost::regex articleregex("\(.+?)\<\/article-title\>"); + + boost::smatch article_matches; + if (boost::regex_search(t_xmltext, article_matches, articleregex)) { + title = article_matches[1]; + } + // find abstract + t_xmltext = xml_text; + std::string abstract = ""; + boost::regex abstractregex("\(.+?)\<\/abstract\>"); + boost::smatch abstract_matches; + if (boost::regex_search(t_xmltext, abstract_matches, abstractregex)) { + abstract = abstract_matches[1]; + } + // find citation + t_xmltext = xml_text; + std::string citation = ""; + boost::regex volumeregex("\(\\d+)\<\/volume\>"); + boost::smatch volume_matches; + if (boost::regex_search(t_xmltext, volume_matches, volumeregex)) { + citation = citation + "V : " + volume_matches[1] + " "; + } + boost::regex issueregex("\(\\d+)\<\/issue\>"); + boost::smatch issue_matches; + if (boost::regex_search(t_xmltext, issue_matches, issueregex)) { + citation = citation + "(" + issue_matches[1] + ") "; + } + boost::regex pageregex("\(\\d+)\<\/fpage\>\\s+\(\\d+)\<\/lpage\>"); + boost::smatch page_matches; + if (boost::regex_search(t_xmltext, page_matches, pageregex)) { + citation = citation + "pp. " + page_matches[1] + "-" + page_matches[2]; + } + // find year + t_xmltext = xml_text; + std::string year = ""; + boost::regex yearregex("\.*?\(\\d+)\<\/year\>\\s+\<\/pub-date\>"); + boost::smatch year_matches; + if (boost::regex_search(t_xmltext, year_matches, yearregex)) { + year = year_matches[1]; + } + std::vector bibinfo; + bibinfo.push_back(author); + bibinfo.push_back(accession); + bibinfo.push_back(type); + bibinfo.push_back(title); + bibinfo.push_back(journal); + bibinfo.push_back(citation); + bibinfo.push_back(year); + bibinfo.push_back(abstract); + bibinfo.push_back(subject); + return bibinfo; + } + + std::string uncompressGzip2(std::string gzFile) { + std::ifstream filein(gzFile.c_str(), std::ios_base::in | std::ios_base::binary); + boost::iostreams::filtering_streambuf in; + in.push(boost::iostreams::gzip_decompressor()); + in.push(filein); + char tmpname[L_tmpnam]; + char * pDummy = tmpnam(tmpname); + std::string tmpfile(tmpname); + while (boost::filesystem::exists(tmpfile)) { + char * pDummy = tmpnam(tmpname); + tmpfile = std::string(tmpname); + } + std::ofstream out(tmpfile.c_str()); + boost::iostreams::copy(in, out); + out.close(); + return tmpfile; + } + + //[ Uima related + + uima::AnalysisEngine * CreateUimaEngine(const char * descriptor) { + uima::ErrorInfo errorInfo; + uima::AnalysisEngine * ret = uima::Framework::createAnalysisEngine(descriptor, errorInfo); + if (errorInfo.getErrorId() != UIMA_ERR_NONE) { + std::cerr << std::endl + << " Error string : " + << uima::AnalysisEngine::getErrorIdAsCString(errorInfo.getErrorId()) + << std::endl + << " UIMACPP Error info:" << std::endl + << errorInfo << std::endl; + } + return ret; + } + + uima::CAS * GetCas(const char * pszInputFile, uima::AnalysisEngine * pEngine) { + uima::CAS * ret = pEngine->newCAS(); + if (ret == NULL) { + std::cerr << "pEngine_->newCAS() failed." << std::endl; + } else { + try { + /* initialize from an xmicas */ + XMLCh * native = XMLString::transcode(pszInputFile); + LocalFileInputSource fileIS(native); + XMLString::release(&native); + uima::XmiDeserializer::deserialize(fileIS, * ret, true); + } catch (uima::Exception e) { + uima::ErrorInfo errInfo = e.getErrorInfo(); + std::cerr << "Error " << errInfo.getErrorId() << " " << errInfo.getMessage() << std::endl; + std::cerr << errInfo << std::endl; + } + } + return ret; + } + //] Uima related +} + +int main(int argc, char * argv[]) { + + cmdline::parser p; + p.set_program_name("printbibfromcasfile"); + p.add("abstract", 'a', "print abstract"); + p.add("author", 'u', "print author"); + p.add("accession", 'c', "print citation"); + p.add("citation", 'i', "print citation"); + p.add("journal", 'j', "print journal"); + p.add("subject", 's', "print subject"); + p.add("title", 't', "print title"); + p.add("type", 'p', "print type"); + p.add("year", 'y', "print year"); + p.add("nxml", 'n', "file is in nxml format, not gzipped cas."); + p.footer(""); + if (p.parse(argc, argv) == 0) { + std::cerr << "Error:" << p.error() << std::endl + << p.usage() << std::endl; + return -1; + } + if (argc < 3) { + std::cerr << p.usage() << std::endl; + return -1; + } + std::string filename; + if (p.rest().size() > 0) filename = p.rest()[0]; + // + std::vector bib_info; + if (p.exist("nxml")) { + std::ifstream f(filename.c_str()); + std::string in; + std::string all; + while (getline(f, in)) all += in; + f.close(); + bib_info = GetBibFromXML(all); + } else { + (void) uima::ResourceManager::createInstance("TPCAS2TPCENTRALAE"); + uima::AnalysisEngine * pEngine = CreateUimaEngine(TPCAS2TPCENTRALDESCRIPTOR); + std::string tmpfl = uncompressGzip2(filename); + uima::CAS * pcas = GetCas(tmpfl.c_str(), pEngine); + boost::filesystem::remove(tmpfl); + bib_info = GetBibFromXML(getXMLstring(*pcas)); + } + std::string l_author = bib_info[0]; + std::string l_accession = bib_info[1]; + std::string l_type = bib_info[2]; + std::string l_title = bib_info[3]; + std::string l_journal = bib_info[4]; + std::string l_citation = bib_info[5]; + std::string l_year = bib_info[6]; + std::string l_abstract = bib_info[7]; + std::string l_subject = bib_info[8]; + std::cout << "Filename:" << filename << "\t"; + if (p.exist("title")) std::cout << "Title:" << l_title << "\t"; + if (p.exist("author")) std::cout << "Author:" << l_author << "\t"; + if (p.exist("accession")) std::cout << "Accession:" << l_accession << "\t"; + if (p.exist("type")) std::cout << "Type:" << l_type << "\t"; + if (p.exist("journal")) std::cout << "Journal:" << l_journal << "\t"; + if (p.exist("citation")) std::cout << "Citation:" << l_citation << "\t"; + if (p.exist("year")) std::cout << "Year:" << l_year << "\t"; + if (p.exist("abstract")) std::cout << "Abstract:" << l_abstract << "\t"; + if (p.exist("subject")) std::cout << "Subject:" << l_subject << "\t"; + std::cout << std::endl; + return 0; +} diff --git a/run_tpc_pipeline_incremental.sh b/run_tpc_pipeline_incremental.sh new file mode 100644 index 0000000..3dbb24a --- /dev/null +++ b/run_tpc_pipeline_incremental.sh @@ -0,0 +1,253 @@ +#!/usr/bin/env bash + +function usage { + echo "usage: $(basename $0) [-p]" + echo " -p --pdf-dir directory where raw pdf files will be stored" + echo " -x --xml-dir directory where raw xml files will be stored" + echo " -c --cas1-dir directory where generated cas1 files will be stored" + echo " -C --cas2-dir directory where generated cas2 files will be stored" + echo " -t --tmp-dir temp directory" + echo " -f --ftp-dir ftp mount point for pmcoa papers" + echo " -P --num-proc maximum number of parallel processes" + echo " -h --help display help" + exit 1 +} + +if [[ "${#}" < 2 ]] +then + usage +fi + +PDF_DIR="/data/textpresso/raw_files/pdf" +XML_DIR="/data/textpresso/raw_files/xml" +CAS2_DIR="/data/textpresso/tpcas" +CAS1_DIR="/data/textpresso/tpcas-1" +TMP_DIR="/data/textpresso/tmp" +FTP_MNTPNT="/mnt/pmc_ftp" +INDEX_DIR="/data/textpresso/luceneindex" +N_PROC=1 + +while [[ $# -gt 0 ]] +do +key=$1 + +case $key in + -p|--pdf-dir) + shift + if [[ -d $key ]] + then + PDF_DIR="$key" + fi + shift + ;; + -x|--xml-dir) + shift + if [[ -d $key ]] + then + XML_DIR="$key" + fi + shift + ;; + -c|--cas1-dir) + shift + if [[ -d $key ]] + then + CAS1_DIR="$key" + fi + shift + ;; + -C|--cas2-dir) + shift + if [[ -d $key ]] + then + CAS2_DIR="$key" + fi + shift + ;; + -t|--tmp-dir) + shift + if [[ -d $key ]] + then + TMP_DIR="$key" + fi + shift + ;; + -f|--ftp-dir) + shift + if [[ -d $key ]] + then + FTP_MNTPNT="$key" + fi + shift + ;; + -P|--num-proc) + shift + N_PROC=$1 + shift + ;; + -h|--help) + usage + ;; + *) + if [[ -d $key ]] + then + ROOT_DIR="$key" + shift + else + usage + fi + ;; +esac +done + +# temp files +logfile=$(mktemp) +newpdf_list=$(mktemp) +newxml_list=$(mktemp) +newxml_local_list=$(mktemp) + +# download new xml files from pmcoa +## create directory for unclassified xml files +mkdir -p ${XML_DIR} +## mount pmcoa ftp locally through curl +curlftpfs ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/ ${FTP_MNTPNT} +#find ${FTP_MNTPNT} *.gz | xargs ls -d -l --time-style="full-iso" | awk '{if (substr($1,0,1) == "-") print $6, $7, $9}' > ${newxml_list} +# save list of tazendra files locally +for dir in ${FTP_MNTPNT}/*; do for subdir in ${dir}/*; do ls -d -l --time-style="full-iso" ${subdir}/* | awk '{print $6, $7, $9}' >> ${newxml_list}; done; done +if [[ -e ${XML_DIR}/current_filelist.txt ]] +then + # download diff files + diff ${newxml_list} ${XML_DIR}/current_filelist.txt | grep "^<" | awk '{print $3}' | xargs -I {} tar xfz {} --exclude="*.pdf" --exclude="*.PDF" --exclude="*.mp4" --exclude="*.webm" --exclude="*.flv" --exclude="*.avi" --exclude="*.zip" --exclude="*.mov" --exclude="*.csv" --exclude="*.xls*" --exclude="*.doc*" --exclude="*.ppt*" --exclude="*.rar" --exclude="*.txt" --exclude="*.TXT" --exclude="*.wmv" --exclude="*.DOC*" -C ${XML_DIR} + # save new current list + diff ${newxml_list} ${XML_DIR}/current_filelist.txt | grep "^<" | awk '{print $3}' >> ${XML_DIR}/current_filelist.txt +else + # download all files + awk '{print $3}' ${newxml_list} | xargs -I {} tar xfz {} --exclude="*.pdf" --exclude="*.PDF" --exclude="*.mp4" --exclude="*.webm" --exclude="*.flv" --exclude="*.avi" --exclude="*.zip" --exclude="*.mov" --exclude="*.csv" --exclude="*.xls*" --exclude="*.doc*" --exclude="*.ppt*" --exclude="*.rar" --exclude="*.txt" --exclude="*.TXT" --exclude="*.wmv" --exclude="*.DOC*" -C ${XML_DIR} + # save file list as current + cp ${newxml_list} ${XML_DIR}/current_filelist.txt +fi +umount ${FTP_MNTPNT} + +# save new xml local file list +cut -d " " -f 3 ${newxml_list} | sed "s/\/mnt\/pmc\_ftp\/.\{2\}\/.\{2\}\///g;s/\.tar\.gz//g" | xargs -I {} echo ${XML_DIR}/{} > ${newxml_local_list} + +# compress nxml and put images in a separate directory +cat ${newxml_local_list} | while read line +do + gzip $line/*.nxml; mkdir $line/images; ls -d $line/* | grep -v .nxml | grep -v $line/images | xargs -I [] mv [] $line/images +done + +# download new pdf files incrementally from tazendra +getpdfs.py -l ${logfile} -L INFO ${PDF_DIR} "${XML_DIR}/PMCOA C. elegans" +grep -oP "Downloading paper: .* to \K.*\.pdf" ${logfile} > ${newpdf_list} + +# download bib info for pdfs +mkdir -p /usr/local/textpresso/celegans_bib +download_pdfinfo.pl /usr/local/textpresso/celegans_bib/ +extract_pdfbibinfo.pl /usr/local/textpresso/celegans_bib/ + +# generate tpcas-1 from new pdf files +mkdir -p ${CAS1_DIR}/C.\ elegans +mkdir -p ${CAS1_DIR}/C.\ elegans\ Supplementals +cd ${CAS1_DIR} +articles2cas -i ${PDF_DIR}/C.\ elegans -l ${newpdf_list} -t 1 -o C.\ elegans -p +articles2cas -i ${PDF_DIR}/C.\ elegans\ Supplementals -l ${newpdf_list} -t 1 -o C.\ elegans\ Supplementals -p + +# generate tpcas-1 from new nxml files +mkdir -p ${CAS1_DIR}/PMCOA +cd ${CAS1_DIR} +articles2cas -i "${XML_DIR}" -l <(awk 'BEGIN{FS="/"}{print $NF}' ${newxml_local_list}) -t 2 -o PMCOA -p + +# add images to tpcas directory and gzip +## xml +cat ${newxml_local_list} | while read line +do + dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}') + rm -rf "${CAS1_DIR}/PMCOA/${dirname}/images" + ln -fs "${XML_DIR}/${dirname}/images" "${CAS1_DIR}/PMCOA/${dirname}/images" + find "${CAS1_DIR}/PMCOA/${dirname}" -name *.tpcas | xargs -I {} gzip "{}" +done +## pdf +cat ${newpdf_list} | while read line +do + gzip "${CAS1_DIR}/$(echo "${line}" | awk 'BEGIN{FS="/"}{print $NF-2"/"$NF-1"/"$NF}')" +done + +# generate cas2 files from cas1 +## copy files to temp directory +rm -rf ${TMP_DIR}/tpcas-1 +## xml +mkdir -p ${TMP_DIR}/tpcas-1/xml +cat ${newxml_local_list} | while read line +do + dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}') + find "${CAS1_DIR}/PMCOA/${dirname}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/xml/${dirname}.tpcas.gz +done + +mkdir -p ${TMP_DIR}/tpcas-1/pdf_celegans +mkdir -p ${TMP_DIR}/tpcas-1/pdf_celegans_sup +# TODO check if "line" contains only the file name +grep -v "Supplementals" ${newpdf_list} | while read line +do + find "${CAS1_DIR}/C. elegans/${line}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/pdf_celegans/${line}.tpcas.gz +done +grep "Supplementals" ${newpdf_list} | while read line +do + find "${CAS1_DIR}/C. elegans Supplementals/${line}" -name *.tpcas.gz | xargs -I {} cp "{}" ${TMP_DIR}/tpcas-1/pdf_celegans_sup/${line}.tpcas.gz +done + +## apply uima analysis +rm -rf "${TMP_DIR}/tpcas-2" +mkdir -p "${TMP_DIR}/tpcas-2/xml" +mkdir -p "${TMP_DIR}/tpcas-2/pdf_celegans" +mkdir -p "${TMP_DIR}/tpcas-2/pdf_celegans_sup" +find ${TMP_DIR}/tpcas-1 -name *.tpcas.gz | xargs -n 1 -P ${N_PROC} gunzip +runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/xml ${TMP_DIR}/tpcas-2/xml +runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/pdf_celegans ${TMP_DIR}/tpcas-2/pdf_celegans +runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${TMP_DIR}/tpcas-1/pdf_celegans_sup ${TMP_DIR}/tpcas-2/pdf_celegans_sup +find ${TMP_DIR}/tpcas-2 -name *.tpcas | xargs -n 1 -P ${N_PROC} gzip + +# copy tpcas1 dirs to tpcas2 and replace tpcas files with the new ones +mkdir -p "${CAS2_DIR}/PMCOA" +mkdir -p "${CAS2_DIR}/C. elegans" +mkdir -p "${CAS2_DIR}/C. elegans Supplementals" +## xml +cat ${newxml_local_list} | while read line +do + dirname=$(echo ${line} | awk 'BEGIN{FS="/"}{print $NF}') + tpcas_file_name=$(ls ${CAS1_DIR}/PMCOA/${dirname}/*.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}') + mkdir "${CAS2_DIR}/PMCOA/${dirname}" + ln -s "${CAS1_DIR}/PMCOA/${dirname}/images" "${CAS2_DIR}/PMCOA/${dirname}/images" + cp ${TMP_DIR}/tpcas-2/xml/${dirname}.tpcas.gz "${CAS2_DIR}/PMCOA/${dirname}/${tpcas_file_name}" +done +## pdf +grep -v "Supplementals" ${newpdf_list} | while read line +do + mkdir "${CAS2_DIR}/C. elegans/${line}" + ln -s "${CAS1_DIR}/C. elegans/${line}/images" "${CAS2_DIR}/C. elegans/${line}/images" + find "${CAS1_DIR}/C. elegans/${line}/" -name *.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}' | xargs -I {} cp ${TMP_DIR}/tpcas-1/pdf_celegans/"{}" "${CAS2_DIR}/C. elegans/${line}/" +done +grep "Supplementals" ${newpdf_list} | while read line +do + mkdir "${CAS2_DIR}/C. elegans Supplementals/${line}" + ln -s "${CAS1_DIR}/C. elegans Supplementals/${line}/images" "${CAS2_DIR}/C. elegans Supplementals/${line}/images" + find "${CAS1_DIR}/C. elegans Supplementals/${line}/" -name *.tpcas.gz | awk 'BEGIN{FS="/"}{print $NF}' | xargs -I {} cp ${TMP_DIR}/tpcas-1/pdf_celegans/"{}" "${CAS2_DIR}/C. elegans Supplementals/${line}/" +done + +# generate bib files for cas files +# TODO: check from here +getallbibfiles.sh -p ${N_PROC} ${CAS2_DIR} + +if [[ ! -d ${INDEX_DIR} || $(ls ${INDEX_DIR} | grep -v "subindex.config" | wc -l) == "0" ]] +then + mkdir -p ${INDEX_DIR} + createallindexes -p ${N_PROC} ${CAS2_DIR} ${INDEX_DIR} +else + cas2index -i ${CAS2_DIR} -o ${INDEX_DIR} +fi +# cleanup tmp files +rm -rf ${TMP_DIR} +rm ${logfile} +rm ${newpdf_list} +rm ${newxml_list} +rm ${newxml_local_list} \ No newline at end of file diff --git a/useruploads/run_userupload_pipeline_incremental.sh b/useruploads/run_userupload_pipeline_incremental.sh new file mode 100755 index 0000000..7e90633 --- /dev/null +++ b/useruploads/run_userupload_pipeline_incremental.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +USERUPLOADS_DIR="/usr/local/textpresso/useruploads" + +for user_dir in ${USERUPLOADS_DIR}/* +do + cd ${user_dir} + username=${PWD##*/} + mkdir -p ${user_dir}/tpcas + mkdir -p ${user_dir}/tmp/cas1 + mkdir -p ${user_dir}/tmp/cas2 + mkdir -p ${user_dir}/useruploads/${username} + touch ${user_dir}/tpcas/processed_files.txt + touch ${user_dir}/tpcas/tokenized_files.txt + tmpfile=$(mktemp) + grep -vxf ${user_dir}/tpcas/processed_files.txt <(ls -1 ${user_dir}/uploadedfiles) > ${tmpfile} + if [[ $(grep ".pdf" ${tmpfile} | wc -l | awk '{print $1}') != "0" ]] + then + articles2cas -t 1 -i uploadedfiles -o useruploads/${username} -l <(grep ".pdf" ${tmpfile}) + fi + if [[ $(grep ".nxml" ${tmpfile} | wc -l | awk '{print $1}') != "0" ]] + then + articles2cas -t 2 -i ${user_dir}/uploadedfiles -o useruploads/${username} -l <(grep ".nxml" ${tmpfile}) + fi + # TODO process compressed archives + mv useruploads/${username}/* ${user_dir}/tpcas/ + rm -rf useruploads + cat ${tmpfile} >> ${user_dir}/tpcas/tokenized_files.txt + grep -xf <(sed -e 's/\.[^.]*$//' ${tmpfile}) <(ls ${user_dir}/tpcas/) | xargs -I {} cp ${user_dir}/tpcas/{}/{}.tpcas ${user_dir}/tmp/cas1 + if [[ $(ls ${user_dir}/tmp/cas1/ | wc -l | awk '{print $0}') != "0" ]] + then + runAECpp /usr/local/uima_descriptors/TpLexiconAnnotatorFromPg.xml -xmi ${user_dir}/tmp/cas1 ${user_dir}/tmp/cas2 + fi + for tpcas2_file in $(ls ${user_dir}/tmp/cas2/*) + do + mv ${tpcas2_file} ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//') + if [[ -f ${user_dir}/uploadedfiles/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//').bib ]] + then + cp ${user_dir}/uploadedfiles/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//').bib ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//') + fi + gzip ${user_dir}/tpcas/$(basename ${tpcas2_file} | sed -e 's/\.[^.]*$//')/$(basename ${tpcas2_file}) + done + rm -rf ${user_dir}/tmp/ + cat ${tmpfile} >> ${user_dir}/tpcas/processed_files.txt + rm ${tmpfile} + mkdir -p /usr/local/textpresso/tpcas/useruploads/${username} + cd tpcas + find . -mindepth 1 -maxdepth 1 -type d | xargs -I {} ln -s ${user_dir}/tpcas/{} /usr/local/textpresso/tpcas/useruploads/${username}/{} + if [[ ! -f ${user_dir}/luceneindex ]] + then + mkdir -p ${user_dir}/luceneindex + cas2index -i ${user_dir}/tpcas -o ${user_dir}/luceneindex -s 300000 -e + fi +done \ No newline at end of file