diff --git a/.github/workflows/linux-self-hosted.yml b/.github/workflows/linux-self-hosted.yml index 0e9b420..0b368e4 100644 --- a/.github/workflows/linux-self-hosted.yml +++ b/.github/workflows/linux-self-hosted.yml @@ -23,7 +23,7 @@ jobs: runs-on: [linux-self-hosted , agc] needs: checkout env: - EXE_AGC: ./agc + EXE_AGC: ./bin/agc steps: # - name: make (g++ 9) # run: | @@ -31,16 +31,20 @@ jobs: # make CXX=g++-9 agc # make clean # - - name: make (g++ 10) - run: | - make clean - make CXX=g++-10 agc - make clean - +# - name: make (g++ 10) +# run: | +# make clean +# make CXX=g++-10 agc +# make clean +# - name: make (g++ 11) run: | make CXX=g++-11 agc make clean + - name: make (g++ 12) + run: | + make CXX=g++-12 agc + make clean - name: make (default) run: | @@ -70,7 +74,7 @@ jobs: - name: camp all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -78,7 +82,7 @@ jobs: - name: camp one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN @@ -108,7 +112,7 @@ jobs: - name: camp all together PAR $PARAMS run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -116,7 +120,7 @@ jobs: - name: camp one by one $PARAMS run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -142,7 +146,7 @@ jobs: - name: salmo all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -151,7 +155,7 @@ jobs: - name: salmo one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND @@ -177,7 +181,7 @@ jobs: - name: salmo all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -185,7 +189,7 @@ jobs: - name: salmo one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -212,7 +216,7 @@ jobs: - name: covid all together create + getcol run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -226,7 +230,7 @@ jobs: - name: covid all together create -a + getset run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -237,7 +241,7 @@ jobs: do echo $i SMPL=`sed " $i q;d" $OUT_NAME/$SET_LIST` - echo "./agc getset -l $LINE_LEN $OUT_NAME/$OUT_NAME.agc $SMPL > $OUT_NAME/$SMPL.fa" + echo "./bin/agc getset -l $LINE_LEN $OUT_NAME/$OUT_NAME.agc $SMPL > $OUT_NAME/$SMPL.fa" $EXE_AGC getset -l $LINE_LEN $OUT_NAME/$OUT_NAME.agc $SMPL > $OUT_NAME/$SMPL.fa done cd $OUT_NAME @@ -248,7 +252,7 @@ jobs: - name: covid all together create -a + append + getset run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -260,7 +264,7 @@ jobs: do echo $i SMPL=`sed " $i q;d" $OUT_NAME/$SET_LIST` - echo "./agc getset -l $LINE_LEN $OUT_NAME/$OUT_NAME-APPEND.agc $SMPL > $OUT_NAME/$SMPL.fa" + echo "./bin/agc getset -l $LINE_LEN $OUT_NAME/$OUT_NAME-APPEND.agc $SMPL > $OUT_NAME/$SMPL.fa" $EXE_AGC getset -l $LINE_LEN $OUT_NAME/$OUT_NAME-APPEND.agc $SMPL > $OUT_NAME/$SMPL.fa done cd $OUT_NAME @@ -291,7 +295,7 @@ jobs: - name: covid all together create + getcol run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -324,7 +328,7 @@ jobs: - name: hprc all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -332,7 +336,7 @@ jobs: - name: hprc one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN @@ -359,7 +363,7 @@ jobs: - name: hprc all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -367,7 +371,7 @@ jobs: - name: hprc one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -391,7 +395,7 @@ jobs: - name: hgsvc all together run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -416,7 +420,7 @@ jobs: - name: hgsvc one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN diff --git a/.github/workflows/mac-self-hosted.yml b/.github/workflows/mac-self-hosted.yml index 186fd65..c9057ab 100644 --- a/.github/workflows/mac-self-hosted.yml +++ b/.github/workflows/mac-self-hosted.yml @@ -33,9 +33,9 @@ jobs: with: submodules: recursive - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable + # - uses: maxim-lobanov/setup-xcode@v1 + # with: + # xcode-version: latest-stable # xcode-version: '14.1-beta' make-tests: @@ -46,14 +46,14 @@ jobs: runs-on: ['${{ matrix.runner }}'] needs: checkout env: - EXE_AGC: ./agc + EXE_AGC: ./bin/agc steps: - uses: actions/checkout@v4 with: submodules: recursive - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable + # - uses: maxim-lobanov/setup-xcode@v1 + # with: + # xcode-version: latest-stable # xcode-version: '14.1-beta' # - name: make (g++ 9) @@ -67,10 +67,10 @@ jobs: # make CXX=g++-10 agc # make clean - - name: make (g++ 11) + - name: gmake (g++ 13) run: | - make clean - make CXX=g++-11 agc + gmake clean + gmake CXX=g++-13 agc #- name: make (default) # run: | @@ -104,7 +104,7 @@ jobs: - name: camp all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -113,7 +113,7 @@ jobs: - name: camp one by one run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN @@ -147,7 +147,7 @@ jobs: - name: camp all together PAR $PARAMS run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -156,7 +156,7 @@ jobs: - name: camp one by one $PARAMS run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -186,7 +186,7 @@ jobs: - name: salmo all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -196,7 +196,7 @@ jobs: - name: salmo one by one run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND @@ -226,7 +226,7 @@ jobs: - name: salmo all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -235,7 +235,7 @@ jobs: - name: salmo one by one run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -266,7 +266,7 @@ jobs: - name: covid all together create + getcol run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -281,7 +281,7 @@ jobs: - name: covid all together create -a + getset run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -304,7 +304,7 @@ jobs: - name: covid all together create -a + append + getset run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -351,7 +351,7 @@ jobs: - name: covid all together create + getcol run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME mkdir $OUT_NAME @@ -388,7 +388,7 @@ jobs: - name: hprc all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -396,7 +396,7 @@ jobs: - name: hprc one by one run: | - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN @@ -427,7 +427,7 @@ jobs: - name: hprc all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN $PARAMS @@ -436,7 +436,7 @@ jobs: - name: hprc one by one run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN $ADAPTIVE_APPEND $PARAMS @@ -464,7 +464,7 @@ jobs: - name: hgsvc all together run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-all ./run_agc_complete_compr_test $OUT_NAME-all $FASTA $REF_NUM $LINE_LEN @@ -493,7 +493,7 @@ jobs: - name: hgsvc one by one run: | rm -f $DATA/agc - cp agc $DATA/ + cp bin/agc $DATA/ cd $DATA rm -rf $OUT_NAME-1by1 ./run_agc_complete_one_by_one_test $OUT_NAME-1by1 $FASTA $LINE_LEN diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1821bd6..a05e4e1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,18 +10,18 @@ jobs: toy-example: strategy: matrix: - os: [ubuntu-latest, ubuntu-22.04] + os: [ubuntu-24.04, ubuntu-22.04] name: Toy Test GH linux runs-on: ${{ matrix.os }} env: - EXE_AGC: ../agc + EXE_AGC: ../bin/agc steps: - uses: actions/checkout@v4 with: submodules: recursive - - name: make CXX=g++-11 agc - run: make CXX=g++-11 agc + - name: make CXX=g++-12 agc + run: make CXX=g++-12 agc #CREATE @@ -150,7 +150,7 @@ jobs: cmp list list_dec - - name: toy example getset -c + - name: toy example getset -c run: | cd toy_ex $EXE_AGC getset toy.agc chr1 > chr1_dec.fa @@ -354,25 +354,26 @@ jobs: toy-example-macos: strategy: matrix: - os: [macos-latest, macos-14, macos-13, macos-12] + os: [macos-13, macos-12] - name: Make Test GitHub macos + name: Make Test GitHub macos runs-on: ${{ matrix.os }} env: - EXE_AGC: ../agc + EXE_AGC: ../bin/agc steps: - uses: actions/checkout@v4 with: submodules: recursive - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable - # xcode-version: '14.1' - - name: make CXX=g++-11 agc + # - uses: maxim-lobanov/setup-xcode@v4 + # with: + # xcode-version: latest-stable + # # xcode-version: '14.1' + - name: gmake CXX=g++-12 agc run: | - make clean - make CXX=g++-11 agc + brew install make + gmake clean + gmake CXX=g++-12 agc - name: toy example run: | diff --git a/.github/workflows/self-hosted-toy.yml b/.github/workflows/self-hosted-toy.yml index d677ccd..809fbad 100644 --- a/.github/workflows/self-hosted-toy.yml +++ b/.github/workflows/self-hosted-toy.yml @@ -1,5 +1,7 @@ name: Self-hosted CI TOY EXAMPLE + + on: #push: # paths-ignore: @@ -12,28 +14,42 @@ jobs: matrix: runner: [linux-self-hosted, mac-self-hosted-M1, mac-self-hosted-Intel] - name: Toy Test Runner + name: Toy Test Runner runs-on: ['${{ matrix.runner }}'] env: - EXE_AGC: ../agc + EXE_AGC: ../bin/agc steps: - uses: actions/checkout@v4 with: submodules: recursive - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable - # xcode-version: '14.1-beta' - if: ${{ matrix.runner == 'self-hosted-mac-M1' }} - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: latest-stable + # - uses: maxim-lobanov/setup-xcode@v1.6.0 + # with: + # xcode-version: latest-stable # xcode-version: '14.1-beta' - if: ${{ matrix.runner == 'self-hosted-mac-Intel' }} - - name: make CXX=g++-11 agc - run: | - make clean - make CXX=g++-11 agc + # if: ${{ matrix.runner == 'self-hosted-mac-M1' }} + # - uses: maxim-lobanov/setup-xcode@v1.6.0 + # with: + # xcode-version: latest-stable + # # xcode-version: '14.1-beta' + # if: ${{ matrix.runner == 'self-hosted-mac-Intel' }} + # - name: make CXX=g++-11 agc + # run: | + # make clean + # make CXX=g++-11 agc + # - name: make CXX=g++-12 agc + # run: | + # make clean + # make CXX=g++-12 agc + + - name: make CXX=g++-13 agc + run: | + gmake clean + gmake CXX=g++-13 agc + # + # - name: make CXX=g++-14 agc + # run: | + # make clean + # make CXX=g++-14 agc #CREATE @@ -78,7 +94,7 @@ jobs: cmp list list_dec cmp list listgz_dec - - name: toy example getset + - name: toy example getset run: | cd toy_ex $EXE_AGC getset toy.agc a > a_dec.fa @@ -90,7 +106,21 @@ jobs: cmp c.fa c_dec.fa cmp ref.fa ref_dec.fa - - name: toy example getset -g (+gz) + + - name: toy example getset -p -s + run: | + cd toy_ex + $EXE_AGC getset -p -s toy.agc a > a_dec.fa + $EXE_AGC getset -p -s toy.agc b > b_dec.fa + $EXE_AGC getset -p -s toy.agc c > c_dec.fa + $EXE_AGC getset -p -s toy.agc ref > ref_dec.fa + cmp a.fa a_dec.fa + cmp b.fa b_dec.fa + cmp c.fa c_dec.fa + cmp ref.fa ref_dec.fa + + + - name: toy example getset -g (+gz) run: | cd toy_ex $EXE_AGC getset -g 9 toygz.agc a > a_dec.fa.gz @@ -190,6 +220,37 @@ jobs: $EXE_AGC getctg toy.agc g:3-4 > ctg_dec.fa cmp ctg.fa ctg_dec.fa + + - name: toy example getctg -p -s + run: | + cd toy_ex + head -n 2 a.fa > chr1a.fa + $EXE_AGC getctg -p -s toy.agc chr1a > chr1a_dec.fa + cmp chr1a.fa chr1a_dec.fa + + tail -n 2 b.fa > t.fa + $EXE_AGC getctg -p -s toy.agc t > t_dec.fa + cmp t.fa t_dec.fa + + head -n 2 ref.fa > ref-chr1.fa + $EXE_AGC getctg -p -s toy.agc chr1@ref > ref-chr1_dec.fa + cmp ref-chr1.fa ref-chr1_dec.fa + + echo ">chr1:3-10" > ctg.fa + echo "TAGCTAGC" >> ctg.fa + $EXE_AGC getctg -p -s toy.agc chr1@ref:3-10 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + echo ">chr3a:1-3" > ctg.fa + echo "GTT" >> ctg.fa + $EXE_AGC getctg -p -s toy.agc chr3a:1-3 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + echo ">g h i 21:3-4" > ctg.fa + echo "AG" >> ctg.fa + $EXE_AGC getctg -p -s toy.agc g:3-4 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + - name: toy example getctg -g (+ from gz) run: | @@ -250,7 +311,7 @@ jobs: cmp list list_dec - - name: toy example getset -c + - name: toy example getset -c run: | cd toy_ex $EXE_AGC getset toy.agc chr1 > chr1_dec.fa @@ -282,6 +343,40 @@ jobs: cmp c_.fa c_dec.fa cat t.fa t_dec.fa cmp t.fa t_dec.fa + + + - name: toy example getset -c -p -s + run: | + cd toy_ex + $EXE_AGC getset -p -s toy.agc chr1 > chr1_dec.fa + $EXE_AGC getset -p -s toy.agc 1 > 1_dec.fa + $EXE_AGC getset -p -s toy.agc 2 > 2_dec.fa + $EXE_AGC getset -p -s toy.agc 3 > 3_dec.fa + head -n 2 c.fa > 1.fa + head -n 4 c.fa | tail -n 2 > 2.fa + tail -n 2 c.fa > 3.fa + cat 1.fa 1_dec.fa + cmp 1.fa 1_dec.fa + cat 2.fa 2_dec.fa + cmp 2.fa 2_dec.fa + cat 3.fa 3_dec.fa + cmp 3.fa 3_dec.fa + cmp ref2.fa chr1_dec.fa + $EXE_AGC getset -p -s toy2.agc chr1 > chr1_dec.fa + cat ref2.fa chr1_dec.fa + cmp ref2.fa chr1_dec.fa + $EXE_AGC getset -p -s toy2.agc c > c_dec.fa + $EXE_AGC getset -p -s toy2.agc g > g_dec.fa + $EXE_AGC getset -p -s toy2.agc t > t_dec.fa + sed -n '3p;4p' < b.fa > g.fa + sed -n '5p;6p' < b.fa > c_.fa + sed -n '7p;8p' < b.fa > t.fa + cat g.fa g_dec.fa + cmp g.fa g_dec.fa + cat c_.fa c_dec.fa + cmp c_.fa c_dec.fa + cat t.fa t_dec.fa + cmp t.fa t_dec.fa - name: toy example getset -c -g, from gz run: | @@ -398,8 +493,34 @@ jobs: cmp ctg.fa ctg_dec.fa $EXE_AGC getctg toy2gz.agc g:3-4 > ctg_dec.fa cmp ctg.fa ctg_dec.fa - - - name: toy example append -a + getset + + + - name: toy example getctg -c (-p -s) + run: | + cd toy_ex + $EXE_AGC getctg -p -s toy.agc chr1 > chr1_dec.fa + cmp ref2.fa chr1_dec.fa + + echo ">chr1:3-7" > ctg.fa + echo "TAGCT" >> ctg.fa + $EXE_AGC getctg -p -s toy.agc chr1:3-7 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + $EXE_AGC getctg -p -s toygz.agc chr1:3-7 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + echo ">3:3-10" > ctg.fa + echo "TCCCGGGA" >> ctg.fa + $EXE_AGC getctg -p -s toy.agc 3:3-10 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + echo ">g h i 21:3-4" > ctg.fa + echo "AG" >> ctg.fa + $EXE_AGC getctg -p -s toy2.agc g:3-4 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + $EXE_AGC getctg -p -s toy2gz.agc g:3-4 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + - name: toy example append -a + getset (+ getset -p -s) run: | cd toy_ex mkdir test-toy @@ -422,6 +543,9 @@ jobs: $EXE_AGC getset test-toy/toy3.agc chr1 > test-toy/chr1_dec.fa cat ref2.fa test-toy/chr1_dec.fa cmp ref2.fa test-toy/chr1_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc chr1 > test-toy/chr1_dec.fa + cat ref2.fa test-toy/chr1_dec.fa + cmp ref2.fa test-toy/chr1_dec.fa $EXE_AGC getset test-toy/toy3.agc c > test-toy/c_dec.fa $EXE_AGC getset test-toy/toy3.agc g > test-toy/g_dec.fa $EXE_AGC getset test-toy/toy3.agc t > test-toy/t_dec.fa @@ -441,7 +565,26 @@ jobs: cmp 2.fa 2_dec.fa cat 3.fa 3_dec.fa cmp 3.fa 3_dec.fa - + cd .. + $EXE_AGC getset -p -s test-toy/toy3.agc c > test-toy/c_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc g > test-toy/g_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc t > test-toy/t_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc 1 > test-toy/1_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc 2 > test-toy/2_dec.fa + $EXE_AGC getset -p -s test-toy/toy3.agc 3 > test-toy/3_dec.fa + cd test-toy + cat g.fa g_dec.fa + cmp g.fa g_dec.fa + cat c_.fa c_dec.fa + cmp c_.fa c_dec.fa + cat t.fa t_dec.fa + cmp t.fa t_dec.fa + cat 1.fa 1_dec.fa + cmp 1.fa 1_dec.fa + cat 2.fa 2_dec.fa + cmp 2.fa 2_dec.fa + cat 3.fa 3_dec.fa + cmp 3.fa 3_dec.fa - name: toy example append -a + getset (-g from gz) run: | @@ -533,7 +676,7 @@ jobs: #APPEND - - name: toy example create-append-getset + - name: toy example create-append-getset (+getset -p -s) run: | cd toy_ex $EXE_AGC create -o tmp0.agc ref.fa @@ -549,6 +692,14 @@ jobs: cmp b.fa b_dec.fa cmp c.fa c_dec.fa cmp ref.fa ref_dec.fa + $EXE_AGC getset -p -s toy_app.agc a > a_dec.fa + $EXE_AGC getset -p -s toy_app.agc b > b_dec.fa + $EXE_AGC getset -p -s toy_app_gz.agc c > c_dec.fa + $EXE_AGC getset -p -s toy_app.agc ref > ref_dec.fa + cmp a.fa a_dec.fa + cmp b.fa b_dec.fa + cmp c.fa c_dec.fa + cmp ref.fa ref_dec.fa - name: toy example append listset run: | @@ -578,29 +729,37 @@ jobs: cmp c.fa getcola/c.fa cmp ref.fa getcola/ref.fa - - name: toy example append getctg + - name: toy example append getctg (+ getctg -p -s) run: | cd toy_ex head -n 2 a.fa > chr1a.fa $EXE_AGC getctg toy_app.agc chr1a > chr1a_dec.fa cmp chr1a.fa chr1a_dec.fa + $EXE_AGC getctg -p -s toy_app.agc chr1a > chr1a_dec.fa + cmp chr1a.fa chr1a_dec.fa tail -n 2 b.fa > t.fa $EXE_AGC getctg toy_app.agc t > t_dec.fa cmp t.fa t_dec.fa + $EXE_AGC getctg -p -s toy_app.agc t > t_dec.fa + cmp t.fa t_dec.fa head -n 2 ref.fa > ref-chr1.fa $EXE_AGC getctg toy_app.agc chr1@ref > ref-chr1_dec.fa cmp ref-chr1.fa ref-chr1_dec.fa + $EXE_AGC getctg -p -s toy_app.agc chr1@ref > ref-chr1_dec.fa + cmp ref-chr1.fa ref-chr1_dec.fa echo ">chr1:3-10" > ctg.fa echo "TAGCTAGC" >> ctg.fa $EXE_AGC getctg toy_app.agc chr1@ref:3-10 > ctg_dec.fa cmp ctg.fa ctg_dec.fa + $EXE_AGC getctg -p -s toy_app.agc chr1@ref:3-10 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa $EXE_AGC getctg -g 1 toy_app.agc chr1@ref:3-10 > ctg_dec.fa.gz gzip -fd ctg_dec.fa.gz cmp ctg.fa ctg_dec.fa - + echo ">chr3a:1-3" > ctg.fa echo "GTT" >> ctg.fa $EXE_AGC getctg toy_app.agc chr3a:1-3 > ctg_dec.fa @@ -608,10 +767,14 @@ jobs: $EXE_AGC getctg -g 3 toy_app_gz.agc chr3a:1-3 > ctg_dec.fa.gz gzip -fd ctg_dec.fa.gz cmp ctg.fa ctg_dec.fa + + $EXE_AGC getctg -p -s toy_app.agc chr3a:1-3 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + #ADAPTIVE APPEND - - name: toy example create-append adaptive-getset + - name: toy example create-append adaptive-getset (+getset -p -s) run: | cd toy_ex $EXE_AGC create -a -o tmp0.agc ref.fa @@ -627,6 +790,14 @@ jobs: cmp b.fa b_dec.fa cmp c.fa c_dec.fa cmp ref.fa ref_dec.fa + $EXE_AGC getset -p -s toy_app.agc a > a_dec.fa + $EXE_AGC getset -p -s toy_app.agc b > b_dec.fa + $EXE_AGC getset -p -s toy_app_gz.agc c > c_dec.fa + $EXE_AGC getset -p -s toy_app.agc ref > ref_dec.fa + cmp a.fa a_dec.fa + cmp b.fa b_dec.fa + cmp c.fa c_dec.fa + cmp ref.fa ref_dec.fa - name: toy example append adaptive listset run: | @@ -676,6 +847,32 @@ jobs: echo "GTT" >> ctg.fa $EXE_AGC getctg toy_app.agc chr3a:1-3 > ctg_dec.fa cmp ctg.fa ctg_dec.fa + + + - name: toy example append adaptive getctg (-p -s) + run: | + cd toy_ex + head -n 2 a.fa > chr1a.fa + $EXE_AGC getctg -p -s toy_app.agc chr1a > chr1a_dec.fa + cmp chr1a.fa chr1a_dec.fa + + tail -n 2 b.fa > t.fa + $EXE_AGC getctg -p -s toy_app.agc t > t_dec.fa + cmp t.fa t_dec.fa + + head -n 2 ref.fa > ref-chr1.fa + $EXE_AGC getctg -p -s toy_app.agc chr1@ref > ref-chr1_dec.fa + cmp ref-chr1.fa ref-chr1_dec.fa + + echo ">chr1:3-10" > ctg.fa + echo "TAGCTAGC" >> ctg.fa + $EXE_AGC getctg -p -s toy_app.agc chr1@ref:3-10 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa + + echo ">chr3a:1-3" > ctg.fa + echo "GTT" >> ctg.fa + $EXE_AGC getctg -p -s toy_app.agc chr3a:1-3 > ctg_dec.fa + cmp ctg.fa ctg_dec.fa diff --git a/.gitignore b/.gitignore index ae7bf86..5e1362c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,6 @@ x64 /agc /bin/obj/libzstd_x64_Release /bin/x64_Release +/3rd_party/nasm +/bin +/obj diff --git a/.gitmodules b/.gitmodules index 27ed8bb..ab42dee 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "3rd_party/libdeflate"] path = 3rd_party/libdeflate url = https://github.com/refresh-bio-dependencies/libdeflate.git +[submodule "3rd_party/pybind11"] + path = 3rd_party/pybind11 + url = https://github.com/refresh-bio-dependencies/pybind11.git diff --git a/libs/ketopt.h b/3rd_party/ketopt.h similarity index 97% rename from libs/ketopt.h rename to 3rd_party/ketopt.h index 1c7bcf6..6b0f5a1 100644 --- a/libs/ketopt.h +++ b/3rd_party/ketopt.h @@ -1,120 +1,120 @@ -#ifndef KETOPT_H -#define KETOPT_H - -#include /* for strchr() and strncmp() */ - -#define ko_no_argument 0 -#define ko_required_argument 1 -#define ko_optional_argument 2 - -typedef struct { - int ind; /* equivalent to optind */ - int opt; /* equivalent to optopt */ - const char *arg; /* equivalent to optarg */ - int longidx; /* index of a long option; or -1 if short */ - /* private variables not intended for external uses */ - int i, pos, n_args; -} ketopt_t; - -typedef struct { - char *name; - int has_arg; - int val; -} ko_longopt_t; - -static ketopt_t KETOPT_INIT = { 1, 0, 0, -1, 1, 0, 0 }; - -static void ketopt_permute(const char *argv[], int j, int n) /* move argv[j] over n elements to the left */ -{ - int k; - const char *p = argv[j]; - for (k = 0; k < n; ++k) - argv[j - k] = argv[j - k - 1]; - argv[j - k] = p; -} - -/** - * Parse command-line options and arguments - * - * This fuction has a similar interface to GNU's getopt_long(). Each call - * parses one option and returns the option name. s->arg points to the option - * argument if present. The function returns -1 when all command-line arguments - * are parsed. In this case, s->ind is the index of the first non-option - * argument. - * - * @param s status; shall be initialized to KETOPT_INIT on the first call - * @param argc length of argv[] - * @param argv list of command-line arguments; argv[0] is ignored - * @param permute non-zero to move options ahead of non-option arguments - * @param ostr option string - * @param longopts long options - * - * @return ASCII for a short option; ko_longopt_t::val for a long option; -1 if - * argv[] is fully processed; '?' for an unknown option or an ambiguous - * long option; ':' if an option argument is missing - */ -static int ketopt(ketopt_t *s, int argc, const char *argv[], int permute, const char *ostr, const ko_longopt_t *longopts) -{ - int opt = -1, i0, j; - if (permute) { - while (s->i < argc && (argv[s->i][0] != '-' || argv[s->i][1] == '\0')) - ++s->i, ++s->n_args; - } - s->arg = 0, s->longidx = -1, i0 = s->i; - if (s->i >= argc || argv[s->i][0] != '-' || argv[s->i][1] == '\0') { - s->ind = s->i - s->n_args; - return -1; - } - if (argv[s->i][0] == '-' && argv[s->i][1] == '-') { /* "--" or a long option */ - if (argv[s->i][2] == '\0') { /* a bare "--" */ - ketopt_permute(argv, s->i, s->n_args); - ++s->i, s->ind = s->i - s->n_args; - return -1; - } - s->opt = 0, opt = '?', s->pos = -1; - if (longopts) { /* parse long options */ - int k, n_exact = 0, n_partial = 0; - const ko_longopt_t *o = 0, *o_exact = 0, *o_partial = 0; - for (j = 2; argv[s->i][j] != '\0' && argv[s->i][j] != '='; ++j) {} /* find the end of the option name */ - for (k = 0; longopts[k].name != 0; ++k) - if (strncmp(&argv[s->i][2], longopts[k].name, j - 2) == 0) { - if (longopts[k].name[j - 2] == 0) ++n_exact, o_exact = &longopts[k]; - else ++n_partial, o_partial = &longopts[k]; - } - if (n_exact > 1 || (n_exact == 0 && n_partial > 1)) return '?'; - o = n_exact == 1? o_exact : n_partial == 1? o_partial : 0; - if (o) { - s->opt = opt = o->val, s->longidx = o - longopts; - if (argv[s->i][j] == '=') s->arg = &argv[s->i][j + 1]; - if (o->has_arg == 1 && argv[s->i][j] == '\0') { - if (s->i < argc - 1) s->arg = argv[++s->i]; - else opt = ':'; /* missing option argument */ - } - } - } - } else { /* a short option */ - char *p; - if (s->pos == 0) s->pos = 1; - opt = s->opt = argv[s->i][s->pos++]; - p = strchr((char*)ostr, opt); - if (p == 0) { - opt = '?'; /* unknown option */ - } else if (p[1] == ':') { - if (argv[s->i][s->pos] == 0) { - if (s->i < argc - 1) s->arg = argv[++s->i]; - else opt = ':'; /* missing option argument */ - } else s->arg = &argv[s->i][s->pos]; - s->pos = -1; - } - } - if (s->pos < 0 || argv[s->i][s->pos] == 0) { - ++s->i, s->pos = 0; - if (s->n_args > 0) /* permute */ - for (j = i0; j < s->i; ++j) - ketopt_permute(argv, j, s->n_args); - } - s->ind = s->i - s->n_args; - return opt; -} - -#endif +#ifndef KETOPT_H +#define KETOPT_H + +#include /* for strchr() and strncmp() */ + +#define ko_no_argument 0 +#define ko_required_argument 1 +#define ko_optional_argument 2 + +typedef struct { + int ind; /* equivalent to optind */ + int opt; /* equivalent to optopt */ + const char *arg; /* equivalent to optarg */ + int longidx; /* index of a long option; or -1 if short */ + /* private variables not intended for external uses */ + int i, pos, n_args; +} ketopt_t; + +typedef struct { + char *name; + int has_arg; + int val; +} ko_longopt_t; + +static ketopt_t KETOPT_INIT = { 1, 0, 0, -1, 1, 0, 0 }; + +static void ketopt_permute(const char *argv[], int j, int n) /* move argv[j] over n elements to the left */ +{ + int k; + const char *p = argv[j]; + for (k = 0; k < n; ++k) + argv[j - k] = argv[j - k - 1]; + argv[j - k] = p; +} + +/** + * Parse command-line options and arguments + * + * This fuction has a similar interface to GNU's getopt_long(). Each call + * parses one option and returns the option name. s->arg points to the option + * argument if present. The function returns -1 when all command-line arguments + * are parsed. In this case, s->ind is the index of the first non-option + * argument. + * + * @param s status; shall be initialized to KETOPT_INIT on the first call + * @param argc length of argv[] + * @param argv list of command-line arguments; argv[0] is ignored + * @param permute non-zero to move options ahead of non-option arguments + * @param ostr option string + * @param longopts long options + * + * @return ASCII for a short option; ko_longopt_t::val for a long option; -1 if + * argv[] is fully processed; '?' for an unknown option or an ambiguous + * long option; ':' if an option argument is missing + */ +static int ketopt(ketopt_t *s, int argc, const char *argv[], int permute, const char *ostr, const ko_longopt_t *longopts) +{ + int opt = -1, i0, j; + if (permute) { + while (s->i < argc && (argv[s->i][0] != '-' || argv[s->i][1] == '\0')) + ++s->i, ++s->n_args; + } + s->arg = 0, s->longidx = -1, i0 = s->i; + if (s->i >= argc || argv[s->i][0] != '-' || argv[s->i][1] == '\0') { + s->ind = s->i - s->n_args; + return -1; + } + if (argv[s->i][0] == '-' && argv[s->i][1] == '-') { /* "--" or a long option */ + if (argv[s->i][2] == '\0') { /* a bare "--" */ + ketopt_permute(argv, s->i, s->n_args); + ++s->i, s->ind = s->i - s->n_args; + return -1; + } + s->opt = 0, opt = '?', s->pos = -1; + if (longopts) { /* parse long options */ + int k, n_exact = 0, n_partial = 0; + const ko_longopt_t *o = 0, *o_exact = 0, *o_partial = 0; + for (j = 2; argv[s->i][j] != '\0' && argv[s->i][j] != '='; ++j) {} /* find the end of the option name */ + for (k = 0; longopts[k].name != 0; ++k) + if (strncmp(&argv[s->i][2], longopts[k].name, j - 2) == 0) { + if (longopts[k].name[j - 2] == 0) ++n_exact, o_exact = &longopts[k]; + else ++n_partial, o_partial = &longopts[k]; + } + if (n_exact > 1 || (n_exact == 0 && n_partial > 1)) return '?'; + o = n_exact == 1? o_exact : n_partial == 1? o_partial : 0; + if (o) { + s->opt = opt = o->val, s->longidx = o - longopts; + if (argv[s->i][j] == '=') s->arg = &argv[s->i][j + 1]; + if (o->has_arg == 1 && argv[s->i][j] == '\0') { + if (s->i < argc - 1) s->arg = argv[++s->i]; + else opt = ':'; /* missing option argument */ + } + } + } + } else { /* a short option */ + char *p; + if (s->pos == 0) s->pos = 1; + opt = s->opt = argv[s->i][s->pos++]; + p = strchr((char*)ostr, opt); + if (p == 0) { + opt = '?'; /* unknown option */ + } else if (p[1] == ':') { + if (argv[s->i][s->pos] == 0) { + if (s->i < argc - 1) s->arg = argv[++s->i]; + else opt = ':'; /* missing option argument */ + } else s->arg = &argv[s->i][s->pos]; + s->pos = -1; + } + } + if (s->pos < 0 || argv[s->i][s->pos] == 0) { + ++s->i, s->pos = 0; + if (s->n_args > 0) /* permute */ + for (j = i0; j < s->i; ++j) + ketopt_permute(argv, j, s->n_args); + } + s->ind = s->i - s->n_args; + return opt; +} + +#endif diff --git a/libs/kseq.h b/3rd_party/kseq.h similarity index 97% rename from libs/kseq.h rename to 3rd_party/kseq.h index e85486a..15d7983 100644 --- a/libs/kseq.h +++ b/3rd_party/kseq.h @@ -1,256 +1,256 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Last Modified: 05MAR2012 */ - -#ifndef AC_KSEQ_H -#define AC_KSEQ_H - -#include -#include -#include - -#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r -#define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) -#define KS_SEP_MAX 2 - -#ifndef klib_unused -#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) -#define klib_unused __attribute__ ((__unused__)) -#else -#define klib_unused -#endif -#endif /* klib_unused */ - -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - int begin, end; \ - int is_eof:2, bufsize:30; \ - type_t f; \ - unsigned char *buf; \ - } kstream_t; - -#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) -#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) - -#define __KS_BASIC(SCOPE, type_t, __bufsize) \ - SCOPE kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; ks->bufsize = __bufsize; \ - ks->buf = (unsigned char*)malloc(__bufsize); \ - return ks; \ - } \ - SCOPE void ks_destroy(kstream_t *ks) \ - { \ - if (!ks) return; \ - free(ks->buf); \ - free(ks); \ - } - -#define __KS_INLINED(__read) \ - static inline klib_unused int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define __KS_GETUNTIL(SCOPE, __read) \ - SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, ks->bufsize); \ - if (ks->end < ks->bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') break; \ - } else if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ - str->s[str->l] = '\0'; \ - return str->l; \ - } - -#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(SCOPE, type_t, __bufsize) \ - __KS_GETUNTIL(SCOPE, __read) \ - __KS_INLINED(__read) - -#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) - -#define KSTREAM_DECLARE(type_t, __read) \ - __KS_TYPE(type_t) \ - extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ - extern kstream_t *ks_init(type_t f); \ - extern void ks_destroy(kstream_t *ks); \ - __KS_INLINED(__read) - -/****************** - * FASTA/Q parser * - ******************/ - -#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) - -#define __KSEQ_BASIC(SCOPE, type_t) \ - SCOPE kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - SCOPE void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } - -/* Return value: - >=0 length of the sequence (normal) - -1 end-of-file - -2 truncated quality string - */ -#define __KSEQ_READ(SCOPE) \ - SCOPE int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ - if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ - seq->seq.m = 256; \ - seq->seq.s = (char*)malloc(seq->seq.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (c == '\n') continue; /* skip empty lines */ \ - seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ - __KSEQ_READ(SCOPE) - -#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) - -#define KSEQ_DECLARE(type_t) \ - __KS_TYPE(type_t) \ - __KSEQ_TYPE(type_t) \ - extern kseq_t *kseq_init(type_t fd); \ - void kseq_destroy(kseq_t *ks); \ - int kseq_read(kseq_t *seq); - -#endif +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + int begin, end; \ + int is_eof:2, bufsize:30; \ + type_t f; \ + unsigned char *buf; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(SCOPE, type_t, __bufsize) \ + SCOPE kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; ks->bufsize = __bufsize; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + SCOPE void ks_destroy(kstream_t *ks) \ + { \ + if (!ks) return; \ + free(ks->buf); \ + free(ks); \ + } + +#define __KS_INLINED(__read) \ + static inline klib_unused int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(SCOPE, __read) \ + SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, ks->bufsize); \ + if (ks->end < ks->bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(SCOPE, type_t, __bufsize) \ + __KS_GETUNTIL(SCOPE, __read) \ + __KS_INLINED(__read) + +#define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) + +#define KSTREAM_DECLARE(type_t, __read) \ + __KS_TYPE(type_t) \ + extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ + extern kstream_t *ks_init(type_t f); \ + extern void ks_destroy(kstream_t *ks); \ + __KS_INLINED(__read) + +/****************** + * FASTA/Q parser * + ******************/ + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/3rd_party/prebuild.bat b/3rd_party/prebuild.bat index 11f4bcf..fab8cdf 100644 --- a/3rd_party/prebuild.bat +++ b/3rd_party/prebuild.bat @@ -3,8 +3,31 @@ rem %2 - $(Configuration) cd %1\3rd_party +@echo "nasm" + +if exist nasm/nasm.exe ( + @echo "nasm.exe already exists" + cd nasm +) else ( + rmdir /S /Q nasm + mkdir nasm + cd nasm + curl -L --ssl-no-revoke https://github.com/refresh-bio-dependencies/nasm/releases/download/v2.16.01/nasm-2.16.01-win64.zip --output nasm-2.16.01-win64.zip + tar -xf nasm-2.16.01-win64.zip --strip-components 1 +) + +set PATH=%PATH%;%cd% +cd .. + +@echo %PATH% + @echo "zlib-ng" cd zlib-ng cmake -B build-vs -S . -DZLIB_COMPAT=ON cmake --build build-vs --config %2 cd .. + + +@echo "isa-l" +cd isa-l +nmake -f Makefile.nmake diff --git a/3rd_party/pybind11 b/3rd_party/pybind11 new file mode 160000 index 0000000..0ed20f2 --- /dev/null +++ b/3rd_party/pybind11 @@ -0,0 +1 @@ +Subproject commit 0ed20f26acee626ac989568ecc6347e159ddbb47 diff --git a/py_agc_api/pybind11-2.11.1/LICENSE b/3rd_party/pybind11-2.11.1.old/LICENSE similarity index 100% rename from py_agc_api/pybind11-2.11.1/LICENSE rename to 3rd_party/pybind11-2.11.1.old/LICENSE diff --git a/py_agc_api/pybind11-2.11.1/MANIFEST.in b/3rd_party/pybind11-2.11.1.old/MANIFEST.in similarity index 100% rename from py_agc_api/pybind11-2.11.1/MANIFEST.in rename to 3rd_party/pybind11-2.11.1.old/MANIFEST.in diff --git a/py_agc_api/pybind11-2.11.1/PKG-INFO b/3rd_party/pybind11-2.11.1.old/PKG-INFO similarity index 100% rename from py_agc_api/pybind11-2.11.1/PKG-INFO rename to 3rd_party/pybind11-2.11.1.old/PKG-INFO diff --git a/py_agc_api/pybind11-2.11.1/README.rst b/3rd_party/pybind11-2.11.1.old/README.rst similarity index 100% rename from py_agc_api/pybind11-2.11.1/README.rst rename to 3rd_party/pybind11-2.11.1.old/README.rst diff --git a/py_agc_api/pybind11-2.11.1/SECURITY.md b/3rd_party/pybind11-2.11.1.old/SECURITY.md similarity index 100% rename from py_agc_api/pybind11-2.11.1/SECURITY.md rename to 3rd_party/pybind11-2.11.1.old/SECURITY.md diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/PKG-INFO b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/PKG-INFO similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/PKG-INFO rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/PKG-INFO diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/SOURCES.txt b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/SOURCES.txt similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/SOURCES.txt rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/SOURCES.txt diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/dependency_links.txt b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/dependency_links.txt similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/dependency_links.txt rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/dependency_links.txt diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/entry_points.txt b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/entry_points.txt similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/entry_points.txt rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/entry_points.txt diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/not-zip-safe b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/not-zip-safe similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/not-zip-safe rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/not-zip-safe diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/requires.txt b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/requires.txt similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/requires.txt rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/requires.txt diff --git a/py_agc_api/pybind11-2.11.1/pybind11.egg-info/top_level.txt b/3rd_party/pybind11-2.11.1.old/pybind11.egg-info/top_level.txt similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11.egg-info/top_level.txt rename to 3rd_party/pybind11-2.11.1.old/pybind11.egg-info/top_level.txt diff --git a/py_agc_api/pybind11-2.11.1/pybind11/__init__.py b/3rd_party/pybind11-2.11.1.old/pybind11/__init__.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/__init__.py rename to 3rd_party/pybind11-2.11.1.old/pybind11/__init__.py diff --git a/py_agc_api/pybind11-2.11.1/pybind11/__main__.py b/3rd_party/pybind11-2.11.1.old/pybind11/__main__.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/__main__.py rename to 3rd_party/pybind11-2.11.1.old/pybind11/__main__.py diff --git a/py_agc_api/pybind11-2.11.1/pybind11/_version.py b/3rd_party/pybind11-2.11.1.old/pybind11/_version.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/_version.py rename to 3rd_party/pybind11-2.11.1.old/pybind11/_version.py diff --git a/py_agc_api/pybind11-2.11.1/pybind11/commands.py b/3rd_party/pybind11-2.11.1.old/pybind11/commands.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/commands.py rename to 3rd_party/pybind11-2.11.1.old/pybind11/commands.py diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/attr.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/attr.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/attr.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/attr.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/buffer_info.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/buffer_info.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/buffer_info.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/buffer_info.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/cast.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/cast.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/cast.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/cast.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/chrono.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/chrono.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/chrono.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/chrono.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/common.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/common.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/common.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/common.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/complex.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/complex.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/complex.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/complex.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/class.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/class.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/class.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/class.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/common.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/common.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/common.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/common.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/descr.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/descr.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/descr.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/descr.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/init.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/init.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/init.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/init.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/internals.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/internals.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/internals.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/internals.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/type_caster_base.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/type_caster_base.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/type_caster_base.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/type_caster_base.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/typeid.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/typeid.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/detail/typeid.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/detail/typeid.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/common.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/common.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/common.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/common.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/matrix.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/matrix.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/matrix.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/matrix.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/tensor.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/tensor.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eigen/tensor.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eigen/tensor.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/embed.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/embed.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/embed.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/embed.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eval.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eval.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/eval.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/eval.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/functional.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/functional.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/functional.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/functional.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/gil.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/gil.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/gil.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/gil.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/iostream.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/iostream.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/iostream.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/iostream.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/numpy.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/numpy.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/numpy.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/numpy.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/operators.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/operators.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/operators.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/operators.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/options.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/options.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/options.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/options.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/pybind11.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/pybind11.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/pybind11.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/pybind11.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/pytypes.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/pytypes.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/pytypes.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/pytypes.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl/filesystem.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl/filesystem.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl/filesystem.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl/filesystem.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl_bind.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl_bind.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/stl_bind.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/stl_bind.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/type_caster_pyobject_ptr.h b/3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/type_caster_pyobject_ptr.h similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/include/pybind11/type_caster_pyobject_ptr.h rename to 3rd_party/pybind11-2.11.1.old/pybind11/include/pybind11/type_caster_pyobject_ptr.h diff --git a/py_agc_api/pybind11-2.11.1/pybind11/py.typed b/3rd_party/pybind11-2.11.1.old/pybind11/py.typed similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/py.typed rename to 3rd_party/pybind11-2.11.1.old/pybind11/py.typed diff --git a/py_agc_api/pybind11-2.11.1/pybind11/setup_helpers.py b/3rd_party/pybind11-2.11.1.old/pybind11/setup_helpers.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/setup_helpers.py rename to 3rd_party/pybind11-2.11.1.old/pybind11/setup_helpers.py diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/FindPythonLibsNew.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Common.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Common.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Common.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Common.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Config.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Config.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Config.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Config.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11ConfigVersion.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11NewTools.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11NewTools.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11NewTools.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11NewTools.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Targets.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Targets.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Targets.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Targets.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Tools.cmake b/3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Tools.cmake similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/cmake/pybind11/pybind11Tools.cmake rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/cmake/pybind11/pybind11Tools.cmake diff --git a/py_agc_api/pybind11-2.11.1/pybind11/share/pkgconfig/pybind11.pc b/3rd_party/pybind11-2.11.1.old/pybind11/share/pkgconfig/pybind11.pc similarity index 100% rename from py_agc_api/pybind11-2.11.1/pybind11/share/pkgconfig/pybind11.pc rename to 3rd_party/pybind11-2.11.1.old/pybind11/share/pkgconfig/pybind11.pc diff --git a/py_agc_api/pybind11-2.11.1/pyproject.toml b/3rd_party/pybind11-2.11.1.old/pyproject.toml similarity index 100% rename from py_agc_api/pybind11-2.11.1/pyproject.toml rename to 3rd_party/pybind11-2.11.1.old/pyproject.toml diff --git a/py_agc_api/pybind11-2.11.1/setup.cfg b/3rd_party/pybind11-2.11.1.old/setup.cfg similarity index 100% rename from py_agc_api/pybind11-2.11.1/setup.cfg rename to 3rd_party/pybind11-2.11.1.old/setup.cfg diff --git a/py_agc_api/pybind11-2.11.1/setup.py b/3rd_party/pybind11-2.11.1.old/setup.py similarity index 100% rename from py_agc_api/pybind11-2.11.1/setup.py rename to 3rd_party/pybind11-2.11.1.old/setup.py diff --git a/libs/file_wrapper.h b/3rd_party/refresh/compression/lib/file_wrapper.h similarity index 95% rename from libs/file_wrapper.h rename to 3rd_party/refresh/compression/lib/file_wrapper.h index 768dd6a..13bd88f 100644 --- a/libs/file_wrapper.h +++ b/3rd_party/refresh/compression/lib/file_wrapper.h @@ -1,949 +1,954 @@ -#ifndef _FILE_WRAPPER_H -#define _FILE_WRAPPER_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 -#include -#include -#endif - -#if defined(ARCH_X64) -#if defined(REFRESH_USE_IGZIP) -#define REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP -#else -#define REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB -#endif -#else -#define REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB -#endif - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP -#include -#endif - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB -#include -#endif - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD -#include -#endif - - -namespace refresh -{ - // ********************************************************************************** - // Base class for basic I/O operations, like reading from file, stdin, ... - // ********************************************************************************** - class stream_in_base - { - public: - stream_in_base() = default; - virtual ~stream_in_base() = default; - - virtual std::pair read() = 0; - virtual void release(char*) = 0; - virtual std::string get_file_name() const = 0; - }; - - // ********************************************************************************** - // Base class for buffered reading - // ********************************************************************************** - class stream_in_buffered : public stream_in_base - { - protected: - size_t buffer_size; - char* buffer; - size_t buffer_filled; - bool buffer_released; - - public: - stream_in_buffered(size_t buffer_size) : - stream_in_base(), - buffer_size(buffer_size) - { - buffer = new char[buffer_size]; - buffer_released = true; - buffer_filled = 0; - } - - virtual ~stream_in_buffered() - { - delete[] buffer; - } - }; - - // ********************************************************************************** - // Low-level reading from stdin - // ********************************************************************************** - class stream_in_stdin : public stream_in_buffered - { - public: - stream_in_stdin(size_t buffer_size = 8 << 20) : - stream_in_buffered(buffer_size) - { -#ifdef _WIN32 - _setmode(_fileno(stdin), _O_BINARY); -#endif - } - - virtual ~stream_in_stdin() - {} - - virtual std::pair read() - { - if (!buffer_released) - assert(0); - - if (!feof(stdin)) - buffer_filled = fread(buffer, 1, buffer_size, stdin); - else - buffer_filled = 0; - - buffer_released = false; - - return std::make_pair(buffer, buffer_filled); - } - - virtual void release(char *ptr) - { - buffer_released = true; - } - - virtual std::string get_file_name() const - { - return ""; - } - }; - - // ********************************************************************************** - // Low-level reading from file - // ********************************************************************************** - class stream_in_file : public stream_in_buffered - { - std::string file_name; - size_t io_buffer_size; - FILE* file = nullptr; - bool test_extension = true; - - public: - stream_in_file(const std::string& file_name, size_t io_buffer_size = 16 << 20, size_t buffer_size = 8 << 20, bool test_extension = true) : - stream_in_buffered(buffer_size), - file_name(file_name), - io_buffer_size(io_buffer_size), - test_extension(test_extension) - { - open(file_name); - } - - virtual ~stream_in_file() - { - close(); - } - - virtual bool open(const std::string& _file_name) - { - file_name = _file_name; - - if (file) - close(); - - file = fopen(file_name.c_str(), "rb"); - - if (!file) - return false; - - setvbuf(file, nullptr, _IOFBF, io_buffer_size); - - buffer_released = true; - - return true; - } - - virtual bool close() - { - if (file) - { - fclose(file); - file = nullptr; - - return true; - } - - return false; - } - - virtual bool restart() - { - if (!file) - return false; - -#ifdef _WIN32 - _fseeki64(file, 0, SEEK_SET); -#else - fseek(file, 0, SEEK_SET); -#endif - - buffer_filled = 0; - buffer_released = true; - - return true; - } - - virtual std::pair read() - { - if (!buffer_released) - assert(0); - - buffer_filled = fread(buffer, 1, buffer_size, file); - buffer_released = false; - - return std::make_pair(buffer, buffer_filled); - } - - virtual void release(char *ptr) - { - buffer_released = true; - } - - virtual std::string get_file_name() const - { - return test_extension ? file_name : ""; - } - - bool is_open() const - { - return file != nullptr; - } - }; - - // ********************************************************************************** - // Base class of decompression engines, like zlib, igzip, plain-text, zstd, ... - // ********************************************************************************** - class stream_decompression_engine - { - protected: - stream_in_base* stream_in; - size_t out_buffer_size; - char* in_buffer_data = nullptr; - size_t in_buffer_filled = 0; - size_t in_buffer_pos = 0; - - public: - stream_decompression_engine(stream_in_base* stream_in, size_t out_buffer_size, char *in_buffer_data, size_t in_buffer_filled) : - stream_in(stream_in), - out_buffer_size(out_buffer_size), - in_buffer_data(in_buffer_data), - in_buffer_filled(in_buffer_filled) - {} - - virtual ~stream_decompression_engine() - {} - - static std::string ext(const std::string& fn, const size_t len) - { - if (fn.length() <= len) - return ""; - - return fn.substr(fn.length() - len, std::string::npos); - } - - virtual int read(char* ptr, size_t &readed) = 0; - }; - -// ********************************************************************************** -// Decompression engine passing text files without any decompression -// ********************************************************************************** - class stream_decompression_engine_text : public stream_decompression_engine - { - - public: - stream_decompression_engine_text(stream_in_base* stream_in, size_t out_buffer_size, char* in_buffer_data, size_t in_buffer_filled) - : stream_decompression_engine(stream_in, out_buffer_size, in_buffer_data, in_buffer_filled) - {} - - virtual ~stream_decompression_engine_text() - { - } - - static bool knows_it(const std::string& file_name, char* data, size_t size) - { - return true; - } - - virtual int read(char* ptr, size_t& readed) - { - readed = 0; - - while (true) - { - size_t to_copy = std::min(in_buffer_filled - in_buffer_pos, out_buffer_size - readed); - memcpy(ptr + readed, in_buffer_data + in_buffer_pos, to_copy); - in_buffer_pos += to_copy; - readed += to_copy; - - if (in_buffer_pos == in_buffer_filled) - { - stream_in->release(in_buffer_data); - in_buffer_pos = 0; - std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); - - if (in_buffer_filled == 0) - return readed ? 0 : -1; - } - - if (readed == out_buffer_size) - return 0; - } - } - }; - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP - // ********************************************************************************** - // Decompression engine using igzip library (for .gz files) - // ********************************************************************************** - class stream_decompression_engine_igzip : public stream_decompression_engine - { - constexpr static std::array magic_numbers = {0x1f, 0x8b}; - - struct isal_gzip_header gz_hdr; - struct inflate_state state; - - enum class internal_state_t {none, file_start, before_header, inside}; - internal_state_t internal_state = internal_state_t::none; - - void initialize() - { - isal_gzip_header_init(&gz_hdr); - isal_inflate_init(&state); -// state.crc_flag = ISAL_GZIP_NO_HDR_VER; - state.crc_flag = ISAL_GZIP; - - state.next_in = (uint8_t*) in_buffer_data; - state.avail_in = (uint32_t) in_buffer_filled; - - internal_state = internal_state_t::file_start; - } - - bool check_header() - { - if (state.avail_in >= magic_numbers.size()) - return std::equal(magic_numbers.begin(), magic_numbers.end(), state.next_in); - - uint32_t in_state_len = state.avail_in; - - if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, state.next_in)) - return false; - - if (isal_inflate(&state) != ISAL_DECOMP_OK) - return false; - - if (!load_new_part()) - return false; - - return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), state.next_in); - } - - bool load_new_part() - { - if(in_buffer_data) - stream_in->release(in_buffer_data); - std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); - - state.next_in = (uint8_t*)in_buffer_data; - state.avail_in = (uint32_t)in_buffer_filled; - - return in_buffer_filled != 0; - } - - public: - stream_decompression_engine_igzip(stream_in_base *stream_in, size_t out_buffer_size, char *in_buffer_data, size_t in_buffer_filled) - : stream_decompression_engine(stream_in, out_buffer_size, in_buffer_data, in_buffer_filled) - { - isal_inflate_init(&state); - isal_gzip_header_init(&gz_hdr); - } - - virtual ~stream_decompression_engine_igzip() - { - if (in_buffer_data) - stream_in->release(in_buffer_data); - } - - static bool knows_it(const std::string& file_name, const char* data, const size_t size) - { - if (file_name.size() > 3 && ext(file_name, 3) != std::string(".gz")) - return false; - - if (size < magic_numbers.size()) - return false; - - return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); - } - - virtual int read(char* ptr, size_t& readed) - { - if (internal_state == internal_state_t::none) - initialize(); - - readed = 0; - - state.next_out = (uint8_t*) ptr; - state.avail_out = out_buffer_size; - - if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) - { - isal_inflate_reset(&state); - - if (!check_header()) - { - if (internal_state == internal_state_t::file_start) - return -2; // Error, no data in file - else - return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them - } - - internal_state = internal_state_t::inside; - } - - do - { - if (state.avail_in == 0) - if (!load_new_part()) - return -1; - - int ret = isal_inflate(&state); - - if (ret != ISAL_DECOMP_OK) - return -3; // Error, broken gzip file - - readed = state.next_out - (uint8_t*)ptr; - state.avail_in = (uint32_t)in_buffer_filled - (state.next_in - (uint8_t*)in_buffer_data); - state.avail_out = (uint32_t)(out_buffer_size - readed); - - if (state.block_state == ISAL_BLOCK_FINISH) - { - internal_state = internal_state_t::before_header; - break; - } - - if (readed == out_buffer_size) - return 0; - - } while (true); - - return 0; - } - }; -#endif - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB - // ********************************************************************************** - // Decompression engine using zlib library (for .gz files) - // ********************************************************************************** - class stream_decompression_engine_zlib : public stream_decompression_engine - { - constexpr static std::array magic_numbers = { 0x1f, 0x8b }; - - z_stream state; - - enum class internal_state_t { none, file_start, before_header, inside }; - internal_state_t internal_state = internal_state_t::none; - - void initialize() - { - state.next_in = (uint8_t*)in_buffer_data; - state.avail_in = (uint32_t)in_buffer_filled; - - state.zalloc = NULL; - state.zfree = NULL; - state.opaque = NULL; - - inflateInit2(&state, 15 + 32); - - internal_state = internal_state_t::file_start; - } - - bool check_header() - { - if (state.avail_in >= magic_numbers.size()) - return std::equal(magic_numbers.begin(), magic_numbers.end(), state.next_in); - - uint32_t in_state_len = state.avail_in; - - if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, state.next_in)) - return false; - - if (inflate(&state, Z_NO_FLUSH) != Z_OK) - return false; - - if (!load_new_part()) - return false; - - return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), state.next_in); - } - - bool load_new_part() - { - if (in_buffer_data) - stream_in->release(in_buffer_data); - std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); - - state.next_in = (uint8_t*)in_buffer_data; - state.avail_in = (uint32_t)in_buffer_filled; - - return in_buffer_filled != 0; - } - - public: - stream_decompression_engine_zlib(stream_in_base *stream_in, size_t buffer_size, char* in_buffer_data, size_t in_buffer_filled) - : stream_decompression_engine(stream_in, buffer_size, in_buffer_data, in_buffer_filled) - { - } - - virtual ~stream_decompression_engine_zlib() - { - if (in_buffer_data) - stream_in->release(in_buffer_data); - - if(internal_state != internal_state_t::none) - inflateEnd(&state); - } - - static bool knows_it(const std::string& file_name, const char* data, const size_t size) - { - if (file_name.size() > 3 && ext(file_name, 3) != std::string(".gz")) - return false; - - if (size < magic_numbers.size()) - return false; - - return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); - } - - virtual int read(char* ptr, size_t& readed) - { - if (internal_state == internal_state_t::none) - initialize(); - - readed = 0; - - state.next_out = (uint8_t*)ptr; - state.avail_out = out_buffer_size; - - if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) - { - if(internal_state == internal_state_t::before_header) - inflateReset(&state); - - if (!check_header()) - { - if (internal_state == internal_state_t::file_start) - return -2; // Error, no data in file - else - return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them - } - - internal_state = internal_state_t::inside; - } - - do - { - if (state.avail_in == 0) - if (!load_new_part()) - return -1; - - int ret = inflate(&state, Z_NO_FLUSH); - - if (ret == Z_DATA_ERROR) - return -3; // Error, broken gzip file - - readed = state.next_out - (uint8_t*)ptr; - state.avail_in = (uint32_t)in_buffer_filled - (state.next_in - (uint8_t*)in_buffer_data); - state.avail_out = (uint32_t)(out_buffer_size - readed); - - if (ret == Z_STREAM_END) - { - internal_state = internal_state_t::before_header; - break; - } - - if (readed == out_buffer_size) - return 0; - - } while (true); - - return 0; - } - }; -#endif - -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD - // ********************************************************************************** - // Decompression engine using zstd library (for .zst files) - // ********************************************************************************** - class stream_decompression_engine_zst : public stream_decompression_engine - { - constexpr static std::array magic_numbers = { 0x28, 0xb5, 0x2f, 0xfd }; - - ZSTD_DStream *state; - - ZSTD_inBuffer in_buffer; - ZSTD_outBuffer out_buffer; - - enum class internal_state_t { none, file_start, before_header, inside }; - internal_state_t internal_state = internal_state_t::none; - - void initialize() - { - ZSTD_initDStream(state); - - in_buffer.src = in_buffer_data; - in_buffer.size = in_buffer_filled; - in_buffer.pos = 0; - - internal_state = internal_state_t::file_start; - } - - bool check_header() - { - if (in_buffer.size - in_buffer.pos >= magic_numbers.size()) - return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) in_buffer.src + in_buffer.pos); - - uint32_t in_state_len = in_buffer.size - in_buffer.pos; - - if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, (uint8_t*)in_buffer.src + in_buffer.pos)) - return false; - - if(ZSTD_decompressStream(state, &out_buffer, &in_buffer) < 0) - return false; - - if (!load_new_part()) - return false; - - return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), (uint8_t*) in_buffer.src); - } - - bool load_new_part() - { - if (in_buffer_data) - stream_in->release(in_buffer_data); - std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); - - in_buffer.src = in_buffer_data; - in_buffer.size = in_buffer_filled; - in_buffer.pos = 0; - - return in_buffer_filled != 0; - } - - public: - stream_decompression_engine_zst(stream_in_base *stream_in, size_t buffer_size, char* in_buffer_data, size_t in_buffer_filled) - : stream_decompression_engine(stream_in, buffer_size, in_buffer_data, in_buffer_filled) - { - state = ZSTD_createDStream(); - } - - virtual ~stream_decompression_engine_zst() - { - ZSTD_freeDStream(state); - } - - static bool knows_it(const std::string& file_name, const char* data, const size_t size) - { - if (file_name.size() > 4 && ext(file_name, 4) != std::string(".zst")) - return false; - - if (size < magic_numbers.size()) - return false; - - return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); - } - - virtual int read(char* ptr, size_t& readed) - { - if (internal_state == internal_state_t::none) - initialize(); - - readed = 0; - - out_buffer.dst = ptr; - out_buffer.size = out_buffer_size; - out_buffer.pos = 0; - - if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) - { - if (internal_state == internal_state_t::before_header) - ZSTD_initDStream(state); - - if (!check_header()) - { - if (internal_state == internal_state_t::file_start) - return -2; // Error, no data in file - else - return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them - } - - internal_state = internal_state_t::inside; - } - - do - { - if (in_buffer.pos == in_buffer.size) - if (!load_new_part()) - return -1; - - int ret = ZSTD_decompressStream(state, &out_buffer, &in_buffer); - - if (ret < 0) - return -3; // Error, broken gzip file - - readed = out_buffer.pos; - - if (ret == 0) - { - internal_state = internal_state_t::before_header; - break; - } - - if (readed == out_buffer_size) - return 0; - - } while (true); - - return 0; - } - }; -#endif - -#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) && defined (REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) - using stream_decompression_engine_gz = stream_decompression_engine_igzip; -#else -#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) - using stream_decompression_engine_gz = stream_decompression_engine_igzip; -#else -#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) - using stream_decompression_engine_gz = stream_decompression_engine_zlib; -#endif -#endif -#endif - - // ********************************************************************************** - // Main class for decompression of stream data (file, stdin, ...) in some compressed format (.gz, .zstd, ...) - // ********************************************************************************** - class stream_decompression - { - public: - enum class format_t { unknown, text, gzip, zstd }; - - private: - stream_decompression_engine* engine = nullptr; - format_t format = format_t::unknown; - size_t engine_part_size; - - char* buffer; - size_t size; - size_t filled; - size_t pos; - bool eof_marker = true; - - bool determine_format(stream_in_base* stream_in) - { - format = format_t::unknown; - if (engine) - { - delete engine; - engine = nullptr; - } - - if (!stream_in) - return false; - - char* ptr; - std::tie(ptr, filled) = stream_in->read(); - -#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) || defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) - if (stream_decompression_engine_gz::knows_it(stream_in->get_file_name(), ptr, filled)) - { - format = format_t::gzip; - engine = new stream_decompression_engine_gz(stream_in, engine_part_size, ptr, filled); - } - else -#endif -#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD - if (stream_decompression_engine_zst::knows_it(stream_in->get_file_name(), ptr, filled)) - { - format = format_t::zstd; - engine = new stream_decompression_engine_zst(stream_in, engine_part_size, ptr, filled); - } - else -#endif - if (stream_decompression_engine_text::knows_it(stream_in->get_file_name(), ptr, filled)) - { - format = format_t::text; - engine = new stream_decompression_engine_text(stream_in, engine_part_size, ptr, filled); - } - - return format != format_t::unknown; - } - - int fill_buffer() - { - int ret = engine->read(buffer, filled); - pos = 0; - - return ret; - } - - public: - stream_decompression(stream_in_base *stream_in, size_t engine_part_size = 16 << 20) : - engine_part_size(engine_part_size) - { - determine_format(stream_in); - eof_marker = false; - filled = 0; - - buffer = new char[engine_part_size]; - pos = 0; - } - - ~stream_decompression() - { - if (engine) - delete engine; - - delete[] buffer; - } - - bool restart(stream_in_base* stream_in) - { - auto ret = determine_format(stream_in); - - filled = 0; - pos = 0; - - eof_marker = false; - - return ret; - } - - bool release() - { - if (!engine) - return false; - - delete engine; - engine = nullptr; - - return true; - } - - int getc() - { - if (pos < filled) - return buffer[pos++]; - - auto ret = fill_buffer(); - - if (ret < 0) - { - eof_marker = true; - return ret; - } - - if (filled == 0) - return -1; - - return buffer[pos++]; - } - - int read(char* dest, size_t requested, size_t &readed) - { - char* p = dest; - readed = 0; - int ret = 0; - - while (readed < requested) - { - size_t to_copy = std::min(requested, filled - pos); - memcpy(p, buffer + pos, to_copy); - readed += to_copy; - pos += to_copy; - p += to_copy; - - if (pos == filled) - { - ret = fill_buffer(); - - if(ret < 0) - { - eof_marker = true; - return ret; - } - } - } - - return 0; - } - - int getline(std::string& str) - { - str.clear(); - - int ret = 0; - - while (true) - { - auto q = std::find(buffer + pos, buffer + filled, (char) 0x0a); - - str.append(buffer + pos, q); - - pos = q - buffer; - - if (pos != filled) - { - ++pos; - break; - } - - ret = fill_buffer(); - - if (ret < 0) - { - eof_marker = true; - break; - } - } - - if (!str.empty() && str.back() == 0x0d) - str.pop_back(); - - return ret; - } - - bool eof() const - { - return eof_marker; - } - - format_t get_format() const - { - return format; - } - }; -} - -#endif +#ifndef _FILE_WRAPPER_H +#define _FILE_WRAPPER_H + +// *** History of updates +// *** v. 1.0.1 (2024-03-11) - bug fix (wrong zlib initialization) +// *** v. 1.0.2 (2024-05-01) - bug fix (wrong reading from file) +// *** + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#endif + +#if defined(ARCH_X64) +#if defined(REFRESH_USE_IGZIP) +#define REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP +#else +#define REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB +#endif +#else +#define REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB +#endif + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP +#include +#endif + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB +#include +#endif + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD +#include +#endif + + +namespace refresh +{ + // ********************************************************************************** + // Base class for basic I/O operations, like reading from file, stdin, ... + // ********************************************************************************** + class stream_in_base + { + public: + stream_in_base() = default; + virtual ~stream_in_base() = default; + + virtual std::pair read() = 0; + virtual void release(char*) = 0; + virtual std::string get_file_name() const = 0; + }; + + // ********************************************************************************** + // Base class for buffered reading + // ********************************************************************************** + class stream_in_buffered : public stream_in_base + { + protected: + size_t buffer_size; + char* buffer; + size_t buffer_filled; + bool buffer_released; + + public: + stream_in_buffered(size_t buffer_size) : + stream_in_base(), + buffer_size(buffer_size) + { + buffer = new char[buffer_size]; + buffer_released = true; + buffer_filled = 0; + } + + virtual ~stream_in_buffered() + { + delete[] buffer; + } + }; + + // ********************************************************************************** + // Low-level reading from stdin + // ********************************************************************************** + class stream_in_stdin : public stream_in_buffered + { + public: + stream_in_stdin(size_t buffer_size = 8 << 20) : + stream_in_buffered(buffer_size) + { +#ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); +#endif + } + + virtual ~stream_in_stdin() + {} + + virtual std::pair read() + { + if (!buffer_released) + assert(0); + + if (!feof(stdin)) + buffer_filled = fread(buffer, 1, buffer_size, stdin); + else + buffer_filled = 0; + + buffer_released = false; + + return std::make_pair(buffer, buffer_filled); + } + + virtual void release(char *ptr) + { + buffer_released = true; + } + + virtual std::string get_file_name() const + { + return ""; + } + }; + + // ********************************************************************************** + // Low-level reading from file + // ********************************************************************************** + class stream_in_file : public stream_in_buffered + { + std::string file_name; + size_t io_buffer_size; + FILE* file = nullptr; + bool test_extension = true; + + public: + stream_in_file(const std::string& file_name, size_t io_buffer_size = 16 << 20, size_t buffer_size = 8 << 20, bool test_extension = true) : + stream_in_buffered(buffer_size), + file_name(file_name), + io_buffer_size(io_buffer_size), + test_extension(test_extension) + { + open(file_name); + } + + virtual ~stream_in_file() + { + close(); + } + + virtual bool open(const std::string& _file_name) + { + file_name = _file_name; + + if (file) + close(); + + file = fopen(file_name.c_str(), "rb"); + + if (!file) + return false; + + setvbuf(file, nullptr, _IOFBF, io_buffer_size); + + buffer_released = true; + + return true; + } + + virtual bool close() + { + if (file) + { + fclose(file); + file = nullptr; + + return true; + } + + return false; + } + + virtual bool restart() + { + if (!file) + return false; + +#ifdef _WIN32 + _fseeki64(file, 0, SEEK_SET); +#else + fseek(file, 0, SEEK_SET); +#endif + + buffer_filled = 0; + buffer_released = true; + + return true; + } + + virtual std::pair read() + { + if (!buffer_released) + assert(0); + + buffer_filled = fread(buffer, 1, buffer_size, file); + buffer_released = false; + + return std::make_pair(buffer, buffer_filled); + } + + virtual void release(char *ptr) + { + buffer_released = true; + } + + virtual std::string get_file_name() const + { + return test_extension ? file_name : ""; + } + + bool is_open() const + { + return file != nullptr; + } + }; + + // ********************************************************************************** + // Base class of decompression engines, like zlib, igzip, plain-text, zstd, ... + // ********************************************************************************** + class stream_decompression_engine + { + protected: + stream_in_base* stream_in; + size_t out_buffer_size; + char* in_buffer_data = nullptr; + size_t in_buffer_filled = 0; + size_t in_buffer_pos = 0; + + public: + stream_decompression_engine(stream_in_base* stream_in, size_t out_buffer_size, char *in_buffer_data, size_t in_buffer_filled) : + stream_in(stream_in), + out_buffer_size(out_buffer_size), + in_buffer_data(in_buffer_data), + in_buffer_filled(in_buffer_filled) + {} + + virtual ~stream_decompression_engine() + {} + + static std::string ext(const std::string& fn, const size_t len) + { + if (fn.length() <= len) + return ""; + + return fn.substr(fn.length() - len, std::string::npos); + } + + virtual int read(char* ptr, size_t &readed) = 0; + }; + +// ********************************************************************************** +// Decompression engine passing text files without any decompression +// ********************************************************************************** + class stream_decompression_engine_text : public stream_decompression_engine + { + + public: + stream_decompression_engine_text(stream_in_base* stream_in, size_t out_buffer_size, char* in_buffer_data, size_t in_buffer_filled) + : stream_decompression_engine(stream_in, out_buffer_size, in_buffer_data, in_buffer_filled) + {} + + virtual ~stream_decompression_engine_text() + { + } + + static bool knows_it(const std::string& file_name, char* data, size_t size) + { + return true; + } + + virtual int read(char* ptr, size_t& readed) + { + readed = 0; + + while (true) + { + size_t to_copy = std::min(in_buffer_filled - in_buffer_pos, out_buffer_size - readed); + memcpy(ptr + readed, in_buffer_data + in_buffer_pos, to_copy); + in_buffer_pos += to_copy; + readed += to_copy; + + if (in_buffer_pos == in_buffer_filled) + { + stream_in->release(in_buffer_data); + in_buffer_pos = 0; + std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); + + if (in_buffer_filled == 0) + return readed ? 0 : -1; + } + + if (readed == out_buffer_size) + return 0; + } + } + }; + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP + // ********************************************************************************** + // Decompression engine using igzip library (for .gz files) + // ********************************************************************************** + class stream_decompression_engine_igzip : public stream_decompression_engine + { + constexpr static std::array magic_numbers = {0x1f, 0x8b}; + + struct isal_gzip_header gz_hdr; + struct inflate_state state; + + enum class internal_state_t {none, file_start, before_header, inside}; + internal_state_t internal_state = internal_state_t::none; + + void initialize() + { + isal_gzip_header_init(&gz_hdr); + isal_inflate_init(&state); +// state.crc_flag = ISAL_GZIP_NO_HDR_VER; + state.crc_flag = ISAL_GZIP; + + state.next_in = (uint8_t*) in_buffer_data; + state.avail_in = (uint32_t) in_buffer_filled; + + internal_state = internal_state_t::file_start; + } + + bool check_header() + { + if (state.avail_in >= magic_numbers.size()) + return std::equal(magic_numbers.begin(), magic_numbers.end(), state.next_in); + + uint32_t in_state_len = state.avail_in; + + if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, state.next_in)) + return false; + + if (isal_inflate(&state) != ISAL_DECOMP_OK) + return false; + + if (!load_new_part()) + return false; + + return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), state.next_in); + } + + bool load_new_part() + { + if(in_buffer_data) + stream_in->release(in_buffer_data); + std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); + + state.next_in = (uint8_t*)in_buffer_data; + state.avail_in = (uint32_t)in_buffer_filled; + + return in_buffer_filled != 0; + } + + public: + stream_decompression_engine_igzip(stream_in_base *stream_in, size_t out_buffer_size, char *in_buffer_data, size_t in_buffer_filled) + : stream_decompression_engine(stream_in, out_buffer_size, in_buffer_data, in_buffer_filled) + { + isal_inflate_init(&state); + isal_gzip_header_init(&gz_hdr); + } + + virtual ~stream_decompression_engine_igzip() + { + if (in_buffer_data) + stream_in->release(in_buffer_data); + } + + static bool knows_it(const std::string& file_name, const char* data, const size_t size) + { + if (file_name.size() > 3 && ext(file_name, 3) != std::string(".gz")) + return false; + + if (size < magic_numbers.size()) + return false; + + return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); + } + + virtual int read(char* ptr, size_t& readed) + { + if (internal_state == internal_state_t::none) + initialize(); + + readed = 0; + + state.next_out = (uint8_t*) ptr; + state.avail_out = (uint32_t) out_buffer_size; + + if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) + { + isal_inflate_reset(&state); + + if (!check_header()) + { + if (internal_state == internal_state_t::file_start) + return -2; // Error, no data in file + else + return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them + } + + internal_state = internal_state_t::inside; + } + + do + { + if (state.avail_in == 0) + if (!load_new_part()) + return -1; + + int ret = isal_inflate(&state); + + if (ret != ISAL_DECOMP_OK) + return -3; // Error, broken gzip file + + readed = state.next_out - (uint8_t*)ptr; + state.avail_in = (uint32_t)(in_buffer_filled - (state.next_in - (uint8_t*)in_buffer_data)); + state.avail_out = (uint32_t)(out_buffer_size - readed); + + if (state.block_state == ISAL_BLOCK_FINISH) + { + internal_state = internal_state_t::before_header; + break; + } + + if (readed == out_buffer_size) + return 0; + + } while (true); + + return 0; + } + }; +#endif + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB + // ********************************************************************************** + // Decompression engine using zlib library (for .gz files) + // ********************************************************************************** + class stream_decompression_engine_zlib : public stream_decompression_engine + { + constexpr static std::array magic_numbers = { 0x1f, 0x8b }; + + z_stream state; + + enum class internal_state_t { none, file_start, before_header, inside }; + internal_state_t internal_state = internal_state_t::none; + + void initialize() + { + state.next_in = (uint8_t*)in_buffer_data; + state.avail_in = (uint32_t)in_buffer_filled; + + state.zalloc = NULL; + state.zfree = NULL; + state.opaque = NULL; + + inflateInit2(&state, 15 + 32); + + internal_state = internal_state_t::file_start; + } + + bool check_header() + { + if (state.avail_in >= magic_numbers.size()) + return std::equal(magic_numbers.begin(), magic_numbers.end(), state.next_in); + + uint32_t in_state_len = state.avail_in; + + if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, state.next_in)) + return false; + + if (inflate(&state, Z_NO_FLUSH) != Z_OK) + return false; + + if (!load_new_part()) + return false; + + return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), state.next_in); + } + + bool load_new_part() + { + if (in_buffer_data) + stream_in->release(in_buffer_data); + std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); + + state.next_in = (uint8_t*)in_buffer_data; + state.avail_in = (uint32_t)in_buffer_filled; + + return in_buffer_filled != 0; + } + + public: + stream_decompression_engine_zlib(stream_in_base *stream_in, size_t buffer_size, char* in_buffer_data, size_t in_buffer_filled) + : stream_decompression_engine(stream_in, buffer_size, in_buffer_data, in_buffer_filled) + { + } + + virtual ~stream_decompression_engine_zlib() + { + if (in_buffer_data) + stream_in->release(in_buffer_data); + + if(internal_state != internal_state_t::none) + inflateEnd(&state); + } + + static bool knows_it(const std::string& file_name, const char* data, const size_t size) + { + if (file_name.size() > 3 && ext(file_name, 3) != std::string(".gz")) + return false; + + if (size < magic_numbers.size()) + return false; + + return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); + } + + virtual int read(char* ptr, size_t& readed) + { + if (internal_state == internal_state_t::none) + initialize(); + + readed = 0; + + state.next_out = (uint8_t*)ptr; + state.avail_out = out_buffer_size; + + if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) + { + if(internal_state == internal_state_t::before_header) + inflateReset(&state); + + if (!check_header()) + { + if (internal_state == internal_state_t::file_start) + return -2; // Error, no data in file + else + return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them + } + + internal_state = internal_state_t::inside; + } + + do + { + if (state.avail_in == 0) + if (!load_new_part()) + return -1; + + int ret = inflate(&state, Z_NO_FLUSH); + + if (ret == Z_DATA_ERROR) + return -3; // Error, broken gzip file + + readed = state.next_out - (uint8_t*)ptr; + state.avail_in = (uint32_t)in_buffer_filled - (state.next_in - (uint8_t*)in_buffer_data); + state.avail_out = (uint32_t)(out_buffer_size - readed); + + if (ret == Z_STREAM_END) + { + internal_state = internal_state_t::before_header; + break; + } + + if (readed == out_buffer_size) + return 0; + + } while (true); + + return 0; + } + }; +#endif + +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD + // ********************************************************************************** + // Decompression engine using zstd library (for .zst files) + // ********************************************************************************** + class stream_decompression_engine_zst : public stream_decompression_engine + { + constexpr static std::array magic_numbers = { 0x28, 0xb5, 0x2f, 0xfd }; + + ZSTD_DStream *state; + + ZSTD_inBuffer in_buffer; + ZSTD_outBuffer out_buffer; + + enum class internal_state_t { none, file_start, before_header, inside }; + internal_state_t internal_state = internal_state_t::none; + + void initialize() + { + ZSTD_initDStream(state); + + in_buffer.src = in_buffer_data; + in_buffer.size = in_buffer_filled; + in_buffer.pos = 0; + + internal_state = internal_state_t::file_start; + } + + bool check_header() + { + if (in_buffer.size - in_buffer.pos >= magic_numbers.size()) + return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) in_buffer.src + in_buffer.pos); + + uint32_t in_state_len = in_buffer.size - in_buffer.pos; + + if (!std::equal(magic_numbers.begin(), magic_numbers.begin() + in_state_len, (uint8_t*)in_buffer.src + in_buffer.pos)) + return false; + + if(ZSTD_decompressStream(state, &out_buffer, &in_buffer) < 0) + return false; + + if (!load_new_part()) + return false; + + return std::equal(magic_numbers.begin() + in_state_len, magic_numbers.end(), (uint8_t*) in_buffer.src); + } + + bool load_new_part() + { + if (in_buffer_data) + stream_in->release(in_buffer_data); + std::tie(in_buffer_data, in_buffer_filled) = stream_in->read(); + + in_buffer.src = in_buffer_data; + in_buffer.size = in_buffer_filled; + in_buffer.pos = 0; + + return in_buffer_filled != 0; + } + + public: + stream_decompression_engine_zst(stream_in_base *stream_in, size_t buffer_size, char* in_buffer_data, size_t in_buffer_filled) + : stream_decompression_engine(stream_in, buffer_size, in_buffer_data, in_buffer_filled) + { + state = ZSTD_createDStream(); + } + + virtual ~stream_decompression_engine_zst() + { + ZSTD_freeDStream(state); + } + + static bool knows_it(const std::string& file_name, const char* data, const size_t size) + { + if (file_name.size() > 4 && ext(file_name, 4) != std::string(".zst")) + return false; + + if (size < magic_numbers.size()) + return false; + + return std::equal(magic_numbers.begin(), magic_numbers.end(), (uint8_t*) data); + } + + virtual int read(char* ptr, size_t& readed) + { + if (internal_state == internal_state_t::none) + initialize(); + + readed = 0; + + out_buffer.dst = ptr; + out_buffer.size = out_buffer_size; + out_buffer.pos = 0; + + if (internal_state == internal_state_t::before_header || internal_state == internal_state_t::file_start) + { + if (internal_state == internal_state_t::before_header) + ZSTD_initDStream(state); + + if (!check_header()) + { + if (internal_state == internal_state_t::file_start) + return -2; // Error, no data in file + else + return -1; // Just end-of-file, maybe some junk data are still present, but do not care about them + } + + internal_state = internal_state_t::inside; + } + + do + { + if (in_buffer.pos == in_buffer.size) + if (!load_new_part()) + return -1; + + int ret = ZSTD_decompressStream(state, &out_buffer, &in_buffer); + + if (ret < 0) + return -3; // Error, broken gzip file + + readed = out_buffer.pos; + + if (ret == 0) + { + internal_state = internal_state_t::before_header; + break; + } + + if (readed == out_buffer_size) + return 0; + + } while (true); + + return 0; + } + }; +#endif + +#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) && defined (REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) + using stream_decompression_engine_gz = stream_decompression_engine_igzip; +#else +#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) + using stream_decompression_engine_gz = stream_decompression_engine_igzip; +#else +#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) + using stream_decompression_engine_gz = stream_decompression_engine_zlib; +#endif +#endif +#endif + + // ********************************************************************************** + // Main class for decompression of stream data (file, stdin, ...) in some compressed format (.gz, .zstd, ...) + // ********************************************************************************** + class stream_decompression + { + public: + enum class format_t { unknown, text, gzip, zstd }; + + private: + stream_decompression_engine* engine = nullptr; + format_t format = format_t::unknown; + size_t engine_part_size; + + char* buffer; + size_t size; + size_t filled; + size_t pos; + bool eof_marker = true; + + bool determine_format(stream_in_base* stream_in) + { + format = format_t::unknown; + if (engine) + { + delete engine; + engine = nullptr; + } + + if (!stream_in) + return false; + + char* ptr; + std::tie(ptr, filled) = stream_in->read(); + +#if defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_IGZIP) || defined(REFRESH_STREAM_DECOMPRESSION_ENABLE_ZLIB) + if (stream_decompression_engine_gz::knows_it(stream_in->get_file_name(), ptr, filled)) + { + format = format_t::gzip; + engine = new stream_decompression_engine_gz(stream_in, engine_part_size, ptr, filled); + } + else +#endif +#ifdef REFRESH_STREAM_DECOMPRESSION_ENABLE_ZSTD + if (stream_decompression_engine_zst::knows_it(stream_in->get_file_name(), ptr, filled)) + { + format = format_t::zstd; + engine = new stream_decompression_engine_zst(stream_in, engine_part_size, ptr, filled); + } + else +#endif + if (stream_decompression_engine_text::knows_it(stream_in->get_file_name(), ptr, filled)) + { + format = format_t::text; + engine = new stream_decompression_engine_text(stream_in, engine_part_size, ptr, filled); + } + + return format != format_t::unknown; + } + + int fill_buffer() + { + int ret = engine->read(buffer, filled); + pos = 0; + + return ret; + } + + public: + stream_decompression(stream_in_base *stream_in, size_t engine_part_size = 16 << 20) : + engine_part_size(engine_part_size) + { + determine_format(stream_in); + eof_marker = false; + filled = 0; + + buffer = new char[engine_part_size]; + pos = 0; + } + + ~stream_decompression() + { + if (engine) + delete engine; + + delete[] buffer; + } + + bool restart(stream_in_base* stream_in) + { + auto ret = determine_format(stream_in); + + filled = 0; + pos = 0; + + eof_marker = false; + + return ret; + } + + bool release() + { + if (!engine) + return false; + + delete engine; + engine = nullptr; + + return true; + } + + int getc() + { + if (pos < filled) + return buffer[pos++]; + + auto ret = fill_buffer(); + + if (ret < 0) + { + eof_marker = true; + return ret; + } + + if (filled == 0) + return -1; + + return buffer[pos++]; + } + + int read(char* dest, size_t requested, size_t &readed) + { + char* p = dest; + readed = 0; + int ret = 0; + + while (readed < requested) + { + size_t to_copy = std::min(requested - readed, filled - pos); + memcpy(p, buffer + pos, to_copy); + readed += to_copy; + pos += to_copy; + p += to_copy; + + if (pos == filled) + { + ret = fill_buffer(); + + if(ret < 0) + { + eof_marker = true; + return ret; + } + } + } + + return 0; + } + + int getline(std::string& str) + { + str.clear(); + + int ret = 0; + + while (true) + { + auto q = std::find(buffer + pos, buffer + filled, (char) 0x0a); + + str.append(buffer + pos, q); + + pos = q - buffer; + + if (pos != filled) + { + ++pos; + break; + } + + ret = fill_buffer(); + + if (ret < 0) + { + eof_marker = true; + break; + } + } + + if (!str.empty() && str.back() == 0x0d) + str.pop_back(); + + return ret; + } + + bool eof() const + { + return eof_marker; + } + + format_t get_format() const + { + return format; + } + }; +} + +#endif diff --git a/libs/gz_wrapper.h b/3rd_party/refresh/compression/lib/gz_wrapper.h similarity index 95% rename from libs/gz_wrapper.h rename to 3rd_party/refresh/compression/lib/gz_wrapper.h index dc106bb..e206e0d 100644 --- a/libs/gz_wrapper.h +++ b/3rd_party/refresh/compression/lib/gz_wrapper.h @@ -1,189 +1,189 @@ -#ifndef _GZ_WRAPPER_H -#define _GZ_WRAPPER_H - -#include -#include -#include - -#include - -namespace refresh -{ - // ********************************************************************************** - class gz_in_memory - { - static const int min_compression_level = 1; - static const int max_compression_level = 12; - - int compression_level; - bool low_memory; - - libdeflate_compressor *ld_comp = nullptr; - libdeflate_decompressor *ld_decomp = nullptr; - - enum class mode_t { none, buffer }; - - mode_t working_mode = mode_t::none; - - void ensure_comp() - { - if (!ld_comp) - ld_comp = libdeflate_alloc_compressor(compression_level); - } - - void ensure_decomp() - { - if (!ld_decomp) - ld_decomp = libdeflate_alloc_decompressor(); - } - - void free_comp() - { - if (ld_comp) - { - libdeflate_free_compressor(ld_comp); - ld_comp = nullptr; - } - } - - void free_decomp() - { - if (ld_decomp) - { - libdeflate_free_decompressor(ld_decomp); - ld_decomp = nullptr; - } - } - - void check_and_set_compression_level(int _compression_level) - { - compression_level = _compression_level; - - if (compression_level < min_compression_level) - compression_level = min_compression_level; - - if (compression_level > max_compression_level) - compression_level = max_compression_level; - } - - public: - gz_in_memory(int _compression_level = 9, bool low_memory = false) : - low_memory(low_memory) - { - check_and_set_compression_level(_compression_level); - } - - gz_in_memory(gz_in_memory&& rhs) noexcept - { - compression_level = rhs.compression_level; - low_memory = rhs.low_memory; - - working_mode = rhs.working_mode; - rhs.working_mode = mode_t::none; - - ld_comp = rhs.ld_comp; - rhs.ld_comp = nullptr; - - ld_decomp = rhs.ld_decomp; - rhs.ld_decomp = nullptr; - }; - - gz_in_memory& operator=(gz_in_memory&& rhs) noexcept - { - compression_level = rhs.compression_level; - low_memory = rhs.low_memory; - - working_mode = rhs.working_mode; - rhs.working_mode = mode_t::none; - - free_comp(); - ld_comp = rhs.ld_comp; - rhs.ld_comp = nullptr; - - free_decomp(); - ld_decomp = rhs.ld_decomp; - rhs.ld_decomp = nullptr; - - return *this; - } - - ~gz_in_memory() - { - free_comp(); - free_decomp(); - } - - void set_compression_level(size_t _compression_level) - { - check_and_set_compression_level(_compression_level); - - free_comp(); - } - - static int get_min_compression_level() - { - return min_compression_level; - } - - static int get_max_compression_level() - { - return max_compression_level; - } - - size_t get_overhead(size_t to_compress_size) - { - ensure_comp(); - - auto r = libdeflate_gzip_compress_bound(ld_comp, to_compress_size) - to_compress_size; - - if (low_memory) - free_comp(); - - return r; - } - - size_t compress(const void* src, const size_t src_size, void* dest, size_t dest_size, int level = 0) - { - if (working_mode == mode_t::none) - working_mode = mode_t::buffer; - else if (working_mode != mode_t::buffer) - return 0; - - if (level == 0) - level = compression_level; - - if (level != compression_level) - { - free_comp(); - check_and_set_compression_level(level); - } - - ensure_comp(); - - if (libdeflate_gzip_compress_bound(ld_comp, src_size) > dest_size) - return 0; - - auto r = libdeflate_gzip_compress(ld_comp, src, src_size, dest, dest_size); - - if (low_memory) - free_comp(); - - return r; - } - - size_t decompress(const void* src, const size_t src_size, void* dest, size_t dest_size) - { - ensure_decomp(); - - size_t decoded_size; - auto r = libdeflate_gzip_decompress(ld_decomp, src, src_size, dest, dest_size, &decoded_size); - - if (low_memory) - free_decomp(); - - return decoded_size; - } - }; -} - -#endif +#ifndef _GZ_WRAPPER_H +#define _GZ_WRAPPER_H + +#include +#include +#include + +#include + +namespace refresh +{ + // ********************************************************************************** + class gz_in_memory + { + static const int min_compression_level = 1; + static const int max_compression_level = 12; + + int compression_level; + bool low_memory; + + libdeflate_compressor *ld_comp = nullptr; + libdeflate_decompressor *ld_decomp = nullptr; + + enum class mode_t { none, buffer }; + + mode_t working_mode = mode_t::none; + + void ensure_comp() + { + if (!ld_comp) + ld_comp = libdeflate_alloc_compressor(compression_level); + } + + void ensure_decomp() + { + if (!ld_decomp) + ld_decomp = libdeflate_alloc_decompressor(); + } + + void free_comp() + { + if (ld_comp) + { + libdeflate_free_compressor(ld_comp); + ld_comp = nullptr; + } + } + + void free_decomp() + { + if (ld_decomp) + { + libdeflate_free_decompressor(ld_decomp); + ld_decomp = nullptr; + } + } + + void check_and_set_compression_level(int _compression_level) + { + compression_level = _compression_level; + + if (compression_level < min_compression_level) + compression_level = min_compression_level; + + if (compression_level > max_compression_level) + compression_level = max_compression_level; + } + + public: + gz_in_memory(int _compression_level = 9, bool low_memory = false) : + low_memory(low_memory) + { + check_and_set_compression_level(_compression_level); + } + + gz_in_memory(gz_in_memory&& rhs) noexcept + { + compression_level = rhs.compression_level; + low_memory = rhs.low_memory; + + working_mode = rhs.working_mode; + rhs.working_mode = mode_t::none; + + ld_comp = rhs.ld_comp; + rhs.ld_comp = nullptr; + + ld_decomp = rhs.ld_decomp; + rhs.ld_decomp = nullptr; + }; + + gz_in_memory& operator=(gz_in_memory&& rhs) noexcept + { + compression_level = rhs.compression_level; + low_memory = rhs.low_memory; + + working_mode = rhs.working_mode; + rhs.working_mode = mode_t::none; + + free_comp(); + ld_comp = rhs.ld_comp; + rhs.ld_comp = nullptr; + + free_decomp(); + ld_decomp = rhs.ld_decomp; + rhs.ld_decomp = nullptr; + + return *this; + } + + ~gz_in_memory() + { + free_comp(); + free_decomp(); + } + + void set_compression_level(size_t _compression_level) + { + check_and_set_compression_level(_compression_level); + + free_comp(); + } + + static int get_min_compression_level() + { + return min_compression_level; + } + + static int get_max_compression_level() + { + return max_compression_level; + } + + size_t get_overhead(size_t to_compress_size) + { + ensure_comp(); + + auto r = libdeflate_gzip_compress_bound(ld_comp, to_compress_size) - to_compress_size; + + if (low_memory) + free_comp(); + + return r; + } + + size_t compress(const void* src, const size_t src_size, void* dest, size_t dest_size, int level = 0) + { + if (working_mode == mode_t::none) + working_mode = mode_t::buffer; + else if (working_mode != mode_t::buffer) + return 0; + + if (level == 0) + level = compression_level; + + if (level != compression_level) + { + free_comp(); + check_and_set_compression_level(level); + } + + ensure_comp(); + + if (libdeflate_gzip_compress_bound(ld_comp, src_size) > dest_size) + return 0; + + auto r = libdeflate_gzip_compress(ld_comp, src, src_size, dest, dest_size); + + if (low_memory) + free_comp(); + + return r; + } + + size_t decompress(const void* src, const size_t src_size, void* dest, size_t dest_size) + { + ensure_decomp(); + + size_t decoded_size; + auto r = libdeflate_gzip_decompress(ld_decomp, src, src_size, dest, dest_size, &decoded_size); + + if (low_memory) + free_decomp(); + + return decoded_size; + } + }; +} + +#endif diff --git a/3rd_party/refresh/string_operations/lib/string_operations.h b/3rd_party/refresh/string_operations/lib/string_operations.h new file mode 100644 index 0000000..b342ff7 --- /dev/null +++ b/3rd_party/refresh/string_operations/lib/string_operations.h @@ -0,0 +1,153 @@ +#pragma once + +#include + +#if defined(__x86_64__) || defined(_M_AMD64) +#include +#include +#elif defined(__aarch64__) +#include +#endif + + +namespace refresh +{ + namespace details + { + template + size_t matching_length_naive(Iter* first, Iter* second, size_t max_length) + { + size_t len; + + for (len = 0; len < max_length; ++len) + if (first[len] != second[len]) + break; + + return len; + } + +#if defined(__x86_64__) || defined(_M_AMD64) + // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html + + template + size_t matching_length_avx(Iter* first, Iter* second, size_t max_length) + { + const size_t CHAR_SIZE = sizeof(*first); + + size_t len; + + if (max_length < 16 / CHAR_SIZE) + return matching_length_naive(first, second, max_length); + + const uint32_t VEC_SIZE = 16 / CHAR_SIZE; + + for (len = 0; len + VEC_SIZE < max_length; len += VEC_SIZE) + { + const __m128i x = _mm_loadu_si128((__m128i const*) (first + len)); + const __m128i y = _mm_loadu_si128((__m128i const*) (second + len)); + + uint32_t m = (uint32_t)(_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xffff); + + if (m != 0) + return len + std::countr_zero(m) / CHAR_SIZE; + } + + auto rest = max_length % VEC_SIZE; + + if(rest) + len -= VEC_SIZE - rest; + + const __m128i x = _mm_loadu_si128((__m128i const*) (first + len)); + const __m128i y = _mm_loadu_si128((__m128i const*) (second + len)); + + uint32_t m = (uint32_t)(_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)) ^ 0xffff); + + if (m != 0) + return len + std::countr_zero(m) / CHAR_SIZE; + + return max_length; + } +#endif + +#if defined(__aarch64__) + // https://developer.arm.com/architectures/instruction-sets/intrinsics + + template + size_t matching_length_neon(Iter* first, Iter* second, size_t max_length) + { + const size_t CHAR_SIZE = sizeof(*first); + + size_t len; + + if (max_length < 16 / CHAR_SIZE) + return matching_length_naive(first, second, max_length); + + const uint32_t VEC_SIZE = 16 / CHAR_SIZE; + + for (len = 0; len + VEC_SIZE < max_length; len += VEC_SIZE) + { + const uint8x16_t x = vld1q_u8((uint8_t const*) (first + len)); + const uint8x16_t y = vld1q_u8((uint8_t const*) (second + len)); + + uint8x16_t c = vceqq_u8(x, y); + uint64x2_t m = vreinterpretq_u64_u8(c); + + if (~m[0]) + return len + std::countr_zero(~m[0]) / 8 / CHAR_SIZE; + else if(~m[1]) + return len + 8 / CHAR_SIZE + std::countr_zero(~m[1]) / 8 / CHAR_SIZE; + } + + auto rest = max_length % VEC_SIZE; + + if (rest) + len -= VEC_SIZE - rest; + + const uint8x16_t x = vld1q_u8((uint8_t const*)(first + len)); + const uint8x16_t y = vld1q_u8((uint8_t const*)(second + len)); + + uint8x16_t c = vceqq_u8(x, y); + uint64x2_t m = vreinterpretq_u64_u8(c); + + if (~m[0]) + return len + std::countr_zero(~m[0]) / 8 / CHAR_SIZE; + else if (~m[1]) + return len + 8 / CHAR_SIZE + std::countr_zero(~m[1]) / 8 / CHAR_SIZE; + + return max_length; + } +#endif + } + + template + size_t matching_length(Iter* first, Iter* second, size_t max_length) + { + +#if defined(__x86_64__) || defined(_M_AMD64) + if constexpr (sizeof(*first) == 1) + return details::matching_length_avx(first, second, max_length); + else if constexpr (sizeof(*first) == 2) + return details::matching_length_avx(first, second, max_length); + else if constexpr (sizeof(*first) == 4) + return details::matching_length_avx(first, second, max_length); + else if constexpr (sizeof(*first) == 8) + return details::matching_length_avx(first, second, max_length); + else + return details::matching_length_naive(first, second, max_length); +#elif defined(__aarch64__) + if constexpr (sizeof(*first) == 1) + return details::matching_length_neon(first, second, max_length); + else if constexpr (sizeof(*first) == 2) + return details::matching_length_neon(first, second, max_length); + else if constexpr (sizeof(*first) == 4) + return details::matching_length_neon(first, second, max_length); + else if constexpr (sizeof(*first) == 8) + return details::matching_length_neon(first, second, max_length); + else + return details::matching_length_naive(first, second, max_length); +#else + return details::matchin_length_naive(first, second, max_length); +#endif + + } +} \ No newline at end of file diff --git a/agc-dev.sln b/agc-dev.sln index 967d2bf..ce21dda 100644 --- a/agc-dev.sln +++ b/agc-dev.sln @@ -11,8 +11,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libzstd", "3rd_party\zstd\b EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Raduls", "3rd_party\raduls-inplace\Raduls\Raduls.vcxproj", "{4C0F01B6-4967-40E6-A39A-90D9BB2281AD}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "isa-l-vc", "3rd_party\isa-l\isa-l-vc.vcxproj", "{68837F79-3D7F-4F4A-AF10-99383B70F7A9}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libdeflate_static", "3rd_party\libdeflate\build-vs\libdeflate_static.vcxproj", "{0DF64642-6604-30B0-9FFA-1BB593BA2CB1}" EndProject Global @@ -91,22 +89,6 @@ Global {4C0F01B6-4967-40E6-A39A-90D9BB2281AD}.RelWithDebInfo|x64.Build.0 = Release|x64 {4C0F01B6-4967-40E6-A39A-90D9BB2281AD}.RelWithDebInfo|x86.ActiveCfg = Release|Win32 {4C0F01B6-4967-40E6-A39A-90D9BB2281AD}.RelWithDebInfo|x86.Build.0 = Release|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Debug|x64.ActiveCfg = Debug|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Debug|x64.Build.0 = Debug|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Debug|x86.ActiveCfg = Debug|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Debug|x86.Build.0 = Debug|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.MinSizeRel|x64.ActiveCfg = Debug|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.MinSizeRel|x64.Build.0 = Debug|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.MinSizeRel|x86.ActiveCfg = Debug|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.MinSizeRel|x86.Build.0 = Debug|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Release|x64.ActiveCfg = Release|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Release|x64.Build.0 = Release|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Release|x86.ActiveCfg = Release|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.Release|x86.Build.0 = Release|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.RelWithDebInfo|x64.ActiveCfg = Release|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.RelWithDebInfo|x64.Build.0 = Release|x64 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.RelWithDebInfo|x86.ActiveCfg = Release|Win32 - {68837F79-3D7F-4F4A-AF10-99383B70F7A9}.RelWithDebInfo|x86.Build.0 = Release|Win32 {0DF64642-6604-30B0-9FFA-1BB593BA2CB1}.Debug|x64.ActiveCfg = Debug|x64 {0DF64642-6604-30B0-9FFA-1BB593BA2CB1}.Debug|x64.Build.0 = Debug|x64 {0DF64642-6604-30B0-9FFA-1BB593BA2CB1}.Debug|x86.ActiveCfg = Debug|x64 diff --git a/makefile b/makefile index 8b5ddbb..1312c67 100644 --- a/makefile +++ b/makefile @@ -1,254 +1,88 @@ -AGC_ROOT_DIR = . -AGC_MAIN_DIR = src -AGC_EXAMPLES_DIR = src/examples -AGC_CORE_DIR = src/core -AGC_APP_DIR = src/app -AGC_CXX_DIR = src/lib-cxx -AGC_LIBS_DIR = libs -LIBS_DIR = . #/usr/local/lib -INC_DIRS =. /usr/local/include 3rd_party/mimalloc/include 3rd_party/zstd/lib 3rd_party/zlib-ng/ 3rd_party/raduls-inplace/Raduls 3rd_party/isa-l/include 3rd_party/libdeflate -INCLUDE_DIR=$(foreach d, $(INC_DIRS), -I$d) -PY_AGC_API_DIR = py_agc_api -PYBIND11_LIB = $(PY_AGC_API_DIR)/pybind11-2.11.1/pybind11/include - -ifdef MSVC # Avoid the MingW/Cygwin sections - uname_S := Windows - uname_M := "x86_64" -else # If uname not available => 'not' - uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') - uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') +# all: agc libagc py_agc_api +all: agc libagc py_agc_api + +# *** REFRESH makefile utils +include refresh.mk + +$(call INIT_SUBMODULES) +$(call INIT_GLOBALS) +$(call CHECK_OS_ARCH, $(PLATFORM)) + +# *** Project directories +$(call SET_SRC_OBJ_BIN,src,obj,bin) +3RD_PARTY_DIR := ./3rd_party + +# *** Project configuration +$(call CHECK_NASM) +$(call ADD_MIMALLOC, $(3RD_PARTY_DIR)/mimalloc) +$(call PROPOSE_ISAL, $(3RD_PARTY_DIR)/isa-l) +$(call PROPOSE_ZLIB_NG, $(3RD_PARTY_DIR)/zlib-ng) +$(call CHOOSE_GZIP_DECOMPRESSION) +$(call ADD_LIBDEFLATE, $(3RD_PARTY_DIR)/libdeflate) +$(call ADD_LIBZSTD, $(3RD_PARTY_DIR)/zstd) +$(call ADD_RADULS_INPLACE,$(3RD_PARTY_DIR)/raduls-inplace) +$(call ADD_PYBIND11,$(3RD_PARTY_DIR)/pybind11/include) +$(call SET_STATIC, $(STATIC_LINK)) + +$(call SET_C_CPP_STANDARDS, c11, c++20) +$(call ADD_REFRESH_LIB, $(3RD_PARTY_DIR)) + +$(call SET_GIT_COMMIT) + +$(call SET_FLAGS, $(TYPE)) + +$(call SET_COMPILER_VERSION_ALLOWED, GCC, Linux_x86_64, 10, 20) +$(call SET_COMPILER_VERSION_ALLOWED, GCC, Linux_aarch64, 11, 20) +$(call SET_COMPILER_VERSION_ALLOWED, GCC, Darwin_x86_64, 11, 13) +$(call SET_COMPILER_VERSION_ALLOWED, GCC, Darwin_arm64, 11, 13) + +ifneq ($(MAKECMDGOALS),clean) +$(call CHECK_COMPILER_VERSION) endif -NASM_V := $(shell nasm --version 2>/dev/null) - -RADULS_DIR = 3rd_party/raduls-inplace/Raduls -ZSTD_DIR = 3rd_party/zstd -ZLIB_DIR = 3rd_party/zlib-ng -ISAL_DIR = 3rd_party/isa-l -LIBDEFLATE_DIR = 3rd_party/libdeflate - -MIMALLOC_INLUCDE_DIR = 3rd_party/mimalloc/include -MIMALLOC_OBJ=libs/mimalloc.o - - -ifeq ($(PLATFORM), arm8) -$(info *** ARMv8 with NEON extensions ***) - ARCH_FLAGS := -march=armv8-a -DARCH_ARM -else ifeq ($(PLATFORM), m1) -$(info *** Apple M1(or never) with NEON extensions ***) - ARCH_FLAGS := -march=armv8.4-a -DARCH_ARM -else ifeq ($(PLATFORM), sse2) -$(info *** x86-64 with SSE2 extensions ***) - ARCH_FLAGS := -msse2 -m64 -DARCH_X64 -else ifeq ($(PLATFORM), avx) -$(info *** x86-64 with AVX extensions ***) - ARCH_FLAGS := -mavx -m64 -DARCH_X64 -else ifeq ($(PLATFORM), avx2) -$(info *** x86-64 with AVX2 extensions ***) - ARCH_FLAGS := -mavx2 -m64 -DARCH_X64 -else -$(info *** Unspecified platform - use native compilation) - ifeq ($(uname_M),x86_64) - ARCH_FLAGS := -march=native -DARCH_X64 - else - ARCH_FLAGS := -march=native -DARCH_ARM - endif -endif - - -#CXX = g++ #(by default) - -AR = ar -CFLAGS = -fPIC -Wall -g -O3 $(ARCH_FLAGS) -std=c++17 -pthread -I $(INCLUDE_DIR) -fpermissive -#CLINK = -lm -lz -lpthread -std=c++17 -#CLINK = -lm -lpthread -std=c++17 -lc -PY_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++17 -O3 -I $(INCLUDE_DIR) - -ifeq ($(uname_S),Linux) - CLINK = -lm -static -O -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++17 -lc -fabi-version=6 - AR_OPT=rcs -o - PY_AGC_API_CFLAGS = -fPIC -Wall -shared -std=c++14 -O3 -endif - -ifeq ($(uname_S),Darwin) - AR_OPT=-rcs - PY_AGC_API_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++14 -O3 - CLINK = -lm -lpthread -std=c++17 -lc -static-libgcc -endif - -LIB_ZSTD=libzstd.a -LIB_RADULS=libraduls.a -LIB_DEFLATE=libdeflate.a - -ifeq ($(uname_M),x86_64) - ifdef NASM_V - GZ_LIB = isa-l.a - gz_target = isa-l - CFLAGS+=-DREFRESH_USE_IGZIP - else - GZ_LIB = libz.a - gz_target = ng_zlib - CFLAGS+=-DREFRESH_USE_ZLIB - endif -else - GZ_LIB = libz.a - gz_target = ng_zlib - CFLAGS+=-DREFRESH_USE_ZLIB -endif - - -# default install location (binary placed in the /bin folder) -prefix = /usr/local - -# optional install location -exec_prefix = $(prefix) - - -all: agc libagc py_agc_api raduls zstd $(gz_target) libdeflate - -%.o: %.cpp - $(CXX) $(CFLAGS) -c $< -o $@ - -agc: raduls zstd $(gz_target) libdeflate $(MIMALLOC_OBJ) \ - $(AGC_APP_DIR)/main.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_compressor.o \ - $(AGC_CORE_DIR)/agc_decompressor.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_APP_DIR)/application.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o - $(CXX) -o $(AGC_ROOT_DIR)/$@ \ +# *** Source files and rules +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,APP,app)) +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,CORE,core)) +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,COMMON,common)) +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,EXAMPLES,examples)) +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,LIB_CXX,lib-cxx)) +$(eval $(call PREPARE_DEFAULT_COMPILE_RULE,PY_AGC_API,py_agc_api,$(PY_FLAGS))) + + +# *** Targets +agc: $(OUT_BIN_DIR)/agc +$(OUT_BIN_DIR)/agc: \ + $(OBJ_APP) $(OBJ_CORE) $(OBJ_COMMON) + -mkdir -p $(OUT_BIN_DIR) + $(CXX) -o $@ \ $(MIMALLOC_OBJ) \ - $(AGC_APP_DIR)/main.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_compressor.o \ - $(AGC_CORE_DIR)/agc_decompressor.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_APP_DIR)/application.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o \ - $(AGC_LIBS_DIR)/$(LIB_ZSTD) \ - $(AGC_LIBS_DIR)/$(GZ_LIB) \ - $(AGC_LIBS_DIR)/$(LIB_RADULS) \ - $(AGC_LIBS_DIR)/$(LIB_DEFLATE) \ - $(CLINK) - -libagc: zstd \ - $(AGC_CXX_DIR)/lib-cxx.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o - $(AR) $(AR_OPT) $(AGC_ROOT_DIR)/$@.a \ - $(AGC_CXX_DIR)/lib-cxx.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o - -.PHONY:py_agc_api -py_agc_api: zstd \ - $(PY_AGC_API_DIR)/py_agc_api.cpp $(AGC_CXX_DIR)/lib-cxx.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o - $(CXX) $(PY_CFLAGS) \ - $(PY_AGC_API_DIR)/py_agc_api.cpp \ - $(AGC_CXX_DIR)/lib-cxx.o \ - $(AGC_CORE_DIR)/agc_basic.o \ - $(AGC_CORE_DIR)/agc_decompressor_lib.o \ - $(AGC_CORE_DIR)/archive.o \ - $(AGC_CORE_DIR)/collection.o \ - $(AGC_CORE_DIR)/collection_v1.o \ - $(AGC_CORE_DIR)/collection_v2.o \ - $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/lz_diff.o \ - $(AGC_CORE_DIR)/segment.o \ - $(AGC_CORE_DIR)/utils.o \ - $(AGC_LIBS_DIR)/$(LIB_ZSTD) \ - -I $(AGC_MAIN_DIR) \ - -I $(PYBIND11_LIB) \ - -I `python3 -c "import sysconfig;print(sysconfig.get_paths()['include'])"` \ - -o $@`python3-config --extension-suffix` - -raduls: - cd $(RADULS_DIR) && $(MAKE) - cp $(RADULS_DIR)/libraduls.a $(AGC_LIBS_DIR) - -zstd: - cd $(ZSTD_DIR) && $(MAKE) lib - cp $(ZSTD_DIR)/lib/libzstd.* $(AGC_LIBS_DIR) - -ng_zlib: - cd $(ZLIB_DIR) && ./configure --zlib-compat && $(MAKE) - cp $(ZLIB_DIR)/libz.* $(AGC_LIBS_DIR) - -libdeflate: - cd $(LIBDEFLATE_DIR) && cmake -B build && cmake --build build - cp $(LIBDEFLATE_DIR)/build/libdeflate.* $(AGC_LIBS_DIR) - -isa-l: - cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx - cp $(ISAL_DIR)/bin/isa-l.a $(AGC_LIBS_DIR) - cp $(ISAL_DIR)/bin/libisal.* $(AGC_LIBS_DIR) - -$(MIMALLOC_OBJ): - $(CC) -DMI_MALLOC_OVERRIDE -O3 -DNDEBUG -fPIC -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden -Wstrict-prototypes -ftls-model=initial-exec -fno-builtin-malloc -std=gnu11 -c -I 3rd_party/mimalloc/include 3rd_party/mimalloc/src/static.c -o $(MIMALLOC_OBJ) - -clean: - -rm $(AGC_EXAMPLES_DIR)/*.o - -rm $(AGC_APP_DIR)/*.o - -rm $(AGC_CORE_DIR)/*.o - -rm $(AGC_CXX_DIR)/*.o - -rm agc - -rm libagc.a - -rm -f $(PY_AGC_API_DIR)/*.o - -rm *.so - -rm $(AGC_LIBS_DIR)/libraduls.* - -rm $(AGC_LIBS_DIR)/libzstd.* - -rm $(AGC_LIBS_DIR)/libz.* - -rm $(AGC_LIBS_DIR)/isa-l.* - -rm $(AGC_LIBS_DIR)/libisal.* - -rm $(AGC_LIBS_DIR)/mimalloc.* - -rm $(AGC_LIBS_DIR)/libdeflate.* - cd $(RADULS_DIR) && $(MAKE) clean - cd $(ZSTD_DIR) && $(MAKE) clean - cd $(ZLIB_DIR) && $(MAKE) -f Makefile.in clean - cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx clean - -cd $(LIBDEFLATE_DIR) && rm -r build + $(OBJ_APP) $(OBJ_CORE) $(OBJ_COMMON) \ + $(LIBRARY_FILES) $(LINKER_FLAGS) $(LINKER_DIRS) + +libagc: $(OUT_BIN_DIR)/libagc +$(OUT_BIN_DIR)/libagc: \ + $(OBJ_LIB_CXX) $(OBJ_COMMON) + -mkdir -p $(OUT_BIN_DIR) + $(AR) $(AR_OPT) $@.a \ + $(OBJ_LIB_CXX) $(OBJ_COMMON) + + +#.PHONY:py_agc_api +py_agc_api: $(OUT_BIN_DIR)/py_agc_api +$(OUT_BIN_DIR)/py_agc_api: \ + $(OBJ_PY_AGC_API) $(OBJ_LIB_CXX) $(OBJ_COMMON) + -mkdir -p $(OUT_BIN_DIR) + $(CXX) $(PY_FLAGS) $(INCLUDE_DIRS) \ + $(OBJ_PY_AGC_API) $(OBJ_LIB_CXX) $(OBJ_COMMON) \ + $(LIBRARY_FILES) $(LINKER_FLAGS) $(LINKER_DIRS) \ + -o $@$(PY_EXTENSION_SUFFIX) + + +# *** Cleaning +.PHONY: clean init +clean: clean-libzstd clean-zlib-ng clean-isa-l clean-libdeflate clean-mimalloc_obj + -rm -r $(OBJ_DIR) + -rm -r $(OUT_BIN_DIR) + +init: + $(call INIT_SUBMODULES) diff --git a/makefile.noavx b/makefile.noavx index babae68..516a2de 100644 --- a/makefile.noavx +++ b/makefile.noavx @@ -112,7 +112,6 @@ libagc: $(AGC_CXX_DIR)/lib-cxx.o \ $(AGC_CORE_DIR)/collection_v1.o \ $(AGC_CORE_DIR)/collection_v2.o \ $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ $(AGC_CORE_DIR)/lz_diff.o \ $(AGC_CORE_DIR)/segment.o \ $(AGC_CORE_DIR)/utils.o diff --git a/makefile.old b/makefile.old new file mode 100644 index 0000000..926b844 --- /dev/null +++ b/makefile.old @@ -0,0 +1,253 @@ +AGC_ROOT_DIR = . +AGC_MAIN_DIR = src +AGC_EXAMPLES_DIR = src/examples +AGC_CORE_DIR = src/core +AGC_APP_DIR = src/app +AGC_CXX_DIR = src/lib-cxx +AGC_LIBS_DIR = libs +LIBS_DIR = . #/usr/local/lib +INC_DIRS =. /usr/local/include 3rd_party/mimalloc/include 3rd_party/zstd/lib 3rd_party/zlib-ng/ 3rd_party/raduls-inplace/Raduls 3rd_party/isa-l/include 3rd_party/libdeflate +INCLUDE_DIR=$(foreach d, $(INC_DIRS), -I$d) +PY_AGC_API_DIR = py_agc_api +PYBIND11_LIB = $(PY_AGC_API_DIR)/pybind11-2.11.1/pybind11/include + +ifdef MSVC # Avoid the MingW/Cygwin sections + uname_S := Windows + uname_M := "x86_64" +else # If uname not available => 'not' + uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') + uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') +endif + +NASM_V := $(shell nasm --version 2>/dev/null) + +RADULS_DIR = 3rd_party/raduls-inplace/Raduls +ZSTD_DIR = 3rd_party/zstd +ZLIB_DIR = 3rd_party/zlib-ng +ISAL_DIR = 3rd_party/isa-l +LIBDEFLATE_DIR = 3rd_party/libdeflate + +MIMALLOC_INLUCDE_DIR = 3rd_party/mimalloc/include +MIMALLOC_OBJ=libs/mimalloc.o + + +ifeq ($(PLATFORM), arm8) +$(info *** ARMv8 with NEON extensions ***) + ARCH_FLAGS := -march=armv8-a -DARCH_ARM +else ifeq ($(PLATFORM), m1) +$(info *** Apple M1(or never) with NEON extensions ***) + ARCH_FLAGS := -march=armv8.4-a -DARCH_ARM +else ifeq ($(PLATFORM), sse2) +$(info *** x86-64 with SSE2 extensions ***) + ARCH_FLAGS := -msse2 -m64 -DARCH_X64 +else ifeq ($(PLATFORM), avx) +$(info *** x86-64 with AVX extensions ***) + ARCH_FLAGS := -mavx -m64 -DARCH_X64 +else ifeq ($(PLATFORM), avx2) +$(info *** x86-64 with AVX2 extensions ***) + ARCH_FLAGS := -mavx2 -m64 -DARCH_X64 +else +$(info *** Unspecified platform - use native compilation) + ifeq ($(uname_M),x86_64) + ARCH_FLAGS := -march=native -DARCH_X64 + else + ARCH_FLAGS := -march=native -DARCH_ARM + endif +endif + + +#CXX = g++ #(by default) + +AR = ar +CFLAGS = -fPIC -Wall -g -O3 $(ARCH_FLAGS) -std=c++20 -pthread -I $(INCLUDE_DIR) -fpermissive +#CLINK = -lm -lz -lpthread -std=c++17 +#CLINK = -lm -lpthread -std=c++17 -lc +PY_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++20 -O3 -I $(INCLUDE_DIR) + +ifeq ($(uname_S),Linux) + CLINK = -lm -static -O -Wl,--whole-archive -lpthread -Wl,--no-whole-archive -std=c++20 -lc -fabi-version=6 + AR_OPT=rcs -o + PY_AGC_API_CFLAGS = -fPIC -Wall -shared -std=c++14 -O3 +endif + +ifeq ($(uname_S),Darwin) + AR_OPT=-rcs + PY_AGC_API_CFLAGS = -Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++14 -O3 + CLINK = -lm -lpthread -std=c++20 -lc -static-libgcc +endif + +LIB_ZSTD=libzstd.a +LIB_RADULS=libraduls.a +LIB_DEFLATE=libdeflate.a + +ifeq ($(uname_M),x86_64) + ifdef NASM_V + GZ_LIB = isa-l.a + gz_target = isa-l + CFLAGS+=-DREFRESH_USE_IGZIP + else + GZ_LIB = libz.a + gz_target = ng_zlib + CFLAGS+=-DREFRESH_USE_ZLIB + endif +else + GZ_LIB = libz.a + gz_target = ng_zlib + CFLAGS+=-DREFRESH_USE_ZLIB +endif + + +# default install location (binary placed in the /bin folder) +prefix = /usr/local + +# optional install location +exec_prefix = $(prefix) + + +all: agc libagc py_agc_api raduls zstd $(gz_target) libdeflate + +%.o: %.cpp + $(CXX) $(CFLAGS) -c $< -o $@ + +agc: raduls zstd $(gz_target) libdeflate $(MIMALLOC_OBJ) \ + $(AGC_APP_DIR)/main.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_compressor.o \ + $(AGC_CORE_DIR)/agc_decompressor.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_APP_DIR)/application.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/genome_io.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o + $(CXX) -o $(AGC_ROOT_DIR)/$@ \ + $(MIMALLOC_OBJ) \ + $(AGC_APP_DIR)/main.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_compressor.o \ + $(AGC_CORE_DIR)/agc_decompressor.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_APP_DIR)/application.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/genome_io.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o \ + $(AGC_LIBS_DIR)/$(LIB_ZSTD) \ + $(AGC_LIBS_DIR)/$(GZ_LIB) \ + $(AGC_LIBS_DIR)/$(LIB_RADULS) \ + $(AGC_LIBS_DIR)/$(LIB_DEFLATE) \ + $(CLINK) + +libagc: zstd \ + $(AGC_CXX_DIR)/lib-cxx.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/genome_io.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o + $(AR) $(AR_OPT) $(AGC_ROOT_DIR)/$@.a \ + $(AGC_CXX_DIR)/lib-cxx.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o + +.PHONY:py_agc_api +py_agc_api: zstd \ + $(PY_AGC_API_DIR)/py_agc_api.cpp $(AGC_CXX_DIR)/lib-cxx.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o + $(CXX) $(PY_CFLAGS) \ + $(PY_AGC_API_DIR)/py_agc_api.cpp \ + $(AGC_CXX_DIR)/lib-cxx.o \ + $(AGC_CORE_DIR)/agc_basic.o \ + $(AGC_CORE_DIR)/agc_decompressor_lib.o \ + $(AGC_CORE_DIR)/archive.o \ + $(AGC_CORE_DIR)/collection.o \ + $(AGC_CORE_DIR)/collection_v1.o \ + $(AGC_CORE_DIR)/collection_v2.o \ + $(AGC_CORE_DIR)/collection_v3.o \ + $(AGC_CORE_DIR)/lz_diff.o \ + $(AGC_CORE_DIR)/segment.o \ + $(AGC_CORE_DIR)/utils.o \ + $(AGC_LIBS_DIR)/$(LIB_ZSTD) \ + -I $(AGC_MAIN_DIR) \ + -I $(PYBIND11_LIB) \ + -I `python3 -c "import sysconfig;print(sysconfig.get_paths()['include'])"` \ + -o $@`python3-config --extension-suffix` + +raduls: + cd $(RADULS_DIR) && $(MAKE) + cp $(RADULS_DIR)/libraduls.a $(AGC_LIBS_DIR) + +zstd: + cd $(ZSTD_DIR) && $(MAKE) lib + cp $(ZSTD_DIR)/lib/libzstd.* $(AGC_LIBS_DIR) + +ng_zlib: + cd $(ZLIB_DIR) && ./configure --zlib-compat && $(MAKE) + cp $(ZLIB_DIR)/libz.* $(AGC_LIBS_DIR) + +libdeflate: + cd $(LIBDEFLATE_DIR) && cmake -B build && cmake --build build + cp $(LIBDEFLATE_DIR)/build/libdeflate.* $(AGC_LIBS_DIR) + +isa-l: + cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx + cp $(ISAL_DIR)/bin/isa-l.a $(AGC_LIBS_DIR) + cp $(ISAL_DIR)/bin/libisal.* $(AGC_LIBS_DIR) + +$(MIMALLOC_OBJ): + $(CC) -DMI_MALLOC_OVERRIDE -O3 -DNDEBUG -fPIC -Wall -Wextra -Wno-unknown-pragmas -fvisibility=hidden -Wstrict-prototypes -ftls-model=initial-exec -fno-builtin-malloc -std=gnu11 -c -I 3rd_party/mimalloc/include 3rd_party/mimalloc/src/static.c -o $(MIMALLOC_OBJ) + +clean: + -rm $(AGC_EXAMPLES_DIR)/*.o + -rm $(AGC_APP_DIR)/*.o + -rm $(AGC_CORE_DIR)/*.o + -rm $(AGC_CXX_DIR)/*.o + -rm agc + -rm libagc.a + -rm -f $(PY_AGC_API_DIR)/*.o + -rm *.so + -rm $(AGC_LIBS_DIR)/libraduls.* + -rm $(AGC_LIBS_DIR)/libzstd.* + -rm $(AGC_LIBS_DIR)/libz.* + -rm $(AGC_LIBS_DIR)/isa-l.* + -rm $(AGC_LIBS_DIR)/libisal.* + -rm $(AGC_LIBS_DIR)/mimalloc.* + -rm $(AGC_LIBS_DIR)/libdeflate.* + cd $(RADULS_DIR) && $(MAKE) clean + cd $(ZSTD_DIR) && $(MAKE) clean + cd $(ZLIB_DIR) && $(MAKE) -f Makefile.in clean + cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx clean + -cd $(LIBDEFLATE_DIR) && rm -r build diff --git a/makefile.release b/makefile.release index e601227..a9a3dad 100644 --- a/makefile.release +++ b/makefile.release @@ -112,7 +112,6 @@ libagc: $(AGC_CXX_DIR)/lib-cxx.o \ $(AGC_CORE_DIR)/collection_v1.o \ $(AGC_CORE_DIR)/collection_v2.o \ $(AGC_CORE_DIR)/collection_v3.o \ - $(AGC_CORE_DIR)/genome_io.o \ $(AGC_CORE_DIR)/lz_diff.o \ $(AGC_CORE_DIR)/segment.o \ $(AGC_CORE_DIR)/utils.o diff --git a/prebuild/prebuild.vcxproj b/prebuild/prebuild.vcxproj new file mode 100644 index 0000000..a6d2e44 --- /dev/null +++ b/prebuild/prebuild.vcxproj @@ -0,0 +1,94 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {939F5626-13ED-4481-B6C7-320F186C2453} + Win32Proj + + + + Makefile + true + v143 + + + Makefile + false + v143 + + + Makefile + true + v143 + + + Makefile + false + v143 + + + + + + + + + + + + + + + + + + + + + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) + prebuild.exe + _DEBUG;$(NMakePreprocessorDefinitions) + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) + + + prebuild.bat 3rd_party %{Solution + prebuild.exe + WIN32;_DEBUG;$(NMakePreprocessorDefinitions) + + + prebuild.bat 3rd_party %{Solution + prebuild.exe + WIN32;NDEBUG;$(NMakePreprocessorDefinitions) + + + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) + prebuild.exe + NDEBUG;$(NMakePreprocessorDefinitions) + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) + + + + + + + + + \ No newline at end of file diff --git a/prebuild/prebuild.vcxproj.filters b/prebuild/prebuild.vcxproj.filters new file mode 100644 index 0000000..a8a6563 --- /dev/null +++ b/prebuild/prebuild.vcxproj.filters @@ -0,0 +1,17 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + \ No newline at end of file diff --git a/prebuild/prebuild.vcxproj.user b/prebuild/prebuild.vcxproj.user new file mode 100644 index 0000000..88a5509 --- /dev/null +++ b/prebuild/prebuild.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/refresh.mk b/refresh.mk new file mode 100644 index 0000000..53c4f67 --- /dev/null +++ b/refresh.mk @@ -0,0 +1,714 @@ +### REFRESH group macros - v.1.0.7 (2024-11-14) + +### Macros for initialization +define INIT_GLOBALS + $(info *** Initialization of global values ***) + $(eval INCLUDE_DIRS:=-I.) + $(eval REFRESH_DIR:=) + $(eval LIBRARY_FILES:=) + $(eval LINKER_DIRS:=) + $(eval C_FLAGS:=) + $(eval CPP_FLAGS:=) + $(eval PY_FLAGS:=) + $(eval DEFINE_FLAGS:=) + $(eval LINKER_FLAGS:=) + $(eval CMAKE_OSX_FIX:=) + $(eval COMPILER_ALLOWED:=) + $(eval TYPE?=release) + $(eval PREBUILD_JOBS:=) + $(eval SRC_DIR:=./src) + $(eval OBJ_DIR:=./obj) + $(eval OUT_BIN_DIR:=./bin) + $(eval AR?=ar) +endef + +### Macros for 3rd-party libraries registration +# Add zlib-ng +define ADD_ZLIB_NG + $(info *** Adding zlib-ng ***) + $(eval ZLIB_DIR:=$(1)) + $(eval ZLIB_A_DIR:=$(1)/build-g++/zlib-ng) + $(eval ZLIB_A:=$(ZLIB_A_DIR)/libz.a) + $(eval INCLUDE_DIRS+=-I$(ZLIB_DIR)/build-g++) + $(eval LIBRARY_FILES+=$(ZLIB_A)) + $(eval LINKER_DIRS+=-L $(ZLIB_A_DIR)) + $(eval PREBUILD_JOBS+=zlib-ng) + + $(eval zlib-ng: $(ZLIB_A)) + $(eval $(ZLIB_A) : ; \ + cd $(ZLIB_DIR) && cmake $(CMAKE_OSX_FIX) -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_C_COMPILER=$(CC) -B build-g++/zlib-ng -S . -DZLIB_COMPAT=ON; cmake --build build-g++/zlib-ng --config Release) +endef + +# Propose zlib-ng (to be considered by CHOOSE_...) +define PROPOSE_ZLIB_NG + $(info *** Proposing zlib-ng ***) + $(eval ZLIB_DIR:=$(1)) + $(eval ZLIB_A_DIR:=$(1)/build-g++/zlib-ng) + $(eval ZLIB_A:=$(ZLIB_A_DIR)/libz.a) + + $(eval zlib-ng: $(ZLIB_A)) + $(eval $(ZLIB_A) : ; \ + cd $(ZLIB_DIR) && cmake $(CMAKE_OSX_FIX) -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_C_COMPILER=$(CC) -B build-g++/zlib-ng -S . -DZLIB_COMPAT=ON; cmake --build build-g++/zlib-ng --config Release) +endef + +# Propose isa-l (to be considered by CHOOSE_...) +define PROPOSE_ISAL + $(info *** Proposing isal ***) + $(eval ISAL_DIR:=$(1)) + $(eval ISAL_A_DIR:=$(1)/bin) + $(eval ISAL_A:=$(1)/bin/isa-l.a) + + $(eval isa-l: $(ISAL_A)) + $(eval $(ISAL_A) : ; \ + cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx) +endef + +# Add libdeflate +define ADD_LIBDEFLATE + $(info *** Adding libdeflate ***) + $(eval INCLUDE_DIRS+=-I$(1)) + $(eval LIBDEFLATE_DIR:=$(1)) + $(eval LIBDEFLATE_A_DIR:=$(1)) + $(eval LIBDEFLATE_A:=$(1)/build/libdeflate.a) + $(eval LIBRARY_FILES+=$(LIBDEFLATE_A)) + $(eval LINKER_DIRS+=-L $(LIBDEFLATE_A_DIR)) + $(call TEST_SOFT,cmake) + $(eval PREBUILD_JOBS+=libdeflate) + + $(eval libdeflate: $(LIBDEFLATE_A)) + $(eval $(LIBDEFLATE_A): ; \ + cd $(LIBDEFLATE_DIR) && cmake $(CMAKE_OSX_FIX) -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_C_COMPILER=$(CC) -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -B build && cmake --build build) +endef + +# Add zstd (!!! CHECK) +define ADD_LIBZSTD + $(info *** Adding libzstd ***) + $(eval INCLUDE_DIRS+=-I$(1)) + $(eval LIBZSTD_DIR:=$(1)) + $(eval LIBZSTD_A_DIR:=$(1)) + $(eval LIBZSTD_A:=$(1)/lib/libzstd.a) + $(eval LIBRARY_FILES+=$(LIBZSTD_A)) + $(eval LINKER_DIRS+=-L $(LIBZSTD_A_DIR)) + $(eval PREBUILD_JOBS+=libzstd) + + $(eval libzstd: $(LIBZSTD_A)) + $(eval $(LIBZSTD_A): ; \ + cd $(LIBZSTD_DIR) && $(MAKE)) +endef + +# Add mimalloc +define ADD_MIMALLOC + $(info *** Adding mimalloc ***) + $(eval MIMALLOC_INCLUDE_DIR:=$(1)/include) + $(eval INCLUDE_DIRS+=-I$(1)/include) + $(eval MIMALLOC_DIR:=$(1)) + $(eval MIMALLOC_OBJ:=$(1)/mimalloc.o) + $(eval PREBUILD_JOBS+=mimalloc_obj) + + $(eval mimalloc_obj: $(MIMALLOC_OBJ)) + $(eval $(MIMALLOC_OBJ): ; \ + $(CXX) -DMI_MALLOC_OVERRIDE -O3 -DNDEBUG -fPIC -Wall -Wextra -Wno-unknown-pragmas \ + -fvisibility=hidden -ftls-model=initial-exec -fno-builtin-malloc -c -I $(MIMALLOC_INCLUDE_DIR) \ + $(MIMALLOC_DIR)/src/static.c -o $(MIMALLOC_OBJ)) +endef + +# Add cdflib +define ADD_CDFLIB + $(info *** Adding cdflib ***) + $(eval CDFLIB_INCLUDE_DIR:=$(1)) + $(eval INCLUDE_DIRS+=-I$(1)) + $(eval CDFLIB_DIR:=$(1)) + $(eval CDFLIB_OBJ:=$(1)/cdflib.cpp.o) + $(eval PREBUILD_JOBS+=cdflib_obj) + + $(eval cdflib_obj: $(CDFLIB_OBJ)) + $(eval $(CDFLIB_OBJ): ; \ + cd $(CDFLIB_DIR) && $(CXX) $(CPP_FLAGS) $(OPTIMIZATION_FLAGS) $(DEFINE_FLAGS) $(INCLUDE_DIRS) -c cdflib.cpp -o cdflib.cpp.o) +endef + +# Add REFRESH - parallel queues monitor +define ADD_REFRESH_PARALLEL_QUEUES_MONITOR + $(info *** Adding refresh - parallel queues monitor ***) + $(eval REFRESH_PARALLEL_QUEUES_MONITOR_DIR:=$(1)/refresh/parallel_queues/lib/) + $(eval REFRESH_PARALLEL_QUEUES_MONITOR_OBJ:=$(1)/refresh/parallel_queues/lib/parallel-queues-monitor.cpp.o) + $(eval PREBUILD_JOBS+=refresh_parallel_queues_monitor_obj) + $(eval refresh_parallel_queues_monitor_obj: $(REFRESH_PARALLEL_QUEUES_MONITOR_OBJ)) + $(eval $(REFRESH_PARALLEL_QUEUES_MONITOR_OBJ): ; \ + cd $(REFRESH_PARALLEL_QUEUES_MONITOR_DIR) && $(CXX) $(CPP_FLAGS) $(OPTIMIZATION_FLAGS) $(DEFINE_FLAGS) $(INCLUDE_DIRS) -c parallel-queues-monitor.cpp -o parallel-queues-monitor.cpp.o) +endef + +# Add RADULS-inplace +define ADD_RADULS_INPLACE + $(info *** Adding raduls-inplace ***) + $(eval INCLUDE_DIRS+=-I$(1)/Raduls) + $(eval RADULS_INPLACE_DIR:=$(1)/Raduls) + $(eval RADULS_INPLACE_A_DIR:=$(1)/Raduls) + $(eval RADULS_INPLACE_A:=$(1)/Raduls/libraduls.a) + $(eval LIBRARY_FILES+=$(RADULS_INPLACE_A)) + $(eval LINKER_DIRS+=-L $(RADULS_INPLACE_A_DIR)) + $(eval PREBUILD_JOBS+=raduls-inplace) + + $(eval raduls-inplace: $(RADULS_INPLACE_A)) + $(eval $(RADULS_INPLACE_A) : ; \ + cd $(RADULS_INPLACE_DIR) && $(MAKE)) +endef + +# Add igraph +define ADD_IGRAPH + $(info *** Adding igraph ***) + $(eval INCLUDE_DIRS+=-I$(1)/include -I$(1)/build/include) + $(eval IGRAPH_DIR:=$(1)) + $(eval IGRAPH_A_DIR:=$(1)/build/src) + $(eval IGRAPH_A:=$(IGRAPH_A_DIR)/libigraph.a) + $(eval LIBRARY_FILES+=$(IGRAPH_A)) + $(eval LINKER_DIRS+=-L $(IGRAPH_A_DIR)) + $(eval IGRAPH_TARGET:=igraph) + $(call TEST_SOFT,cmake) + $(call TEST_SOFT,bison) + $(call TEST_SOFT,flex) + $(eval PREBUILD_JOBS+=igraph) + + $(eval igraph: $(IGRAPH_A)) + $(eval $(IGRAPH_A): ; \ + $(if $(filter Darwin,$(OS_TYPE)), \ + $(eval IEEE754_DOUBLE_ENDIANNESS_MATCHES_FIX:=-DIEEE754_DOUBLE_ENDIANNESS_MATCHES=TRUE), \ + $(eval IEEE754_DOUBLE_ENDIANNESS_MATCHES_FIX:=) \ + ) \ + mkdir -p $(IGRAPH_DIR)/build && cmake $(CMAKE_OSX_FIX) $(IEEE754_DOUBLE_ENDIANNESS_MATCHES_FIX) -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_C_COMPILER=$(CC) -S $(IGRAPH_DIR) -B $(IGRAPH_DIR)/build && cmake --build $(IGRAPH_DIR)/build ) +endef + +# Add SBWT +define ADD_SBWT + $(info *** Adding SBWT ***) + $(eval INCLUDE_DIRS+=-I$(1)/include -I$(1)/sdsl-lite/include -I$(1)/SeqIO/include -I$(1)/build/external/sdsl-lite/build/external/libdivsufsort/include/) + $(eval SBWT_DIR:=$(1)) + $(eval SBWT_A_DIR:=$(1)/build) + $(eval SBWT_A:=$(SBWT_A_DIR)/libsbwt_static.a) + $(eval SBWT_SDSL_A:=$(SBWT_A_DIR)/external/sdsl-lite/build/lib/libsdsl.a) + $(eval SBWT_KMC_CORE_A:=$(SBWT_A_DIR)/external/KMC/build/libkmc_core.a) + $(eval SBWT_KMC_TOOLS_A:=$(SBWT_A_DIR)/external/KMC/build/libkmc_tools.a) + $(eval LIBRARY_FILES+=$(SBWT_A) $(SBWT_SDSL_A) $(SBWT_KMC_CORE_A) $(SBWT_KMC_TOOLS_A)) + $(eval LINKER_DIRS+=-L $(SBWT_A_DIR)) + $(eval PREBUILD_JOBS+=sbwt) + + $(eval sbwt: $(SBWT_A) $(SBWT_SDSL_A) $(SBWT_KMC_CORE_A) $(SBWT_KMC_TOOLS_A)) + $(eval $(SBWT_A): ; \ + mkdir -p $(SBWT_DIR)/build && cd $(SBWT_DIR)/build && cmake $(CMAKE_OSX_FIX) -DCMAKE_CXX_COMPILER=$(CXX) -DCMAKE_C_COMPILER=$(CC) .. -DMAX_KMER_LENGTH=32 && $(MAKE) -j) + $(eval $(SBWT_SDSL_A) : $(SBWT_A)) + $(eval $(SBWT_KMC_CORE_A) : $(SBWT_A)) + $(eval $(SBWT_KMC_TOOLS_A) : $(SBWT_A)) +endef + +# Add Pybind11 +define ADD_PYBIND11 + $(eval PYBIND11_DIR:=$(1)) + $(eval INCLUDE_DIRS+=-I$(PYBIND11_DIR)) + $(eval INCLUDE_DIRS+=-I $(shell python3 -c "import sysconfig;print(sysconfig.get_paths()['include'])")) + $(eval PY_EXTENSION_SUFFIX:=$(shell python3-config --extension-suffix)) +endef + +# Add REFRESH libs +define ADD_REFRESH_LIB + $(info *** Adding REFRESH libs ***) + $(eval REFRESH_DIR:=-I$(1)) +endef + +# Add StatsLib +define ADD_STATS_LIB + $(info *** Adding StatsLib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add Annoy +define ADD_ANNOY_LIB + $(info *** Adding Annoy lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add hnswlib +define ADD_HNSWLIB + $(info *** Adding hnswlib ***) + $(eval INCLUDE_DIRS+=-I$(1)) +endef + +# Add umappp lib +define ADD_UMAPPP_LIB + $(info *** Adding UMAPPP lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add CppIrlba lib +define ADD_CPPIRLBA_LIB + $(info *** Adding CppIrlba lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add CppKmeans lib +define ADD_CPPKMEANS_LIB + $(info *** Adding CppIrlba lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add aarand lib +define ADD_AARAND_LIB + $(info *** Adding aarand lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add knncolle lib +define ADD_KNNCOLLE_LIB + $(info *** Adding knncolle lib ***) + $(eval INCLUDE_DIRS+=-I$(1)/include) +endef + +# Add Eigen lib +define ADD_EIGEN_LIB + $(info *** Adding Eigen lib ***) + $(eval INCLUDE_DIRS+=-I$(1)) +endef + +### Macros configuring compiler/linker flags +# Add os-specific flags for static linking +define SET_STATIC + $(if $(filter true,$(1)), \ + $(if $(filter Darwin,$(OS_TYPE)), \ + $(eval STATIC_LFLAGS:=-static-libgcc -static-libstdc++ -pthread), \ + $(if $(filter x86_64,$(ARCH_TYPE)), \ + $(eval STATIC_LFLAGS:=-static -Wl,--whole-archive -lpthread -Wl,--no-whole-archive), \ + $(eval STATIC_LFLAGS:=-static-libgcc -static-libstdc++ -lpthread) \ + ) + ) + ) +endef + +# Add C, C++ standards +define SET_C_CPP_STANDARDS + $(eval C_STD:=$(1)) + $(eval CPP_STD:=$(2)) +endef + +# Define allowed compiler version and type +define SET_COMPILER_VERSION_ALLOWED + $(eval COMPILER_VERSION_$(strip $(1))_$(strip $(2))_MIN:=$(strip $(3))) + $(eval COMPILER_VERSION_$(strip $(1))_$(strip $(2))_MAX:=$(strip $(4))) + $(eval COMPILER_ALLOWED+=COMPILER_VERSION_$(strip $(1))_$(strip $(2))) +endef + +# Set source, object and binary directories +define SET_SRC_OBJ_BIN + $(eval SRC_DIR:=$(1)) + $(eval OBJ_DIR:=$(2)) + $(eval OUT_BIN_DIR:=$(3)) +endef + +# *** Utility functions +define LESS_THAN + $(if $(filter 0,$(shell [ $(1) -lt $(2) ]; echo $$?)),1,0) +endef + +define GREATER_THAN + $(if $(filter 0,$(shell [ $(1) -gt $(2) ]; echo $$?)),1,0) +endef + +define IN_RANGE + $(shell if [ $(COMP) -ge $(MIN) ] && [ $(COMP) -le $(MAX) ]; then echo 1; else echo 0; fi) +endef + +define TEST_SOFT + $(if $(shell command -v $(1) >/dev/null 2>&1 && echo found),, \ + $(error The required software '$(1)' is not installed or not in PATH)) +endef + +# Check Git commit id and set GIT_COMMIT macro for compilation rule +define SET_GIT_COMMIT + $(eval GIT_COMMIT:=$(shell git describe --always --dirty)) + $(eval DEFINE_FLAGS:=-DGIT_COMMIT=$(GIT_COMMIT)) +endef + +# Prepare file variables +define LOAD_FILES +$(eval SRC_$(1)_DIR := $(SRC_DIR)/$(2)) +$(eval OBJ_$(1)_DIR := $(OBJ_DIR)/$(2)) +$(eval SRC_$(1) := $(wildcard $(SRC_$(1)_DIR)/*.cpp)) +$(eval OBJ_$(1) := $(patsubst $(SRC_$(1)_DIR)/%.cpp, $(OBJ_$(1)_DIR)/%.cpp.o, $(SRC_$(1)))) +endef + +# Dynamic creation of build rules +define DEFAULT_COMPILE_RULE = +$(OBJ_$(1)_DIR)/%.cpp.o: $(SRC_$(1)_DIR)/%.cpp | prebuild + @mkdir -p $(OBJ_$(1)_DIR) + $(CXX) $(CPP_FLAGS) $(OPTIMIZATION_FLAGS) $(DEFINE_FLAGS) $(INCLUDE_DIRS) -MMD -MF $$@.d -c $$< -o $$@ +endef + +# Dynamic creation of build rules for files in directory +define PREPARE_DEFAULT_COMPILE_RULE +# Source files +$(eval SRC_$(1)_DIR := $(SRC_DIR)/$(2)) +$(eval OBJ_$(1)_DIR := $(OBJ_DIR)/$(2)) +$(eval SRC_$(1) := $(wildcard $(SRC_$(1)_DIR)/*.cpp)) +$(eval OBJ_$(1) := $(patsubst $(SRC_$(1)_DIR)/%.cpp, $(OBJ_$(1)_DIR)/%.cpp.o, $(SRC_$(1)))) +# Compilation rule +$(OBJ_$(1)_DIR)/%.cpp.o: $(SRC_$(1)_DIR)/%.cpp | prebuild + mkdir -p $(OBJ_$(1)_DIR) + $(CXX) $(3) $(CPP_FLAGS) $(OPTIMIZATION_FLAGS) $(ARCH_FLAGS) $(DEFINE_FLAGS) $(INCLUDE_DIRS) -MMD -MF $$@.d -c $$< -o $$@ +# Dependency files +-include $(OBJ_$(1):.o=.o.d) +endef + +# Check compiler version +define CHECK_COMPILER_VERSION + $(info *** Checking compiler version ***) + $(eval COMPILER_DESC:=$(shell command -v $(CXX) >/dev/null 2>&1 && basename $(CXX) | sed 's/-.*//' || echo "")) + + $(if $(COMPILER_DESC),,\ + $(error Compiler does not exist) \ + ) + + $(eval COMPILER_VERSION_FULL:=$(shell $(CXX) --version | sed -n '1s/^[^0-9]*\([0-9\.]*\).*$$/\1/p')) + $(eval COMPILER_VERSION_MAJOR:=$(firstword $(subst ., ,$(COMPILER_VERSION_FULL)))) + + $(eval COMPILER_DESC:=$(subst g++,GCC,$(COMPILER_DESC))) + $(eval COMPILER_DESC:=$(subst clang,CLANG,$(COMPILER_DESC))) + + $(info Compiler: $(COMPILER_DESC)) + $(info Version: $(COMPILER_VERSION_MAJOR)) + + $(if $(or $(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MIN),$(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MAX)),\ + ,\ + $(error Compiler not supported) \ + ) + + $(if $(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MIN), \ + $(if $(filter 1,$(call LESS_THAN,$(COMPILER_VERSION_MAJOR),$(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MIN))), \ + $(error Too low compiler version), \ + $(if $(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MAX), \ + $(if $(filter 1,$(call GREATER_THAN,$(COMPILER_VERSION_MAJOR),$(COMPILER_VERSION_$(COMPILER_DESC)_$(OS_ARCH_TYPE)_MAX))), \ + $(error Too high compiler version) \ + ), \ + ) + ), \ + ) +endef + +# Add type-specifix flags +define SET_FLAGS + $(if $(filter Linux_x86_64,$(OS_ARCH_TYPE)), \ + $(eval PLATFORM_SPECIFIC_C_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_CPP_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_LINKER_FLAGS:=-fabi-version=6), \ + $(if $(filter Linux_aarch64,$(OS_ARCH_TYPE)), \ + $(eval PLATFORM_SPECIFIC_C_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_CPP_FLAGS:=-ffp-contract=off) \ + $(eval PLATFORM_SPECIFIC_LINKER_FLAGS:=-fabi-version=6), \ + $(if $(filter Darwin_arm64,$(OS_ARCH_TYPE)), \ + $(eval PLATFORM_SPECIFIC_C_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_CPP_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_LINKER_FLAGS:=), \ + $(if $(filter Darwin_x86_64,$(OS_ARCH_TYPE)), \ + $(eval PLATFORM_SPECIFIC_C_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_CPP_FLAGS:=) \ + $(eval PLATFORM_SPECIFIC_LINKER_FLAGS:=) \ + ) \ + ) \ + ) \ + ) + + $(eval C_FLAGS+=-std=$(C_STD) -Wall -fPIC -pthread -fpermissive $(PLATFORM_SPECIFIC_C_FLAGS)) + $(eval CPP_FLAGS+=-std=$(CPP_STD) -Wall -fPIC -pthread -fpermissive $(PLATFORM_SPECIFIC_CPP_FLAGS)) + $(eval LINKER_FLAGS+=-lm -lpthread $(PLATFORM_SPECIFIC_LINKER_FLAGS) $(STATIC_LFLAGS)) + $(eval PY_FLAGS:=-Wl,-undefined,dynamic_lookup -shared) + + + $(if $(filter release,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3) \ + $(eval C_FLAGS+=) \ + $(eval CPP_FLAGS+= ), \ + $(if $(filter debug,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O0 -g) \ + $(eval C_FLAGS+=) \ + $(eval CPP_FLAGS+= ), \ + $(if $(filter ASan,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3 -g) \ + $(eval C_FLAGS+=-fsanitize=address) \ + $(eval CPP_FLAGS+=-fsanitize=address) \ + $(eval LINKER_FLAGS+=-fsanitize=address), \ + $(if $(filter TSan,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3 -g) \ + $(eval C_FLAGS+=-fsanitize=thread) \ + $(eval CPP_FLAGS+=-fsanitize=thread) \ + $(eval LINKER_FLAGS+=-fsanitize=thread -static-libgcc -static-libstdc++), \ + $(if $(filter UBSan,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3 -g) \ + $(eval C_FLAGS+=-fsanitize=undefined) \ + $(eval CPP_FLAGS+=-fsanitize=undefined) \ + $(eval LINKER_FLAGS+=-fsanitize=undefined), \ + $(if $(filter LSan,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3 -g) \ + $(eval C_FLAGS+=-fsanitize=leak) \ + $(eval CPP_FLAGS+=-fsanitize=leak) \ + $(eval LINKER_FLAGS+=-fsanitize=leak), \ + $(if $(filter MSan,$(1)), \ + $(eval OPTIMIZATION_FLAGS+=-O3 -g) \ + $(eval C_FLAGS+=-fsanitize=memory) \ + $(eval CPP_FLAGS+=-fsanitize=memory) \ + $(eval LINKER_FLAGS+=-fsanitize=memory), \ + ) \ + ) \ + ) \ + ) \ + ) \ + ) \ + ) + + $(eval CPP_FLAGS_SSE2:=$(CPPFLAGS) -msse2) + $(eval CPP_FLAGS_SSE4:=$(CPPFLAGS) -msse4) + $(eval CPP_FLAGS_AVX:=$(CPPFLAGS) -mavx) + $(eval CPP_FLAGS_AVX2:=$(CPPFLAGS) -mavx2) + $(eval CPP_FLAGS_AVX512:=$(CPPFLAGS) -mavx512) + $(eval CPP_FLAGS_NEON:=$(CPPFLAGS)) + + $(eval INCLUDE_DIRS+=$(REFRESH_DIR)) + $(info Prebuild jobs: $(PREBUILD_JOBS)) +prebuild: + $(PREBUILD_JOBS) +endef + + +### Macros checking system and software +# Check for NASM +define CHECK_NASM + $(eval NASM_VERSION:=$(shell nasm --version 2>/dev/null)) +endef + +# Choose lib for gzip decompression +define CHOOSE_GZIP_DECOMPRESSION + $(if $(filter x86_64,$(ARCH_TYPE)), \ + $(if $(and $(NASM_VERSION),$(ISAL_DIR)), \ + $(eval GZ_TARGET:=isa-l) \ + $(eval PREBUILD_JOBS+=isa-l) \ + $(eval INCLUDE_DIRS+=-I$(ISAL_DIR)/include) ,\ + $(eval GZ_TARGET:=zlib-ng) \ + $(eval PREBUILD_JOBS+=zlib-ng) \ + $(eval INCLUDE_DIRS+=-I$(ZLIB_DIR)/build-g++) \ + ), \ + $(eval GZ_TARGET:=zlib-ng) \ + $(eval PREBUILD_JOBS+=zlib-ng) \ + $(eval INCLUDE_DIRS+=-I$(ZLIB_DIR)/build-g++) \ + ) + + $(if $(filter isa-l,$(GZ_TARGET)), \ + $(info ISAL will be used for gzip decompression) \ + $(eval GZ_LIB:=isa-l.a) \ + $(eval LIBRARY_FILES+=$(ISAL_A)) \ + $(eval LINKER_DIRS+=-L $(ISAL_A_DIR)) + $(eval C_FLAGS+=-DREFRESH_USE_IGZIP) \ + $(eval CPP_FLAGS+=-DREFRESH_USE_IGZIP), \ + $(info zlib-ng will be used for gzip decompression) \ + $(eval GZ_LIB:=libz.a) \ + $(eval LIBRARY_FILES+=$(ZLIB_A)) \ + $(eval LINKER_DIRS+=-L $(ZLIB_A_DIR)) + $(eval C_FLAGS+=-DREFRESH_USE_ZLIB) \ + $(eval CPP_FLAGS+=-DREFRESH_USE_ZLIB) \ + ) +endef + +# Check for OS and architecture +define CHECK_OS_ARCH + $(if $(MSVC), \ + $(eval OS_TYPE:=windows) \ + $(eval ARCH_TYPE:=x86_64), \ + $(eval OS_TYPE:=$(shell uname -s 2>/dev/null || echo not)) \ + $(eval ARCH_TYPE:=$(shell uname -m 2>/dev/null || echo not)) \ + ) + + $(eval OS_ARCH_TYPE:=$(OS_TYPE)_$(ARCH_TYPE)) + + $(if $(filter arm8,$(1)), \ + $(eval ARCH_FLAGS:=-march=armv8-a -DARCH_ARM) \ + $(info *** ARMv8 with NEON extensions ***), \ + $(if $(filter m1,$(1)), \ + $(eval ARCH_FLAGS:=-march=armv8.4-a -DARCH_ARM) \ + $(info *** Apple M1 (or newer) with NEON extensions ***), \ + $(if $(filter sse2,$(1)), \ + $(eval ARCH_FLAGS:=-msse2 -m64 -DARCH_X64) \ + $(info *** x86-64 with SSE2 extensions ***), \ + $(if $(filter avx,$(1)), \ + $(eval ARCH_FLAGS:=-mavx -m64 -DARCH_X64) \ + $(info *** x86-64 with AVX extensions ***), \ + $(if $(filter avx2,$(1)), \ + $(eval ARCH_FLAGS:=-mavx2 -m64 -DARCH_X64) \ + $(info *** x86-64 with AVX2 extensions ***), \ + $(if $(filter avx512,$(1)), \ + $(eval ARCH_FLAGS:=-mavx512 -m64 -DARCH_X64) \ + $(info *** x86-64 with AVX512 extensions ***), \ + $(if $(filter x86_64,$(ARCH_TYPE)), \ + $(eval ARCH_FLAGS:=-march=native -DARCH_X64) \ + $(info *** Unspecified platform - using native compilation for x86_64 ***), \ + $(eval ARCH_FLAGS:=-march=native -DARCH_ARM) \ + $(info *** Unspecified platform - using native compilation for ARM ***)))))))) + + $(if $(filter Darwin,$(OS_TYPE)), \ + $(eval SDK_PATH:=$(shell $(CXX) -v 2>&1 | grep -- '--with-sysroot' | sed -E 's/.*--with-sysroot=([^ ]+).*/\1/')) \ + $(eval CMAKE_OSX_FIX:=-DCMAKE_OSX_SYSROOT=$(SDK_PATH)) \ + ) + + $(if $(filter Darwin,$(OS_TYPE)), \ + $(eval AR_OPT:=-rcs) \ + $(eval PY_AGC_API_CFLAGS:=-Wl,-undefined,dynamic_lookup -fPIC -Wall -shared -std=c++14 -O3), \ + $(eval AR_OPT:=rcs -o) \ + $(eval PY_AGC_API_CFLAGS:=-fPIC -Wall -shared -std=c++14 -O3) \ + ) +endef + +# Load submodules if necessary +define INIT_SUBMODULES + $(info *** Initialization of submodules ***) + $(eval dummy:=$(shell git submodule update --init --recursive)) +endef + + +### Clean library targets +clean-zlib-ng: + -cd $(ZLIB_DIR) && $(MAKE) -f Makefile.in clean && rm -r build-g++ + +clean-isa-l: + -cd $(ISAL_DIR) && $(MAKE) -f Makefile.unx clean + +clean-libdeflate: + -cd $(LIBDEFLATE_DIR) && rm -r build + +clean-libzstd: + -cd $(LIBZSTD_DIR) && $(MAKE) clean + +clean-raduls-inplace: + -cd $(RADULS_INPLACE_DIR) && $(MAKE) clean + +clean-igraph: + -rm -r $(IGRAPH_DIR)/build + +clean-mimalloc_obj: + -rm $(MIMALLOC_OBJ) + +clean-cdflib_obj: + -rm $(CDFLIB_OBJ) + +clean-refresh_parallel_queues_monitor_obj: + -rm $(CDFLIB_OBJ) + +clean-sbwt: + -rm $(SBWT_A) + -rm $(SBWT_SDSL_A) + -rm $(SBWT_KMC_CORE_A) + -rm $(SBWT_KMC_TOOLS_A) + -rm -r $(SBWT_A_DIR) + +### Testing +define show_var + $(info $(1): $($(1))) +endef + +define show_var_opt + $(if $(1), \ + $(info $(1): $($(1))) \ + ) +endef + +_testing: + $(info *** General ***) + $(call show_var,OS_TYPE) + $(call show_var,ARCH_TYPE) + $(call show_var,OS_ARCH_TYPE) + $(call show_var,ARCH_FLAGS) + $(call show_var,NASM_VERSION) + + $(info *** Compilers ***) + $(call show_var,COMPILER_DESC) + $(call show_var,COMPILER_VERSION_FULL) + $(call show_var,COMPILER_VERSION_MAJOR) + $(call show_var,COMPILER_ALLOWED) + $(foreach desc,\ + $(wordlist 1,$(words $(COMPILER_ALLOWED)),$(COMPILER_ALLOWED)), \ + $(call show_var,$(desc)_MIN) \ + $(call show_var,$(desc)_MAX) \ + ) + + $(info *** Main directories ***) + $(call show_var,INCLUDE_DIRS) + $(call show_var,LIBRARY_DIRS) + + $(info *** Compiler and linker flags ***) + $(call show_var,C_STD) + $(call show_var,CPP_STD) + $(call show_var,C_FLAGS) + $(call show_var,CPP_FLAGS) + $(call show_var,OPTIMIZATION_FLAGS) + $(call show_var,DEFINE_FLAGS) + $(call show_var,LINKER_FLAGS) + $(call show_var,STATIC_LFLAGS) + $(call show_var,CPP_FLAGS_SSE2) + $(call show_var,CPP_FLAGS_SSE4) + $(call show_var,CPP_FLAGS_AVX) + $(call show_var,CPP_FLAGS_AVX2) + $(call show_var,CPP_FLAGS_AVX512) + $(call show_var,CPP_FLAGS_NEON) + + $(info *** Files ***) + $(call show_var,SRC_DIR) + $(call show_var,OBJ_DIR) + $(call show_var,OUT_BIN_DIR) + $(call show_var,FILES_DEFINED) + $(foreach item,\ + $(wordlist 1,$(words $(FILES_DEFINED)),$(FILES_DEFINED)), \ + $(call show_var,SRC_$(item)_DIR) \ + $(call show_var,OBJ_$(item)_DIR) \ + $(call show_var,SRC_$(item)) \ + $(call show_var,OBJ_$(item)) \ + ) + + $(info *** Libraries ***) + $(info * gzip decompression *) + $(call show_var,GZ_TARGET) + + $(info * zlib-ng *) + $(call show_var_opt,ZLIB_DIR) + $(call show_var_opt,ZLIB_A_DIR) + $(call show_var_opt,ZLIB_A) + + $(info * isa-l *) + $(call show_var_opt,ISAL_DIR) + $(call show_var_opt,ISAL_A_DIR) + $(call show_var_opt,ISAL_A) + + $(info * libdeflate *) + $(call show_var_opt,LIBDEFLATE_DIR) + $(call show_var_opt,LIBDEFLATE_A_DIR) + $(call show_var_opt,LIBDEFLATE_A) + + $(info * libzstd *) + $(call show_var_opt,LIBZSTD_DIR) + $(call show_var_opt,LIBZSTD_A_DIR) + $(call show_var_opt,LIBZSTD_A) + + $(info * mimalloc *) + $(call show_var_opt,MIMALLOC_INCLUDE_DIR) + $(call show_var_opt,MIMALLOC_DIR) + $(call show_var_opt,MIMALLOC_OBJ) + + $(info * raduls *) + $(call show_var_opt,RADULS_INPLACE_DIR) + $(call show_var_opt,RADULS_INPLACE_A_DIR) + $(call show_var_opt,RADULS_INPLACE_A) + + $(info * igraph *) + $(call show_var_opt,IGRAPH_DIR) + $(call show_var_opt,IGRAPH_A_DIR) + $(call show_var_opt,IGRAPH_A) + + $(info * SBWT *) + $(call show_var_opt,SBWT_DIR) + $(call show_var_opt,SBWT_A_DIR) + $(call show_var_opt,SBWT_A) + $(call show_var_opt,SBWT_SDSL_A) + $(call show_var_opt,SBWT_KMC_CORE_A) + $(call show_var_opt,SBWT_KMC_TOOLS_A) + diff --git a/src/app/agc-dev.vcxproj b/src/app/agc-dev.vcxproj index e5ab79a..2ac91ac 100644 --- a/src/app/agc-dev.vcxproj +++ b/src/app/agc-dev.vcxproj @@ -81,11 +81,13 @@ true - ../../3rd_party/libdeflate;../../3rd_party/isa-l/include;../../3rd_party/raduls-inplace/raduls;../../3rd_party/zlib-ng/build-vs;../../3rd_party/zstd/lib;../../3rd_party/mimalloc/include;$(VC_IncludePath);$(WindowsSDK_IncludePath) + ../../3rd_party/libdeflate;../../3rd_party/isa-l/include;../../3rd_party/raduls-inplace/raduls;../..\3rd_party\zlib-ng\build-vs;../../3rd_party;../../3rd_party/mimalloc/include;$(VC_IncludePath);$(WindowsSDK_IncludePath) + ../3rd_party/;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64) false - ../../3rd_party/libdeflate;../../3rd_party/isa-l/include;../../3rd_party/raduls-inplace/raduls;../../3rd_party/zlib-ng/build-vs;../../3rd_party/zstd/lib;../../3rd_party/mimalloc/include;$(VC_IncludePath);$(WindowsSDK_IncludePath) + ../../3rd_party/libdeflate;../../3rd_party/isa-l/include;../../3rd_party/raduls-inplace/raduls;../../3rd_party\zlib-ng\build-vs;../../3rd_party;../../3rd_party/mimalloc/include;$(VC_IncludePath);$(WindowsSDK_IncludePath) + ../3rd_party/;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64) false @@ -126,7 +128,7 @@ _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true /D_CRT_SECURE_NO_WARNINGS /DREFRESH_USE_IGZIP %(AdditionalOptions) - stdcpp17 + stdcpp20 stdc11 ..\..\libs\mimalloc true @@ -135,10 +137,10 @@ Console true - ..\..\3rd_party\zlib-ng\build-vs\$(Configuration)\zlibstaticd.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + ../..\3rd_party\isa-l\isa-l_static.lib;../..\3rd_party\isa-l\isa-l.lib;;../..\3rd_party\zlib-ng\build-vs\$(Configuration)\zlibstaticd.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) - call $(SolutionDir)3rd_party\prebuild.bat $(SolutionDir) $(Configuration) + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) @@ -150,7 +152,7 @@ NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true /D_CRT_SECURE_NO_WARNINGS %(AdditionalOptions) - stdcpp17 + stdcpp20 stdc11 true AdvancedVectorExtensions @@ -166,58 +168,55 @@ true true true - ..\..\3rd_party\zlib-ng\build-vs\$(Configuration)\zlibstatic.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + ../..\3rd_party\isa-l\isa-l_static.lib;../..\3rd_party\isa-l\isa-l.lib;../..\3rd_party\zlib-ng\build-vs\$(Configuration)\zlibstatic.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) - call $(SolutionDir)3rd_party\prebuild.bat $(SolutionDir) $(Configuration) + call "$(SolutionDir)3rd_party\prebuild.bat" "$(SolutionDir)" $(Configuration) - - + + + + + + + + + + + + + - - - - + - - - - - - - - false + + + + + + + + + + - - - - - - - - - - - - {68837f79-3d7f-4f4a-af10-99383b70f7a9} - {0df64642-6604-30b0-9ffa-1bb593ba2cb1} diff --git a/src/app/agc-dev.vcxproj.filters b/src/app/agc-dev.vcxproj.filters index eed8bbf..96d6305 100644 --- a/src/app/agc-dev.vcxproj.filters +++ b/src/app/agc-dev.vcxproj.filters @@ -7,109 +7,109 @@ Source files - + Source files - + Source files - + Source files - - Source files + + Library files - + Source files - + Source files - + Source files - + Source files - + Source files - + Source files - + Source files - + Source files - + Source files - - Library files + + Source files Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - + Header files - - Library files + + Header files diff --git a/src/app/agc-dev.vcxproj.user b/src/app/agc-dev.vcxproj.user index 8131efc..9a51bd1 100644 --- a/src/app/agc-dev.vcxproj.user +++ b/src/app/agc-dev.vcxproj.user @@ -1,13 +1,13 @@  - getcol -g 4 -o dec/ -v 1 hpp_small.agc - j:\agc + create -a -o f50.agc -i fl50 -f 0.0001 -v 1 -t 32 col\100000_CHM13.pri.fa.gz + j:\agc\heng WindowsLocalDebugger - create -o test-camp-1.agc -v 2 -t 1 -i fl1 camp-gz/SAMEA1010245.contigs.fa - j:\agc + create -t 16 -i salmo.fl -o t.agc -a -k 17 -b 300 -s 3000 -v 1 -r salmonella_enterica__01/SAMEA104364886.fa -f 0.005 + k:\atb WindowsLocalDebugger \ No newline at end of file diff --git a/src/app/application.cpp b/src/app/application.cpp index 8014a49..80c7b5b 100644 --- a/src/app/application.cpp +++ b/src/app/application.cpp @@ -4,8 +4,8 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #include "application.h" @@ -13,8 +13,8 @@ #include #include #include -#include "../core/utils.h" -#include "../../libs/ketopt.h" +#include "../common/utils.h" +#include "../../3rd_party/ketopt.h" // ******************************************************************************************* bool CApplication::parse_params(const int argc, const char** argv) @@ -111,6 +111,7 @@ void CApplication::usage_create() const cerr << " -b - batch size " << execution_params.pack_cardinality.info() << "\n"; cerr << " -c - concatenated genomes in a single file (default: " << boolalpha << execution_params.concatenated_genomes << noboolalpha << ")\n"; cerr << " -d - do not store cmd-line (default: " << boolalpha << execution_params.store_cmd_line << noboolalpha << ")\n"; + cerr << " -f - fraction of fall-back minimizers " << execution_params.fallback_frac.info() << "\n"; cerr << " -i - file with FASTA file names (alterantive to listing file names explicitely in command line)\n"; cerr << " -k - k-mer length" << execution_params.k.info() << "\n"; cerr << " -l - min. match length " << execution_params.min_match_length.info() << "\n"; @@ -126,7 +127,7 @@ bool CApplication::parse_params_create(const int argc, const char** argv) ketopt_t o = KETOPT_INIT; int i, c; - while ((c = ketopt(&o, argc, argv, 1, "t:b:s:k:l:acdfi:o:v:", 0)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "t:b:s:k:f:l:acdfi:o:v:", 0)) >= 0) { if (c == 't') { execution_params.no_threads.assign(atoi(o.arg)); } else if (c == 'b') { @@ -135,6 +136,8 @@ bool CApplication::parse_params_create(const int argc, const char** argv) execution_params.segment_size.assign(atoi(o.arg)); } else if (c == 'k') { execution_params.k.assign(atoi(o.arg)); + } else if (c == 'f') { + execution_params.fallback_frac.assign(atof(o.arg)); } else if (c == 'l') { execution_params.min_match_length.assign(atoi(o.arg)); } else if (c == 'a') { @@ -176,6 +179,7 @@ void CApplication::usage_append() const cerr << " -a - adaptive mode (default: " << boolalpha << execution_params.adaptive_compression << noboolalpha << ")\n"; cerr << " -c - concatenated genomes in a single file (default: " << boolalpha << execution_params.concatenated_genomes << noboolalpha << ")\n"; cerr << " -d - do not store cmd-line (default: " << boolalpha << execution_params.store_cmd_line << noboolalpha << ")\n"; + cerr << " -f - fraction of fall-back minimizers " << execution_params.fallback_frac.info() << "\n"; cerr << " -i - file with FASTA file names (alterantive to listing file names explicitely in command line)\n"; cerr << " -o - output to file (default: output is sent to stdout)\n"; cerr << " -t - no of threads " << execution_params.no_threads.info() << "\n"; @@ -188,9 +192,12 @@ bool CApplication::parse_params_append(const int argc, const char** argv) ketopt_t o = KETOPT_INIT; int i, c; - while ((c = ketopt(&o, argc, argv, 1, "t:acdfi:o:v:", 0)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "t:f:acdfi:o:v:", 0)) >= 0) { if (c == 't') { execution_params.no_threads.assign(atoi(o.arg)); + } + else if (c == 'f') { + execution_params.fallback_frac.assign(atof(o.arg)); } else if (c == 'c') { execution_params.concatenated_genomes = true; } else if (c == 'd') { @@ -230,9 +237,11 @@ void CApplication::usage_getcol() const cerr << "Usage: agc getcol [options] > \n"; cerr << "Options:\n"; cerr << " -g - optional gzip with given level " << execution_params.gzip_level.info() << "\n"; + cerr << " -f - fast mode (needs more RAM) (default: " << boolalpha << execution_params.fast << ")\n"; cerr << " -l - line length " << execution_params.line_length.info() << "\n"; cerr << " -o - output to files at path (default: output is sent to stdout)\n"; - cerr << " -t - no of threads " << execution_params.no_threads.info() << "\n"; + cerr << " -r - without reference (default: " << boolalpha << execution_params.no_ref << ")\n"; + cerr << " -t - no of threads " << execution_params.no_threads.info() << "\n"; cerr << " -v - verbosity level " << execution_params.verbosity.info() << "\n"; } @@ -244,7 +253,7 @@ bool CApplication::parse_params_getcol(const int argc, const char** argv) execution_params.prefetch = true; - while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:v:", 0)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:v:fr", 0)) >= 0) { if (c == 'g') { execution_params.gzip_level.assign(atoi(o.arg)); } @@ -258,6 +267,12 @@ bool CApplication::parse_params_getcol(const int argc, const char** argv) execution_params.output_name = o.arg; execution_params.use_stdout = false; } + else if (c == 'f') { + execution_params.fast = true; + } + else if (c == 'r') { + execution_params.no_ref = true; + } else if (c == 'v') { execution_params.verbosity.assign(atoi(o.arg)); } @@ -283,6 +298,7 @@ void CApplication::usage_getset() const cerr << " -l - line length " << execution_params.line_length.info() << "\n"; cerr << " -o - output to file (default: output is sent to stdout)\n"; cerr << " -p - disable file prefetching (useful for small genomes)" << "\n"; + cerr << " -s - enable streaming mode (slower but need less memory)" << "\n"; cerr << " -t - no of threads " << execution_params.no_threads.info() << "\n"; cerr << " -v - verbosity level " << execution_params.verbosity.info() << "\n"; } @@ -295,7 +311,7 @@ bool CApplication::parse_params_getset(const int argc, const char** argv) execution_params.prefetch = true; - while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:pv:", 0)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:psv:", 0)) >= 0) { if (c == 'g') { execution_params.gzip_level.assign(atoi(o.arg)); } @@ -309,7 +325,11 @@ bool CApplication::parse_params_getset(const int argc, const char** argv) } else if (c == 'p') { execution_params.prefetch = false; - } else if (c == 'v') { + } + else if (c == 's') { + execution_params.streaming = true; + } + else if (c == 'v') { execution_params.verbosity.assign(atoi(o.arg)); } } @@ -345,6 +365,7 @@ void CApplication::usage_getctg() const cerr << " -l - line length " << execution_params.line_length.info() << "\n"; cerr << " -o - output to file (default: output is sent to stdout)\n"; cerr << " -p - disable file prefetching (useful for short queries)" << "\n"; + cerr << " -s - enable streaming mode (slower but need less memory)" << "\n"; cerr << " -t - no of threads " << execution_params.no_threads.info() << "\n"; cerr << " -v - verbosity level " << execution_params.verbosity.info() << "\n"; } @@ -357,7 +378,7 @@ bool CApplication::parse_params_getctg(const int argc, const char** argv) execution_params.prefetch = true; - while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:pv:", 0)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "g:t:l:o:psv:", 0)) >= 0) { if (c == 'g') { execution_params.gzip_level.assign(atoi(o.arg)); } @@ -370,7 +391,11 @@ bool CApplication::parse_params_getctg(const int argc, const char** argv) execution_params.use_stdout = false; } else if (c == 'p') { execution_params.prefetch = false; - } else if (c == 'v') { + } + else if (c == 's') { + execution_params.streaming = true; + } + else if (c == 'v') { execution_params.verbosity.assign(atoi(o.arg)); } } diff --git a/src/app/application.h b/src/app/application.h index 4240ede..edcd55c 100644 --- a/src/app/application.h +++ b/src/app/application.h @@ -7,8 +7,8 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2022-12-22 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #include @@ -68,6 +68,7 @@ struct CParams b_value line_length{ 80, 40, 2'000'000'000 }; b_value verbosity{ 0, 0, 2 }; b_value gzip_level{ 0, 0, 9 }; + b_value fallback_frac{ 0, 0, 0.05 }; uint32_t no_segments = 0; bool concatenated_genomes = false; @@ -75,6 +76,9 @@ struct CParams bool store_cmd_line = true; bool prefetch = true; bool adaptive_compression = false; + bool no_ref = false; + bool fast = false; + bool streaming = false; CParams() = default; }; diff --git a/src/app/main.cpp b/src/app/main.cpp index 76bb80d..5f40860 100644 --- a/src/app/main.cpp +++ b/src/app/main.cpp @@ -4,8 +4,8 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #include @@ -16,8 +16,8 @@ #include #ifdef _MSC_VER -//#include -//#include +#include +#include #endif #include "../app/application.h" @@ -89,7 +89,8 @@ bool CApplication::create() execution_params.concatenated_genomes, execution_params.adaptive_compression, execution_params.verbosity(), - execution_params.no_threads()); + execution_params.no_threads(), + execution_params.fallback_frac()); if (!r) { @@ -127,8 +128,15 @@ bool CApplication::append() sanitize_input_file_names(execution_params.input_names); - bool r = agc_c.Append(execution_params.in_archive_name, execution_params.out_archive_name, execution_params.verbosity(), true, execution_params.concatenated_genomes, execution_params.adaptive_compression, - execution_params.no_threads()); + bool r = agc_c.Append( + execution_params.in_archive_name, + execution_params.out_archive_name, + execution_params.verbosity(), + true, + execution_params.concatenated_genomes, + execution_params.adaptive_compression, + execution_params.no_threads(), + execution_params.fallback_frac()); if (!r) { @@ -177,6 +185,8 @@ bool CApplication::getcol() execution_params.line_length(), execution_params.no_threads(), execution_params.gzip_level(), + execution_params.no_ref, + execution_params.fast, execution_params.verbosity()); r &= agc_d.Close(); @@ -197,14 +207,23 @@ bool CApplication::getset() return false; } - r &= agc_d.GetSampleFile( - execution_params.output_name, - execution_params.sample_names, - execution_params.line_length(), - execution_params.no_threads(), - execution_params.gzip_level(), - execution_params.verbosity()); - + if(execution_params.streaming) + r &= agc_d.GetSampleForStreaming( + execution_params.output_name, + execution_params.sample_names, + execution_params.line_length(), + execution_params.no_threads(), + execution_params.gzip_level(), + execution_params.verbosity()); + else + r &= agc_d.GetSampleFile( + execution_params.output_name, + execution_params.sample_names, + execution_params.line_length(), + execution_params.no_threads(), + execution_params.gzip_level(), + execution_params.verbosity()); + r &= agc_d.Close(); return r; @@ -223,13 +242,22 @@ bool CApplication::getctg() return false; } - r &= agc_d.GetContigFile( - execution_params.output_name, - execution_params.contig_names, - execution_params.line_length(), - execution_params.no_threads(), - execution_params.gzip_level(), - execution_params.verbosity()); + if (execution_params.streaming) + r &= agc_d.GetContigForStreaming( + execution_params.output_name, + execution_params.contig_names, + execution_params.line_length(), + execution_params.no_threads(), + execution_params.gzip_level(), + execution_params.verbosity()); + else + r &= agc_d.GetContigFile( + execution_params.output_name, + execution_params.contig_names, + execution_params.line_length(), + execution_params.no_threads(), + execution_params.gzip_level(), + execution_params.verbosity()); r &= agc_d.Close(); @@ -358,16 +386,19 @@ bool CApplication::info() uint32_t kmer_length; uint32_t min_match_len; uint32_t pack_cardinality; + uint32_t segment_size; string ref_name; agc_d.ListSamples(v_sample_names); agc_d.GetCmdLines(cmd_lines); - agc_d.GetParams(kmer_length, min_match_len, pack_cardinality); + agc_d.GetParams(kmer_length, min_match_len, pack_cardinality, segment_size); agc_d.GetReferenceSample(ref_name); cerr << "No. samples : " << v_sample_names.size() << endl; cerr << "k-mer length : " << kmer_length << endl; cerr << "Min. match length: " << min_match_len << endl; + if(segment_size) + cerr << "Segment size : " << segment_size << endl; cerr << "Batch size : " << pack_cardinality << endl; cerr << "Reference name : " << ref_name << endl; cerr << "Command lines:" << endl; diff --git a/src/core/agc_basic.cpp b/src/common/agc_basic.cpp similarity index 95% rename from src/core/agc_basic.cpp rename to src/common/agc_basic.cpp index d577105..2cd3c62 100644 --- a/src/core/agc_basic.cpp +++ b/src/common/agc_basic.cpp @@ -1,317 +1,317 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include "../core/agc_basic.h" -#include -#include - -using namespace std::chrono; - -// ******************************************************************************************* -CAGCBasic::CAGCBasic() -{ - compression_params.kmer_length = 0; - compression_params.min_match_len = 0; - compression_params.pack_cardinality = 1; - compression_params.segment_size = 0; - - pack_cardinality = 1; - - working_mode = working_mode_t::none; - - is_app_mode = true; - kmer_length = 0; - min_match_len = 0; - - archive_version = AGC_FILE_MAJOR * 1000 + AGC_FILE_MINOR; - - if (archive_version < 2000) - collection_desc = static_pointer_cast(make_shared()); - else if(archive_version < 3000) - collection_desc = static_pointer_cast(make_shared()); - else if(archive_version < 4000) - collection_desc = static_pointer_cast(make_shared()); - - verbosity = 0; -} - -// ******************************************************************************************* -CAGCBasic::~CAGCBasic() -{ -} - -// ******************************************************************************************* -bool CAGCBasic::load_file_type_info(const string& archive_name) -{ - vector v_data; - - if (prefetch_archive) - in_archive = make_shared(true, ~0ull, ss_prefix(archive_version)); // ~0ull - special value - buffers whole archive - else - in_archive = make_shared(true, 32 << 10, ss_prefix(archive_version)); - - if (!in_archive->Open(archive_name)) - { - if (is_app_mode) - cerr << "Cannot open archive " << archive_name << endl; - return false; - } - - m_file_type_info.clear(); - - auto s_id = in_archive->GetStreamId("file_type_info"); - if (s_id < 0) - return false; - - uint64_t n_items; - - if (!in_archive->GetPart(s_id, v_data, n_items)) - return false; - - auto p = v_data.begin(); - string key, val; - - for (size_t i = 0; i < n_items; ++i) - { - read(p, key); - read(p, val); - - m_file_type_info.emplace(key, val); - } - - archive_version = stoi(m_file_type_info["file_version_major"]) * 1000 + stoi(m_file_type_info["file_version_minor"]); - - if (archive_version < 2000) - collection_desc = static_pointer_cast(make_shared()); - else if (archive_version < 3000) - collection_desc = static_pointer_cast(make_shared()); - else if (archive_version < 4000) - collection_desc = static_pointer_cast(make_shared()); - - return true; -} - -// ******************************************************************************************* -bool CAGCBasic::load_metadata_impl_v1() -{ - int desc_sid_v1 = in_archive->GetStreamId("collection-desc"); - - vector v_desc_zstd; - uint64_t tmp; - - if (!in_archive->GetPart(desc_sid_v1, v_desc_zstd, tmp)) - { - in_archive->Close(); - if (is_app_mode) - cerr << "Problem with archive\n"; - return false; - } - - vector v_desc; - - v_desc.resize(tmp); - ZSTD_decompress(v_desc.data(), v_desc.size(), v_desc_zstd.data(), v_desc_zstd.size()); - v_desc_zstd.clear(); - v_desc_zstd.shrink_to_fit(); - - auto collection_desc_v1 = dynamic_pointer_cast(collection_desc); - - if (!collection_desc_v1->deserialize(v_desc)) - { - in_archive->Close(); - if (is_app_mode) - cerr << "Cannot deserialize\n"; - return false; - } - - return true; -} - -// ******************************************************************************************* -bool CAGCBasic::load_metadata_impl_v2() -{ - int desc_main_sid_v2 = in_archive->GetStreamId("collection-main"); - int desc_details_sid_v2 = in_archive->GetStreamId("collection-details"); - - vector v_desc_zstd; - uint64_t tmp; - - if (!in_archive->GetPart(desc_main_sid_v2, v_desc_zstd, tmp)) - { - in_archive->Close(); - if (is_app_mode) - cerr << "Problem with archive\n"; - return false; - } - - vector v_desc; - - v_desc.resize(tmp); - ZSTD_decompress(v_desc.data(), v_desc.size(), v_desc_zstd.data(), v_desc_zstd.size()); - v_desc_zstd.clear(); - v_desc_zstd.shrink_to_fit(); - - bool expensive_collection_structures; - if (is_app_mode) - { - if (working_mode == working_mode_t::decompression || working_mode == working_mode_t::none) - expensive_collection_structures = false; - else - expensive_collection_structures = true; - } - else - expensive_collection_structures = prefetch_archive; - - auto collection_desc_v2 = dynamic_pointer_cast(collection_desc); - - if (!collection_desc_v2->deserialize_main(v_desc, expensive_collection_structures)) - { - in_archive->Close(); - if (is_app_mode) - cerr << "Cannot deserialize\n"; - return false; - } - - while (in_archive->GetPart(desc_details_sid_v2, v_desc_zstd, tmp)) - collection_desc_v2->deserialize_details(v_desc_zstd, tmp, - working_mode == working_mode_t::appending || working_mode == working_mode_t::pre_appending); - - return true; -} - -// ******************************************************************************************* -bool CAGCBasic::load_metadata_impl_v3() -{ - return true; - - // Do nothing here - loading is made in other place -} - -// ******************************************************************************************* -bool CAGCBasic::load_metadata() -{ - if (archive_version >= 4000) - { - archive_version = 0; // Invalid archive - - in_archive->Close(); - if (is_app_mode) - cerr << "Unsupported archive version. Please use the most recent AGC application" << endl; - return false; - } - - if (archive_version < 2000) // v1 - load_metadata_impl_v1(); - else if (archive_version < 3000) // v2 - load_metadata_impl_v2(); - else if (archive_version < 4000) // v3 - load_metadata_impl_v3(); - - uint64_t tmp; - vector v_params; - - if (!in_archive->GetPart(in_archive->GetStreamId("params"), v_params, tmp)) - { - in_archive->Close(); - if (is_app_mode) - cerr << "Archive does not contain parameters section\n"; - return false; - } - - auto p = v_params.begin(); - read(p, compression_params.kmer_length); - read(p, compression_params.min_match_len); - read(p, compression_params.pack_cardinality); - - if (archive_version >= 2000) - read(p, compression_params.segment_size); - else - compression_params.segment_size = 0; - - kmer_length = compression_params.kmer_length; - pack_cardinality = compression_params.pack_cardinality; - min_match_len = compression_params.min_match_len; - segment_size = compression_params.segment_size; - - return true; -} - -// ******************************************************************************************* -void CAGCBasic::join_threads(vector &v_threads) -{ - for (auto& t : v_threads) - t.join(); - - v_threads.clear(); -} - -// ******************************************************************************************* -void CAGCBasic::reverse_complement(contig_t& contig) -{ - if (contig.empty()) - return; - - int size = (int) contig.size(); - - uint8_t* p = contig.data() + size - 1; - uint8_t* q = contig.data(); - - for (int i = 0; i < size / 2; ++i) - { - uint8_t x = (*p < 4) ? 3 - *p : *p; - uint8_t y = (*q < 4) ? 3 - *q : *q; - - *q++ = x; - *p-- = y; - } - - if (size % 2) - if (*p < 4) - *p = 3 - *p; -} - -// ******************************************************************************************* -void CAGCBasic::reverse_complement_copy(contig_t& src_contig, contig_t& dest_contig) -{ - int size = (int) src_contig.size(); - - dest_contig.resize(size); - - if (src_contig.empty()) - return; - - int i = 0; - uint8_t* p = src_contig.data() + size - 1; - uint8_t* q = dest_contig.data(); - - switch (size % 4) - { - case 3: - *q++ = (*p < 4) ? 3 - *p : *p; --p; - ++i; - case 2: - *q++ = (*p < 4) ? 3 - *p : *p; --p; - ++i; - case 1: - *q++ = (*p < 4) ? 3 - *p : *p; --p; - ++i; - } - - for (; i < size; i += 4) - { - *q++ = (*p < 4) ? 3 - *p : *p; --p; - *q++ = (*p < 4) ? 3 - *p : *p; --p; - *q++ = (*p < 4) ? 3 - *p : *p; --p; - *q++ = (*p < 4) ? 3 - *p : *p; --p; - } -} - +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include "agc_basic.h" +#include +#include + +using namespace std::chrono; + +// ******************************************************************************************* +CAGCBasic::CAGCBasic() +{ + compression_params.kmer_length = 0; + compression_params.min_match_len = 0; + compression_params.pack_cardinality = 1; + compression_params.segment_size = 0; + + pack_cardinality = 1; + + working_mode = working_mode_t::none; + + is_app_mode = true; + kmer_length = 0; + min_match_len = 0; + + archive_version = AGC_FILE_MAJOR * 1000 + AGC_FILE_MINOR; + + if (archive_version < 2000) + collection_desc = static_pointer_cast(make_shared()); + else if(archive_version < 3000) + collection_desc = static_pointer_cast(make_shared()); + else if(archive_version < 4000) + collection_desc = static_pointer_cast(make_shared()); + + verbosity = 0; +} + +// ******************************************************************************************* +CAGCBasic::~CAGCBasic() +{ +} + +// ******************************************************************************************* +bool CAGCBasic::load_file_type_info(const string& archive_name) +{ + vector v_data; + + if (prefetch_archive) + in_archive = make_shared(true, ~0ull, ss_prefix(archive_version)); // ~0ull - special value - buffers whole archive + else + in_archive = make_shared(true, 32 << 10, ss_prefix(archive_version)); + + if (!in_archive->Open(archive_name)) + { + if (is_app_mode) + cerr << "Cannot open archive " << archive_name << endl; + return false; + } + + m_file_type_info.clear(); + + auto s_id = in_archive->GetStreamId("file_type_info"); + if (s_id < 0) + return false; + + uint64_t n_items; + + if (!in_archive->GetPart(s_id, v_data, n_items)) + return false; + + auto p = v_data.begin(); + string key, val; + + for (size_t i = 0; i < n_items; ++i) + { + read(p, key); + read(p, val); + + m_file_type_info.emplace(key, val); + } + + archive_version = stoi(m_file_type_info["file_version_major"]) * 1000 + stoi(m_file_type_info["file_version_minor"]); + + if (archive_version < 2000) + collection_desc = static_pointer_cast(make_shared()); + else if (archive_version < 3000) + collection_desc = static_pointer_cast(make_shared()); + else if (archive_version < 4000) + collection_desc = static_pointer_cast(make_shared()); + + return true; +} + +// ******************************************************************************************* +bool CAGCBasic::load_metadata_impl_v1() +{ + int desc_sid_v1 = in_archive->GetStreamId("collection-desc"); + + vector v_desc_zstd; + uint64_t tmp; + + if (!in_archive->GetPart(desc_sid_v1, v_desc_zstd, tmp)) + { + in_archive->Close(); + if (is_app_mode) + cerr << "Problem with archive\n"; + return false; + } + + vector v_desc; + + v_desc.resize(tmp); + ZSTD_decompress(v_desc.data(), v_desc.size(), v_desc_zstd.data(), v_desc_zstd.size()); + v_desc_zstd.clear(); + v_desc_zstd.shrink_to_fit(); + + auto collection_desc_v1 = dynamic_pointer_cast(collection_desc); + + if (!collection_desc_v1->deserialize(v_desc)) + { + in_archive->Close(); + if (is_app_mode) + cerr << "Cannot deserialize\n"; + return false; + } + + return true; +} + +// ******************************************************************************************* +bool CAGCBasic::load_metadata_impl_v2() +{ + int desc_main_sid_v2 = in_archive->GetStreamId("collection-main"); + int desc_details_sid_v2 = in_archive->GetStreamId("collection-details"); + + vector v_desc_zstd; + uint64_t tmp; + + if (!in_archive->GetPart(desc_main_sid_v2, v_desc_zstd, tmp)) + { + in_archive->Close(); + if (is_app_mode) + cerr << "Problem with archive\n"; + return false; + } + + vector v_desc; + + v_desc.resize(tmp); + ZSTD_decompress(v_desc.data(), v_desc.size(), v_desc_zstd.data(), v_desc_zstd.size()); + v_desc_zstd.clear(); + v_desc_zstd.shrink_to_fit(); + + bool expensive_collection_structures; + if (is_app_mode) + { + if (working_mode == working_mode_t::decompression || working_mode == working_mode_t::none) + expensive_collection_structures = false; + else + expensive_collection_structures = true; + } + else + expensive_collection_structures = prefetch_archive; + + auto collection_desc_v2 = dynamic_pointer_cast(collection_desc); + + if (!collection_desc_v2->deserialize_main(v_desc, expensive_collection_structures)) + { + in_archive->Close(); + if (is_app_mode) + cerr << "Cannot deserialize\n"; + return false; + } + + while (in_archive->GetPart(desc_details_sid_v2, v_desc_zstd, tmp)) + collection_desc_v2->deserialize_details(v_desc_zstd, tmp, + working_mode == working_mode_t::appending || working_mode == working_mode_t::pre_appending); + + return true; +} + +// ******************************************************************************************* +bool CAGCBasic::load_metadata_impl_v3() +{ + return true; + + // Do nothing here - loading is made in other place +} + +// ******************************************************************************************* +bool CAGCBasic::load_metadata() +{ + if (archive_version >= 4000) + { + archive_version = 0; // Invalid archive + + in_archive->Close(); + if (is_app_mode) + cerr << "Unsupported archive version. Please use the most recent AGC application" << endl; + return false; + } + + if (archive_version < 2000) // v1 + load_metadata_impl_v1(); + else if (archive_version < 3000) // v2 + load_metadata_impl_v2(); + else if (archive_version < 4000) // v3 + load_metadata_impl_v3(); + + uint64_t tmp; + vector v_params; + + if (!in_archive->GetPart(in_archive->GetStreamId("params"), v_params, tmp)) + { + in_archive->Close(); + if (is_app_mode) + cerr << "Archive does not contain parameters section\n"; + return false; + } + + auto p = v_params.begin(); + read(p, compression_params.kmer_length); + read(p, compression_params.min_match_len); + read(p, compression_params.pack_cardinality); + + if (archive_version >= 2000) + read(p, compression_params.segment_size); + else + compression_params.segment_size = 0; + + kmer_length = compression_params.kmer_length; + pack_cardinality = compression_params.pack_cardinality; + min_match_len = compression_params.min_match_len; + segment_size = compression_params.segment_size; + + return true; +} + +// ******************************************************************************************* +void CAGCBasic::join_threads(vector &v_threads) +{ + for (auto& t : v_threads) + t.join(); + + v_threads.clear(); +} + +// ******************************************************************************************* +void CAGCBasic::reverse_complement(contig_t& contig) +{ + if (contig.empty()) + return; + + int size = (int) contig.size(); + + uint8_t* p = contig.data() + size - 1; + uint8_t* q = contig.data(); + + for (int i = 0; i < size / 2; ++i) + { + uint8_t x = (*p < 4) ? 3 - *p : *p; + uint8_t y = (*q < 4) ? 3 - *q : *q; + + *q++ = x; + *p-- = y; + } + + if (size % 2) + if (*p < 4) + *p = 3 - *p; +} + +// ******************************************************************************************* +void CAGCBasic::reverse_complement_copy(contig_t& src_contig, contig_t& dest_contig) +{ + int size = (int) src_contig.size(); + + dest_contig.resize(size); + + if (src_contig.empty()) + return; + + int i = 0; + uint8_t* p = src_contig.data() + size - 1; + uint8_t* q = dest_contig.data(); + + switch (size % 4) + { + case 3: + *q++ = (*p < 4) ? 3 - *p : *p; --p; + ++i; + case 2: + *q++ = (*p < 4) ? 3 - *p : *p; --p; + ++i; + case 1: + *q++ = (*p < 4) ? 3 - *p : *p; --p; + ++i; + } + + for (; i < size; i += 4) + { + *q++ = (*p < 4) ? 3 - *p : *p; --p; + *q++ = (*p < 4) ? 3 - *p : *p; --p; + *q++ = (*p < 4) ? 3 - *p : *p; --p; + *q++ = (*p < 4) ? 3 - *p : *p; --p; + } +} + // EOL \ No newline at end of file diff --git a/src/core/agc_basic.h b/src/common/agc_basic.h similarity index 90% rename from src/core/agc_basic.h rename to src/common/agc_basic.h index 338e50a..f465671 100644 --- a/src/core/agc_basic.h +++ b/src/common/agc_basic.h @@ -1,135 +1,135 @@ -#ifndef _AGC_BASIC_H -#define _AGC_BASIC_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../core/archive.h" -#include "../core/segment.h" -#include "../core/collection_v1.h" -#include "../core/collection_v2.h" -#include "../core/collection_v3.h" -#include "../core/queue.h" - -using namespace std; - -// ******************************************************************************************* -// Basic compression class -class CAGCBasic -{ - friend class CAGCDecompressor; - -protected: - enum class working_mode_t { none, compression, decompression, appending, pre_appending }; - - const uint8_t cnv_num[128] = { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - 'A', 'C', 'G', 'T', 'N', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'U', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, - 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30, - ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, - 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30 - }; - - // ******************************************************************************************* - struct compression_params_t - { - uint32_t kmer_length; - uint32_t min_match_len; - uint32_t pack_cardinality; - uint32_t segment_size; - }; - - working_mode_t working_mode; - bool is_app_mode; - - uint32_t kmer_length; - uint32_t min_match_len; - uint32_t pack_cardinality; - uint32_t segment_size; - - string in_archive_name; - bool prefetch_archive = false; - uint32_t archive_version; - - shared_ptr in_archive; // internal mutexes - - shared_ptr collection_desc; - - map m_file_type_info; - - compression_params_t compression_params; - - const uint32_t no_raw_groups = 16; - - uint32_t verbosity; - - // ******************************************************************************************* - void read(vector::iterator& p, uint32_t& num) - { - num = 0; - - for (int i = 0; i < 4; ++i) - num += ((uint32_t)p[i]) << (8 * i); - - p += 4; - } - - // ******************************************************************************************* - void read64(vector::iterator& p, uint64_t& num) - { - num = 0; - - for (int i = 0; i < 8; ++i) - num += ((uint64_t)p[i]) << (8 * i); - - p += 8; - } - - // ******************************************************************************************* - void read(vector::iterator& p, string& str) - { - str.clear(); - - for (; *p != 0; ++p) - str.push_back((char)*p); - ++p; - } - - // ******************************************************************************************* - void join_threads(vector &v_threads); - bool load_metadata_impl_v1(); - bool load_metadata_impl_v2(); - bool load_metadata_impl_v3(); - - bool load_metadata(); - bool load_file_type_info(const string& archive_name); - - void reverse_complement(contig_t& contig); - void reverse_complement_copy(contig_t& src_contig, contig_t& dest_contig); - -public: - CAGCBasic(); - ~CAGCBasic(); -}; - -// EOF +#ifndef _AGC_BASIC_H +#define _AGC_BASIC_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../common/archive.h" +#include "../common/segment.h" +#include "../common/collection_v1.h" +#include "../common/collection_v2.h" +#include "../common/collection_v3.h" +#include "../common/queue.h" + +using namespace std; + +// ******************************************************************************************* +// Basic compression class +class CAGCBasic +{ + friend class CAGCDecompressor; + +protected: + enum class working_mode_t { none, compression, decompression, appending, pre_appending }; + + const uint8_t cnv_num[128] = { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 'A', 'C', 'G', 'T', 'N', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'U', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, + 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30, + ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, + 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30 + }; + + // ******************************************************************************************* + struct compression_params_t + { + uint32_t kmer_length; + uint32_t min_match_len; + uint32_t pack_cardinality; + uint32_t segment_size; + }; + + working_mode_t working_mode; + bool is_app_mode; + + uint32_t kmer_length; + uint32_t min_match_len; + uint32_t pack_cardinality; + uint32_t segment_size; + + string in_archive_name; + bool prefetch_archive = false; + uint32_t archive_version; + + shared_ptr in_archive; // internal mutexes + + shared_ptr collection_desc; + + map m_file_type_info; + + compression_params_t compression_params; + + const uint32_t no_raw_groups = 16; + + uint32_t verbosity; + + // ******************************************************************************************* + void read(vector::iterator& p, uint32_t& num) + { + num = 0; + + for (int i = 0; i < 4; ++i) + num += ((uint32_t)p[i]) << (8 * i); + + p += 4; + } + + // ******************************************************************************************* + void read64(vector::iterator& p, uint64_t& num) + { + num = 0; + + for (int i = 0; i < 8; ++i) + num += ((uint64_t)p[i]) << (8 * i); + + p += 8; + } + + // ******************************************************************************************* + void read(vector::iterator& p, string& str) + { + str.clear(); + + for (; *p != 0; ++p) + str.push_back((char)*p); + ++p; + } + + // ******************************************************************************************* + void join_threads(vector &v_threads); + bool load_metadata_impl_v1(); + bool load_metadata_impl_v2(); + bool load_metadata_impl_v3(); + + bool load_metadata(); + bool load_file_type_info(const string& archive_name); + + void reverse_complement(contig_t& contig); + void reverse_complement_copy(contig_t& src_contig, contig_t& dest_contig); + +public: + CAGCBasic(); + ~CAGCBasic(); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/agc_decompressor_lib.cpp b/src/common/agc_decompressor_lib.cpp similarity index 60% rename from src/core/agc_decompressor_lib.cpp rename to src/common/agc_decompressor_lib.cpp index 4dae6e2..6f61201 100644 --- a/src/core/agc_decompressor_lib.cpp +++ b/src/common/agc_decompressor_lib.cpp @@ -1,404 +1,647 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-18 -// ******************************************************************************************* - -#include "../core/agc_decompressor_lib.h" - -// ******************************************************************************************* -CAGCDecompressorLibrary::CAGCDecompressorLibrary(bool _is_app_mode) : CAGCBasic() -{ - is_app_mode = _is_app_mode; -} - -// ******************************************************************************************* -CAGCDecompressorLibrary::~CAGCDecompressorLibrary() -{ - if (working_mode == working_mode_t::decompression) - close_decompression(); -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::ListSamples(vector& v_sample_names) -{ - return collection_desc->get_samples_list(v_sample_names); -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::ListContigs(const string& sample_name, vector& v_contig_names) -{ - return collection_desc->get_contig_list_in_sample(sample_name, v_contig_names); -} - -// ******************************************************************************************* -void CAGCDecompressorLibrary::GetFileTypeInfo(map& _m_file_type_info) -{ - _m_file_type_info = m_file_type_info; -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::IsOpened() -{ - return working_mode != working_mode_t::none; -} - -// ******************************************************************************************* -int32_t CAGCDecompressorLibrary::GetNoSamples() -{ - return static_cast(collection_desc->get_no_samples()); -} - -// ******************************************************************************************* -int32_t CAGCDecompressorLibrary::GetNoContigs(const string& sample_name) -{ - return collection_desc->get_no_contigs(sample_name); -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::analyze_contig_query(const string& query, string& sample, name_range_t& name_range) -{ - smatch sm; - - name_range.from = -1; - name_range.to = -1; - - sample.clear(); - - if (regex_match(query, sm, re_csr)) - { - name_range.name = sm[1].str(); - sample = sm[2].str(); - name_range.from = atoll(sm[3].str().c_str()); - name_range.to = atoll(sm[4].str().c_str()); - } - else if (regex_match(query, sm, re_cs)) - { - name_range.name = sm[1].str(); - sample = sm[2].str(); - } - else if (regex_match(query, sm, re_cr)) - { - name_range.name = sm[1].str(); - name_range.from = atoll(sm[2].str().c_str()); - name_range.to = atoll(sm[3].str().c_str()); - } - else if (regex_match(query, sm, re_c)) - { - name_range.name = sm[1].str(); - } - else - { - return false; - } - - return true; -} - -// ******************************************************************************************* -int CAGCDecompressorLibrary::GetContigString(const string& sample_name, const string& contig_name, const int start, const int end, string& contig_data) -{ - if (working_mode != working_mode_t::decompression) - return -1; - - uint32_t id = 0; - vector contig_desc; - string det_sample_name = sample_name; - - if (sample_name.empty()) - { - auto v_cand_samples = collection_desc->get_samples_for_contig(contig_name); - if (v_cand_samples.size() == 0) - return -1; - if (v_cand_samples.size() > 1) - return -2; - - det_sample_name = v_cand_samples.front(); - } - - string full_contig_name = contig_name; - - if (!collection_desc->get_contig_desc(det_sample_name, full_contig_name, contig_desc)) - return -1; - - contig_task_t task{ id++, "", name_range_t(full_contig_name, start, end), contig_desc }; - contig_t ctg; - - decompress_contig(task, nullptr, ctg); - - contig_data.clear(); - contig_data.reserve(ctg.size()); - for (auto& c : ctg) - contig_data.push_back(cnv_num[static_cast(c)]); - - return 0; -} - -// ******************************************************************************************* -int64_t CAGCDecompressorLibrary::GetContigLength(const string& sample_name, const string& contig_name) -{ - vector contig_desc; - string det_sample_name = sample_name; - - if (sample_name.empty()) - { - auto v_cand_samples = collection_desc->get_samples_for_contig(contig_name); - if (v_cand_samples.size() == 0) - return -1; - if (v_cand_samples.size() > 1) - return -2; - - det_sample_name = v_cand_samples.front(); - } - - string full_contig_name = contig_name; - - if (!collection_desc->get_contig_desc(det_sample_name, full_contig_name, contig_desc)) - return -1; - - int64_t len = 0; - for (auto& x : contig_desc) - len += x.raw_length; - - return len - (contig_desc.size() - 1) * kmer_length; -} - -// ******************************************************************************************* -void CAGCDecompressorLibrary::convert_to_alpha(contig_t& ctg) -{ - size_t size = ctg.size(); - size_t i = size % 8; - - switch (i) - { - case 7: ctg[6] = cnv_num[ctg[6]]; - case 6: ctg[5] = cnv_num[ctg[5]]; - case 5: ctg[4] = cnv_num[ctg[4]]; - case 4: ctg[3] = cnv_num[ctg[3]]; - case 3: ctg[2] = cnv_num[ctg[2]]; - case 2: ctg[1] = cnv_num[ctg[1]]; - case 1: ctg[0] = cnv_num[ctg[0]]; - } - - for (; i < size; i += 8) - { - ctg[i] = cnv_num[ctg[i]]; - ctg[i+1] = cnv_num[ctg[i+1]]; - ctg[i+2] = cnv_num[ctg[i+2]]; - ctg[i+3] = cnv_num[ctg[i+3]]; - ctg[i+4] = cnv_num[ctg[i+4]]; - ctg[i+5] = cnv_num[ctg[i+5]]; - ctg[i+6] = cnv_num[ctg[i+6]]; - ctg[i+7] = cnv_num[ctg[i+7]]; - } -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::decompress_contig(contig_task_t& contig_desc, ZSTD_DCtx* zstd_ctx, contig_t& ctg) -{ - name_range_t &contig_name_range = contig_desc.name_range; - vector v_segments_loc; - - bool need_free_zstd = false; - - if (!zstd_ctx) - { - zstd_ctx = ZSTD_createDCtx(); - need_free_zstd = true; - } - - int64_t from = contig_name_range.from; - int64_t to = contig_name_range.to; - int64_t curr_pos = 0; - - if (from < 0 && to < 0) - { - from = 0; - to = 0x7fffffffffffffffu; - } - else - { - if (from < 0) - { - if (is_app_mode) - cerr << "Warning: Start of range (" + to_string(from) + ") is below 0, so changed to 0\n"; - from = 0; - contig_name_range.from = 0; - } - if (to < 0) - { - if (is_app_mode) - cerr << "Warning: End of range (" + to_string(to) + ") is below 0, so changed to max value\n"; - to = 0x7fffffffffffffffu; - contig_name_range.to = 0x7fffffffffffffffu; - } - if (from > to) - { - if (is_app_mode) - cerr << "Warning: End of range (" + to_string(to) + ") is prior to start of range (" + to_string(from) + ") so changed to whole contig\n"; - - from = 0; - to = 0x7fffffffffffffffu; - contig_name_range.from = -1; - contig_name_range.to = -1; - } - } - - for (auto seg : contig_desc.segments) - { - int32_t seg_len = seg.raw_length; - - if (curr_pos + seg_len < from) - { - from -= seg_len - kmer_length; - to -= seg_len - kmer_length; - continue; - } - else if (curr_pos > to) - break; - -// cout << seg.group_id << " " << seg.in_group_id << endl; - - decompress_segment(seg.group_id, seg.in_group_id, ctg, zstd_ctx); - if (seg.is_rev_comp) - reverse_complement(ctg); - - v_segments_loc.emplace_back(ctg); - - curr_pos += seg_len - kmer_length; - } - - if (!v_segments_loc.empty()) - { - ctg = v_segments_loc.front(); - - for (uint32_t j = 1; j < v_segments_loc.size(); ++j) - if (v_segments_loc[j].size() < compression_params.kmer_length) - { - if (is_app_mode) - cerr << "Corrupted archive!" << endl; - } - else - { - ctg.insert(ctg.end(), v_segments_loc[j].begin() + compression_params.kmer_length, v_segments_loc[j].end()); - v_segments_loc[j].clear(); - v_segments_loc[j].shrink_to_fit(); - } - - if (ctg.size() > (uint64_t)to + 1) - ctg.resize((uint64_t)to + 1); - - if (from != 0) - ctg.erase(ctg.begin(), ctg.begin() + from); - } - else - ctg.clear(); - - if (need_free_zstd) - ZSTD_freeDCtx(zstd_ctx); - - return true; -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::Open(const string& _archive_fn, const bool _prefetch_archive) -{ - if (working_mode != working_mode_t::none) - return false; - - in_archive_name = _archive_fn; - prefetch_archive = _prefetch_archive; - - working_mode = working_mode_t::decompression; - - if (!load_file_type_info(in_archive_name)) - return false; - - if (archive_version < 3000) - { - if (!load_metadata()) - return false; - } - else if (archive_version < 4000) - { - if (!load_metadata() || - !dynamic_pointer_cast(collection_desc)->set_archives(in_archive, nullptr, 1, pack_cardinality, segment_size, kmer_length)) - return false; - } - - return true; -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::close_decompression() -{ - if (working_mode != working_mode_t::decompression) - return false; - - return true; -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::decompress_segment(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx) -{ - CSegment segment(ss_base(archive_version, group_id), in_archive, nullptr, compression_params.pack_cardinality, compression_params.min_match_len, false, archive_version); - - if (group_id < no_raw_groups) - return segment.get_raw(in_group_id, ctg, zstd_ctx); - else - return segment.get(in_group_id, ctg, zstd_ctx); -} - -// ******************************************************************************************* -void CAGCDecompressorLibrary::GetCmdLines(vector>& _cmd_lines) -{ - _cmd_lines.clear(); - - if (working_mode != working_mode_t::decompression) - return; - - collection_desc->get_cmd_lines(_cmd_lines); -} - -// ******************************************************************************************* -void CAGCDecompressorLibrary::GetParams(uint32_t& _kmer_length, uint32_t& _min_match_len, uint32_t& _pack_cardinality) -{ - if (working_mode != working_mode_t::decompression) - return; - - _kmer_length = kmer_length; - _min_match_len = min_match_len; - _pack_cardinality = pack_cardinality; -} - -// ******************************************************************************************* -void CAGCDecompressorLibrary::GetReferenceSample(string& ref_name) -{ - ref_name.clear(); - - if (working_mode != working_mode_t::decompression) - return; - - collection_desc->get_reference_name(ref_name); -} - -// ******************************************************************************************* -bool CAGCDecompressorLibrary::Close() -{ - bool r = true; - - if (working_mode == working_mode_t::none) - r = false; - else if (working_mode == working_mode_t::decompression) - r = close_decompression(); - - working_mode = working_mode_t::none; - - return r; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "agc_decompressor_lib.h" +#include + +// ******************************************************************************************* +CAGCDecompressorLibrary::CAGCDecompressorLibrary(bool _is_app_mode) : CAGCBasic() +{ + is_app_mode = _is_app_mode; +} + +// ******************************************************************************************* +CAGCDecompressorLibrary::~CAGCDecompressorLibrary() +{ + if (working_mode == working_mode_t::decompression) + close_decompression(); +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::ListSamples(vector& v_sample_names) +{ + return collection_desc->get_samples_list(v_sample_names); +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::ListContigs(const string& sample_name, vector& v_contig_names) +{ + return collection_desc->get_contig_list_in_sample(sample_name, v_contig_names); +} + +// ******************************************************************************************* +void CAGCDecompressorLibrary::GetFileTypeInfo(map& _m_file_type_info) +{ + _m_file_type_info = m_file_type_info; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::IsOpened() +{ + return working_mode != working_mode_t::none; +} + +// ******************************************************************************************* +int32_t CAGCDecompressorLibrary::GetNoSamples() +{ + return static_cast(collection_desc->get_no_samples()); +} + +// ******************************************************************************************* +int32_t CAGCDecompressorLibrary::GetNoContigs(const string& sample_name) +{ + return collection_desc->get_no_contigs(sample_name); +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::analyze_contig_query(const string& query, string& sample, name_range_t& name_range) +{ + smatch sm; + + name_range.from = -1; + name_range.to = -1; + + sample.clear(); + + if (regex_match(query, sm, re_csr)) + { + name_range.name = sm[1].str(); + sample = sm[2].str(); + name_range.from = atoll(sm[3].str().c_str()); + name_range.to = atoll(sm[4].str().c_str()); + } + else if (regex_match(query, sm, re_cs)) + { + name_range.name = sm[1].str(); + sample = sm[2].str(); + } + else if (regex_match(query, sm, re_cr)) + { + name_range.name = sm[1].str(); + name_range.from = atoll(sm[2].str().c_str()); + name_range.to = atoll(sm[3].str().c_str()); + } + else if (regex_match(query, sm, re_c)) + { + name_range.name = sm[1].str(); + } + else + { + return false; + } + + return true; +} + +// ******************************************************************************************* +int CAGCDecompressorLibrary::GetContigString(const string& sample_name, const string& contig_name, const int start, const int end, string& contig_data) +{ + if (working_mode != working_mode_t::decompression) + return -1; + + uint32_t id = 0; + vector contig_desc; + string det_sample_name = sample_name; + + if (sample_name.empty()) + { + auto v_cand_samples = collection_desc->get_samples_for_contig(contig_name); + if (v_cand_samples.size() == 0) + return -1; + if (v_cand_samples.size() > 1) + return -2; + + det_sample_name = v_cand_samples.front(); + } + + string full_contig_name = contig_name; + + if (!collection_desc->get_contig_desc(det_sample_name, full_contig_name, contig_desc)) + return -1; + + contig_task_t task{ id++, "", name_range_t(full_contig_name, start, end), contig_desc }; + contig_t ctg; + + decompress_contig(task, nullptr, ctg); + + contig_data.clear(); + contig_data.reserve(ctg.size()); + for (auto& c : ctg) + contig_data.push_back(cnv_num[static_cast(c)]); + + return 0; +} + +// ******************************************************************************************* +int64_t CAGCDecompressorLibrary::GetContigLength(const string& sample_name, const string& contig_name) +{ + vector contig_desc; + string det_sample_name = sample_name; + + if (sample_name.empty()) + { + auto v_cand_samples = collection_desc->get_samples_for_contig(contig_name); + if (v_cand_samples.size() == 0) + return -1; + if (v_cand_samples.size() > 1) + return -2; + + det_sample_name = v_cand_samples.front(); + } + + string full_contig_name = contig_name; + + if (!collection_desc->get_contig_desc(det_sample_name, full_contig_name, contig_desc)) + return -1; + + int64_t len = 0; + for (auto& x : contig_desc) + len += x.raw_length; + + return len - (contig_desc.size() - 1) * kmer_length; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::decompress_contig(contig_task_t& contig_desc, ZSTD_DCtx* zstd_ctx, contig_t& ctg, bool fast) +{ + name_range_t &contig_name_range = contig_desc.name_range; + vector v_segments_loc; + + bool need_free_zstd = false; + + if (!zstd_ctx) + { + zstd_ctx = ZSTD_createDCtx(); + need_free_zstd = true; + } + + int64_t from = contig_name_range.from; + int64_t to = contig_name_range.to; + int64_t curr_pos = 0; + + if (from < 0 && to < 0) + { + from = 0; + to = 0x7fffffffffffffffu; + } + else + { + if (from < 0) + { + if (is_app_mode) + cerr << "Warning: Start of range (" + to_string(from) + ") is below 0, so changed to 0\n"; + from = 0; + contig_name_range.from = 0; + } + if (to < 0) + { + if (is_app_mode) + cerr << "Warning: End of range (" + to_string(to) + ") is below 0, so changed to max value\n"; + to = 0x7fffffffffffffffu; + contig_name_range.to = 0x7fffffffffffffffu; + } + if (from > to) + { + if (is_app_mode) + cerr << "Warning: End of range (" + to_string(to) + ") is prior to start of range (" + to_string(from) + ") so changed to whole contig\n"; + + from = 0; + to = 0x7fffffffffffffffu; + contig_name_range.from = -1; + contig_name_range.to = -1; + } + } + + v_segments_loc.reserve(contig_desc.segments.size()); + + for (auto seg : contig_desc.segments) + { + int32_t seg_len = seg.raw_length; + + if (curr_pos + seg_len < from) + { + from -= seg_len - kmer_length; + to -= seg_len - kmer_length; + continue; + } + else if (curr_pos > to) + break; + + if(!fast) + decompress_segment(seg.group_id, seg.in_group_id, ctg, zstd_ctx); + else + decompress_segment_fast(seg.group_id, seg.in_group_id, ctg, zstd_ctx); + + if (seg.is_rev_comp) + reverse_complement(ctg); + + v_segments_loc.emplace_back(move(ctg)); + + curr_pos += seg_len - kmer_length; + } + + if (!v_segments_loc.empty()) + { + size_t req_size = 0; + for (uint32_t j = 0; j < v_segments_loc.size(); ++j) + req_size += v_segments_loc[j].size(); + + ctg.clear(); + ctg.reserve(req_size); + ctg.insert(ctg.end(), v_segments_loc.front().begin(), v_segments_loc.front().end()); + + for (uint32_t j = 1; j < v_segments_loc.size(); ++j) + if (v_segments_loc[j].size() < compression_params.kmer_length) + { + if (is_app_mode) + cerr << "Corrupted archive!" << endl; + } + else + { + ctg.insert(ctg.end(), v_segments_loc[j].begin() + compression_params.kmer_length, v_segments_loc[j].end()); + v_segments_loc[j].clear(); + v_segments_loc[j].shrink_to_fit(); + } + + if (ctg.size() > (uint64_t)to + 1) + ctg.resize((uint64_t)to + 1); + + if (from != 0) + ctg.erase(ctg.begin(), ctg.begin() + from); + } + else + ctg.clear(); + + if (need_free_zstd) + ZSTD_freeDCtx(zstd_ctx); + + return true; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::decompress_contig_streaming(contig_task_t& contig_desc, ZSTD_DCtx* zstd_ctx, CStreamWrapper& stream_wrapper, bool fast) +{ + name_range_t &contig_name_range = contig_desc.name_range; +// vector v_segments_loc; + + bool need_free_zstd = false; + + if (!zstd_ctx) + { + zstd_ctx = ZSTD_createDCtx(); + need_free_zstd = true; + } + + int64_t from = contig_name_range.from; + int64_t to = contig_name_range.to; + int64_t curr_pos = 0; + + if (from < 0 && to < 0) + { + from = 0; + to = 0x7fffffffffffffffu; + } + else + { + if (from < 0) + { + if (is_app_mode) + cerr << "Warning: Start of range (" + to_string(from) + ") is below 0, so changed to 0\n"; + from = 0; + contig_name_range.from = 0; + } + if (to < 0) + { + if (is_app_mode) + cerr << "Warning: End of range (" + to_string(to) + ") is below 0, so changed to max value\n"; + to = 0x7fffffffffffffffu; + contig_name_range.to = 0x7fffffffffffffffu; + } + if (from > to) + { + if (is_app_mode) + cerr << "Warning: End of range (" + to_string(to) + ") is prior to start of range (" + to_string(from) + ") so changed to whole contig\n"; + + from = 0; + to = 0x7fffffffffffffffu; + contig_name_range.from = -1; + contig_name_range.to = -1; + } + } + +// v_segments_loc.reserve(contig_desc.segments.size()); + contig_t ctg; + bool first_processed = false; + + contig_t local_ctg; + + for (auto seg : contig_desc.segments) + { + int32_t seg_len = seg.raw_length; + + if (curr_pos + seg_len < from) + { + from -= seg_len - kmer_length; + to -= seg_len - kmer_length; + continue; + } + else if (curr_pos > to) + break; + + if(!fast) + decompress_segment(seg.group_id, seg.in_group_id, ctg, zstd_ctx); + else + decompress_segment_fast(seg.group_id, seg.in_group_id, ctg, zstd_ctx); + + if (seg.is_rev_comp) + reverse_complement(ctg); + + if (first_processed) + { + if (ctg.size() < compression_params.kmer_length) + { + if (is_app_mode) + cerr << "Corrupted archive!" << endl; + } + else + local_ctg.assign(ctg.begin() + compression_params.kmer_length, ctg.end()); +// stream_wrapper.append(ctg.begin() + compression_params.kmer_length, ctg.end()); + } + else +// stream_wrapper.append(ctg.begin() + from, ctg.end()); + local_ctg.assign(ctg.begin(), ctg.end()); + + if (local_ctg.size() > (uint64_t)to + 1) + local_ctg.resize((uint64_t)to + 1); + + stream_wrapper.append(local_ctg.begin() + from, local_ctg.end()); + + curr_pos += seg_len - kmer_length; + first_processed = true; + } + + if (need_free_zstd) + ZSTD_freeDCtx(zstd_ctx); + + stream_wrapper.complete_contig(); + + return true; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::Open(const string& _archive_fn, const bool _prefetch_archive) +{ + if (working_mode != working_mode_t::none) + return false; + + in_archive_name = _archive_fn; + prefetch_archive = _prefetch_archive; + + working_mode = working_mode_t::decompression; + + if (!load_file_type_info(in_archive_name)) + return false; + + if (archive_version < 3000) + { + if (!load_metadata()) + return false; + } + else if (archive_version < 4000) + { + if (!load_metadata() || + !dynamic_pointer_cast(collection_desc)->set_archives(in_archive, nullptr, 1, pack_cardinality, segment_size, kmer_length)) + return false; + } + + return true; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::close_decompression() +{ + if (working_mode != working_mode_t::decompression) + return false; + + return true; +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::decompress_segment(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + CSegment segment(ss_base(archive_version, group_id), in_archive, nullptr, compression_params.pack_cardinality, compression_params.min_match_len, false, archive_version); + + if (group_id < no_raw_groups) + return segment.get_raw(in_group_id, ctg, zstd_ctx); + else + return segment.get(in_group_id, ctg, zstd_ctx); +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::decompress_segment_fast(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + shared_ptr segment; + + { + shared_lock lck(mtx_segment); + + auto p = v_segment.find(group_id); + + if (p != v_segment.end()) + segment = p->second; + } + + if (segment == nullptr) + { + unique_lock lck(mtx_segment); + + auto p = v_segment.find(group_id); // can happen that other thread will add this segment just before this lock + + if (p != v_segment.end()) + segment = p->second; + else + { + segment = make_shared(ss_base(archive_version, group_id), in_archive, nullptr, compression_params.pack_cardinality, compression_params.min_match_len, false, archive_version, true); + v_segment[group_id] = segment; + } + } + + if (group_id < no_raw_groups) + return segment->get_raw_locked(in_group_id, ctg, zstd_ctx); + else + return segment->get_locked(in_group_id, ctg, zstd_ctx); +} + +// ******************************************************************************************* +void CAGCDecompressorLibrary::GetCmdLines(vector>& _cmd_lines) +{ + _cmd_lines.clear(); + + if (working_mode != working_mode_t::decompression) + return; + + collection_desc->get_cmd_lines(_cmd_lines); +} + +// ******************************************************************************************* +void CAGCDecompressorLibrary::GetParams(uint32_t& _kmer_length, uint32_t& _min_match_len, uint32_t& _pack_cardinality, uint32_t &_segment_size) +{ + if (working_mode != working_mode_t::decompression) + return; + + _kmer_length = kmer_length; + _min_match_len = min_match_len; + _pack_cardinality = pack_cardinality; + _segment_size = segment_size; +} + +// ******************************************************************************************* +void CAGCDecompressorLibrary::GetReferenceSample(string& ref_name) +{ + ref_name.clear(); + + if (working_mode != working_mode_t::decompression) + return; + + collection_desc->get_reference_name(ref_name); +} + +// ******************************************************************************************* +bool CAGCDecompressorLibrary::Close() +{ + bool r = true; + + if (working_mode == working_mode_t::none) + r = false; + else if (working_mode == working_mode_t::decompression) + r = close_decompression(); + + working_mode = working_mode_t::none; + + return r; +} + +// ******************************************************************************************* +void CAGCDecompressorLibrary::CNumAlphaConverter::convert_to_alpha(contig_t& ctg) +{ + size_t size = ctg.size(); + size_t i = size % 8; + + switch (i) + { + case 7: ctg[6] = cnv_num[ctg[6]]; + case 6: ctg[5] = cnv_num[ctg[5]]; + case 5: ctg[4] = cnv_num[ctg[4]]; + case 4: ctg[3] = cnv_num[ctg[3]]; + case 3: ctg[2] = cnv_num[ctg[2]]; + case 2: ctg[1] = cnv_num[ctg[1]]; + case 1: ctg[0] = cnv_num[ctg[0]]; + } + + for (; i < size; i += 8) + { + ctg[i] = cnv_num[ctg[i]]; + ctg[i + 1] = cnv_num[ctg[i + 1]]; + ctg[i + 2] = cnv_num[ctg[i + 2]]; + ctg[i + 3] = cnv_num[ctg[i + 3]]; + ctg[i + 4] = cnv_num[ctg[i + 4]]; + ctg[i + 5] = cnv_num[ctg[i + 5]]; + ctg[i + 6] = cnv_num[ctg[i + 6]]; + ctg[i + 7] = cnv_num[ctg[i + 7]]; + } +} + +// ******************************************************************************************* +size_t CAGCDecompressorLibrary::CNumAlphaConverter::convert_and_split_into_lines(contig_t& ctg, contig_t& working_space, uint32_t line_len, uint32_t no_symbols_in_non_complete_line, bool append_eol) +{ + if (ctg.empty()) + return 0; + + size_t dest_size = ctg.size() + (ctg.size() + line_len - 1) / line_len + 2; + working_space.resize(dest_size); + + auto p = ctg.data(); + auto q = working_space.data(); + + size_t to_save = ctg.size(); + + if (no_symbols_in_non_complete_line) + { + while (to_save && no_symbols_in_non_complete_line++ < line_len) + { + *q++ = cnv_num[*p++]; + to_save--; + } + *q++ = '\n'; + } + + if (!to_save) + { + working_space.resize(q - working_space.data()); + + if (!append_eol) + working_space.pop_back(); + + std::swap(ctg, working_space); + + return no_symbols_in_non_complete_line; + } + + for (; to_save > line_len; to_save -= line_len) + { + uint32_t i; + + switch (i = line_len % 8) + { + case 7: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 6: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 5: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 4: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 3: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 2: *q++ = cnv_num[*p++]; [[fallthrough]]; + case 1: *q++ = cnv_num[*p++]; + } + + for (; i < line_len; i += 8) + { + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + *q++ = cnv_num[*p++]; + } + + *q++ = '\n'; + } + + size_t r = to_save; + + if (to_save) + { + while (to_save--) + *q++ = cnv_num[*p++]; + *q++ = '\n'; + } + + assert(q <= working_space.data() + working_space.size()); + working_space.resize(q - working_space.data()); + + if (!append_eol) + working_space.pop_back(); + + std::swap(ctg, working_space); + + return r; // No of symbols in last line +} + +// EOF diff --git a/src/core/agc_decompressor_lib.h b/src/common/agc_decompressor_lib.h similarity index 51% rename from src/core/agc_decompressor_lib.h rename to src/common/agc_decompressor_lib.h index 26da29e..abab1d3 100644 --- a/src/core/agc_decompressor_lib.h +++ b/src/common/agc_decompressor_lib.h @@ -1,125 +1,205 @@ -#ifndef _AGC_DECOMPRESSOR_LIB_H -#define _AGC_DECOMPRESSOR_LIB_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include "../core/agc_basic.h" - -// ******************************************************************************************* -// Class supporting only decompression of AGC files - library version -class CAGCDecompressorLibrary : public CAGCBasic -{ -protected: - const uint8_t cnv_num[128] = { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - 'A', 'C', 'G', 'T', 'N', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'U', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, - 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30, - ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, - 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30 - }; - - // ******************************************************************************************* - struct name_range_t - { - string name; - int64_t from; - int64_t to; - - name_range_t(const string _name = "", const int64_t _from = -1, const int64_t _to = -1) : - name(_name), from(_from), to(_to) - {} - - string str() const - { - if (from >= 0 && to >= 0) - return name + ":" + to_string(from) + "-" + to_string(to); - else - return name; - } - }; - - const regex re_csr = regex("(.+)@(.+):(.+)-(.+)"); - const regex re_cs = regex("(.+)@(.+)"); - const regex re_cr = regex("(.+):(.+)-(.+)"); - const regex re_c = regex("(.+)"); - - struct sample_contig_data_t { - string sample_name; - string contig_name; - contig_t contig_data; - - sample_contig_data_t() = default; - sample_contig_data_t(string _sample_name, string _contig_name, contig_t _contig_data) : - sample_name(_sample_name), contig_name(_contig_name), contig_data(_contig_data) {} - - sample_contig_data_t(const sample_contig_data_t&) = default; - sample_contig_data_t(sample_contig_data_t&&) = default; - sample_contig_data_t& operator=(const sample_contig_data_t&) = default; - }; - - struct contig_task_t { - size_t priority; - string sample_name; - name_range_t name_range; - vector segments; - - contig_task_t() = default; - contig_task_t(const size_t _priority, const string _sample_name, const name_range_t _name_range, const vector& _segments) : - priority(_priority), sample_name(_sample_name), name_range(_name_range), segments(_segments) {}; - contig_task_t(const contig_task_t&) = default; - contig_task_t(contig_task_t&&) = default; - contig_task_t& operator=(const contig_task_t&) = default; - }; - - unique_ptr> q_contig_tasks; - unique_ptr> pq_contigs_to_save; - - void convert_to_alpha(contig_t& ctg); - - bool analyze_contig_query(const string& query, string& sample, name_range_t& name_range); - bool decompress_segment(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx); - - bool decompress_contig(contig_task_t& task, ZSTD_DCtx *zstd_ctx, contig_t& ctg); - - bool close_decompression(); - -public: - CAGCDecompressorLibrary(bool _is_app_mode); - ~CAGCDecompressorLibrary(); - - bool Open(const string& _archive_fn, const bool _prefetch_archive = false); - - void GetCmdLines(vector>& _cmd_lines); - void GetParams(uint32_t& kmer_length, uint32_t& min_match_len, uint32_t& pack_cardinality); - void GetReferenceSample(string& ref_name); - - bool Close(); - - int GetContigString(const string& sample_name, const string& contig_name, const int start, const int end, string& contig_data); - int64_t GetContigLength(const string& sample_name, const string& contig_name); - - bool ListSamples(vector& v_sample_names); - bool ListContigs(const string& sample_name, vector& v_contig_names); - int32_t GetNoSamples(); - int32_t GetNoContigs(const string& sample_name); - - void GetFileTypeInfo(map& _m_file_type_info); - - bool IsOpened(); -}; - -// EOF +#ifndef _AGC_DECOMPRESSOR_LIB_H +#define _AGC_DECOMPRESSOR_LIB_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include "../common/agc_basic.h" + +// ******************************************************************************************* +// Class supporting only decompression of AGC files - library version +class CAGCDecompressorLibrary : public CAGCBasic +{ +protected: + class CNumAlphaConverter + { + static const inline uint8_t cnv_num[128] = { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 'A', 'C', 'G', 'T', 'N', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'U', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, + 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30, + ' ', 0, 11, 1, 12, 30, 30, 2, 13, 30, 30, 9, 30, 10, 4, 30, + 30, 30, 5, 7, 3, 15, 14, 8, 30, 6, 30, 30, 30, 30, 30, 30 + }; + + public: + CNumAlphaConverter() = default; + + static void convert_to_alpha(contig_t& ctg); + static size_t convert_and_split_into_lines(contig_t& ctg, contig_t& working_space, uint32_t line_len, uint32_t no_symbols_in_non_complete_line = 0, bool append_eol = true); + }; + + + // ******************************************************************************************* + struct name_range_t + { + string name; + int64_t from; + int64_t to; + + name_range_t(const string _name = "", const int64_t _from = -1, const int64_t _to = -1) : + name(_name), from(_from), to(_to) + {} + + name_range_t(const name_range_t&) = default; + name_range_t(name_range_t&&) noexcept = default; + + name_range_t& operator=(const name_range_t&) = default; + name_range_t& operator=(name_range_t&&) noexcept = default; + + string str() const + { + if (from >= 0 && to >= 0) + return name + ":" + to_string(from) + "-" + to_string(to); + else + return name; + } + }; + + class CStreamWrapper + { +// size_t buffer_size = 1 << 20; + size_t buffer_size = 1024; + FILE* stream = nullptr; + contig_t buffer; + contig_t working_space; + uint32_t line_length; + size_t no_symbols_in_last_line = 0; + + CNumAlphaConverter num_alpha_converter; + + bool store_buffer() + { + if (line_length == 0) + num_alpha_converter.convert_to_alpha(buffer); + else + no_symbols_in_last_line = num_alpha_converter.convert_and_split_into_lines(buffer, working_space, line_length, no_symbols_in_last_line, false); + + return fwrite(buffer.data(), 1, buffer.size(), stream) == buffer.size(); + } + + public: + CStreamWrapper() = delete; + + CStreamWrapper(FILE* stream, uint32_t line_length, int gzip_level) : + stream(stream), + line_length(line_length)//, +// gzip_compressor(gzip_level) + {} + + bool start_contig(const string& name) + { + string to_save = ">" + name + "\n"; + + no_symbols_in_last_line = 0; + + return fwrite(to_save.c_str(), 1, to_save.size(), stream) == to_save.size(); + } + + bool complete_contig() + { + return putc('\n', stream) != EOF; + } + + template + bool append(Iter first, Iter last) + { + if (!stream) + return false; + + buffer.assign(first, last); + + return store_buffer(); + } + }; + + const regex re_csr = regex("(.+)@(.+):(.+)-(.+)"); + const regex re_cs = regex("(.+)@(.+)"); + const regex re_cr = regex("(.+):(.+)-(.+)"); + const regex re_c = regex("(.+)"); + + struct sample_contig_data_t { + string sample_name; + string contig_name; + contig_t contig_data; + + sample_contig_data_t() = default; + sample_contig_data_t(const string &_sample_name, const string &_contig_name, const contig_t &_contig_data) : + sample_name(_sample_name), contig_name(_contig_name), contig_data(_contig_data) {} + + sample_contig_data_t(const string &_sample_name, const string &_contig_name, contig_t &&_contig_data) : + sample_name(_sample_name), contig_name(_contig_name), contig_data(move(_contig_data)) {} + + sample_contig_data_t(const sample_contig_data_t&) = default; + sample_contig_data_t(sample_contig_data_t&&) = default; + sample_contig_data_t& operator=(const sample_contig_data_t&) = default; + sample_contig_data_t& operator=(sample_contig_data_t&&) = default; + }; + + struct contig_task_t { + size_t priority; + string sample_name; + name_range_t name_range; + vector segments; + + contig_task_t() = default; + contig_task_t(const size_t _priority, const string _sample_name, const name_range_t _name_range, const vector& _segments) : + priority(_priority), sample_name(_sample_name), name_range(_name_range), segments(_segments) {}; + contig_task_t(const contig_task_t&) = default; + contig_task_t(contig_task_t&&) = default; + contig_task_t& operator=(const contig_task_t&) = default; + }; + + unique_ptr> q_contig_tasks; + unique_ptr> pq_contigs_to_save; + + shared_mutex mtx_segment; + map> v_segment; + + bool analyze_contig_query(const string& query, string& sample, name_range_t& name_range); + bool decompress_segment(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + bool decompress_segment_fast(const uint32_t group_id, const uint32_t in_group_id, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + + bool decompress_contig(contig_task_t& task, ZSTD_DCtx *zstd_ctx, contig_t& ctg, bool fast = false); + bool decompress_contig_streaming(contig_task_t& task, ZSTD_DCtx *zstd_ctx, CStreamWrapper& stream_wrapper, bool fast = false); + + bool close_decompression(); + +public: + CAGCDecompressorLibrary(bool _is_app_mode); + ~CAGCDecompressorLibrary(); + + bool Open(const string& _archive_fn, const bool _prefetch_archive = false); + + void GetCmdLines(vector>& _cmd_lines); + void GetParams(uint32_t& kmer_length, uint32_t& min_match_len, uint32_t& pack_cardinality, uint32_t& _segment_size); + void GetReferenceSample(string& ref_name); + + bool Close(); + + int GetContigString(const string& sample_name, const string& contig_name, const int start, const int end, string& contig_data); + int64_t GetContigLength(const string& sample_name, const string& contig_name); + + bool ListSamples(vector& v_sample_names); + bool ListContigs(const string& sample_name, vector& v_contig_names); + int32_t GetNoSamples(); + int32_t GetNoContigs(const string& sample_name); + + void GetFileTypeInfo(map& _m_file_type_info); + + bool IsOpened(); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/archive.cpp b/src/common/archive.cpp similarity index 95% rename from src/core/archive.cpp rename to src/common/archive.cpp index 485d6f1..0ab78d3 100644 --- a/src/core/archive.cpp +++ b/src/common/archive.cpp @@ -1,540 +1,557 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/archive.h" -#include "defs.h" - -#include -#include - -#ifndef _WIN32 -#define my_fseek fseek -#define my_ftell ftell -#else -#define my_fseek _fseeki64 -#define my_ftell _ftelli64 -#endif - -// ******************************************************************************************* -CArchive::CArchive(const bool _input_mode, const size_t _io_buffer_size, const string& _lazy_prefix) -{ - input_mode = _input_mode; - io_buffer_size = _io_buffer_size; - - if(input_mode) // Ignore lazy_prefix in output mode - lazy_prefix = _lazy_prefix; -} - -// ******************************************************************************************* -CArchive::~CArchive() -{ - Close(); -} - -// ******************************************************************************************* -bool CArchive::Open(const string &file_name) -{ - lock_guard lck(mtx); - - if (f_in.IsOpened()) - f_in.Close(); - if (f_out.IsOpened()) - f_out.Close(); - - if (input_mode) - f_in.Open(file_name, io_buffer_size); - else - f_out.Open(file_name); - - if (!f_in.IsOpened() && !f_out.IsOpened()) - return false; - - if (input_mode) - deserialize(); - - f_offset = 0; - - return true; -} - -// ******************************************************************************************* -bool CArchive::Close() -{ - lock_guard lck(mtx); - - if (!f_in.IsOpened() && !f_out.IsOpened()) - return false; - - if (input_mode) - f_in.Close(); - else - { - flush_out_buffers(); - serialize(); - f_out.Close(); - } - - return true; -} - -// ******************************************************************************************* -/*size_t CArchive::write_fixed(const uint64_t x) -{ - f_out.WriteUInt(x, 8); - - return 8; -} - -// ******************************************************************************************* -size_t CArchive::write(const uint64_t _x) -{ - int no_bytes = 0; - uint64_t x = _x; - - for (size_t tmp = x; tmp; tmp >>= 8) - ++no_bytes; - - f_out.Put(no_bytes); - - for (int i = no_bytes; i; --i) - f_out.Put((x >> ((i - 1) * 8)) & 0xff); - - return no_bytes + 1; -}*/ - -// ******************************************************************************************* -size_t CArchive::write(const string &s) -{ - f_out.Write(s); - f_out.Put(0); - - return s.size() + 1; -} - -// ******************************************************************************************* -size_t CArchive::read(string& s) -{ - s.clear(); - - while (true) - { - int c = f_in.Get(); - if (c == EOF) - return 0; - - if (c == 0) - return s.size() + 1; - - s.push_back((char)c); - } - - return 0; -} - -// ******************************************************************************************* -bool CArchive::serialize() -{ - size_t footer_size = 0; - - // Store stram part offsets - footer_size += write(v_streams.size()); - - for (auto& stream : v_streams) - { - size_t p = footer_size; - - footer_size += write(stream.stream_name); - footer_size += write(stream.parts.size()); - footer_size += write(stream.raw_size); - - for (auto& part : stream.parts) - { - footer_size += write(part.offset); - footer_size += write(part.size); - } - - stream.packed_size += footer_size - p; - } - - write_fixed(footer_size); - - return true; -} - -// ******************************************************************************************* -bool CArchive::deserialize() -{ - size_t footer_size; - size_t file_size = f_in.FileSize(); - - f_in.Seek(file_size - 8ull); - read_fixed(footer_size); - - f_in.Seek(file_size -(size_t)(8 + footer_size)); - - // Read stream part offsets - size_t n_streams; - read(n_streams); - - v_streams.resize(n_streams, stream_t()); - - rm_streams.reserve(2 * n_streams); - - for (size_t i = 0; i < n_streams; ++i) - { - auto& stream_second = v_streams[i]; - - read(stream_second.stream_name); - read(stream_second.cur_id); - read(stream_second.raw_size); - - stream_second.parts.resize(stream_second.cur_id); - for (size_t j = 0; j < stream_second.cur_id; ++j) - { - read(stream_second.parts[j].offset); - read(stream_second.parts[j].size); - } - - stream_second.cur_id = 0; - - if(!is_lazy_str(stream_second.stream_name)) - rm_streams[stream_second.stream_name] = i; - } - - f_in.Seek(0); - - return true; -} - -// ******************************************************************************************* -int CArchive::RegisterStream(const string &stream_name) -{ - lock_guard lck(mtx); - - // Before adding new stream check if stream_name is already registered - auto p = rm_streams.find(stream_name); - if (p != rm_streams.end()) - return (int)p->second; - - int id = (int) v_streams.size(); - - v_streams.emplace_back(stream_t()); - - v_streams[id].cur_id = 0; - v_streams[id].stream_name = stream_name; - v_streams[id].raw_size = 0; - v_streams[id].packed_size = 0; - v_streams[id].packed_data_size = 0; - - rm_streams[stream_name] = id; - - return id; -} - -// ******************************************************************************************* -int CArchive::get_stream_id(const string& stream_name) -{ - if (is_lazy_str(stream_name)) - de_lazy(); - - auto p = rm_streams.find(stream_name); - if (p != rm_streams.end()) - return (int)p->second; - - return -1; -} - -// ******************************************************************************************* -int CArchive::GetStreamId(const string &stream_name) -{ - lock_guard lck(mtx); - - return get_stream_id(stream_name); -} - -// ******************************************************************************************* -bool CArchive::add_part(const int stream_id, const vector& v_data, const uint64_t metadata) -{ - v_streams[stream_id].parts.push_back(part_t(f_offset, v_data.size())); - - f_offset += write(metadata); - f_out.Write(v_data.data(), v_data.size()); - - f_offset += v_data.size(); - - v_streams[stream_id].packed_size += f_offset - v_streams[stream_id].parts.back().offset; - v_streams[stream_id].packed_data_size += v_data.size(); - - return true; -} - -// ******************************************************************************************* -bool CArchive::AddPart(const int stream_id, const vector &v_data, const uint64_t metadata) -{ - lock_guard lck(mtx); - - return add_part(stream_id, v_data, metadata); -} - -// ******************************************************************************************* -int CArchive::AddPartPrepare(const int stream_id) -{ - lock_guard lck(mtx); - - v_streams[stream_id].parts.push_back(part_t(0, 0)); - - return static_cast(v_streams[stream_id].parts.size()) - 1; -} - -// ******************************************************************************************* -bool CArchive::AddPartComplete(const int stream_id, const int part_id, const vector& v_data, const uint64_t metadata) -{ - lock_guard lck(mtx); - - v_streams[stream_id].parts[part_id] = part_t(f_offset, v_data.size()); - - f_offset += write(metadata); - f_out.Write(v_data.data(), v_data.size()); - - f_offset += v_data.size(); - - v_streams[stream_id].packed_size += f_offset - v_streams[stream_id].parts[part_id].offset; - v_streams[stream_id].packed_data_size += v_data.size(); - - return true; -} - -// ******************************************************************************************* -bool CArchive::AddPartBuffered(const int stream_id, const vector& v_data, const uint64_t metadata) -{ - lock_guard lck(mtx); - - m_buffer[stream_id].emplace_back(v_data, metadata); - - return true; -} - -// ******************************************************************************************* -bool CArchive::flush_out_buffers() -{ - for (auto& x : m_buffer) - for (auto& y : x.second) - add_part(x.first, y.first, y.second); - - m_buffer.clear(); - - return true; -} - -// ******************************************************************************************* -bool CArchive::FlushOutBuffers() -{ - lock_guard lck(mtx); - - return flush_out_buffers(); -} - -// ******************************************************************************************* -void CArchive::SetRawSize(const int stream_id, const size_t raw_size) -{ - lock_guard lck(mtx); - - v_streams[stream_id].raw_size = raw_size; -} - -// ******************************************************************************************* -size_t CArchive::GetRawSize(const int stream_id) -{ - lock_guard lck(mtx); - - return v_streams[stream_id].raw_size; -} - -// ******************************************************************************************* -bool CArchive::get_part(const int stream_id, vector& v_data, uint64_t& metadata) -{ - auto& p = v_streams[stream_id]; - - if (p.cur_id >= p.parts.size()) - return false; - - v_data.resize(p.parts[p.cur_id].size); - - f_in.Seek(p.parts[p.cur_id].offset); - - if (p.parts[p.cur_id].size != 0) - read(metadata); - else - { - metadata = 0; - p.cur_id++; - return true; - } - - f_in.Read(v_data.data(), p.parts[p.cur_id].size); - - p.cur_id++; - - return true; -} - -// ******************************************************************************************* -bool CArchive::GetPart(const int stream_id, vector &v_data, uint64_t &metadata) -{ - lock_guard lck(mtx); - - return get_part(stream_id, v_data, metadata); -} - -// ******************************************************************************************* -pair CArchive::GetPart(const string& stream_name, vector &v_data, uint64_t &metadata) -{ - lock_guard lck(mtx); - - int stream_id = get_stream_id(stream_name); - - if (stream_id < 0) - return make_pair(-1, false); - - return make_pair(stream_id, get_part(stream_id, v_data, metadata)); -} - -// ******************************************************************************************* -tuple CArchive::GetParts( - const string& stream_name1, vector& v_data1, uint64_t& metadata1, - const string& stream_name2, vector& v_data2, uint64_t& metadata2) -{ - lock_guard lck(mtx); - - bool res1 = false; - bool res2 = false; - - int stream_id1 = get_stream_id(stream_name1); - int stream_id2 = get_stream_id(stream_name2); - - if (stream_id1 >= 0) - res1 = get_part(stream_id1, v_data1, metadata1); - - if (stream_id2 >= 0) - res2 = get_part(stream_id2, v_data2, metadata2); - - return make_tuple(stream_id1, res1, stream_id2, res2); -} - -// ******************************************************************************************* -bool CArchive::get_part(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata) -{ - auto& p = v_streams[stream_id]; - - if ((size_t)part_id >= p.parts.size()) - return false; - - v_data.resize(p.parts[part_id].size); - - f_in.Seek(p.parts[part_id].offset); - - if (p.parts[part_id].size != 0) - read(metadata); - else - { - metadata = 0; - return true; - } - - f_in.Read(v_data.data(), p.parts[part_id].size); - - return true; -} - -// ******************************************************************************************* -bool CArchive::GetPart(const int stream_id, const int part_id, vector &v_data, uint64_t &metadata) -{ - lock_guard lck(mtx); - - return get_part(stream_id, part_id, v_data, metadata); -} - -// ******************************************************************************************* -pair CArchive::GetPart(const string& stream_name, const int part_id, vector &v_data, uint64_t &metadata) -{ - lock_guard lck(mtx); - - int stream_id = get_stream_id(stream_name); - - if (stream_id < 0) - return make_pair(-1, false); - - return make_pair(stream_id, get_part(stream_id, part_id, v_data, metadata)); -} - -// ******************************************************************************************* -tuple CArchive::GetParts( - const string& stream_name1, const int part_id1, vector& v_data1, uint64_t& metadata1, - const string& stream_name2, const int part_id2, vector& v_data2, uint64_t& metadata2) -{ - lock_guard lck(mtx); - - bool res1 = false; - bool res2 = false; - - int stream_id1 = get_stream_id(stream_name1); - int stream_id2 = get_stream_id(stream_name2); - - if (stream_id1 >= 0) - res1 = get_part(stream_id1, part_id1, v_data1, metadata1); - - if (stream_id2 >= 0) - res2 = get_part(stream_id2, part_id2, v_data2, metadata2); - - return make_tuple(stream_id1, res1, stream_id2, res2); -} - -// ******************************************************************************************* -size_t CArchive::GetNoStreams() -{ - lock_guard lck(mtx); - - return v_streams.size(); -} - -// ******************************************************************************************* -size_t CArchive::GetNoParts(const int stream_id) -{ - lock_guard lck(mtx); - - if (stream_id < 0 || (size_t)stream_id >= v_streams.size()) - return 0; - - return v_streams[stream_id].parts.size(); -} - -// ******************************************************************************************* -size_t CArchive::GetStreamPackedSize(const int stream_id) -{ - lock_guard lck(mtx); - - if (stream_id < 0 || stream_id >= static_cast(v_streams.size())) - return 0; - - return v_streams[stream_id].packed_size; -} - -// ******************************************************************************************* -size_t CArchive::GetStreamPackedDataSize(const int stream_id) -{ - lock_guard lck(mtx); - - if (stream_id < 0 || stream_id >= static_cast(v_streams.size())) - return 0; - - return v_streams[stream_id].packed_data_size; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "archive.h" +#include "defs.h" + +#include +#include + +#ifndef _WIN32 +#define my_fseek fseek +#define my_ftell ftell +#else +#define my_fseek _fseeki64 +#define my_ftell _ftelli64 +#endif + +// ******************************************************************************************* +CArchive::CArchive(const bool _input_mode, const size_t _io_buffer_size, const string& _lazy_prefix) +{ + input_mode = _input_mode; + io_buffer_size = _io_buffer_size; + + if(input_mode) // Ignore lazy_prefix in output mode + lazy_prefix = _lazy_prefix; +} + +// ******************************************************************************************* +CArchive::~CArchive() +{ + Close(); +} + +// ******************************************************************************************* +bool CArchive::Open(const string &file_name) +{ + lock_guard lck(mtx); + + if (f_in.IsOpened()) + f_in.Close(); + if (f_out.IsOpened()) + f_out.Close(); + + if (input_mode) + f_in.Open(file_name, io_buffer_size); + else + f_out.Open(file_name); + + if (!f_in.IsOpened() && !f_out.IsOpened()) + return false; + + if (input_mode) + deserialize(); + + f_offset = 0; + + return true; +} + +// ******************************************************************************************* +bool CArchive::Close() +{ + lock_guard lck(mtx); + + if (!f_in.IsOpened() && !f_out.IsOpened()) + return false; + + if (input_mode) + f_in.Close(); + else + { + flush_out_buffers(); + serialize(); + f_out.Close(); + } + + return true; +} + +// ******************************************************************************************* +/*size_t CArchive::write_fixed(const uint64_t x) +{ + f_out.WriteUInt(x, 8); + + return 8; +} + +// ******************************************************************************************* +size_t CArchive::write(const uint64_t _x) +{ + int no_bytes = 0; + uint64_t x = _x; + + for (size_t tmp = x; tmp; tmp >>= 8) + ++no_bytes; + + f_out.Put(no_bytes); + + for (int i = no_bytes; i; --i) + f_out.Put((x >> ((i - 1) * 8)) & 0xff); + + return no_bytes + 1; +}*/ + +// ******************************************************************************************* +size_t CArchive::write(const string &s) +{ + f_out.Write(s); + f_out.Put(0); + + return s.size() + 1; +} + +// ******************************************************************************************* +size_t CArchive::read(string& s) +{ + s.clear(); + + while (true) + { + int c = f_in.Get(); + if (c == EOF) + return 0; + + if (c == 0) + return s.size() + 1; + + s.push_back((char)c); + } + + return 0; +} + +// ******************************************************************************************* +bool CArchive::serialize() +{ + size_t footer_size = 0; + + // Store stram part offsets + footer_size += write(v_streams.size()); + + for (auto& stream : v_streams) + { + size_t p = footer_size; + + footer_size += write(stream.stream_name); + footer_size += write(stream.parts.size()); + footer_size += write(stream.raw_size); + + for (auto& part : stream.parts) + { + footer_size += write(part.offset); + footer_size += write(part.size); + } + + stream.packed_size += footer_size - p; + } + + write_fixed(footer_size); + + return true; +} + +// ******************************************************************************************* +bool CArchive::deserialize() +{ + size_t footer_size; + size_t file_size = f_in.FileSize(); + + f_in.Seek(file_size - 8ull); + read_fixed(footer_size); + + f_in.Seek(file_size -(size_t)(8 + footer_size)); + + // Read stream part offsets + size_t n_streams; + read(n_streams); + + v_streams.resize(n_streams, stream_t()); + + rm_streams.reserve(2 * n_streams); + + for (size_t i = 0; i < n_streams; ++i) + { + auto& stream_second = v_streams[i]; + + read(stream_second.stream_name); + read(stream_second.cur_id); + read(stream_second.raw_size); + + stream_second.parts.resize(stream_second.cur_id); + for (size_t j = 0; j < stream_second.cur_id; ++j) + { + read(stream_second.parts[j].offset); + read(stream_second.parts[j].size); + } + + stream_second.cur_id = 0; + + if(!is_lazy_str(stream_second.stream_name)) + rm_streams[stream_second.stream_name] = i; + } + + f_in.Seek(0); + + return true; +} + +// ******************************************************************************************* +int CArchive::register_stream(const string& stream_name) +{ + // Before adding new stream check if stream_name is already registered + auto p = rm_streams.find(stream_name); + if (p != rm_streams.end()) + return (int)p->second; + + int id = (int)v_streams.size(); + + v_streams.emplace_back(stream_t()); + + v_streams[id].cur_id = 0; + v_streams[id].stream_name = stream_name; + v_streams[id].raw_size = 0; + v_streams[id].packed_size = 0; + v_streams[id].packed_data_size = 0; + + rm_streams[stream_name] = id; + + return id; +} + +// ******************************************************************************************* +int CArchive::RegisterStream(const string &stream_name) +{ + lock_guard lck(mtx); + + return register_stream(stream_name); +} + +// ******************************************************************************************* +pair CArchive::RegisterStreams(const string& stream_name1, const string& stream_name2) +{ + lock_guard lck(mtx); + + int id1 = register_stream(stream_name1); + int id2 = register_stream(stream_name2); + + return make_pair(id1, id2); +} + +// ******************************************************************************************* +int CArchive::get_stream_id(const string& stream_name) +{ + if (is_lazy_str(stream_name)) + de_lazy(); + + auto p = rm_streams.find(stream_name); + if (p != rm_streams.end()) + return (int)p->second; + + return -1; +} + +// ******************************************************************************************* +int CArchive::GetStreamId(const string &stream_name) +{ + lock_guard lck(mtx); + + return get_stream_id(stream_name); +} + +// ******************************************************************************************* +bool CArchive::add_part(const int stream_id, const vector& v_data, const uint64_t metadata) +{ + v_streams[stream_id].parts.push_back(part_t(f_offset, v_data.size())); + + f_offset += write(metadata); + f_out.Write(v_data.data(), v_data.size()); + + f_offset += v_data.size(); + + v_streams[stream_id].packed_size += f_offset - v_streams[stream_id].parts.back().offset; + v_streams[stream_id].packed_data_size += v_data.size(); + + return true; +} + +// ******************************************************************************************* +bool CArchive::AddPart(const int stream_id, const vector &v_data, const uint64_t metadata) +{ + lock_guard lck(mtx); + + return add_part(stream_id, v_data, metadata); +} + +// ******************************************************************************************* +int CArchive::AddPartPrepare(const int stream_id) +{ + lock_guard lck(mtx); + + v_streams[stream_id].parts.push_back(part_t(0, 0)); + + return static_cast(v_streams[stream_id].parts.size()) - 1; +} + +// ******************************************************************************************* +bool CArchive::AddPartComplete(const int stream_id, const int part_id, const vector& v_data, const uint64_t metadata) +{ + lock_guard lck(mtx); + + v_streams[stream_id].parts[part_id] = part_t(f_offset, v_data.size()); + + f_offset += write(metadata); + f_out.Write(v_data.data(), v_data.size()); + + f_offset += v_data.size(); + + v_streams[stream_id].packed_size += f_offset - v_streams[stream_id].parts[part_id].offset; + v_streams[stream_id].packed_data_size += v_data.size(); + + return true; +} + +// ******************************************************************************************* +bool CArchive::AddPartBuffered(const int stream_id, const vector& v_data, const uint64_t metadata) +{ + lock_guard lck(mtx); + + m_buffer[stream_id].emplace_back(v_data, metadata); + + return true; +} + +// ******************************************************************************************* +bool CArchive::flush_out_buffers() +{ + for (auto& x : m_buffer) + for (auto& y : x.second) + add_part(x.first, y.first, y.second); + + m_buffer.clear(); + + return true; +} + +// ******************************************************************************************* +bool CArchive::FlushOutBuffers() +{ + lock_guard lck(mtx); + + return flush_out_buffers(); +} + +// ******************************************************************************************* +void CArchive::SetRawSize(const int stream_id, const size_t raw_size) +{ + lock_guard lck(mtx); + + v_streams[stream_id].raw_size = raw_size; +} + +// ******************************************************************************************* +size_t CArchive::GetRawSize(const int stream_id) +{ + lock_guard lck(mtx); + + return v_streams[stream_id].raw_size; +} + +// ******************************************************************************************* +bool CArchive::get_part(const int stream_id, vector& v_data, uint64_t& metadata) +{ + auto& p = v_streams[stream_id]; + + if (p.cur_id >= p.parts.size()) + return false; + + v_data.resize(p.parts[p.cur_id].size); + + f_in.Seek(p.parts[p.cur_id].offset); + + if (p.parts[p.cur_id].size != 0) + read(metadata); + else + { + metadata = 0; + p.cur_id++; + return true; + } + + f_in.Read(v_data.data(), p.parts[p.cur_id].size); + + p.cur_id++; + + return true; +} + +// ******************************************************************************************* +bool CArchive::GetPart(const int stream_id, vector &v_data, uint64_t &metadata) +{ + lock_guard lck(mtx); + + return get_part(stream_id, v_data, metadata); +} + +// ******************************************************************************************* +pair CArchive::GetPart(const string& stream_name, vector &v_data, uint64_t &metadata) +{ + lock_guard lck(mtx); + + int stream_id = get_stream_id(stream_name); + + if (stream_id < 0) + return make_pair(-1, false); + + return make_pair(stream_id, get_part(stream_id, v_data, metadata)); +} + +// ******************************************************************************************* +tuple CArchive::GetParts( + const string& stream_name1, vector& v_data1, uint64_t& metadata1, + const string& stream_name2, vector& v_data2, uint64_t& metadata2) +{ + lock_guard lck(mtx); + + bool res1 = false; + bool res2 = false; + + int stream_id1 = get_stream_id(stream_name1); + int stream_id2 = get_stream_id(stream_name2); + + if (stream_id1 >= 0) + res1 = get_part(stream_id1, v_data1, metadata1); + + if (stream_id2 >= 0) + res2 = get_part(stream_id2, v_data2, metadata2); + + return make_tuple(stream_id1, res1, stream_id2, res2); +} + +// ******************************************************************************************* +bool CArchive::get_part(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata) +{ + auto& p = v_streams[stream_id]; + + if ((size_t)part_id >= p.parts.size()) + return false; + + v_data.resize(p.parts[part_id].size); + + f_in.Seek(p.parts[part_id].offset); + + if (p.parts[part_id].size != 0) + read(metadata); + else + { + metadata = 0; + return true; + } + + f_in.Read(v_data.data(), p.parts[part_id].size); + + return true; +} + +// ******************************************************************************************* +bool CArchive::GetPart(const int stream_id, const int part_id, vector &v_data, uint64_t &metadata) +{ + lock_guard lck(mtx); + + return get_part(stream_id, part_id, v_data, metadata); +} + +// ******************************************************************************************* +pair CArchive::GetPart(const string& stream_name, const int part_id, vector &v_data, uint64_t &metadata) +{ + lock_guard lck(mtx); + + int stream_id = get_stream_id(stream_name); + + if (stream_id < 0) + return make_pair(-1, false); + + return make_pair(stream_id, get_part(stream_id, part_id, v_data, metadata)); +} + +// ******************************************************************************************* +tuple CArchive::GetParts( + const string& stream_name1, const int part_id1, vector& v_data1, uint64_t& metadata1, + const string& stream_name2, const int part_id2, vector& v_data2, uint64_t& metadata2) +{ + lock_guard lck(mtx); + + bool res1 = false; + bool res2 = false; + + int stream_id1 = get_stream_id(stream_name1); + int stream_id2 = get_stream_id(stream_name2); + + if (stream_id1 >= 0) + res1 = get_part(stream_id1, part_id1, v_data1, metadata1); + + if (stream_id2 >= 0) + res2 = get_part(stream_id2, part_id2, v_data2, metadata2); + + return make_tuple(stream_id1, res1, stream_id2, res2); +} + +// ******************************************************************************************* +size_t CArchive::GetNoStreams() +{ + lock_guard lck(mtx); + + return v_streams.size(); +} + +// ******************************************************************************************* +size_t CArchive::GetNoParts(const int stream_id) +{ + lock_guard lck(mtx); + + if (stream_id < 0 || (size_t)stream_id >= v_streams.size()) + return 0; + + return v_streams[stream_id].parts.size(); +} + +// ******************************************************************************************* +size_t CArchive::GetStreamPackedSize(const int stream_id) +{ + lock_guard lck(mtx); + + if (stream_id < 0 || stream_id >= static_cast(v_streams.size())) + return 0; + + return v_streams[stream_id].packed_size; +} + +// ******************************************************************************************* +size_t CArchive::GetStreamPackedDataSize(const int stream_id) +{ + lock_guard lck(mtx); + + if (stream_id < 0 || stream_id >= static_cast(v_streams.size())) + return 0; + + return v_streams[stream_id].packed_data_size; +} + +// EOF diff --git a/src/core/archive.h b/src/common/archive.h similarity index 94% rename from src/core/archive.h rename to src/common/archive.h index e958e52..cb26dbb 100644 --- a/src/core/archive.h +++ b/src/common/archive.h @@ -1,207 +1,209 @@ -#ifndef _ARCHIVE_H -#define _ARCHIVE_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../core/io.h" -#include "../core/utils.h" - -using namespace std; - -class CArchive -{ - bool input_mode; - CInFile f_in; - COutFile f_out; - size_t io_buffer_size; - - size_t f_offset; - - struct part_t{ - size_t offset; - size_t size; - - part_t() : offset(0), size(0) - {}; - - part_t(size_t _offset, size_t _size) : offset(_offset), size(_size) - {}; - }; - - typedef struct { - string stream_name; - size_t cur_id; - size_t raw_size; - size_t packed_size; - size_t packed_data_size; - vector parts; - } stream_t; - - map, uint64_t>>> m_buffer; - - vector v_streams; - unordered_map rm_streams; - string lazy_prefix; - - mutex mtx; - - bool serialize(); - bool deserialize(); - - // ******************************************************************************************* - inline bool is_lazy_str(const string& str) - { - if (!input_mode) - return false; - - if (lazy_prefix.empty()) - return false; - - if (str.size() <= lazy_prefix.size()) - return false; - - const char* p = str.c_str(); - const char* q = lazy_prefix.c_str(); - - while (*p == *q) - ++p, ++q; - - return *q == 0; - } - - // ******************************************************************************************* - void de_lazy() - { - rm_streams.reserve(2 * v_streams.size()); - - for(size_t i = 0; i < v_streams.size(); ++i) - if(is_lazy_str(v_streams[i].stream_name)) - rm_streams[v_streams[i].stream_name] = i; - - lazy_prefix.clear(); - } - - // ******************************************************************************************* - template - size_t write_fixed(const T x) - { - f_out.WriteUInt(static_cast(x), 8); - - return 8; - } - - // ******************************************************************************************* - template - size_t write(const T _x) - { - int no_bytes = 0; - uint64_t x = static_cast(_x); - - for (size_t tmp = x; tmp; tmp >>= 8) - ++no_bytes; - - f_out.Put(no_bytes); - - for (int i = no_bytes; i; --i) - f_out.Put((x >> ((i - 1) * 8)) & 0xff); - - return no_bytes + 1; - } - - // ******************************************************************************************* - size_t write(const string &s); - - // ******************************************************************************************* - template - size_t read_fixed(T& x) - { - x = static_cast(f_in.ReadUInt(8)); - - return 8; - } - - // ******************************************************************************************* - size_t read(string& s); - - // ******************************************************************************************* - template - size_t read(T& x) - { - int no_bytes = f_in.Get(); - - x = 0; - - for (int i = 0; i < no_bytes; ++i) - { - x <<= 8; - x += static_cast(f_in.Get()); - } - - return no_bytes + 1; - } - - // ******************************************************************************************* - bool add_part(const int stream_id, const vector& v_data, const uint64_t metadata); - bool flush_out_buffers(); - int get_stream_id(const string& stream_name); - bool get_part(const int stream_id, vector& v_data, uint64_t& metadata); - bool get_part(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata); - -public: - CArchive(const bool _input_mode, const size_t _io_buffer_size = 64 << 20, const string& _lazy_prefix = ""); - ~CArchive(); - - bool Open(const string &file_name); - bool Close(); - - int RegisterStream(const string &stream_name); - int GetStreamId(const string &stream_name); - - size_t GetStreamPackedSize(const int stream_id); - size_t GetStreamPackedDataSize(const int stream_id); - - bool AddPart(const int stream_id, const vector& v_data, const uint64_t metadata = 0); - int AddPartPrepare(const int stream_id); - bool AddPartComplete(const int stream_id, const int part_id, const vector& v_data, const uint64_t metadata = 0); - bool AddPartBuffered(const int stream_id, const vector& v_data, const uint64_t metadata = 0); - - bool FlushOutBuffers(); - - bool GetPart(const int stream_id, vector &v_data, uint64_t &metadata); - bool GetPart(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata); - - pair GetPart(const string &stream_name, vector &v_data, uint64_t &metadata); - pair GetPart(const string& stream_name, const int part_id, vector &v_data, uint64_t &metadata); - - tuple GetParts( - const string &stream_name1, vector &v_data1, uint64_t &metadata1, - const string& stream_name2, vector& v_data2, uint64_t& metadata2); - tuple GetParts( - const string& stream_name1, const int part_id1, vector &v_data1, uint64_t &metadata1, - const string& stream_name2, const int part_id2, vector &v_data2, uint64_t &metadata2); - - void SetRawSize(const int stream_id, const size_t raw_size); - size_t GetRawSize(const int stream_id); - - size_t GetNoStreams(); - size_t GetNoParts(const int stream_id); -}; - -// EOF +#ifndef _ARCHIVE_H +#define _ARCHIVE_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../common/io.h" +#include "../common/utils.h" + +using namespace std; + +class CArchive +{ + bool input_mode; + CInFile f_in; + COutFile f_out; + size_t io_buffer_size; + + size_t f_offset; + + struct part_t{ + size_t offset; + size_t size; + + part_t() : offset(0), size(0) + {}; + + part_t(size_t _offset, size_t _size) : offset(_offset), size(_size) + {}; + }; + + typedef struct { + string stream_name; + size_t cur_id; + size_t raw_size; + size_t packed_size; + size_t packed_data_size; + vector parts; + } stream_t; + + map, uint64_t>>> m_buffer; + + vector v_streams; + unordered_map rm_streams; + string lazy_prefix; + + mutex mtx; + + bool serialize(); + bool deserialize(); + + // ******************************************************************************************* + inline bool is_lazy_str(const string& str) + { + if (!input_mode) + return false; + + if (lazy_prefix.empty()) + return false; + + if (str.size() <= lazy_prefix.size()) + return false; + + const char* p = str.c_str(); + const char* q = lazy_prefix.c_str(); + + while (*p == *q) + ++p, ++q; + + return *q == 0; + } + + // ******************************************************************************************* + void de_lazy() + { + rm_streams.reserve(2 * v_streams.size()); + + for(size_t i = 0; i < v_streams.size(); ++i) + if(is_lazy_str(v_streams[i].stream_name)) + rm_streams[v_streams[i].stream_name] = i; + + lazy_prefix.clear(); + } + + // ******************************************************************************************* + template + size_t write_fixed(const T x) + { + f_out.WriteUInt(static_cast(x), 8); + + return 8; + } + + // ******************************************************************************************* + template + size_t write(const T _x) + { + int no_bytes = 0; + uint64_t x = static_cast(_x); + + for (size_t tmp = x; tmp; tmp >>= 8) + ++no_bytes; + + f_out.Put(no_bytes); + + for (int i = no_bytes; i; --i) + f_out.Put((x >> ((i - 1) * 8)) & 0xff); + + return no_bytes + 1; + } + + // ******************************************************************************************* + size_t write(const string &s); + + // ******************************************************************************************* + template + size_t read_fixed(T& x) + { + x = static_cast(f_in.ReadUInt(8)); + + return 8; + } + + // ******************************************************************************************* + size_t read(string& s); + + // ******************************************************************************************* + template + size_t read(T& x) + { + int no_bytes = f_in.Get(); + + x = 0; + + for (int i = 0; i < no_bytes; ++i) + { + x <<= 8; + x += static_cast(f_in.Get()); + } + + return no_bytes + 1; + } + + // ******************************************************************************************* + bool add_part(const int stream_id, const vector& v_data, const uint64_t metadata); + bool flush_out_buffers(); + int get_stream_id(const string& stream_name); + bool get_part(const int stream_id, vector& v_data, uint64_t& metadata); + bool get_part(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata); + int register_stream(const string& stream_name); + +public: + CArchive(const bool _input_mode, const size_t _io_buffer_size = 64 << 20, const string& _lazy_prefix = ""); + ~CArchive(); + + bool Open(const string &file_name); + bool Close(); + + int RegisterStream(const string &stream_name); + pair RegisterStreams(const string &stream_name1, const string& stream_name2); + int GetStreamId(const string &stream_name); + + size_t GetStreamPackedSize(const int stream_id); + size_t GetStreamPackedDataSize(const int stream_id); + + bool AddPart(const int stream_id, const vector& v_data, const uint64_t metadata = 0); + int AddPartPrepare(const int stream_id); + bool AddPartComplete(const int stream_id, const int part_id, const vector& v_data, const uint64_t metadata = 0); + bool AddPartBuffered(const int stream_id, const vector& v_data, const uint64_t metadata = 0); + + bool FlushOutBuffers(); + + bool GetPart(const int stream_id, vector &v_data, uint64_t &metadata); + bool GetPart(const int stream_id, const int part_id, vector& v_data, uint64_t& metadata); + + pair GetPart(const string &stream_name, vector &v_data, uint64_t &metadata); + pair GetPart(const string& stream_name, const int part_id, vector &v_data, uint64_t &metadata); + + tuple GetParts( + const string &stream_name1, vector &v_data1, uint64_t &metadata1, + const string& stream_name2, vector& v_data2, uint64_t& metadata2); + tuple GetParts( + const string& stream_name1, const int part_id1, vector &v_data1, uint64_t &metadata1, + const string& stream_name2, const int part_id2, vector &v_data2, uint64_t &metadata2); + + void SetRawSize(const int stream_id, const size_t raw_size); + size_t GetRawSize(const int stream_id); + + size_t GetNoStreams(); + size_t GetNoParts(const int stream_id); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/collection.cpp b/src/common/collection.cpp similarity index 92% rename from src/core/collection.cpp rename to src/common/collection.cpp index e726734..c2bd192 100644 --- a/src/core/collection.cpp +++ b/src/common/collection.cpp @@ -1,59 +1,59 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include "../core/collection.h" - -#include - -// ******************************************************************************************* -string CCollection::extract_contig_name(const string& s) -{ - string::const_iterator p; - - for (p = s.begin(); p != s.end(); ++p) - if ((*p < '0') && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) - break; - - return string(s.begin(), p); -} - -// ******************************************************************************************* -bool CCollection::is_equal_sample_contig(const pair& x, const pair& y) -{ - return x.first == y.first && extract_contig_name(x.second) == extract_contig_name(y.second); -} - -// ******************************************************************************************* -void CCollection::add_cmd_line(const string &cmd) -{ - lock_guard lck(mtx); - - auto tc = time(nullptr); - char tmp[64]; - string s_time; - - if(strftime(tmp, sizeof(tmp), "%A %c", std::gmtime(&tc))) - s_time = tmp; - - cmd_lines.emplace_back(cmd, s_time); -} - -// ******************************************************************************************* -void CCollection::get_cmd_lines(vector>& _cmd_lines) -{ - lock_guard lck(mtx); - - _cmd_lines = cmd_lines; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include "collection.h" + +#include + +// ******************************************************************************************* +string CCollection::extract_contig_name(const string& s) +{ + string::const_iterator p; + + for (p = s.begin(); p != s.end(); ++p) + if ((*p < '0') && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + break; + + return string(s.begin(), p); +} + +// ******************************************************************************************* +bool CCollection::is_equal_sample_contig(const pair& x, const pair& y) +{ + return x.first == y.first && extract_contig_name(x.second) == extract_contig_name(y.second); +} + +// ******************************************************************************************* +void CCollection::add_cmd_line(const string &cmd) +{ + lock_guard lck(mtx); + + auto tc = time(nullptr); + char tmp[64]; + string s_time; + + if(strftime(tmp, sizeof(tmp), "%A %c", std::gmtime(&tc))) + s_time = tmp; + + cmd_lines.emplace_back(cmd, s_time); +} + +// ******************************************************************************************* +void CCollection::get_cmd_lines(vector>& _cmd_lines) +{ + lock_guard lck(mtx); + + _cmd_lines = cmd_lines; +} + +// EOF diff --git a/src/core/collection.h b/src/common/collection.h similarity index 94% rename from src/core/collection.h rename to src/common/collection.h index 1865bf1..152fd4a 100644 --- a/src/core/collection.h +++ b/src/common/collection.h @@ -1,250 +1,251 @@ -#ifndef _COLLECTION_H -#define _COLLECTION_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include -#include -#include -#include "../core/utils.h" -#include - -using namespace std; -using namespace std::chrono; - -// ******************************************************************************************* -struct segment_desc_t -{ - uint32_t group_id; - uint32_t in_group_id; - bool is_rev_comp; - uint32_t raw_length; - - segment_desc_t() : - group_id(~0u), in_group_id(~0u), is_rev_comp(false), raw_length(0) - {} - - segment_desc_t(const uint32_t _group_id, const uint32_t _in_group_id, const bool _is_rev_comp, const uint32_t _raw_length) : - group_id(_group_id), in_group_id(_in_group_id), is_rev_comp(_is_rev_comp), raw_length(_raw_length) - {} -}; - -// ******************************************************************************************* -struct pair_segment_desc_t -{ - segment_desc_t first; - segment_desc_t second; - bool contains_second; - - pair_segment_desc_t(segment_desc_t _first, segment_desc_t _second = segment_desc_t{}, bool _contains_second = false) : - first(_first), second(_second), contains_second(_contains_second) - {} -}; - -// ******************************************************************************************* -struct contig_info_t -{ - string sample_name; - string contig_name; - uint32_t id; - uint32_t no_seg; - - contig_info_t(string _sample_name, string _contig_name, uint32_t _id, uint32_t _no_seg) : - sample_name(_sample_name), contig_name(_contig_name), id(_id), no_seg(_no_seg) - {}; -}; - -// ******************************************************************************************* -struct segments_to_place_t { - string sample_name; - string contig_name; - uint32_t seg_part_no; - int group_id; - int in_group_id; - bool is_rev_comp; - uint32_t data_size; - - segments_to_place_t(string _sample_name, string _contig_name, uint32_t _seg_part_no, int _group_id, int _in_group_id, bool _is_rev_comp, uint32_t _data_size) : - sample_name(_sample_name), - contig_name(_contig_name), - seg_part_no(_seg_part_no), - group_id(_group_id), - in_group_id(_in_group_id), - is_rev_comp(_is_rev_comp), - data_size(_data_size) {} - - segments_to_place_t() = default; - segments_to_place_t(const segments_to_place_t&) = default; -}; - -// ******************************************************************************************* -using sample_desc_t = vector>>; - -// ******************************************************************************************* -class CCollection -{ -protected: - mutex mtx; - - const uint32_t thr_1 = 1u << 7; - const uint32_t thr_2 = thr_1 + (1u << 14); - const uint32_t thr_3 = thr_2 + (1u << 21); - const uint32_t thr_4 = thr_3 + (1u << 28); - const uint8_t pref_1 = 0; - const uint8_t pref_2 = 0b10000000u; - const uint8_t pref_3 = 0b11000000u; - const uint8_t pref_4 = 0b11100000u; - const uint8_t pref_5 = 0b11110000u; - const uint8_t mask_1 = 0b10000000u; - const uint8_t mask_2 = 0b11000000u; - const uint8_t mask_3 = 0b11100000u; - const uint8_t mask_4 = 0b11110000u; - - const uint8_t pref_arr[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5}; - - ZSTD_DCtx* zstd_dctx = nullptr; - - vector> cmd_lines; - - void append(vector& data, const string& str) - { - data.insert(data.end(), str.begin(), str.end()); - data.emplace_back(0); - } - - void append(vector& data, uint32_t num) - { - if (num < thr_1) - data.emplace_back(pref_1 + num); - else if (num < thr_2) - { - num -= thr_1; - data.emplace_back(pref_2 + (num >> 8)); - data.emplace_back(num & 0xffu); - } - else if (num < thr_3) - { - num -= thr_2; - data.emplace_back(pref_3 + (num >> 16)); - data.emplace_back((num >> 8) & 0xffu); - data.emplace_back(num & 0xffu); - } - else if (num < thr_4) - { - num -= thr_3; - data.emplace_back(pref_4 + (num >> 24)); - data.emplace_back((num >> 16) & 0xffu); - data.emplace_back((num >> 8) & 0xffu); - data.emplace_back(num & 0xffu); - } - else - { - num -= thr_4; - data.emplace_back(pref_5); - data.emplace_back((num >> 24) & 0xffu); - data.emplace_back((num >> 16) & 0xffu); - data.emplace_back((num >> 8) & 0xffu); - data.emplace_back(num & 0xffu); - } - } - - void read(uint8_t*& p, string& str) - { - auto q = p; - while (*q) - ++q; - - str.assign((char*)p, q - p); - - p = q + 1; - } - - void read(uint8_t*& p, uint32_t& num) - { - if ((*p & mask_1) == pref_1) - num = *p++ - pref_1; - else if ((*p & mask_2) == pref_2) - { - num = ((uint32_t)p[0] << 8) + p[1] + thr_1 - (pref_2 << 8); - p += 2; - } - else if ((*p & mask_3) == pref_3) - { - num = ((uint32_t) p[0] << 16) + ((uint32_t) p[1] << 8) + p[2] + thr_2 - (pref_3 << 16); - p += 3; - } - else if ((*p & mask_4) == pref_4) - { - num = ((uint32_t)p[0] << 24) + ((uint32_t)p[1] << 16) + ((uint32_t)p[2] << 8) + p[3] + thr_3 - (pref_4 << 24); - p += 4; - } - else - { - p++; // skip pref_5 - num = *p++; - num <<= 8; num += *p++; - num <<= 8; num += *p++; - num <<= 8; num += *p++; - num += thr_4; - } - } - - void read_fixed32(vector::iterator& p, uint32_t& num) - { - num = 0; - - for (int i = 0; i < 4; ++i) - num += ((uint32_t)p[i]) << (8 * i); - - p += 4; - } - - void skip(uint8_t*& p) - { - auto x = pref_arr[*p >> 4]; - p += x; - } - - string extract_contig_name(const string& s); - bool is_equal_sample_contig(const pair& x, const pair& y); - -public: - CCollection() {}; - virtual ~CCollection() { - if (zstd_dctx) - ZSTD_freeDCtx(zstd_dctx); - }; - - virtual bool register_sample_contig(const string& sample_name, const string& contig_name) = 0; - - virtual void add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) = 0; - virtual bool get_reference_name(string& reference_name) = 0; - virtual bool get_samples_list(vector& v_samples) = 0; - virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) = 0; - - virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc) = 0; - virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) = 0; - - virtual bool is_contig_desc(const string& sample_name, const string& contig_name) = 0; - virtual vector get_samples_for_contig(const string& contig_name) = 0; - - void add_cmd_line(const string &cmd); - void get_cmd_lines(vector>& _cmd_lines); - - virtual size_t get_no_samples() = 0; - virtual int32_t get_no_contigs(const string& sample_name) = 0; -}; - -// EOF +#ifndef _COLLECTION_H +#define _COLLECTION_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include +#include +#include +#include "../common/utils.h" +#include + +using namespace std; +using namespace std::chrono; + +// ******************************************************************************************* +struct segment_desc_t +{ + uint32_t group_id; + uint32_t in_group_id; + bool is_rev_comp; + uint32_t raw_length; + + segment_desc_t() : + group_id(~0u), in_group_id(~0u), is_rev_comp(false), raw_length(0) + {} + + segment_desc_t(const uint32_t _group_id, const uint32_t _in_group_id, const bool _is_rev_comp, const uint32_t _raw_length) : + group_id(_group_id), in_group_id(_in_group_id), is_rev_comp(_is_rev_comp), raw_length(_raw_length) + {} +}; + +// ******************************************************************************************* +struct pair_segment_desc_t +{ + segment_desc_t first; + segment_desc_t second; + bool contains_second; + + pair_segment_desc_t(segment_desc_t _first, segment_desc_t _second = segment_desc_t{}, bool _contains_second = false) : + first(_first), second(_second), contains_second(_contains_second) + {} +}; + +// ******************************************************************************************* +struct contig_info_t +{ + string sample_name; + string contig_name; + uint32_t id; + uint32_t no_seg; + + contig_info_t(string _sample_name, string _contig_name, uint32_t _id, uint32_t _no_seg) : + sample_name(_sample_name), contig_name(_contig_name), id(_id), no_seg(_no_seg) + {}; +}; + +// ******************************************************************************************* +struct segments_to_place_t { + string sample_name; + string contig_name; + uint32_t seg_part_no; + int group_id; + int in_group_id; + bool is_rev_comp; + uint32_t data_size; + + segments_to_place_t(string _sample_name, string _contig_name, uint32_t _seg_part_no, int _group_id, int _in_group_id, bool _is_rev_comp, uint32_t _data_size) : + sample_name(_sample_name), + contig_name(_contig_name), + seg_part_no(_seg_part_no), + group_id(_group_id), + in_group_id(_in_group_id), + is_rev_comp(_is_rev_comp), + data_size(_data_size) {} + + segments_to_place_t() = default; + segments_to_place_t(const segments_to_place_t&) = default; +}; + +// ******************************************************************************************* +using sample_desc_t = vector>>; + +// ******************************************************************************************* +class CCollection +{ +protected: + mutex mtx; + + const uint32_t thr_1 = 1u << 7; + const uint32_t thr_2 = thr_1 + (1u << 14); + const uint32_t thr_3 = thr_2 + (1u << 21); + const uint32_t thr_4 = thr_3 + (1u << 28); + const uint8_t pref_1 = 0; + const uint8_t pref_2 = 0b10000000u; + const uint8_t pref_3 = 0b11000000u; + const uint8_t pref_4 = 0b11100000u; + const uint8_t pref_5 = 0b11110000u; + const uint8_t mask_1 = 0b10000000u; + const uint8_t mask_2 = 0b11000000u; + const uint8_t mask_3 = 0b11100000u; + const uint8_t mask_4 = 0b11110000u; + + const uint8_t pref_arr[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5}; + + ZSTD_DCtx* zstd_dctx = nullptr; + + vector> cmd_lines; + + void append(vector& data, const string& str) + { + data.insert(data.end(), str.begin(), str.end()); + data.emplace_back(0); + } + + void append(vector& data, uint32_t num) + { + if (num < thr_1) + data.emplace_back(pref_1 + num); + else if (num < thr_2) + { + num -= thr_1; + data.emplace_back(pref_2 + (num >> 8)); + data.emplace_back(num & 0xffu); + } + else if (num < thr_3) + { + num -= thr_2; + data.emplace_back(pref_3 + (num >> 16)); + data.emplace_back((num >> 8) & 0xffu); + data.emplace_back(num & 0xffu); + } + else if (num < thr_4) + { + num -= thr_3; + data.emplace_back(pref_4 + (num >> 24)); + data.emplace_back((num >> 16) & 0xffu); + data.emplace_back((num >> 8) & 0xffu); + data.emplace_back(num & 0xffu); + } + else + { + num -= thr_4; + data.emplace_back(pref_5); + data.emplace_back((num >> 24) & 0xffu); + data.emplace_back((num >> 16) & 0xffu); + data.emplace_back((num >> 8) & 0xffu); + data.emplace_back(num & 0xffu); + } + } + + void read(uint8_t*& p, string& str) + { + auto q = p; + while (*q) + ++q; + + str.assign((char*)p, q - p); + + p = q + 1; + } + + void read(uint8_t*& p, uint32_t& num) + { + if ((*p & mask_1) == pref_1) + num = *p++ - pref_1; + else if ((*p & mask_2) == pref_2) + { + num = ((uint32_t)p[0] << 8) + p[1] + thr_1 - (pref_2 << 8); + p += 2; + } + else if ((*p & mask_3) == pref_3) + { + num = ((uint32_t) p[0] << 16) + ((uint32_t) p[1] << 8) + p[2] + thr_2 - (pref_3 << 16); + p += 3; + } + else if ((*p & mask_4) == pref_4) + { + num = ((uint32_t)p[0] << 24) + ((uint32_t)p[1] << 16) + ((uint32_t)p[2] << 8) + p[3] + thr_3 - (pref_4 << 24); + p += 4; + } + else + { + p++; // skip pref_5 + num = *p++; + num <<= 8; num += *p++; + num <<= 8; num += *p++; + num <<= 8; num += *p++; + num += thr_4; + } + } + + void read_fixed32(vector::iterator& p, uint32_t& num) + { + num = 0; + + for (int i = 0; i < 4; ++i) + num += ((uint32_t)p[i]) << (8 * i); + + p += 4; + } + + void skip(uint8_t*& p) + { + auto x = pref_arr[*p >> 4]; + p += x; + } + + string extract_contig_name(const string& s); + bool is_equal_sample_contig(const pair& x, const pair& y); + +public: + CCollection() {}; + virtual ~CCollection() { + if (zstd_dctx) + ZSTD_freeDCtx(zstd_dctx); + }; + + virtual bool register_sample_contig(const string& sample_name, const string& contig_name) = 0; + + virtual void add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) = 0; + virtual void add_segments_placed(vector& segments_to_place) = 0; + virtual bool get_reference_name(string& reference_name) = 0; + virtual bool get_samples_list(vector& v_samples, bool sorted = true) = 0; + virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) = 0; + + virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc) = 0; + virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) = 0; + + virtual bool is_contig_desc(const string& sample_name, const string& contig_name) = 0; + virtual vector get_samples_for_contig(const string& contig_name) = 0; + + void add_cmd_line(const string &cmd); + void get_cmd_lines(vector>& _cmd_lines); + + virtual size_t get_no_samples() = 0; + virtual int32_t get_no_contigs(const string& sample_name) = 0; +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/collection_v1.cpp b/src/common/collection_v1.cpp similarity index 95% rename from src/core/collection_v1.cpp rename to src/common/collection_v1.cpp index fef7f7f..1b3528e 100644 --- a/src/core/collection_v1.cpp +++ b/src/common/collection_v1.cpp @@ -1,529 +1,545 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/collection_v1.h" - -// ******************************************************************************************* -void CCollection_V1::serialize(vector& data, bool store_date_time) -{ - data.clear(); - - append(data, (uint32_t)col.size()); - - auto col_order = get_sample_original_order(); - - for (auto& sample_name : col_order) - { - auto& sample = col[sample_name]; - - append(data, sample_name); - append(data, (uint32_t)sample.size()); - - for (auto& contig : sample) - { - append(data, contig.first); - append(data, (uint32_t)contig.second.size()); - - int32_t prev_group_id = 0; - int32_t prev_in_group_id = 0; - int32_t prev_raw_length = 0; - - for (auto& seg : contig.second) - { - uint32_t e_group_id = (uint32_t)zigzag_encode((int32_t)seg.group_id - prev_group_id); - uint32_t e_in_group_id = (uint32_t)zigzag_encode((int32_t)seg.in_group_id - prev_in_group_id); - uint32_t e_raw_length = (uint32_t)zigzag_encode((int32_t)seg.raw_length - prev_raw_length); - - append(data, e_group_id); - append(data, e_in_group_id); - append(data, e_raw_length); - append(data, (uint32_t)seg.is_rev_comp); - - prev_group_id = seg.group_id; - prev_in_group_id = seg.in_group_id; - prev_raw_length = seg.raw_length; - } - } - } - - append(data, (uint32_t)cmd_lines.size()); - - for (auto& cmd : cmd_lines) - { - append(data, cmd.first); - if (store_date_time) - append(data, cmd.second); - else - append(data, ""); - } -} - -// ******************************************************************************************* -bool CCollection_V1::deserialize(vector& data) -{ - uint8_t* p = data.data(); - - col.clear(); - - uint32_t no_samples; - string sample_name; - string contig_name; - - read(p, no_samples); - - v_sample_name.reserve(no_samples); - - for (uint32_t i = 0; i < no_samples; ++i) - { - read(p, sample_name); - - v_sample_name.emplace_back(sample_name); - - uint32_t no_contigs; - read(p, no_contigs); - - col[sample_name].resize(no_contigs); - - uint32_t sample_id = (uint32_t)sample_ids.size(); - sample_ids[sample_name] = sample_id; - - for (uint32_t j = 0; j < no_contigs; ++j) - { - read(p, contig_name); - uint32_t no_seg; - read(p, no_seg); - - string short_contig_name = extract_contig_name(contig_name); - - contig_ids_no_seg[make_pair(sample_name, short_contig_name)] = make_pair(j, no_seg); - - mm_contig2sample.emplace(short_contig_name, sample_name); - - auto& q = col[sample_name][j]; - q.first = contig_name; - auto& q_second = q.second; - - uint32_t e_group_id; - uint32_t e_in_group_id; - uint32_t e_raw_length; - uint32_t e_orientation; - - int32_t prev_group_id = 0; - int32_t prev_in_group_id = 0; - int32_t prev_raw_length = 0; - - q_second.reserve(no_seg); - - for (uint32_t k = 0; k < no_seg; ++k) - { - read(p, e_group_id); - read(p, e_in_group_id); - read(p, e_raw_length); - read(p, e_orientation); - - uint32_t c_group_id = (uint32_t)((int32_t)prev_group_id + zigzag_decode(e_group_id)); - uint32_t c_in_group_id = (uint32_t)((int32_t)prev_in_group_id + zigzag_decode(e_in_group_id)); - uint32_t c_raw_length = (uint32_t)((int32_t)prev_raw_length + zigzag_decode(e_raw_length)); - - q_second.emplace_back(c_group_id, c_in_group_id, (bool)e_orientation, c_raw_length); - - prev_group_id = c_group_id; - prev_in_group_id = c_in_group_id; - prev_raw_length = c_raw_length; - } - } - } - - uint32_t no_cmds; - - read(p, no_cmds); - cmd_lines.clear(); - - cmd_lines.resize(no_cmds); - - for (uint32_t i = 0; i < no_cmds; ++i) - { - read(p, cmd_lines[i].first); - read(p, cmd_lines[i].second); - } - - maps_built = true; - - return true; -} - -// ******************************************************************************************* -vector CCollection_V1::get_sample_original_order() -{ - vector vec(sample_ids.size()); - - for (auto& x : sample_ids) - vec[x.second] = x.first; - - return vec; -} - -// ******************************************************************************************* -bool CCollection_V1::register_sample_contig(const string& sample_name, const string& contig_name) -{ - lock_guard lck(mtx); - - string short_contig_name = extract_contig_name(contig_name); - string stored_sample_name = sample_name; - - if (sample_name.empty()) - stored_sample_name = short_contig_name; - - auto q = sample_ids.find(stored_sample_name); - if (q == sample_ids.end()) - { - uint32_t sample_id = (uint32_t)sample_ids.size(); - sample_ids[stored_sample_name] = sample_id; - } - - auto p = contig_ids_no_seg.find(make_pair(stored_sample_name, short_contig_name)); - if (p == contig_ids_no_seg.end()) - { - uint32_t contig_id = (uint32_t)col[stored_sample_name].size(); - contig_ids_no_seg[make_pair(stored_sample_name, short_contig_name)] = make_pair(contig_id, 0); - col[stored_sample_name].emplace_back(contig_name, vector()); - - mm_contig2sample.emplace(short_contig_name, stored_sample_name); - - return true; - } - else - return false; // The pair sample_name:contig_name is not unique -} - -// ******************************************************************************************* -vector& CCollection_V1::add_segment_basic(const string& sample_name, const string& contig_name, - const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) -{ - uint32_t contig_id; - - string short_contig_name = extract_contig_name(contig_name); - string stored_sample_name = sample_name; - - if (sample_name.empty()) - stored_sample_name = short_contig_name; - - auto p_contig_ids = contig_ids_no_seg.find(make_pair(stored_sample_name, short_contig_name)); - if (p_contig_ids != contig_ids_no_seg.end()) - contig_id = p_contig_ids->second.first; - else - { - contig_id = (uint32_t)col[stored_sample_name].size(); - contig_ids_no_seg[make_pair(stored_sample_name, short_contig_name)] = make_pair(contig_id, 0); - col[stored_sample_name].emplace_back(contig_name, vector()); - - mm_contig2sample.emplace(short_contig_name, stored_sample_name); - } - - return col[stored_sample_name][contig_id].second; -} - -// ******************************************************************************************* -void CCollection_V1::add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, - const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) -{ - lock_guard lck(mtx); - - auto& vec = add_segment_basic(sample_name, contig_name, group_id, in_group_id, is_rev_comp, raw_length); - - if (place >= vec.size()) - vec.resize(place + 1, segment_desc_t(555555555, 555555555, true, 555555555)); - - vec[place] = segment_desc_t(group_id, in_group_id, is_rev_comp, raw_length); -} - -// ******************************************************************************************* -bool CCollection_V1::get_reference_name(string& reference_name) -{ - lock_guard lck(mtx); - - if (v_sample_name.empty()) - return false; - - reference_name = v_sample_name.front(); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V1::get_samples_list(vector& v_samples) -{ - lock_guard lck(mtx); - - v_samples.clear(); - - for (auto& p : col) - v_samples.emplace_back(p.first); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V1::get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) -{ - lock_guard lck(mtx); - - v_contig_names.clear(); - - auto p = col.find(sample_name); - if (p == col.end()) - return false; - - for (auto q : p->second) - v_contig_names.emplace_back(q.first); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V1::get_sample_desc(const string& sample_name, vector>>& sample_desc) -{ - lock_guard lck(mtx); - - auto p = col.find(sample_name); - if (p == col.end()) - return false; - - if (p->second.front().second.empty()) - decompress_sample_details(sample_ids[sample_name]); - - sample_desc = p->second; - - return true; -} - -// ******************************************************************************************* -bool CCollection_V1::get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) -{ - lock_guard lck(mtx); - - string short_contig_name = extract_contig_name(contig_name); - - uint32_t contig_id; - - if (maps_built) - { - auto p_contig_id = contig_ids_no_seg.find(make_pair(sample_name, short_contig_name)); - if (p_contig_id == contig_ids_no_seg.end()) - return false; - - contig_id = p_contig_id->second.first; - } - else - { - auto p_contig_info = find_if(v_contig_info.begin(), v_contig_info.end(), [sample_name, short_contig_name](auto& x) { - return sample_name == x.sample_name && short_contig_name == x.contig_name; }); - - if (p_contig_info == v_contig_info.end()) - return false; - - contig_id = p_contig_info->id; - } - - if (col[sample_name][contig_id].second.empty()) - decompress_sample_details(sample_ids[sample_name]); - - contig_desc = col[sample_name][contig_id].second; - - // Reconstruct full contig name - contig_name = col[sample_name][contig_id].first; - - return true; -} - -// ******************************************************************************************* -bool CCollection_V1::is_contig_desc(const string& sample_name, const string& contig_name) -{ - lock_guard lck(mtx); - - string short_contig_name = extract_contig_name(contig_name); - - if (maps_built) - return contig_ids_no_seg.find(make_pair(sample_name, short_contig_name)) != contig_ids_no_seg.end(); - else - { - for (auto p = v_contig_info.begin(); p != v_contig_info.end(); ++p) - if (p->contig_name == contig_name && p->sample_name == sample_name) - return true; - - return false; - } -} - -// ******************************************************************************************* -vector CCollection_V1::get_samples_for_contig(const string& contig_name) -{ - vector vs; - - if (maps_built) - { - auto pq = mm_contig2sample.equal_range(extract_contig_name(contig_name)); - - for (auto p = pq.first; p != pq.second; ++p) - vs.emplace_back(p->second); - } - else - { - for (auto p = v_contig_info.begin(); p != v_contig_info.end(); ++p) - if (p->contig_name == contig_name) - vs.emplace_back(p->sample_name); - } - - return vs; -} - -// ******************************************************************************************* -size_t CCollection_V1::get_no_samples() -{ - lock_guard lck(mtx); - - return col.size(); -} - -// ******************************************************************************************* -int32_t CCollection_V1::get_no_contigs(const string& sample_name) -{ - lock_guard lck(mtx); - - auto p = col.find(sample_name); - if (p == col.end()) - return -1; - - return static_cast(p->second.size()); -} - -// ******************************************************************************************* -void CCollection_V1::decompress_sample_details(uint32_t i_sample) -{ - if (!zstd_dctx) - zstd_dctx = ZSTD_createDCtx(); - - uint32_t i_part = i_sample / details_batch_size; - - vector v_tmp; - - v_tmp.resize(v_zstd_batches[i_part].second); - ZSTD_decompressDCtx(zstd_dctx, v_tmp.data(), v_tmp.size(), v_zstd_batches[i_part].first.data(), v_zstd_batches[i_part].first.size()); - - v_zstd_batches[i_part].first.clear(); - v_zstd_batches[i_part].first.shrink_to_fit(); - v_zstd_batches[i_part].second = 0; - - uint8_t* p = v_tmp.data(); - - vector v_p_sample; - v_p_sample.reserve(details_batch_size); - - for (uint32_t i = i_part * details_batch_size; i < (i_part + 1) * details_batch_size && i < col.size(); ++i) - v_p_sample.push_back(col.find(v_sample_name[i])); - - if (maps_built) - { - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) - { - uint32_t no_seg = contig_ids_no_seg[make_pair((*p_sam)->first, extract_contig_name(p_ctg->first))].second; - p_ctg->second.resize(no_seg); - } - } - else - { - auto p_contig_info = find_if(v_contig_info.begin(), v_contig_info.end(), [&v_p_sample](auto& x) { - return v_p_sample.front()->first == x.sample_name; }); - - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg, ++p_contig_info) - { - uint32_t no_seg = p_contig_info->no_seg; - - p_ctg->second.resize(no_seg); - } - } - - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) - deserialize_contig_details_group_id(p, p_ctg->second); - - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) - deserialize_contig_details_in_group_id(p, p_ctg->second); - - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) - deserialize_contig_details_raw_length(p, p_ctg->second); - - for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) - for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) - deserialize_contig_details_orientation(p, p_ctg->second); -} - -// ******************************************************************************************* -void CCollection_V1::deserialize_contig_details_group_id(uint8_t*& p, vector& contig_segments) -{ - uint32_t e_group_id; - int32_t prev_group_id = 0; - - for (uint32_t k = 0; k < contig_segments.size(); ++k) - { - read(p, e_group_id); - uint32_t c_group_id = (uint32_t)zigzag_decode(e_group_id, prev_group_id); - contig_segments[k].group_id = c_group_id; - prev_group_id = c_group_id; - } -} - -// ******************************************************************************************* -void CCollection_V1::deserialize_contig_details_in_group_id(uint8_t*& p, vector& contig_segments) -{ - uint32_t e_in_group_id; - int32_t prev_in_group_id = 0; - - for (uint32_t k = 0; k < contig_segments.size(); ++k) - { - read(p, e_in_group_id); - uint32_t c_in_group_id = (uint32_t)zigzag_decode(e_in_group_id, prev_in_group_id); - contig_segments[k].in_group_id = c_in_group_id; - prev_in_group_id = c_in_group_id; - } -} - -// ******************************************************************************************* -void CCollection_V1::deserialize_contig_details_raw_length(uint8_t*& p, vector& contig_segments) -{ - uint32_t e_raw_length; - int32_t prev_raw_length = 0; - - for (uint32_t k = 0; k < contig_segments.size(); ++k) - { - read(p, e_raw_length); - uint32_t c_raw_length = (uint32_t)zigzag_decode(e_raw_length, prev_raw_length); - contig_segments[k].raw_length = c_raw_length; - prev_raw_length = c_raw_length; - } -} - -// ******************************************************************************************* -void CCollection_V1::deserialize_contig_details_orientation(uint8_t*& p, vector& contig_segments) -{ - uint32_t e_orientation; - - for (uint32_t k = 0; k < contig_segments.size(); ++k) - { - read(p, e_orientation); - contig_segments[k].is_rev_comp = (bool)e_orientation; - } -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection_v1.h" + +// ******************************************************************************************* +void CCollection_V1::serialize(vector& data, bool store_date_time) +{ + data.clear(); + + append(data, (uint32_t)col.size()); + + auto col_order = get_sample_original_order(); + + for (auto& sample_name : col_order) + { + auto& sample = col[sample_name]; + + append(data, sample_name); + append(data, (uint32_t)sample.size()); + + for (auto& contig : sample) + { + append(data, contig.first); + append(data, (uint32_t)contig.second.size()); + + int32_t prev_group_id = 0; + int32_t prev_in_group_id = 0; + int32_t prev_raw_length = 0; + + for (auto& seg : contig.second) + { + uint32_t e_group_id = (uint32_t)zigzag_encode((int32_t)seg.group_id - prev_group_id); + uint32_t e_in_group_id = (uint32_t)zigzag_encode((int32_t)seg.in_group_id - prev_in_group_id); + uint32_t e_raw_length = (uint32_t)zigzag_encode((int32_t)seg.raw_length - prev_raw_length); + + append(data, e_group_id); + append(data, e_in_group_id); + append(data, e_raw_length); + append(data, (uint32_t)seg.is_rev_comp); + + prev_group_id = seg.group_id; + prev_in_group_id = seg.in_group_id; + prev_raw_length = seg.raw_length; + } + } + } + + append(data, (uint32_t)cmd_lines.size()); + + for (auto& cmd : cmd_lines) + { + append(data, cmd.first); + if (store_date_time) + append(data, cmd.second); + else + append(data, ""); + } +} + +// ******************************************************************************************* +bool CCollection_V1::deserialize(vector& data) +{ + uint8_t* p = data.data(); + + col.clear(); + + uint32_t no_samples; + string sample_name; + string contig_name; + + read(p, no_samples); + + v_sample_name.reserve(no_samples); + + for (uint32_t i = 0; i < no_samples; ++i) + { + read(p, sample_name); + + v_sample_name.emplace_back(sample_name); + + uint32_t no_contigs; + read(p, no_contigs); + + col[sample_name].resize(no_contigs); + + uint32_t sample_id = (uint32_t)sample_ids.size(); + sample_ids[sample_name] = sample_id; + + for (uint32_t j = 0; j < no_contigs; ++j) + { + read(p, contig_name); + uint32_t no_seg; + read(p, no_seg); + + string short_contig_name = extract_contig_name(contig_name); + + contig_ids_no_seg[make_pair(sample_name, short_contig_name)] = make_pair(j, no_seg); + + mm_contig2sample.emplace(short_contig_name, sample_name); + + auto& q = col[sample_name][j]; + q.first = contig_name; + auto& q_second = q.second; + + uint32_t e_group_id; + uint32_t e_in_group_id; + uint32_t e_raw_length; + uint32_t e_orientation; + + int32_t prev_group_id = 0; + int32_t prev_in_group_id = 0; + int32_t prev_raw_length = 0; + + q_second.reserve(no_seg); + + for (uint32_t k = 0; k < no_seg; ++k) + { + read(p, e_group_id); + read(p, e_in_group_id); + read(p, e_raw_length); + read(p, e_orientation); + + uint32_t c_group_id = (uint32_t)((int32_t)prev_group_id + zigzag_decode(e_group_id)); + uint32_t c_in_group_id = (uint32_t)((int32_t)prev_in_group_id + zigzag_decode(e_in_group_id)); + uint32_t c_raw_length = (uint32_t)((int32_t)prev_raw_length + zigzag_decode(e_raw_length)); + + q_second.emplace_back(c_group_id, c_in_group_id, (bool)e_orientation, c_raw_length); + + prev_group_id = c_group_id; + prev_in_group_id = c_in_group_id; + prev_raw_length = c_raw_length; + } + } + } + + uint32_t no_cmds; + + read(p, no_cmds); + cmd_lines.clear(); + + cmd_lines.resize(no_cmds); + + for (uint32_t i = 0; i < no_cmds; ++i) + { + read(p, cmd_lines[i].first); + read(p, cmd_lines[i].second); + } + + maps_built = true; + + return true; +} + +// ******************************************************************************************* +vector CCollection_V1::get_sample_original_order() +{ + vector vec(sample_ids.size()); + + for (auto& x : sample_ids) + vec[x.second] = x.first; + + return vec; +} + +// ******************************************************************************************* +bool CCollection_V1::register_sample_contig(const string& sample_name, const string& contig_name) +{ + lock_guard lck(mtx); + + string short_contig_name = extract_contig_name(contig_name); + string stored_sample_name = sample_name; + + if (sample_name.empty()) + stored_sample_name = short_contig_name; + + auto q = sample_ids.find(stored_sample_name); + if (q == sample_ids.end()) + { + uint32_t sample_id = (uint32_t)sample_ids.size(); + sample_ids[stored_sample_name] = sample_id; + } + + auto p = contig_ids_no_seg.find(make_pair(stored_sample_name, short_contig_name)); + if (p == contig_ids_no_seg.end()) + { + uint32_t contig_id = (uint32_t)col[stored_sample_name].size(); + contig_ids_no_seg[make_pair(stored_sample_name, short_contig_name)] = make_pair(contig_id, 0); + col[stored_sample_name].emplace_back(contig_name, vector()); + + mm_contig2sample.emplace(short_contig_name, stored_sample_name); + + return true; + } + else + return false; // The pair sample_name:contig_name is not unique +} + +// ******************************************************************************************* +vector& CCollection_V1::add_segment_basic(const string& sample_name, const string& contig_name, + const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) +{ + uint32_t contig_id; + + string short_contig_name = extract_contig_name(contig_name); + string stored_sample_name = sample_name; + + if (sample_name.empty()) + stored_sample_name = short_contig_name; + + auto p_contig_ids = contig_ids_no_seg.find(make_pair(stored_sample_name, short_contig_name)); + if (p_contig_ids != contig_ids_no_seg.end()) + contig_id = p_contig_ids->second.first; + else + { + contig_id = (uint32_t)col[stored_sample_name].size(); + contig_ids_no_seg[make_pair(stored_sample_name, short_contig_name)] = make_pair(contig_id, 0); + col[stored_sample_name].emplace_back(contig_name, vector()); + + mm_contig2sample.emplace(short_contig_name, stored_sample_name); + } + + return col[stored_sample_name][contig_id].second; +} + +// ******************************************************************************************* +void CCollection_V1::add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, + const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) +{ + lock_guard lck(mtx); + + auto& vec = add_segment_basic(sample_name, contig_name, group_id, in_group_id, is_rev_comp, raw_length); + + if (place >= vec.size()) + vec.resize(place + 1, segment_desc_t(555555555, 555555555, true, 555555555)); + + vec[place] = segment_desc_t(group_id, in_group_id, is_rev_comp, raw_length); +} + +// ******************************************************************************************* +void CCollection_V1::add_segments_placed(vector& segments_to_place) +{ + lock_guard lck(mtx); + + for (auto& seg : segments_to_place) + { + auto& vec = add_segment_basic(seg.sample_name, seg.contig_name, seg.group_id, seg.in_group_id, seg.is_rev_comp, seg.data_size); + + if (seg.seg_part_no >= vec.size()) + vec.resize(seg.seg_part_no + 1, segment_desc_t(555555555, 555555555, true, 555555555)); + + vec[seg.seg_part_no] = segment_desc_t(seg.group_id, seg.in_group_id, seg.is_rev_comp, seg.data_size); + } +} + +// ******************************************************************************************* +bool CCollection_V1::get_reference_name(string& reference_name) +{ + lock_guard lck(mtx); + + if (v_sample_name.empty()) + return false; + + reference_name = v_sample_name.front(); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V1::get_samples_list(vector& v_samples, bool sorted) +{ + lock_guard lck(mtx); + + v_samples.clear(); + + for (auto& p : col) + v_samples.emplace_back(p.first); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V1::get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) +{ + lock_guard lck(mtx); + + v_contig_names.clear(); + + auto p = col.find(sample_name); + if (p == col.end()) + return false; + + for (auto q : p->second) + v_contig_names.emplace_back(q.first); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V1::get_sample_desc(const string& sample_name, vector>>& sample_desc) +{ + lock_guard lck(mtx); + + auto p = col.find(sample_name); + if (p == col.end()) + return false; + + if (p->second.front().second.empty()) + decompress_sample_details(sample_ids[sample_name]); + + sample_desc = p->second; + + return true; +} + +// ******************************************************************************************* +bool CCollection_V1::get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) +{ + lock_guard lck(mtx); + + string short_contig_name = extract_contig_name(contig_name); + + uint32_t contig_id; + + if (maps_built) + { + auto p_contig_id = contig_ids_no_seg.find(make_pair(sample_name, short_contig_name)); + if (p_contig_id == contig_ids_no_seg.end()) + return false; + + contig_id = p_contig_id->second.first; + } + else + { + auto p_contig_info = find_if(v_contig_info.begin(), v_contig_info.end(), [sample_name, short_contig_name](auto& x) { + return sample_name == x.sample_name && short_contig_name == x.contig_name; }); + + if (p_contig_info == v_contig_info.end()) + return false; + + contig_id = p_contig_info->id; + } + + if (col[sample_name][contig_id].second.empty()) + decompress_sample_details(sample_ids[sample_name]); + + contig_desc = col[sample_name][contig_id].second; + + // Reconstruct full contig name + contig_name = col[sample_name][contig_id].first; + + return true; +} + +// ******************************************************************************************* +bool CCollection_V1::is_contig_desc(const string& sample_name, const string& contig_name) +{ + lock_guard lck(mtx); + + string short_contig_name = extract_contig_name(contig_name); + + if (maps_built) + return contig_ids_no_seg.find(make_pair(sample_name, short_contig_name)) != contig_ids_no_seg.end(); + else + { + for (auto p = v_contig_info.begin(); p != v_contig_info.end(); ++p) + if (p->contig_name == contig_name && p->sample_name == sample_name) + return true; + + return false; + } +} + +// ******************************************************************************************* +vector CCollection_V1::get_samples_for_contig(const string& contig_name) +{ + vector vs; + + if (maps_built) + { + auto pq = mm_contig2sample.equal_range(extract_contig_name(contig_name)); + + for (auto p = pq.first; p != pq.second; ++p) + vs.emplace_back(p->second); + } + else + { + for (auto p = v_contig_info.begin(); p != v_contig_info.end(); ++p) + if (p->contig_name == contig_name) + vs.emplace_back(p->sample_name); + } + + return vs; +} + +// ******************************************************************************************* +size_t CCollection_V1::get_no_samples() +{ + lock_guard lck(mtx); + + return col.size(); +} + +// ******************************************************************************************* +int32_t CCollection_V1::get_no_contigs(const string& sample_name) +{ + lock_guard lck(mtx); + + auto p = col.find(sample_name); + if (p == col.end()) + return -1; + + return static_cast(p->second.size()); +} + +// ******************************************************************************************* +void CCollection_V1::decompress_sample_details(uint32_t i_sample) +{ + if (!zstd_dctx) + zstd_dctx = ZSTD_createDCtx(); + + uint32_t i_part = i_sample / details_batch_size; + + vector v_tmp; + + v_tmp.resize(v_zstd_batches[i_part].second); + ZSTD_decompressDCtx(zstd_dctx, v_tmp.data(), v_tmp.size(), v_zstd_batches[i_part].first.data(), v_zstd_batches[i_part].first.size()); + + v_zstd_batches[i_part].first.clear(); + v_zstd_batches[i_part].first.shrink_to_fit(); + v_zstd_batches[i_part].second = 0; + + uint8_t* p = v_tmp.data(); + + vector v_p_sample; + v_p_sample.reserve(details_batch_size); + + for (uint32_t i = i_part * details_batch_size; i < (i_part + 1) * details_batch_size && i < col.size(); ++i) + v_p_sample.push_back(col.find(v_sample_name[i])); + + if (maps_built) + { + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) + { + uint32_t no_seg = contig_ids_no_seg[make_pair((*p_sam)->first, extract_contig_name(p_ctg->first))].second; + p_ctg->second.resize(no_seg); + } + } + else + { + auto p_contig_info = find_if(v_contig_info.begin(), v_contig_info.end(), [&v_p_sample](auto& x) { + return v_p_sample.front()->first == x.sample_name; }); + + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg, ++p_contig_info) + { + uint32_t no_seg = p_contig_info->no_seg; + + p_ctg->second.resize(no_seg); + } + } + + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) + deserialize_contig_details_group_id(p, p_ctg->second); + + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) + deserialize_contig_details_in_group_id(p, p_ctg->second); + + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) + deserialize_contig_details_raw_length(p, p_ctg->second); + + for (auto p_sam = v_p_sample.begin(); p_sam != v_p_sample.end(); ++p_sam) + for (auto p_ctg = (*p_sam)->second.begin(); p_ctg != (*p_sam)->second.end(); ++p_ctg) + deserialize_contig_details_orientation(p, p_ctg->second); +} + +// ******************************************************************************************* +void CCollection_V1::deserialize_contig_details_group_id(uint8_t*& p, vector& contig_segments) +{ + uint32_t e_group_id; + int32_t prev_group_id = 0; + + for (uint32_t k = 0; k < contig_segments.size(); ++k) + { + read(p, e_group_id); + uint32_t c_group_id = (uint32_t)zigzag_decode(e_group_id, prev_group_id); + contig_segments[k].group_id = c_group_id; + prev_group_id = c_group_id; + } +} + +// ******************************************************************************************* +void CCollection_V1::deserialize_contig_details_in_group_id(uint8_t*& p, vector& contig_segments) +{ + uint32_t e_in_group_id; + int32_t prev_in_group_id = 0; + + for (uint32_t k = 0; k < contig_segments.size(); ++k) + { + read(p, e_in_group_id); + uint32_t c_in_group_id = (uint32_t)zigzag_decode(e_in_group_id, prev_in_group_id); + contig_segments[k].in_group_id = c_in_group_id; + prev_in_group_id = c_in_group_id; + } +} + +// ******************************************************************************************* +void CCollection_V1::deserialize_contig_details_raw_length(uint8_t*& p, vector& contig_segments) +{ + uint32_t e_raw_length; + int32_t prev_raw_length = 0; + + for (uint32_t k = 0; k < contig_segments.size(); ++k) + { + read(p, e_raw_length); + uint32_t c_raw_length = (uint32_t)zigzag_decode(e_raw_length, prev_raw_length); + contig_segments[k].raw_length = c_raw_length; + prev_raw_length = c_raw_length; + } +} + +// ******************************************************************************************* +void CCollection_V1::deserialize_contig_details_orientation(uint8_t*& p, vector& contig_segments) +{ + uint32_t e_orientation; + + for (uint32_t k = 0; k < contig_segments.size(); ++k) + { + read(p, e_orientation); + contig_segments[k].is_rev_comp = (bool)e_orientation; + } +} + +// EOF diff --git a/src/core/collection_v1.h b/src/common/collection_v1.h similarity index 93% rename from src/core/collection_v1.h rename to src/common/collection_v1.h index 88d999c..5df54fe 100644 --- a/src/core/collection_v1.h +++ b/src/common/collection_v1.h @@ -1,68 +1,69 @@ -#ifndef _COLLECTION_V1_H -#define _COLLECTION_V1_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "collection.h" - -class CCollection_V1 : public CCollection -{ -protected: - uint32_t details_batch_size = 1; - - typedef map>>> col_t; - - vector, size_t>> v_zstd_batches; - - col_t col; - map, pair> contig_ids_no_seg; - multimap mm_contig2sample; - map sample_ids; - vector v_sample_name; - vector v_contig_info; - bool maps_built = false; - - vector get_sample_original_order(); - - void decompress_sample_details(uint32_t i_sample); - void deserialize_contig_details_group_id(uint8_t*& p, vector& contig_segments); - void deserialize_contig_details_in_group_id(uint8_t*& p, vector& contig_segments); - void deserialize_contig_details_raw_length(uint8_t*& p, vector& contig_segments); - void deserialize_contig_details_orientation(uint8_t*& p, vector& contig_segments); - - vector& add_segment_basic(const string& sample_name, const string& contig_name, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); - -public: - CCollection_V1() : CCollection() {} - - virtual ~CCollection_V1() {}; - - - void serialize(vector& data, bool store_date_time); - bool deserialize(vector& data); - - virtual bool register_sample_contig(const string& sample_name, const string& contig_name); - virtual void add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); - virtual bool get_reference_name(string& reference_name); - virtual bool get_samples_list(vector& v_samples); - virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names); - virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc); - virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc); - - virtual bool is_contig_desc(const string& sample_name, const string& contig_name); - virtual vector get_samples_for_contig(const string& contig_name); - - virtual size_t get_no_samples(); - virtual int32_t get_no_contigs(const string& sample_name); -}; - -// EOF +#ifndef _COLLECTION_V1_H +#define _COLLECTION_V1_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection.h" + +class CCollection_V1 : public CCollection +{ +protected: + uint32_t details_batch_size = 1; + + typedef map>>> col_t; + + vector, size_t>> v_zstd_batches; + + col_t col; + map, pair> contig_ids_no_seg; + multimap mm_contig2sample; + map sample_ids; + vector v_sample_name; + vector v_contig_info; + bool maps_built = false; + + vector get_sample_original_order(); + + void decompress_sample_details(uint32_t i_sample); + void deserialize_contig_details_group_id(uint8_t*& p, vector& contig_segments); + void deserialize_contig_details_in_group_id(uint8_t*& p, vector& contig_segments); + void deserialize_contig_details_raw_length(uint8_t*& p, vector& contig_segments); + void deserialize_contig_details_orientation(uint8_t*& p, vector& contig_segments); + + vector& add_segment_basic(const string& sample_name, const string& contig_name, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); + +public: + CCollection_V1() : CCollection() {} + + virtual ~CCollection_V1() {}; + + + void serialize(vector& data, bool store_date_time); + bool deserialize(vector& data); + + virtual bool register_sample_contig(const string& sample_name, const string& contig_name); + virtual void add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); + virtual void add_segments_placed(vector& segments_to_place); + virtual bool get_reference_name(string& reference_name); + virtual bool get_samples_list(vector& v_samples, bool sorted = true); + virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names); + virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc); + virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc); + + virtual bool is_contig_desc(const string& sample_name, const string& contig_name); + virtual vector get_samples_for_contig(const string& contig_name); + + virtual size_t get_no_samples(); + virtual int32_t get_no_contigs(const string& sample_name); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/collection_v2.cpp b/src/common/collection_v2.cpp similarity index 94% rename from src/core/collection_v2.cpp rename to src/common/collection_v2.cpp index a13d3d8..074fd9c 100644 --- a/src/core/collection_v2.cpp +++ b/src/common/collection_v2.cpp @@ -1,176 +1,176 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/collection_v2.h" - -// ******************************************************************************************* -void CCollection_V2::serialize(vector& data_main, vector>& data_details, bool store_date_time, uint32_t _details_batch_size) -{ - data_main.clear(); - data_details.clear(); - - details_batch_size = _details_batch_size; - - auto col_order = get_sample_original_order(); - - append(data_main, details_batch_size); - append(data_main, (uint32_t)col.size()); - - data_details.resize((col_order.size() + (details_batch_size - 1)) / details_batch_size); - - array, 4> v_det; - size_t no_samples_in_batch = 0; - uint32_t i_details_part = 0; - - for (auto& sample_name : col_order) - { - auto& sample = col[sample_name]; - - append(data_main, sample_name); - append(data_main, (uint32_t)sample.size()); - - if (no_samples_in_batch == details_batch_size) - { - for (auto& v : v_det) - { - data_details[i_details_part].insert(data_details[i_details_part].end(), v.begin(), v.end()); - v.clear(); - } - - no_samples_in_batch = 0; - ++i_details_part; - } - - for (auto& contig : sample) - { - append(data_main, contig.first); - append(data_main, (uint32_t)contig.second.size()); - - int32_t prev_group_id = 0; - int32_t prev_in_group_id = 0; - int32_t prev_raw_length = 0; - - for (auto& seg : contig.second) - { - uint32_t e_group_id = (uint32_t)zigzag_encode(seg.group_id, prev_group_id); - uint32_t e_in_group_id = (uint32_t)zigzag_encode(seg.in_group_id, prev_in_group_id); - uint32_t e_raw_length = (uint32_t)zigzag_encode(seg.raw_length, prev_raw_length); - - append(v_det[0], e_group_id); - append(v_det[1], e_in_group_id); - append(v_det[2], e_raw_length); - append(v_det[3], (uint32_t)seg.is_rev_comp); - - prev_group_id = seg.group_id; - prev_in_group_id = seg.in_group_id; - prev_raw_length = seg.raw_length; - } - } - - ++no_samples_in_batch; - } - - for (auto& v : v_det) - data_details[i_details_part].insert(data_details[i_details_part].end(), v.begin(), v.end()); - - append(data_main, (uint32_t)cmd_lines.size()); - - for (auto& cmd : cmd_lines) - { - append(data_main, cmd.first); - if (store_date_time) - append(data_main, cmd.second); - else - append(data_main, ""); - } -} - -// ******************************************************************************************* -bool CCollection_V2::deserialize_main(vector& data_main, bool create_maps) -{ - uint8_t* p = data_main.data(); - - col.clear(); - - uint32_t no_samples; - string sample_name; - string contig_name; - - read(p, details_batch_size); - read(p, no_samples); - - v_sample_name.reserve(no_samples); - - for (uint32_t i = 0; i < no_samples; ++i) - { - read(p, sample_name); - - v_sample_name.emplace_back(sample_name); - - uint32_t no_contigs; - read(p, no_contigs); - - col[sample_name].resize(no_contigs); - auto& col_sample = col[sample_name]; - - uint32_t sample_id = (uint32_t)sample_ids.size(); - sample_ids[sample_name] = sample_id; - - for (uint32_t j = 0; j < no_contigs; ++j) - { - read(p, contig_name); - - uint32_t no_seg; - read(p, no_seg); - - string short_contig_name = extract_contig_name(contig_name); - - if (create_maps) - { - contig_ids_no_seg[make_pair(sample_name, short_contig_name)] = make_pair(j, no_seg); - mm_contig2sample.emplace(short_contig_name, sample_name); - } - else - v_contig_info.emplace_back(sample_name, short_contig_name, j, no_seg); - - col_sample[j].first = contig_name; - } - } - - uint32_t no_cmds; - - read(p, no_cmds); - cmd_lines.clear(); - - cmd_lines.resize(no_cmds); - - for (uint32_t i = 0; i < no_cmds; ++i) - { - read(p, cmd_lines[i].first); - read(p, cmd_lines[i].second); - } - - maps_built = create_maps; - - return true; -} - -// ******************************************************************************************* -bool CCollection_V2::deserialize_details(vector& zstd_data_details, size_t raw_size, bool deserialize_details) -{ - v_zstd_batches.emplace_back(move(zstd_data_details), raw_size); - - if (deserialize_details) - decompress_sample_details((v_zstd_batches.size() - 1) * details_batch_size); - - return true; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection_v2.h" + +// ******************************************************************************************* +void CCollection_V2::serialize(vector& data_main, vector>& data_details, bool store_date_time, uint32_t _details_batch_size) +{ + data_main.clear(); + data_details.clear(); + + details_batch_size = _details_batch_size; + + auto col_order = get_sample_original_order(); + + append(data_main, details_batch_size); + append(data_main, (uint32_t)col.size()); + + data_details.resize((col_order.size() + (details_batch_size - 1)) / details_batch_size); + + array, 4> v_det; + size_t no_samples_in_batch = 0; + uint32_t i_details_part = 0; + + for (auto& sample_name : col_order) + { + auto& sample = col[sample_name]; + + append(data_main, sample_name); + append(data_main, (uint32_t)sample.size()); + + if (no_samples_in_batch == details_batch_size) + { + for (auto& v : v_det) + { + data_details[i_details_part].insert(data_details[i_details_part].end(), v.begin(), v.end()); + v.clear(); + } + + no_samples_in_batch = 0; + ++i_details_part; + } + + for (auto& contig : sample) + { + append(data_main, contig.first); + append(data_main, (uint32_t)contig.second.size()); + + int32_t prev_group_id = 0; + int32_t prev_in_group_id = 0; + int32_t prev_raw_length = 0; + + for (auto& seg : contig.second) + { + uint32_t e_group_id = (uint32_t)zigzag_encode(seg.group_id, prev_group_id); + uint32_t e_in_group_id = (uint32_t)zigzag_encode(seg.in_group_id, prev_in_group_id); + uint32_t e_raw_length = (uint32_t)zigzag_encode(seg.raw_length, prev_raw_length); + + append(v_det[0], e_group_id); + append(v_det[1], e_in_group_id); + append(v_det[2], e_raw_length); + append(v_det[3], (uint32_t)seg.is_rev_comp); + + prev_group_id = seg.group_id; + prev_in_group_id = seg.in_group_id; + prev_raw_length = seg.raw_length; + } + } + + ++no_samples_in_batch; + } + + for (auto& v : v_det) + data_details[i_details_part].insert(data_details[i_details_part].end(), v.begin(), v.end()); + + append(data_main, (uint32_t)cmd_lines.size()); + + for (auto& cmd : cmd_lines) + { + append(data_main, cmd.first); + if (store_date_time) + append(data_main, cmd.second); + else + append(data_main, ""); + } +} + +// ******************************************************************************************* +bool CCollection_V2::deserialize_main(vector& data_main, bool create_maps) +{ + uint8_t* p = data_main.data(); + + col.clear(); + + uint32_t no_samples; + string sample_name; + string contig_name; + + read(p, details_batch_size); + read(p, no_samples); + + v_sample_name.reserve(no_samples); + + for (uint32_t i = 0; i < no_samples; ++i) + { + read(p, sample_name); + + v_sample_name.emplace_back(sample_name); + + uint32_t no_contigs; + read(p, no_contigs); + + col[sample_name].resize(no_contigs); + auto& col_sample = col[sample_name]; + + uint32_t sample_id = (uint32_t)sample_ids.size(); + sample_ids[sample_name] = sample_id; + + for (uint32_t j = 0; j < no_contigs; ++j) + { + read(p, contig_name); + + uint32_t no_seg; + read(p, no_seg); + + string short_contig_name = extract_contig_name(contig_name); + + if (create_maps) + { + contig_ids_no_seg[make_pair(sample_name, short_contig_name)] = make_pair(j, no_seg); + mm_contig2sample.emplace(short_contig_name, sample_name); + } + else + v_contig_info.emplace_back(sample_name, short_contig_name, j, no_seg); + + col_sample[j].first = contig_name; + } + } + + uint32_t no_cmds; + + read(p, no_cmds); + cmd_lines.clear(); + + cmd_lines.resize(no_cmds); + + for (uint32_t i = 0; i < no_cmds; ++i) + { + read(p, cmd_lines[i].first); + read(p, cmd_lines[i].second); + } + + maps_built = create_maps; + + return true; +} + +// ******************************************************************************************* +bool CCollection_V2::deserialize_details(vector& zstd_data_details, size_t raw_size, bool deserialize_details) +{ + v_zstd_batches.emplace_back(move(zstd_data_details), raw_size); + + if (deserialize_details) + decompress_sample_details((v_zstd_batches.size() - 1) * details_batch_size); + + return true; +} + +// EOF diff --git a/src/core/collection_v2.h b/src/common/collection_v2.h similarity index 93% rename from src/core/collection_v2.h rename to src/common/collection_v2.h index 7ebe73b..6820c44 100644 --- a/src/core/collection_v2.h +++ b/src/common/collection_v2.h @@ -1,30 +1,30 @@ -#ifndef _COLLECTION_V2_H -#define _COLLECTION_V2_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "collection.h" -#include "collection_v1.h" - -class CCollection_V2 : public CCollection_V1 -{ - -public: - CCollection_V2() : CCollection_V1() {} - virtual ~CCollection_V2() {}; - - void serialize(vector& data_main, vector>& data_details, bool store_date_time, uint32_t _details_batch_size); - bool deserialize_main(vector& data_main, bool create_maps); - bool deserialize_details(vector& zstd_data_details, size_t raw_size, bool deserialize_details); -}; - -// EOF +#ifndef _COLLECTION_V2_H +#define _COLLECTION_V2_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection.h" +#include "collection_v1.h" + +class CCollection_V2 : public CCollection_V1 +{ + +public: + CCollection_V2() : CCollection_V1() {} + virtual ~CCollection_V2() {}; + + void serialize(vector& data_main, vector>& data_details, bool store_date_time, uint32_t _details_batch_size); + bool deserialize_main(vector& data_main, bool create_maps); + bool deserialize_details(vector& zstd_data_details, size_t raw_size, bool deserialize_details); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/collection_v3.cpp b/src/common/collection_v3.cpp similarity index 95% rename from src/core/collection_v3.cpp rename to src/common/collection_v3.cpp index 5cdfe2a..e46ee27 100644 --- a/src/core/collection_v3.cpp +++ b/src/common/collection_v3.cpp @@ -1,989 +1,996 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/collection_v3.h" -#include -#include - -// ******************************************************************************************* -bool CCollection_V3::set_archives(shared_ptr _in_archive, shared_ptr _out_archive, - uint32_t _no_threads, size_t _batch_size, uint32_t _segment_size, uint32_t _kmer_length) -{ - lock_guard lck(mtx); - - in_archive = _in_archive; - out_archive = _out_archive; - - batch_size = _batch_size; - segment_size = _segment_size; - no_threads = _no_threads; - kmer_length = _kmer_length; - - if (in_archive == nullptr) - return prepare_for_compression(); - else if (out_archive == nullptr) - return prepare_for_decompression(); - else - return prepare_for_appending_copy(); -} - -// ******************************************************************************************* -bool CCollection_V3::prepare_for_compression() -{ - collection_samples_id = out_archive->RegisterStream("collection-samples"); - collection_contig_id = out_archive->RegisterStream("collection-contigs"); - collection_details_id = out_archive->RegisterStream("collection-details"); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::prepare_for_appending_copy() -{ -// auto in_collection_samples_id = in_archive->GetStreamId("collection-samples"); - auto in_collection_contig_id = in_archive->GetStreamId("collection-contigs"); - auto in_collection_details_id = in_archive->GetStreamId("collection-details"); - - collection_samples_id = out_archive->RegisterStream("collection-samples"); - collection_contig_id = out_archive->RegisterStream("collection-contigs"); - collection_details_id = out_archive->RegisterStream("collection-details"); - - load_batch_sample_names(); - - // in and out ids for collection-* must be the same! - - auto no_contig_batches = in_archive->GetNoParts(in_collection_contig_id); - - // Transfer all but the last one batch from in to out archive - vector data; - uint64_t meta; - - for (size_t i = 0; i < no_contig_batches - 1; ++i) - { - in_archive->GetPart(in_collection_contig_id, i, data, meta); - out_archive->AddPart(collection_contig_id, data, meta); - in_archive->GetPart(in_collection_details_id, i, data, meta); - out_archive->AddPart(collection_details_id, data, meta); - } - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::prepare_for_appending_load_last_batch() -{ - lock_guard lck(mtx); - - auto in_collection_contig_id = in_archive->GetStreamId("collection-contigs"); - auto in_collection_details_id = in_archive->GetStreamId("collection-details"); - - auto no_contig_batches = in_archive->GetNoParts(in_collection_contig_id); - - // Transfer all but the last one batch from in to out archive - vector data; - uint64_t meta; - - // Load last batch - load_batch_contig_names(no_contig_batches - 1); - load_batch_contig_details(no_contig_batches - 1); - - if (no_samples_in_last_batch == batch_size) - { - in_archive->GetPart(in_collection_contig_id, no_contig_batches - 1, data, meta); - out_archive->AddPart(collection_contig_id, data, meta); - in_archive->GetPart(in_collection_details_id, no_contig_batches - 1, data, meta); - out_archive->AddPart(collection_details_id, data, meta); - - clear_batch_contig(no_contig_batches - 1); - } - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::prepare_for_decompression() -{ - collection_samples_id = in_archive->GetStreamId("collection-samples"); - collection_contig_id = in_archive->GetStreamId("collection-contigs"); - collection_details_id = in_archive->GetStreamId("collection-details"); - - load_batch_sample_names(); - - return true; -} - -// ******************************************************************************************* -void CCollection_V3::complete_serialization() -{ - lock_guard lck(mtx); - - store_batch_sample_names(); -} - -// ******************************************************************************************* -void CCollection_V3::zstd_compress(ZSTD_CCtx*& cctx, vector& v_input, vector& v_output, int level) -{ - if (cctx == nullptr) - { - cctx = ZSTD_createCCtx(); - } - - v_output.resize(ZSTD_compressBound(v_input.size())); - auto c_size = ZSTD_compressCCtx(cctx, v_output.data(), v_output.size(), v_input.data(), v_input.size(), level); - - v_output.resize(c_size); -} - -// ******************************************************************************************* -void CCollection_V3::zstd_decompress(ZSTD_DCtx*& dctx, vector& v_input, vector& v_output, size_t raw_size) -{ - if (dctx == nullptr) - dctx = ZSTD_createDCtx(); - - v_output.resize(raw_size); - ZSTD_decompressDCtx(dctx, v_output.data(), v_output.size(), v_input.data(), v_input.size()); -} - -// ******************************************************************************************* -void CCollection_V3::store_batch_sample_names() -{ - vector v_data, v_tmp; - - determine_collection_samples_id(); - - serialize_sample_names(v_tmp); - - zstd_compress(zstd_cctx_samples, v_tmp, v_data, 19); - - out_archive->AddPartBuffered(collection_samples_id, v_data, v_tmp.size()); -} - -// ******************************************************************************************* -void CCollection_V3::load_batch_sample_names() -{ - vector v_data, v_tmp; - uint64_t raw_size; - - determine_collection_samples_id(); - - in_archive->GetPart(collection_samples_id, v_tmp, raw_size); - - zstd_decompress(zstd_dctx_samples, v_tmp, v_data, raw_size); - - deserialize_sample_names(v_data); -} - -// ******************************************************************************************* -void CCollection_V3::store_batch_contig_names(uint32_t id_from, uint32_t id_to) -{ - vector v_data, v_tmp; - - determine_collection_contig_id(); - - serialize_contig_names(v_tmp, id_from, id_to); - - zstd_compress(zstd_cctx_contigs, v_tmp, v_data, 18); - - out_archive->AddPartBuffered(collection_contig_id, v_data, v_tmp.size()); -} - -// ******************************************************************************************* -void CCollection_V3::load_batch_contig_names(size_t id_batch) -{ - vector v_data, v_tmp; - uint64_t raw_size; - - if (unpacked_contig_data_batch_id >= 0 && unpacked_contig_data_batch_id != (int) id_batch) - clear_batch_contig(unpacked_contig_data_batch_id); - - determine_collection_contig_id(); - - in_archive->GetPart(collection_contig_id, id_batch, v_tmp, raw_size); - - zstd_decompress(zstd_dctx_contigs, v_tmp, v_data, raw_size); - - deserialize_contig_names(v_data, id_batch * batch_size); - - unpacked_contig_data_batch_id = id_batch; -} - -// ******************************************************************************************* -void CCollection_V3::clear_batch_contig(size_t id_batch) -{ - size_t to_batch_id = min(sample_desc.size(), (id_batch + 1) * batch_size); - - for (size_t i = id_batch * batch_size; i < to_batch_id; ++i) - { - sample_desc[i].contigs.clear(); - sample_desc[i].contigs.shrink_to_fit(); - } -} - -// ******************************************************************************************* -void CCollection_V3::store_batch_contig_details(uint32_t id_from, uint32_t id_to) -{ - array, 5> v_data; - array, 5> v_packed; - vector v_stream; - - determnine_collection_details_id(); - - serialize_contig_details(v_data, id_from, id_to); - - if (no_threads >= 4) - { - vector> v_fut; - v_fut.reserve(5); - - for (int i = 0; i < 5; ++i) - v_fut.emplace_back(async([&, i]() {zstd_compress(zstd_cctx_details[i], v_data[i], v_packed[i], 19); })); - - for (int i = 0; i < 5; ++i) - v_fut[i].wait(); - } - else - { - for (int i = 0; i < 5; ++i) - zstd_compress(zstd_cctx_details[i], v_data[i], v_packed[i], 19); - } - - for (int i = 0; i < 5; ++i) - { - append(v_stream, v_data[i].size()); - append(v_stream, v_packed[i].size()); - } - - for (int i = 0; i < 5; ++i) - v_stream.insert(v_stream.end(), v_packed[i].begin(), v_packed[i].end()); - - out_archive->AddPartBuffered(collection_details_id, v_stream, 0); -} - -// ******************************************************************************************* -void CCollection_V3::load_batch_contig_details(size_t id_batch) -{ - array, 5> v_data; - array, 5> v_packed; - vector v_stream; - - uint64_t aux; - - if (unpacked_contig_data_batch_id >= 0 && unpacked_contig_data_batch_id != (int) id_batch) - clear_batch_contig(unpacked_contig_data_batch_id); - - determnine_collection_details_id(); - - in_archive->GetPart(collection_details_id, id_batch, v_stream, aux); - - uint8_t* ptr = v_stream.data(); - array, 5> a_sizes; - - for (int i = 0; i < 5; ++i) - { - read(ptr, a_sizes[i].first); - read(ptr, a_sizes[i].second); - } - - for (int i = 0; i < 5; ++i) - { - v_packed[i].assign(ptr, ptr + a_sizes[i].second); - ptr += a_sizes[i].second; - } - - if (no_threads >= 4) - { - vector> v_fut; - v_fut.reserve(5); - - for (int i = 0; i < 5; ++i) - v_fut.emplace_back(async([&, i]() {zstd_decompress(zstd_dctx_details[i], v_packed[i], v_data[i], a_sizes[i].first); })); - - for (int i = 0; i < 5; ++i) - v_fut[i].wait(); - } - else - { - for (int i = 0; i < 5; ++i) - zstd_decompress(zstd_dctx_details[i], v_packed[i], v_data[i], a_sizes[i].first); - } - - deserialize_contig_details(v_data, id_batch * batch_size); - - unpacked_contig_data_batch_id = id_batch; -} - -// ******************************************************************************************* -void CCollection_V3::serialize_sample_names(vector& v_data) -{ - append(v_data, (uint32_t) sample_desc.size()); - - for (auto& x: sample_desc) - append(v_data, x.name); -} - -// ******************************************************************************************* -void CCollection_V3::deserialize_sample_names(vector& v_data) -{ - uint8_t* p = v_data.data(); - - uint32_t no_samples; - - read(p, no_samples); - - sample_desc.resize(no_samples); - - for (size_t i = 0; i < no_samples; ++i) - { - read(p, sample_desc[i].name); - sample_ids[sample_desc[i].name] = i; - } -} - -// ******************************************************************************************* -vector CCollection_V3::split_string(const string& s) -{ - auto p = s.begin(); - vector components; - - while (true) - { - auto q = find(p, s.end(), ' '); - components.push_back(string(p, q)); - if (q == s.end()) - break; - - p = q + 1; - } - - return components; -} - -// ******************************************************************************************* -string CCollection_V3::encode_split(vector& prev_split, vector& curr_split) -{ - string enc; - - for (size_t i = 0; i < curr_split.size(); ++i) - { - if (prev_split[i] == curr_split[i]) - enc.push_back(-127); // same component marker - else if (prev_split[i].size() != curr_split[i].size()) - enc.append(curr_split[i]); - else - { - signed char cnt = 0; - uint32_t cmp_len = curr_split[i].size(); - - auto p_ptr = prev_split[i].data(); - auto c_ptr = curr_split[i].data(); - - for (uint32_t j = 0; j < cmp_len; ++j) - { - if (p_ptr[j] == c_ptr[j]) - { - if (cnt == 100) - { - enc.push_back(-cnt); // repetition marker - cnt = 1; - } - else - ++cnt; - } - else - { - if (cnt) - { - enc.push_back(-cnt); // repetition marker - cnt = 0; - } - - enc.push_back(c_ptr[j]); - } - } - - if (cnt) - enc.push_back(-cnt); // repetition marker - } - - enc.push_back(' '); - } - - enc.pop_back(); // remove final space - - return enc; -} - -// ******************************************************************************************* -string CCollection_V3::decode_split(vector& prev_split, vector& curr_split) -{ - string dec; - string cmp; - - for (size_t i = 0; i < curr_split.size(); ++i) - { - if (curr_split[i].size() == 1 && (signed char) (curr_split[i].front()) == -127) // same component marker - { - dec.append(prev_split[i]); - curr_split[i] = prev_split[i]; - } - else - { - cmp.clear(); - auto p_ptr = prev_split[i].data(); - - for (signed char c : curr_split[i]) - { - if (c >= 0) - { - cmp.push_back(c); - ++p_ptr; - } - else - { - cmp.append(p_ptr, -c); - p_ptr += -c; - } - } - - dec.append(cmp); - curr_split[i] = move(cmp); - } - - dec.push_back(' '); - } - - dec.pop_back(); // remove final space - - return dec; -} - -// ******************************************************************************************* -void CCollection_V3::serialize_contig_names(vector& v_data, uint32_t id_from, uint32_t id_to) -{ - append(v_data, id_to - id_from); - - string p_name; - - vector sp_prev, sp_curr; - - for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) - { - append(v_data, p->contigs.size()); - - vector prev_split; - vector curr_split; - - for (auto& x : p->contigs) - { - curr_split = split_string(x.name); - - if (curr_split.size() != prev_split.size()) - append(v_data, x.name); - else - append(v_data, encode_split(prev_split, curr_split)); - - prev_split = move(curr_split); - } - } -} - -// ******************************************************************************************* -void CCollection_V3::deserialize_contig_names(vector& v_data, size_t i_sample) -{ - uint8_t* p = v_data.data(); - - uint32_t no_samples_in_curr_batch; - uint32_t no_contigs_in_curr_sample; - - read(p, no_samples_in_curr_batch); - - for (size_t i = 0; i < no_samples_in_curr_batch; ++i) - { - read(p, no_contigs_in_curr_sample); - - auto& curr_sample = sample_desc[i_sample + i]; - - curr_sample.contigs.resize(no_contigs_in_curr_sample); - - vector prev_split; - vector curr_split; - string enc; - - for (size_t j = 0; j < no_contigs_in_curr_sample; ++j) - { - read(p, enc); - - curr_split = split_string(enc); - - if (curr_split.size() != prev_split.size()) - curr_sample.contigs[j].name = enc; - else - curr_sample.contigs[j].name = decode_split(prev_split, curr_split); - - prev_split = move(curr_split); - } - } - - // important only for appending mode - no_samples_in_last_batch = no_samples_in_curr_batch; -} - -// ******************************************************************************************* -void CCollection_V3::serialize_contig_details(array, 5>& v_data, uint32_t id_from, uint32_t id_to) -{ - append(v_data[0], id_to - id_from); - - clear_in_group_ids(); - - for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) - { - append(v_data[0], p->contigs.size()); - - uint32_t pred_raw_length = segment_size + kmer_length; - - for (auto& x : p->contigs) - { - append(v_data[0], x.segments.size()); - - for (auto& seg : x.segments) - { - int prev_in_group_id = get_in_group_id(seg.group_id); - - uint32_t e_group_id = seg.group_id; - uint32_t e_in_group_id; - - if (prev_in_group_id == -1) - e_in_group_id = (int) seg.in_group_id; - else - { - if (seg.in_group_id == 0) - e_in_group_id = 0; - else if ((int)seg.in_group_id == prev_in_group_id + 1) - e_in_group_id = 1; - else - e_in_group_id = (uint32_t)zigzag_encode(seg.in_group_id, prev_in_group_id + 1) + 1u; - } - - uint32_t e_raw_length = (uint32_t)zigzag_encode(seg.raw_length, pred_raw_length); - - append(v_data[1], e_group_id); - append(v_data[2], e_in_group_id); - append(v_data[3], e_raw_length); - append(v_data[4], (uint32_t)seg.is_rev_comp); - - if ((int) seg.in_group_id > prev_in_group_id && seg.in_group_id > 0) - set_in_group_id(seg.group_id, seg.in_group_id); - } - } - } -} - -// ******************************************************************************************* -void CCollection_V3::deserialize_contig_details(array, 5>& v_data, size_t i_sample) -{ - array, 5> v_det; - - uint8_t* p = v_data[0].data(); - - uint32_t no_samples_in_curr_batch; - uint32_t no_contigs_in_curr_sample; - uint32_t no_segments_in_curr_contig; - size_t no_items = 0; - - read(p, no_samples_in_curr_batch); - - for (size_t i = 0; i < no_samples_in_curr_batch; ++i) - { - read(p, no_contigs_in_curr_sample); - - auto& curr_sample = sample_desc[i_sample + i]; - - curr_sample.contigs.resize(no_contigs_in_curr_sample); - - for (size_t j = 0; j < no_contigs_in_curr_sample; ++j) - { - read(p, no_segments_in_curr_contig); - - curr_sample.contigs[j].segments.resize(no_segments_in_curr_contig); - - no_items += no_segments_in_curr_contig; - } - } - - for(int i = 1; i < 5; ++i) - { - v_det[i].resize(no_items); - - p = v_data[i].data(); - - for (size_t j = 0; j < no_items; ++j) - read(p, v_det[i][j]); - } - - no_items = 0; - - clear_in_group_ids(); - - uint32_t pred_raw_length = segment_size + kmer_length; - - for (size_t i = 0; i < no_samples_in_curr_batch; ++i) - { - auto& curr_sample = sample_desc[i_sample + i]; - - for (size_t j = 0; j < curr_sample.contigs.size(); ++j) - { - auto& curr_contig = curr_sample.contigs[j]; - - for (size_t k = 0; k < curr_contig.segments.size(); ++k, ++no_items) - { - uint32_t c_group_id = v_det[1][no_items]; - - curr_contig.segments[k].group_id = c_group_id; - int prev_in_group_id = get_in_group_id(c_group_id); - - uint32_t e_in_group_id = v_det[2][no_items]; - uint32_t c_in_group_id; - - if (prev_in_group_id == -1) - c_in_group_id = e_in_group_id; - else - { - if (e_in_group_id == 0) - c_in_group_id = 0; - else if (e_in_group_id == 1) - c_in_group_id = prev_in_group_id + 1; - else - c_in_group_id = (uint32_t)zigzag_decode(e_in_group_id - 1u, prev_in_group_id + 1); - - } - - curr_contig.segments[k].in_group_id = c_in_group_id; - - uint32_t c_raw_length = (uint32_t)zigzag_decode(v_det[3][no_items], pred_raw_length); - curr_contig.segments[k].raw_length = c_raw_length; - - curr_contig.segments[k].is_rev_comp = (bool)v_det[4][no_items]; - - if ((int)c_in_group_id > prev_in_group_id && c_in_group_id > 0) - set_in_group_id(c_group_id, c_in_group_id); - } - } - } -} - -// ******************************************************************************************* -void CCollection_V3::store_contig_batch(uint32_t id_from, uint32_t id_to) -{ - lock_guard lck(mtx); - - if (no_threads > 1) - { - future fut_contigs = async([&]() {this->store_batch_contig_names(id_from, id_to); }); - store_batch_contig_details(id_from, id_to); - fut_contigs.wait(); - } - else - { - store_batch_contig_names(id_from, id_to); - store_batch_contig_details(id_from, id_to); - } - - for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) - { - p->contigs.clear(); - p->contigs.shrink_to_fit(); - } -} - -// ******************************************************************************************* -bool CCollection_V3::register_sample_contig(const string& sample_name, const string& contig_name) -{ - string short_contig_name = extract_contig_name(contig_name); - string stored_sample_name = sample_name; - - lock_guard lck(mtx); - - if (sample_name.empty()) - stored_sample_name = short_contig_name; - - if (stored_sample_name != prev_sample_name) - { - auto q = sample_ids.find(stored_sample_name); - if (q != sample_ids.end()) - return false; // sample of the same name was already registered (prior to previous sample_name) - - uint32_t sample_id = (uint32_t)sample_ids.size(); - sample_ids[stored_sample_name] = sample_id; - sample_desc.emplace_back(stored_sample_name); - - prev_sample_name = stored_sample_name; - } - - sample_desc.back().contigs.emplace_back(contig_desc_t(contig_name)); - - return true; -} - -// ******************************************************************************************* -void CCollection_V3::reset_prev_sample_name() -{ - lock_guard lck(mtx); - - prev_sample_name.clear(); -} - -// ******************************************************************************************* -void CCollection_V3::add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) -{ - lock_guard lck(mtx); - - string stored_sample_name = sample_name; - - if (sample_name.empty()) - stored_sample_name = extract_contig_name(contig_name); - - if (placing_sample_name != stored_sample_name) - { - placing_sample_name = stored_sample_name; - placing_sample_id = sample_ids.find(stored_sample_name)->second; - } - - for (auto& x : sample_desc[placing_sample_id].contigs) - { - if (x.name == contig_name) - { - if (place >= x.segments.size()) - x.segments.resize(place + 1); - - x.segments[place] = segment_desc_t(group_id, in_group_id, is_rev_comp, raw_length); - - return; - } - } -} - -// ******************************************************************************************* -void CCollection_V3::add_segments_placed(vector& segments_to_place) -{ - lock_guard lck(mtx); - - for (const auto& desc : segments_to_place) - { - auto p = sample_ids.find(desc.sample_name); - - if (p == sample_ids.end()) - { - assert("Wrong sample name\n"); - return; - } - - for (auto& x : sample_desc[p->second].contigs) - { - if (x.name == desc.contig_name) - { - if (desc.seg_part_no >= x.segments.size()) - x.segments.resize(desc.seg_part_no + 1); - - x.segments[desc.seg_part_no] = segment_desc_t(desc.group_id, desc.in_group_id, desc.is_rev_comp, desc.data_size); - - break; - } - } - } -} -// ******************************************************************************************* -bool CCollection_V3::get_reference_name(string& reference_name) -{ - lock_guard lck(mtx); - - if (sample_desc.empty()) - return false; - - reference_name = sample_desc.front().name; - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::get_samples_list(vector& v_samples) -{ - lock_guard lck(mtx); - - v_samples.clear(); - v_samples.reserve(sample_desc.size()); - - for (auto& x : sample_desc) - v_samples.emplace_back(x.name); - - sort(v_samples.begin(), v_samples.end()); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) -{ - lock_guard lck(mtx); - - auto p = sample_ids.find(sample_name); - - if (p == sample_ids.end()) - return false; // Error: no such a sample - - if (sample_desc[p->second].contigs.empty()) - load_batch_contig_names(p->second / batch_size); - - v_contig_names.clear(); - v_contig_names.reserve(sample_desc[p->second].contigs.size()); - - for (auto& x : sample_desc[p->second].contigs) - v_contig_names.emplace_back(x.name); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::get_sample_desc(const string& sample_name, vector>>& sample_desc_) -{ - lock_guard lck(mtx); - - sample_desc_.clear(); - - auto p = sample_ids.find(sample_name); - - if (p == sample_ids.end()) - return false; // Error: no such a sample - - if (sample_desc[p->second].contigs.empty()) - { - load_batch_contig_names(p->second / batch_size); - - load_batch_contig_details(p->second / batch_size); - } - - sample_desc_.reserve(sample_desc[p->second].contigs.size()); - - for (auto& x : sample_desc[p->second].contigs) - sample_desc_.emplace_back(x.name, x.segments); - - return true; -} - -// ******************************************************************************************* -bool CCollection_V3::get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) -{ - lock_guard lck(mtx); - - string short_contig_name = extract_contig_name(contig_name); - - contig_desc.clear(); - - auto p = sample_ids.find(sample_name); - - if (p == sample_ids.end()) - return false; // Error: no such a sample - - if (sample_desc[p->second].contigs.empty()) - load_batch_contig_names(p->second / batch_size); - - if (sample_desc[p->second].contigs.empty() || sample_desc[p->second].contigs.front().segments.empty()) - load_batch_contig_details(p->second / batch_size); - - for (auto& x : sample_desc[p->second].contigs) - { - if (extract_contig_name(x.name) == short_contig_name) - { - contig_desc = x.segments; - contig_name = x.name; - return true; - } - } - - return false; -} - -// ******************************************************************************************* -bool CCollection_V3::is_contig_desc(const string& sample_name, const string& contig_name) -{ - lock_guard lck(mtx); - - string short_contig_name = extract_contig_name(contig_name); - - auto p = sample_ids.find(sample_name); - - if (p == sample_ids.end()) - return false; // Error: no such a sample - - if (sample_desc[p->second].contigs.empty()) - load_batch_contig_names(p->second / batch_size); - - for (auto& x : sample_desc[p->second].contigs) - if (extract_contig_name(x.name) == contig_name) - return true; - - return false; -} - -// ******************************************************************************************* -vector CCollection_V3::get_samples_for_contig(const string& contig_name) -{ - lock_guard lck(mtx); - - vector v_samples; - - string short_contig_name = extract_contig_name(contig_name); - - size_t no_batches = (sample_desc.size() + batch_size - 1) / batch_size; - - for (size_t i = 0; i < no_batches; ++i) - { - if (sample_desc[i * batch_size].contigs.empty()) - load_batch_contig_names(i); - - size_t to_batch_id = min(sample_desc.size(), (i + 1) * batch_size); - - for (size_t j = i * batch_size; j < to_batch_id; ++j) - { - for (auto& x : sample_desc[j].contigs) - if(extract_contig_name(x.name) == short_contig_name) - v_samples.emplace_back(sample_desc[j].name); - } - - clear_batch_contig(i); - } - - return v_samples; -} - -// ******************************************************************************************* -size_t CCollection_V3::get_no_samples() -{ - lock_guard lck(mtx); - - return sample_desc.size(); -} - -// ******************************************************************************************* -int32_t CCollection_V3::get_no_contigs(const string& sample_name) -{ - lock_guard lck(mtx); - - auto p = sample_ids.find(sample_name); - - if (p == sample_ids.end()) - return -1; // Error: no such a sample - - if (sample_desc[p->second].contigs.empty()) - load_batch_contig_names(p->second / batch_size); - - return (int32_t) sample_desc[p->second].contigs.size(); -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection_v3.h" +#include +#include + +// ******************************************************************************************* +bool CCollection_V3::set_archives(shared_ptr _in_archive, shared_ptr _out_archive, + uint32_t _no_threads, size_t _batch_size, uint32_t _segment_size, uint32_t _kmer_length) +{ + lock_guard lck(mtx); + + in_archive = _in_archive; + out_archive = _out_archive; + + batch_size = _batch_size; + segment_size = _segment_size; + no_threads = _no_threads; + kmer_length = _kmer_length; + + if (in_archive == nullptr) + return prepare_for_compression(); + else if (out_archive == nullptr) + return prepare_for_decompression(); + else + return prepare_for_appending_copy(); +} + +// ******************************************************************************************* +bool CCollection_V3::prepare_for_compression() +{ + collection_samples_id = out_archive->RegisterStream("collection-samples"); + collection_contig_id = out_archive->RegisterStream("collection-contigs"); + collection_details_id = out_archive->RegisterStream("collection-details"); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::prepare_for_appending_copy() +{ +// auto in_collection_samples_id = in_archive->GetStreamId("collection-samples"); + auto in_collection_contig_id = in_archive->GetStreamId("collection-contigs"); + auto in_collection_details_id = in_archive->GetStreamId("collection-details"); + + collection_samples_id = out_archive->RegisterStream("collection-samples"); + collection_contig_id = out_archive->RegisterStream("collection-contigs"); + collection_details_id = out_archive->RegisterStream("collection-details"); + + load_batch_sample_names(); + + // in and out ids for collection-* must be the same! + + auto no_contig_batches = in_archive->GetNoParts(in_collection_contig_id); + + // Transfer all but the last one batch from in to out archive + vector data; + uint64_t meta; + + for (size_t i = 0; i < no_contig_batches - 1; ++i) + { + in_archive->GetPart(in_collection_contig_id, i, data, meta); + out_archive->AddPart(collection_contig_id, data, meta); + in_archive->GetPart(in_collection_details_id, i, data, meta); + out_archive->AddPart(collection_details_id, data, meta); + } + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::prepare_for_appending_load_last_batch() +{ + lock_guard lck(mtx); + + auto in_collection_contig_id = in_archive->GetStreamId("collection-contigs"); + auto in_collection_details_id = in_archive->GetStreamId("collection-details"); + + auto no_contig_batches = in_archive->GetNoParts(in_collection_contig_id); + + // Transfer all but the last one batch from in to out archive + vector data; + uint64_t meta; + + // Load last batch + load_batch_contig_names(no_contig_batches - 1); + load_batch_contig_details(no_contig_batches - 1); + + if (no_samples_in_last_batch == batch_size) + { + in_archive->GetPart(in_collection_contig_id, no_contig_batches - 1, data, meta); + out_archive->AddPart(collection_contig_id, data, meta); + in_archive->GetPart(in_collection_details_id, no_contig_batches - 1, data, meta); + out_archive->AddPart(collection_details_id, data, meta); + + clear_batch_contig(no_contig_batches - 1); + } + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::prepare_for_decompression() +{ + collection_samples_id = in_archive->GetStreamId("collection-samples"); + collection_contig_id = in_archive->GetStreamId("collection-contigs"); + collection_details_id = in_archive->GetStreamId("collection-details"); + + load_batch_sample_names(); + + return true; +} + +// ******************************************************************************************* +void CCollection_V3::complete_serialization() +{ + lock_guard lck(mtx); + + store_batch_sample_names(); +} + +// ******************************************************************************************* +void CCollection_V3::zstd_compress(ZSTD_CCtx*& cctx, vector& v_input, vector& v_output, int level) +{ + if (cctx == nullptr) + { + cctx = ZSTD_createCCtx(); + } + + v_output.resize(ZSTD_compressBound(v_input.size())); + auto c_size = ZSTD_compressCCtx(cctx, v_output.data(), v_output.size(), v_input.data(), v_input.size(), level); + + v_output.resize(c_size); +} + +// ******************************************************************************************* +void CCollection_V3::zstd_decompress(ZSTD_DCtx*& dctx, vector& v_input, vector& v_output, size_t raw_size) +{ + if (dctx == nullptr) + dctx = ZSTD_createDCtx(); + + v_output.resize(raw_size); + ZSTD_decompressDCtx(dctx, v_output.data(), v_output.size(), v_input.data(), v_input.size()); +} + +// ******************************************************************************************* +void CCollection_V3::store_batch_sample_names() +{ + vector v_data, v_tmp; + + determine_collection_samples_id(); + + serialize_sample_names(v_tmp); + + zstd_compress(zstd_cctx_samples, v_tmp, v_data, 19); + + out_archive->AddPartBuffered(collection_samples_id, v_data, v_tmp.size()); +} + +// ******************************************************************************************* +void CCollection_V3::load_batch_sample_names() +{ + vector v_data, v_tmp; + uint64_t raw_size; + + determine_collection_samples_id(); + + in_archive->GetPart(collection_samples_id, v_tmp, raw_size); + + zstd_decompress(zstd_dctx_samples, v_tmp, v_data, raw_size); + + deserialize_sample_names(v_data); +} + +// ******************************************************************************************* +void CCollection_V3::store_batch_contig_names(uint32_t id_from, uint32_t id_to) +{ + vector v_data, v_tmp; + + determine_collection_contig_id(); + + serialize_contig_names(v_tmp, id_from, id_to); + + zstd_compress(zstd_cctx_contigs, v_tmp, v_data, 18); + + out_archive->AddPartBuffered(collection_contig_id, v_data, v_tmp.size()); +} + +// ******************************************************************************************* +void CCollection_V3::load_batch_contig_names(size_t id_batch) +{ + vector v_data, v_tmp; + uint64_t raw_size; + + if (unpacked_contig_data_batch_id >= 0 && unpacked_contig_data_batch_id != (int) id_batch) + clear_batch_contig(unpacked_contig_data_batch_id); + + determine_collection_contig_id(); + + in_archive->GetPart(collection_contig_id, id_batch, v_tmp, raw_size); + + zstd_decompress(zstd_dctx_contigs, v_tmp, v_data, raw_size); + + deserialize_contig_names(v_data, id_batch * batch_size); + + unpacked_contig_data_batch_id = id_batch; +} + +// ******************************************************************************************* +void CCollection_V3::clear_batch_contig(size_t id_batch) +{ + size_t to_batch_id = min(sample_desc.size(), (id_batch + 1) * batch_size); + + for (size_t i = id_batch * batch_size; i < to_batch_id; ++i) + { + sample_desc[i].contigs.clear(); + sample_desc[i].contigs.shrink_to_fit(); + } +} + +// ******************************************************************************************* +void CCollection_V3::store_batch_contig_details(uint32_t id_from, uint32_t id_to) +{ + array, 5> v_data; + array, 5> v_packed; + vector v_stream; + + determnine_collection_details_id(); + + serialize_contig_details(v_data, id_from, id_to); + + if (no_threads >= 4) + { + vector> v_fut; + v_fut.reserve(5); + + for (int i = 0; i < 5; ++i) + v_fut.emplace_back(async([&, i]() {zstd_compress(zstd_cctx_details[i], v_data[i], v_packed[i], 19); })); + + for (int i = 0; i < 5; ++i) + v_fut[i].wait(); + } + else + { + for (int i = 0; i < 5; ++i) + zstd_compress(zstd_cctx_details[i], v_data[i], v_packed[i], 19); + } + + for (int i = 0; i < 5; ++i) + { + append(v_stream, v_data[i].size()); + append(v_stream, v_packed[i].size()); + } + + for (int i = 0; i < 5; ++i) + v_stream.insert(v_stream.end(), v_packed[i].begin(), v_packed[i].end()); + + out_archive->AddPartBuffered(collection_details_id, v_stream, 0); +} + +// ******************************************************************************************* +void CCollection_V3::load_batch_contig_details(size_t id_batch) +{ + array, 5> v_data; + array, 5> v_packed; + vector v_stream; + + uint64_t aux; + + if (unpacked_contig_data_batch_id >= 0 && unpacked_contig_data_batch_id != (int) id_batch) + clear_batch_contig(unpacked_contig_data_batch_id); + + determnine_collection_details_id(); + + in_archive->GetPart(collection_details_id, id_batch, v_stream, aux); + + uint8_t* ptr = v_stream.data(); + array, 5> a_sizes; + + for (int i = 0; i < 5; ++i) + { + read(ptr, a_sizes[i].first); + read(ptr, a_sizes[i].second); + } + + for (int i = 0; i < 5; ++i) + { + v_packed[i].assign(ptr, ptr + a_sizes[i].second); + ptr += a_sizes[i].second; + } + + if (no_threads >= 4) + { + vector> v_fut; + v_fut.reserve(5); + + for (int i = 0; i < 5; ++i) + v_fut.emplace_back(async([&, i]() {zstd_decompress(zstd_dctx_details[i], v_packed[i], v_data[i], a_sizes[i].first); })); + + for (int i = 0; i < 5; ++i) + v_fut[i].wait(); + } + else + { + for (int i = 0; i < 5; ++i) + zstd_decompress(zstd_dctx_details[i], v_packed[i], v_data[i], a_sizes[i].first); + } + + deserialize_contig_details(v_data, id_batch * batch_size); + + unpacked_contig_data_batch_id = id_batch; +} + +// ******************************************************************************************* +void CCollection_V3::serialize_sample_names(vector& v_data) +{ + append(v_data, (uint32_t) sample_desc.size()); + + for (auto& x: sample_desc) + append(v_data, x.name); +} + +// ******************************************************************************************* +void CCollection_V3::deserialize_sample_names(vector& v_data) +{ + uint8_t* p = v_data.data(); + + uint32_t no_samples; + + read(p, no_samples); + + sample_desc.resize(no_samples); + + for (size_t i = 0; i < no_samples; ++i) + { + read(p, sample_desc[i].name); + sample_ids[sample_desc[i].name] = i; + } +} + +// ******************************************************************************************* +vector CCollection_V3::split_string(const string& s) +{ + auto p = s.begin(); + vector components; + + while (true) + { + auto q = find(p, s.end(), ' '); + components.push_back(string(p, q)); + if (q == s.end()) + break; + + p = q + 1; + } + + return components; +} + +// ******************************************************************************************* +string CCollection_V3::encode_split(vector& prev_split, vector& curr_split) +{ + string enc; + + for (size_t i = 0; i < curr_split.size(); ++i) + { + if (prev_split[i] == curr_split[i]) + enc.push_back(-127); // same component marker + else if (prev_split[i].size() != curr_split[i].size()) + enc.append(curr_split[i]); + else + { + signed char cnt = 0; + uint32_t cmp_len = curr_split[i].size(); + + auto p_ptr = prev_split[i].data(); + auto c_ptr = curr_split[i].data(); + + for (uint32_t j = 0; j < cmp_len; ++j) + { + if (p_ptr[j] == c_ptr[j]) + { + if (cnt == 100) + { + enc.push_back(-cnt); // repetition marker + cnt = 1; + } + else + ++cnt; + } + else + { + if (cnt) + { + enc.push_back(-cnt); // repetition marker + cnt = 0; + } + + enc.push_back(c_ptr[j]); + } + } + + if (cnt) + enc.push_back(-cnt); // repetition marker + } + + enc.push_back(' '); + } + + enc.pop_back(); // remove final space + + return enc; +} + +// ******************************************************************************************* +string CCollection_V3::decode_split(vector& prev_split, vector& curr_split) +{ + string dec; + string cmp; + + for (size_t i = 0; i < curr_split.size(); ++i) + { + if (curr_split[i].size() == 1 && (signed char) (curr_split[i].front()) == -127) // same component marker + { + dec.append(prev_split[i]); + curr_split[i] = prev_split[i]; + } + else + { + cmp.clear(); + auto p_ptr = prev_split[i].data(); + + for (signed char c : curr_split[i]) + { + if (c >= 0) + { + cmp.push_back(c); + ++p_ptr; + } + else + { + cmp.append(p_ptr, -c); + p_ptr += -c; + } + } + + dec.append(cmp); + curr_split[i] = move(cmp); + } + + dec.push_back(' '); + } + + dec.pop_back(); // remove final space + + return dec; +} + +// ******************************************************************************************* +void CCollection_V3::serialize_contig_names(vector& v_data, uint32_t id_from, uint32_t id_to) +{ + append(v_data, id_to - id_from); + + string p_name; + + vector sp_prev, sp_curr; + + for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) + { + append(v_data, p->contigs.size()); + + vector prev_split; + vector curr_split; + + for (auto& x : p->contigs) + { + curr_split = split_string(x.name); + + if (curr_split.size() != prev_split.size()) + append(v_data, x.name); + else + append(v_data, encode_split(prev_split, curr_split)); + + prev_split = move(curr_split); + } + } +} + +// ******************************************************************************************* +void CCollection_V3::deserialize_contig_names(vector& v_data, size_t i_sample) +{ + uint8_t* p = v_data.data(); + + uint32_t no_samples_in_curr_batch; + uint32_t no_contigs_in_curr_sample; + + read(p, no_samples_in_curr_batch); + + for (size_t i = 0; i < no_samples_in_curr_batch; ++i) + { + read(p, no_contigs_in_curr_sample); + + auto& curr_sample = sample_desc[i_sample + i]; + + curr_sample.contigs.resize(no_contigs_in_curr_sample); + + vector prev_split; + vector curr_split; + string enc; + + for (size_t j = 0; j < no_contigs_in_curr_sample; ++j) + { + read(p, enc); + + curr_split = split_string(enc); + + if (curr_split.size() != prev_split.size()) + curr_sample.contigs[j].name = enc; + else + curr_sample.contigs[j].name = decode_split(prev_split, curr_split); + + prev_split = move(curr_split); + } + } + + // important only for appending mode + no_samples_in_last_batch = no_samples_in_curr_batch; +} + +// ******************************************************************************************* +void CCollection_V3::serialize_contig_details(array, 5>& v_data, uint32_t id_from, uint32_t id_to) +{ + append(v_data[0], id_to - id_from); + + clear_in_group_ids(); + + for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) + { + append(v_data[0], p->contigs.size()); + + uint32_t pred_raw_length = segment_size + kmer_length; + + for (auto& x : p->contigs) + { + append(v_data[0], x.segments.size()); + + for (auto& seg : x.segments) + { + int prev_in_group_id = get_in_group_id(seg.group_id); + + uint32_t e_group_id = seg.group_id; + uint32_t e_in_group_id; + + if (prev_in_group_id == -1) + e_in_group_id = (int) seg.in_group_id; + else + { + if (seg.in_group_id == 0) + e_in_group_id = 0; + else if ((int)seg.in_group_id == prev_in_group_id + 1) + e_in_group_id = 1; + else + e_in_group_id = (uint32_t)zigzag_encode(seg.in_group_id, prev_in_group_id + 1) + 1u; + } + + uint32_t e_raw_length = (uint32_t)zigzag_encode(seg.raw_length, pred_raw_length); + + append(v_data[1], e_group_id); + append(v_data[2], e_in_group_id); + append(v_data[3], e_raw_length); + append(v_data[4], (uint32_t)seg.is_rev_comp); + + if ((int) seg.in_group_id > prev_in_group_id && seg.in_group_id > 0) + set_in_group_id(seg.group_id, seg.in_group_id); + } + } + } +} + +// ******************************************************************************************* +void CCollection_V3::deserialize_contig_details(array, 5>& v_data, size_t i_sample) +{ + array, 5> v_det; + + uint8_t* p = v_data[0].data(); + + uint32_t no_samples_in_curr_batch; + uint32_t no_contigs_in_curr_sample; + uint32_t no_segments_in_curr_contig; + size_t no_items = 0; + + read(p, no_samples_in_curr_batch); + + for (size_t i = 0; i < no_samples_in_curr_batch; ++i) + { + read(p, no_contigs_in_curr_sample); + + auto& curr_sample = sample_desc[i_sample + i]; + + curr_sample.contigs.resize(no_contigs_in_curr_sample); + + for (size_t j = 0; j < no_contigs_in_curr_sample; ++j) + { + read(p, no_segments_in_curr_contig); + + curr_sample.contigs[j].segments.resize(no_segments_in_curr_contig); + + no_items += no_segments_in_curr_contig; + } + } + + for(int i = 1; i < 5; ++i) + { + v_det[i].resize(no_items); + + p = v_data[i].data(); + + for (size_t j = 0; j < no_items; ++j) + read(p, v_det[i][j]); + } + + no_items = 0; + + clear_in_group_ids(); + + uint32_t pred_raw_length = segment_size + kmer_length; + + for (size_t i = 0; i < no_samples_in_curr_batch; ++i) + { + auto& curr_sample = sample_desc[i_sample + i]; + + for (size_t j = 0; j < curr_sample.contigs.size(); ++j) + { + auto& curr_contig = curr_sample.contigs[j]; + + for (size_t k = 0; k < curr_contig.segments.size(); ++k, ++no_items) + { + uint32_t c_group_id = v_det[1][no_items]; + + curr_contig.segments[k].group_id = c_group_id; + int prev_in_group_id = get_in_group_id(c_group_id); + + uint32_t e_in_group_id = v_det[2][no_items]; + uint32_t c_in_group_id; + + if (prev_in_group_id == -1) + c_in_group_id = e_in_group_id; + else + { + if (e_in_group_id == 0) + c_in_group_id = 0; + else if (e_in_group_id == 1) + c_in_group_id = prev_in_group_id + 1; + else + c_in_group_id = (uint32_t)zigzag_decode(e_in_group_id - 1u, prev_in_group_id + 1); + + } + + curr_contig.segments[k].in_group_id = c_in_group_id; + + uint32_t c_raw_length = (uint32_t)zigzag_decode(v_det[3][no_items], pred_raw_length); + curr_contig.segments[k].raw_length = c_raw_length; + + curr_contig.segments[k].is_rev_comp = (bool)v_det[4][no_items]; + + if ((int)c_in_group_id > prev_in_group_id && c_in_group_id > 0) + set_in_group_id(c_group_id, c_in_group_id); + } + } + } +} + +// ******************************************************************************************* +void CCollection_V3::store_contig_batch(uint32_t id_from, uint32_t id_to) +{ + lock_guard lck(mtx); + + if (no_threads > 1) + { + future fut_contigs = async([&]() {this->store_batch_contig_names(id_from, id_to); }); + store_batch_contig_details(id_from, id_to); + fut_contigs.wait(); + } + else + { + store_batch_contig_names(id_from, id_to); + store_batch_contig_details(id_from, id_to); + } + + for (auto p = sample_desc.begin() + id_from; p != sample_desc.begin() + id_to; ++p) + { + p->contigs.clear(); + p->contigs.shrink_to_fit(); + } +} + +// ******************************************************************************************* +bool CCollection_V3::register_sample_contig(const string& sample_name, const string& contig_name) +{ + string short_contig_name = extract_contig_name(contig_name); + string stored_sample_name = sample_name; + + lock_guard lck(mtx); + + if (sample_name.empty()) + stored_sample_name = short_contig_name; + + if (stored_sample_name != prev_sample_name) + { + auto q = sample_ids.find(stored_sample_name); + if (q != sample_ids.end()) + return false; // sample of the same name was already registered (prior to previous sample_name) + + uint32_t sample_id = (uint32_t)sample_ids.size(); + sample_ids[stored_sample_name] = sample_id; + sample_desc.emplace_back(stored_sample_name); + + prev_sample_name = stored_sample_name; + } + + sample_desc.back().contigs.emplace_back(contig_desc_t(contig_name)); + + return true; +} + +// ******************************************************************************************* +void CCollection_V3::reset_prev_sample_name() +{ + lock_guard lck(mtx); + + prev_sample_name.clear(); +} + +// ******************************************************************************************* +void CCollection_V3::add_segment_placed(const string& sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length) +{ + lock_guard lck(mtx); + + string stored_sample_name = sample_name; + + if (sample_name.empty()) + stored_sample_name = extract_contig_name(contig_name); + + if (placing_sample_name != stored_sample_name) + { + placing_sample_name = stored_sample_name; + placing_sample_id = sample_ids.find(stored_sample_name)->second; + } + + for (auto& x : sample_desc[placing_sample_id].contigs) + { + if (x.name == contig_name) + { + if (place >= x.segments.size()) + x.segments.resize(place + 1); + + x.segments[place] = segment_desc_t(group_id, in_group_id, is_rev_comp, raw_length); + + return; + } + } +} + +// ******************************************************************************************* +void CCollection_V3::add_segments_placed(vector& segments_to_place) +{ + lock_guard lck(mtx); + + for (const auto& desc : segments_to_place) + { + string sample_name = desc.sample_name; + + if (sample_name.empty()) + sample_name = extract_contig_name(desc.contig_name); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + { + assert("Wrong sample name\n"); + return; + } + + for (auto& x : sample_desc[p->second].contigs) + { + if (x.name == desc.contig_name) + { + if (desc.seg_part_no >= x.segments.size()) + x.segments.resize(desc.seg_part_no + 1); + + x.segments[desc.seg_part_no] = segment_desc_t(desc.group_id, desc.in_group_id, desc.is_rev_comp, desc.data_size); + + break; + } + } + } +} + +// ******************************************************************************************* +bool CCollection_V3::get_reference_name(string& reference_name) +{ + lock_guard lck(mtx); + + if (sample_desc.empty()) + return false; + + reference_name = sample_desc.front().name; + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::get_samples_list(vector& v_samples, bool sorted) +{ + lock_guard lck(mtx); + + v_samples.clear(); + v_samples.reserve(sample_desc.size()); + + for (auto& x : sample_desc) + v_samples.emplace_back(x.name); + + if(sorted) + sort(v_samples.begin(), v_samples.end()); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::get_contig_list_in_sample(const string& sample_name, vector& v_contig_names) +{ + lock_guard lck(mtx); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + return false; // Error: no such a sample + + if (sample_desc[p->second].contigs.empty()) + load_batch_contig_names(p->second / batch_size); + + v_contig_names.clear(); + v_contig_names.reserve(sample_desc[p->second].contigs.size()); + + for (auto& x : sample_desc[p->second].contigs) + v_contig_names.emplace_back(x.name); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::get_sample_desc(const string& sample_name, vector>>& sample_desc_) +{ + lock_guard lck(mtx); + + sample_desc_.clear(); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + return false; // Error: no such a sample + + if (sample_desc[p->second].contigs.empty()) + { + load_batch_contig_names(p->second / batch_size); + + load_batch_contig_details(p->second / batch_size); + } + + sample_desc_.reserve(sample_desc[p->second].contigs.size()); + + for (auto& x : sample_desc[p->second].contigs) + sample_desc_.emplace_back(x.name, x.segments); + + return true; +} + +// ******************************************************************************************* +bool CCollection_V3::get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc) +{ + lock_guard lck(mtx); + + string short_contig_name = extract_contig_name(contig_name); + + contig_desc.clear(); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + return false; // Error: no such a sample + + if (sample_desc[p->second].contigs.empty()) + load_batch_contig_names(p->second / batch_size); + + if (sample_desc[p->second].contigs.empty() || sample_desc[p->second].contigs.front().segments.empty()) + load_batch_contig_details(p->second / batch_size); + + for (auto& x : sample_desc[p->second].contigs) + { + if (extract_contig_name(x.name) == short_contig_name) + { + contig_desc = x.segments; + contig_name = x.name; + return true; + } + } + + return false; +} + +// ******************************************************************************************* +bool CCollection_V3::is_contig_desc(const string& sample_name, const string& contig_name) +{ + lock_guard lck(mtx); + + string short_contig_name = extract_contig_name(contig_name); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + return false; // Error: no such a sample + + if (sample_desc[p->second].contigs.empty()) + load_batch_contig_names(p->second / batch_size); + + for (auto& x : sample_desc[p->second].contigs) + if (extract_contig_name(x.name) == contig_name) + return true; + + return false; +} + +// ******************************************************************************************* +vector CCollection_V3::get_samples_for_contig(const string& contig_name) +{ + lock_guard lck(mtx); + + vector v_samples; + + string short_contig_name = extract_contig_name(contig_name); + + size_t no_batches = (sample_desc.size() + batch_size - 1) / batch_size; + + for (size_t i = 0; i < no_batches; ++i) + { + if (sample_desc[i * batch_size].contigs.empty()) + load_batch_contig_names(i); + + size_t to_batch_id = min(sample_desc.size(), (i + 1) * batch_size); + + for (size_t j = i * batch_size; j < to_batch_id; ++j) + { + for (auto& x : sample_desc[j].contigs) + if(extract_contig_name(x.name) == short_contig_name) + v_samples.emplace_back(sample_desc[j].name); + } + + clear_batch_contig(i); + } + + return v_samples; +} + +// ******************************************************************************************* +size_t CCollection_V3::get_no_samples() +{ + lock_guard lck(mtx); + + return sample_desc.size(); +} + +// ******************************************************************************************* +int32_t CCollection_V3::get_no_contigs(const string& sample_name) +{ + lock_guard lck(mtx); + + auto p = sample_ids.find(sample_name); + + if (p == sample_ids.end()) + return -1; // Error: no such a sample + + if (sample_desc[p->second].contigs.empty()) + load_batch_contig_names(p->second / batch_size); + + return (int32_t) sample_desc[p->second].contigs.size(); +} + +// EOF diff --git a/src/core/collection_v3.h b/src/common/collection_v3.h similarity index 95% rename from src/core/collection_v3.h rename to src/common/collection_v3.h index bb79b11..135040d 100644 --- a/src/core/collection_v3.h +++ b/src/common/collection_v3.h @@ -1,264 +1,264 @@ -#ifndef _COLLECTION_V3_H -#define _COLLECTION_V3_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "collection.h" -#include "archive.h" - -class CCollection_V3 : public CCollection -{ - struct contig_desc_t { - string name; - vector segments; - - contig_desc_t() : name("") {}; - - contig_desc_t(const contig_desc_t& x) { - name = x.name; - segments = x.segments; - } - - contig_desc_t(contig_desc_t&& x) noexcept { - name = move(x.name); - segments = move(x.segments); - } - - contig_desc_t(const string& _name) : name(_name) {}; - - contig_desc_t& operator=(const contig_desc_t& x) - { - name = x.name; - segments = x.segments; - - return *this; - } - - contig_desc_t& operator=(contig_desc_t&& x) noexcept - { - name = move(x.name); - segments = move(x.segments); - - return *this; - } - }; - - struct sample_desc_t { - string name; - vector contigs; - - sample_desc_t() : name("") {}; - sample_desc_t(const string& _name, const vector& _contigs) : name(_name), contigs(_contigs) {}; - sample_desc_t(const sample_desc_t&x) { - name = x.name; - contigs = x.contigs; - } - - sample_desc_t(sample_desc_t&&x) noexcept { - name = move(x.name); - contigs = move(x.contigs); - } - sample_desc_t(const string& _name) : name(_name) {}; - - sample_desc_t& operator=(const sample_desc_t& x) { - name = x.name; - contigs = x.contigs; - - return *this; - } - - sample_desc_t& operator=(sample_desc_t&& x) noexcept { - name = move(x.name); - contigs = move(x.contigs); - - return *this; - } - }; - - ZSTD_CCtx* zstd_cctx_samples = nullptr; - ZSTD_CCtx* zstd_cctx_contigs = nullptr; - array zstd_cctx_details = { nullptr, nullptr, nullptr, nullptr, nullptr }; - - ZSTD_DCtx* zstd_dctx_samples = nullptr; - ZSTD_DCtx* zstd_dctx_contigs = nullptr; - array zstd_dctx_details = { nullptr, nullptr, nullptr, nullptr, nullptr }; - - unordered_map sample_ids; - vector sample_desc; - - int unpacked_contig_data_batch_id = -1; - - uint32_t no_threads; - - size_t batch_size; - uint32_t segment_size; - uint32_t kmer_length; - size_t no_samples_in_last_batch; - string prev_sample_name; - - string placing_sample_name; - uint32_t placing_sample_id; - - int collection_samples_id; - int collection_contig_id; - int collection_details_id; - - shared_ptr in_archive; - shared_ptr out_archive; - vector v_in_group_ids; - - void store_batch_sample_names(); - void store_batch_contig_names(uint32_t id_from, uint32_t id_to); - void store_batch_contig_details(uint32_t id_from, uint32_t id_to); - - void load_batch_sample_names(); - void load_batch_contig_names(size_t id_batch); - void load_batch_contig_details(size_t id_batch); - void clear_batch_contig(size_t id_batch); - - void serialize_sample_names(vector &v_data); - void serialize_contig_names(vector& v_data, uint32_t id_from, uint32_t id_to); - void serialize_contig_details(array, 5>& v_data, uint32_t id_from, uint32_t id_to); - - void deserialize_sample_names(vector& v_data); - void deserialize_contig_names(vector& v_data, size_t i_sample); - void deserialize_contig_details(array, 5>& v_data, size_t i_sample); - - bool prepare_for_compression(); - bool prepare_for_appending_copy(); - bool prepare_for_decompression(); - - void zstd_compress(ZSTD_CCtx*& cctx, vector& v_input, vector& v_output, int level); - void zstd_decompress(ZSTD_DCtx*& dctx, vector& v_input, vector& v_output, size_t raw_size); - - // Just check - int get_in_group_id(int pos) - { - if ((size_t) pos >= v_in_group_ids.size()) - return -1; - return v_in_group_ids[pos]; - } - - // Check but resize first if necessary - int read_in_group_id(int pos) - { - if ((size_t) pos >= v_in_group_ids.size()) - v_in_group_ids.resize((int)(pos * 1.2), -1); - - return v_in_group_ids[pos]; - } - - void set_in_group_id(int pos, int val) - { - if ((size_t) pos >= v_in_group_ids.size()) - v_in_group_ids.resize((int) (pos * 1.2) + 1, -1); - - v_in_group_ids[pos] = val; - } - - void clear_in_group_ids() - { - v_in_group_ids.clear(); - } - - void determine_collection_samples_id() - { - if (collection_samples_id >= 0) - return; - - if(out_archive != nullptr) - collection_samples_id = out_archive->GetStreamId("collection-samples"); - else - collection_samples_id = in_archive->GetStreamId("collection-samples"); - } - - void determine_collection_contig_id() - { - if (collection_contig_id >= 0) - return; - - if(out_archive != nullptr) - collection_contig_id = out_archive->GetStreamId("collection-contigs"); - else - collection_contig_id = in_archive->GetStreamId("collection-contigs"); - } - - void determnine_collection_details_id() - { - if (collection_details_id >= 0) - return; - - if(out_archive != nullptr) - collection_details_id = out_archive->GetStreamId("collection-details"); - else - collection_details_id = in_archive->GetStreamId("collection-details"); - } - - vector split_string(const string& s); - string encode_split(vector& prev_split, vector& curr_split); - string decode_split(vector& prev_split, vector& curr_split); - -public: - CCollection_V3() : CCollection() { - batch_size = 1ull << 20; - - no_threads = 1; - - collection_samples_id = -1; - collection_contig_id = -1; - collection_details_id = -1; - - placing_sample_id = ~0u; - placing_sample_name = ""; - - kmer_length = 0; - no_samples_in_last_batch = 0; - segment_size = 0; - } - virtual ~CCollection_V3() { - if (zstd_cctx_samples) ZSTD_freeCCtx(zstd_cctx_samples); - if (zstd_cctx_contigs) ZSTD_freeCCtx(zstd_cctx_contigs); - for(auto &x : zstd_cctx_details) - if (x) ZSTD_freeCCtx(x); - - if (zstd_dctx_samples) ZSTD_freeDCtx(zstd_dctx_samples); - if (zstd_dctx_contigs) ZSTD_freeDCtx(zstd_dctx_contigs); - for(auto &x : zstd_dctx_details) - if (x) ZSTD_freeDCtx(x); - }; - - bool set_archives(shared_ptr _in_archive, shared_ptr _out_archive, - uint32_t _no_threads, size_t _batch_size, uint32_t _segment_size, uint32_t _kmer_length); - - void complete_serialization(); - - bool prepare_for_appending_load_last_batch(); - - virtual bool register_sample_contig(const string& sample_name, const string& contig_name); - - void reset_prev_sample_name(); - virtual void add_segment_placed(const string &sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); - void add_segments_placed(vector& segments_to_place); - virtual bool get_reference_name(string& reference_name); - virtual bool get_samples_list(vector& v_samples); - virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names); - virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc_); - virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc); - virtual bool is_contig_desc(const string& sample_name, const string& contig_name); - virtual vector get_samples_for_contig(const string& contig_name); - virtual size_t get_no_samples(); - virtual int32_t get_no_contigs(const string& sample_name); - - void store_contig_batch(uint32_t id_from, uint32_t id_to); -}; - -// EOF -#endif +#ifndef _COLLECTION_V3_H +#define _COLLECTION_V3_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "collection.h" +#include "archive.h" + +class CCollection_V3 : public CCollection +{ + struct contig_desc_t { + string name; + vector segments; + + contig_desc_t() : name("") {}; + + contig_desc_t(const contig_desc_t& x) { + name = x.name; + segments = x.segments; + } + + contig_desc_t(contig_desc_t&& x) noexcept { + name = move(x.name); + segments = move(x.segments); + } + + contig_desc_t(const string& _name) : name(_name) {}; + + contig_desc_t& operator=(const contig_desc_t& x) + { + name = x.name; + segments = x.segments; + + return *this; + } + + contig_desc_t& operator=(contig_desc_t&& x) noexcept + { + name = move(x.name); + segments = move(x.segments); + + return *this; + } + }; + + struct sample_desc_t { + string name; + vector contigs; + + sample_desc_t() : name("") {}; + sample_desc_t(const string& _name, const vector& _contigs) : name(_name), contigs(_contigs) {}; + sample_desc_t(const sample_desc_t&x) { + name = x.name; + contigs = x.contigs; + } + + sample_desc_t(sample_desc_t&&x) noexcept { + name = move(x.name); + contigs = move(x.contigs); + } + sample_desc_t(const string& _name) : name(_name) {}; + + sample_desc_t& operator=(const sample_desc_t& x) { + name = x.name; + contigs = x.contigs; + + return *this; + } + + sample_desc_t& operator=(sample_desc_t&& x) noexcept { + name = move(x.name); + contigs = move(x.contigs); + + return *this; + } + }; + + ZSTD_CCtx* zstd_cctx_samples = nullptr; + ZSTD_CCtx* zstd_cctx_contigs = nullptr; + array zstd_cctx_details = { nullptr, nullptr, nullptr, nullptr, nullptr }; + + ZSTD_DCtx* zstd_dctx_samples = nullptr; + ZSTD_DCtx* zstd_dctx_contigs = nullptr; + array zstd_dctx_details = { nullptr, nullptr, nullptr, nullptr, nullptr }; + + unordered_map sample_ids; + vector sample_desc; + + int unpacked_contig_data_batch_id = -1; + + uint32_t no_threads; + + size_t batch_size; + uint32_t segment_size; + uint32_t kmer_length; + size_t no_samples_in_last_batch; + string prev_sample_name; + + string placing_sample_name; + uint32_t placing_sample_id; + + int collection_samples_id; + int collection_contig_id; + int collection_details_id; + + shared_ptr in_archive; + shared_ptr out_archive; + vector v_in_group_ids; + + void store_batch_sample_names(); + void store_batch_contig_names(uint32_t id_from, uint32_t id_to); + void store_batch_contig_details(uint32_t id_from, uint32_t id_to); + + void load_batch_sample_names(); + void load_batch_contig_names(size_t id_batch); + void load_batch_contig_details(size_t id_batch); + void clear_batch_contig(size_t id_batch); + + void serialize_sample_names(vector &v_data); + void serialize_contig_names(vector& v_data, uint32_t id_from, uint32_t id_to); + void serialize_contig_details(array, 5>& v_data, uint32_t id_from, uint32_t id_to); + + void deserialize_sample_names(vector& v_data); + void deserialize_contig_names(vector& v_data, size_t i_sample); + void deserialize_contig_details(array, 5>& v_data, size_t i_sample); + + bool prepare_for_compression(); + bool prepare_for_appending_copy(); + bool prepare_for_decompression(); + + void zstd_compress(ZSTD_CCtx*& cctx, vector& v_input, vector& v_output, int level); + void zstd_decompress(ZSTD_DCtx*& dctx, vector& v_input, vector& v_output, size_t raw_size); + + // Just check + int get_in_group_id(int pos) + { + if ((size_t) pos >= v_in_group_ids.size()) + return -1; + return v_in_group_ids[pos]; + } + + // Check but resize first if necessary + int read_in_group_id(int pos) + { + if ((size_t) pos >= v_in_group_ids.size()) + v_in_group_ids.resize((int)(pos * 1.2), -1); + + return v_in_group_ids[pos]; + } + + void set_in_group_id(int pos, int val) + { + if ((size_t) pos >= v_in_group_ids.size()) + v_in_group_ids.resize((int) (pos * 1.2) + 1, -1); + + v_in_group_ids[pos] = val; + } + + void clear_in_group_ids() + { + v_in_group_ids.clear(); + } + + void determine_collection_samples_id() + { + if (collection_samples_id >= 0) + return; + + if(out_archive != nullptr) + collection_samples_id = out_archive->GetStreamId("collection-samples"); + else + collection_samples_id = in_archive->GetStreamId("collection-samples"); + } + + void determine_collection_contig_id() + { + if (collection_contig_id >= 0) + return; + + if(out_archive != nullptr) + collection_contig_id = out_archive->GetStreamId("collection-contigs"); + else + collection_contig_id = in_archive->GetStreamId("collection-contigs"); + } + + void determnine_collection_details_id() + { + if (collection_details_id >= 0) + return; + + if(out_archive != nullptr) + collection_details_id = out_archive->GetStreamId("collection-details"); + else + collection_details_id = in_archive->GetStreamId("collection-details"); + } + + vector split_string(const string& s); + string encode_split(vector& prev_split, vector& curr_split); + string decode_split(vector& prev_split, vector& curr_split); + +public: + CCollection_V3() : CCollection() { + batch_size = 1ull << 20; + + no_threads = 1; + + collection_samples_id = -1; + collection_contig_id = -1; + collection_details_id = -1; + + placing_sample_id = ~0u; + placing_sample_name = ""; + + kmer_length = 0; + no_samples_in_last_batch = 0; + segment_size = 0; + } + virtual ~CCollection_V3() { + if (zstd_cctx_samples) ZSTD_freeCCtx(zstd_cctx_samples); + if (zstd_cctx_contigs) ZSTD_freeCCtx(zstd_cctx_contigs); + for(auto &x : zstd_cctx_details) + if (x) ZSTD_freeCCtx(x); + + if (zstd_dctx_samples) ZSTD_freeDCtx(zstd_dctx_samples); + if (zstd_dctx_contigs) ZSTD_freeDCtx(zstd_dctx_contigs); + for(auto &x : zstd_dctx_details) + if (x) ZSTD_freeDCtx(x); + }; + + bool set_archives(shared_ptr _in_archive, shared_ptr _out_archive, + uint32_t _no_threads, size_t _batch_size, uint32_t _segment_size, uint32_t _kmer_length); + + void complete_serialization(); + + bool prepare_for_appending_load_last_batch(); + + virtual bool register_sample_contig(const string& sample_name, const string& contig_name); + + void reset_prev_sample_name(); + virtual void add_segment_placed(const string &sample_name, const string& contig_name, const uint32_t place, const uint32_t group_id, const uint32_t in_group_id, const bool is_rev_comp, const uint32_t raw_length); + void add_segments_placed(vector& segments_to_place); + virtual bool get_reference_name(string& reference_name); + virtual bool get_samples_list(vector& v_samples, bool sorted = true); + virtual bool get_contig_list_in_sample(const string& sample_name, vector& v_contig_names); + virtual bool get_sample_desc(const string& sample_name, vector>>& sample_desc_); + virtual bool get_contig_desc(const string& sample_name, string& contig_name, vector& contig_desc); + virtual bool is_contig_desc(const string& sample_name, const string& contig_name); + virtual vector get_samples_for_contig(const string& contig_name); + virtual size_t get_no_samples(); + virtual int32_t get_no_contigs(const string& sample_name); + + void store_contig_batch(uint32_t id_from, uint32_t id_to); +}; + +// EOF +#endif diff --git a/src/core/defs.h b/src/common/defs.h similarity index 89% rename from src/core/defs.h rename to src/common/defs.h index beab492..50be156 100644 --- a/src/core/defs.h +++ b/src/common/defs.h @@ -1,52 +1,54 @@ -#ifndef _DEFS_H -#define _DEFS_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include - -using namespace std; - -typedef vector contig_t; -typedef vector packed_block_t; - -const uint32_t AGC_VER_MAJOR = 3; -const uint32_t AGC_VER_MINOR = 1; -const uint32_t AGC_VER_BUGFIX = 0; -const string AGC_VER_BUILD = "20240312.1"s; - -const uint32_t AGC_FILE_MAJOR = 3; -const uint32_t AGC_FILE_MINOR = 0; - -const std::string AGC_VERSION = std::string("AGC (Assembled Genomes Compressor) v. ") + - to_string(AGC_VER_MAJOR) + "." + to_string(AGC_VER_MINOR) + "." + to_string(AGC_VER_BUGFIX) + - " [build " + AGC_VER_BUILD + "]"; - -#define IMPROVED_LZ_ENCODING - -#include - -#if defined(_MSC_VER) /* Visual Studio */ -#define REFRESH_FORCE_INLINE __forceinline -#define REFRESH_NO_INLINE __declspec(noinline) -#define ARCH_X64 -#elif defined(__GNUC__) -#define REFRESH_FORCE_INLINE __inline__ __attribute__((always_inline, unused)) -#define REFRESH_NO_INLINE __attribute__((noinline)) -#else -#define REFRESH_FORCE_INLINE -#define REFRESH_NO_INLINE -#endif - -// EOF +#ifndef _DEFS_H +#define _DEFS_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include + +using namespace std; + +typedef vector contig_t; +typedef vector packed_block_t; + +const uint32_t AGC_VER_MAJOR = 3; +const uint32_t AGC_VER_MINOR = 2; +const uint32_t AGC_VER_BUGFIX = 0; +const string AGC_VER_BUILD = "20241121.1"s; + +const uint32_t AGC_FILE_MAJOR = 3; +const uint32_t AGC_FILE_MINOR = 0; + +const std::string AGC_VERSION = std::string("AGC (Assembled Genomes Compressor) v. ") + + to_string(AGC_VER_MAJOR) + "." + to_string(AGC_VER_MINOR) + "." + to_string(AGC_VER_BUGFIX) + + " [build " + AGC_VER_BUILD + "]"; + +#define IMPROVED_LZ_ENCODING + +#define USE_INCREMENTING_BARRIERS + +#include + +#if defined(_MSC_VER) /* Visual Studio */ +#define REFRESH_FORCE_INLINE __forceinline +#define REFRESH_NO_INLINE __declspec(noinline) +#define ARCH_X64 +#elif defined(__GNUC__) +#define REFRESH_FORCE_INLINE __inline__ __attribute__((always_inline, unused)) +#define REFRESH_NO_INLINE __attribute__((noinline)) +#else +#define REFRESH_FORCE_INLINE +#define REFRESH_NO_INLINE +#endif + +// EOF #endif \ No newline at end of file diff --git a/src/core/io.h b/src/common/io.h similarity index 95% rename from src/core/io.h rename to src/common/io.h index accef10..4af3586 100644 --- a/src/core/io.h +++ b/src/common/io.h @@ -1,424 +1,424 @@ -#ifndef _IO_H -#define _IO_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include -#include - -#include - -using namespace std; - -#ifndef _WIN32 -#define my_fseek fseek -#define my_ftell ftell -#else -#define my_fseek _fseeki64 -#define my_ftell _ftelli64 -#include -#include -#endif - -// ******************************************************************************************* -// Buffered input file -class CInFile -{ - size_t BUFFER_SIZE = 0; - - FILE *f; - uint8_t *buffer; - size_t buffer_pos; - size_t buffer_filled; - - size_t file_size; - size_t before_buffer_bytes; - -public: - // ******************************************************************************************* - CInFile() : f(nullptr), buffer(nullptr), buffer_pos(0), buffer_filled(0), file_size(0), before_buffer_bytes(0) - {}; - - // ******************************************************************************************* - ~CInFile() - { - if (f) - fclose(f); - if (buffer) - delete[] buffer; - } - - // ******************************************************************************************* - bool Open(const string &file_name, const size_t _BUFFER_SIZE = 128 << 20) - { - if (f) - return false; - - f = fopen(file_name.c_str(), "rb"); - if (!f) - return false; - - my_fseek(f, 0, SEEK_END); - file_size = my_ftell(f); - my_fseek(f, 0, SEEK_SET); - before_buffer_bytes = 0; - - if (_BUFFER_SIZE == ~0ull) - BUFFER_SIZE = file_size; - else - BUFFER_SIZE = _BUFFER_SIZE; - - buffer = new uint8_t[BUFFER_SIZE]; - - buffer_pos = 0; - - buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); - - return true; - } - - // ******************************************************************************************* - bool Close() - { - if (f) - { - fclose(f); - f = nullptr; - } - if (buffer) - { - delete[] buffer; - buffer = nullptr; - } - - return true; - } - - // ******************************************************************************************* - bool IsOpened() - { - return f != nullptr; - } - - // ******************************************************************************************* - int Get() - { - if (buffer_pos < buffer_filled) - return buffer[buffer_pos++]; - - if (feof(f)) - return EOF; - - before_buffer_bytes += buffer_filled; - - buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); - if (buffer_filled == 0) - return EOF; - - buffer_pos = 0; - - return buffer[buffer_pos++]; - } - - // ******************************************************************************************* - bool UnGet() - { - if (buffer_pos) - { - --buffer_pos; - return true; - } - - return false; - } - - // ******************************************************************************************* - uint64_t ReadUInt(const int no_bytes) - { - uint64_t x = 0; - uint64_t shift = 0; - - for (int i = 0; i < no_bytes; ++i) - { - uint64_t c = Get(); - x += c << shift; - shift += 8; - } - - return x; - } - - // ******************************************************************************************* - uint64_t ReadUIntVar() - { - uint64_t x = 0; - uint64_t c = Get(); - - if ((c >> 7) == 0) // [0, 0x8000) - { - x = c << 8; - x += (uint64_t)Get(); - } - else if ((c >> 6) == 0b10) // [0x8000, 0x400000) - { - x = (c & 0x3f) << 16; - x += (uint64_t) Get() << 8; - x += (uint64_t)Get(); - } - else if ((c >> 6) == 0b11) // [0x80000, 0x4000000) - { - x = (c & 0x3f) << 24; - x += (uint64_t)Get() << 16; - x += (uint64_t)Get() << 8; - x += (uint64_t)Get(); - } - - return x; - } - - // ******************************************************************************************* - void Read(uint8_t *ptr, size_t size) - { - if (before_buffer_bytes + buffer_pos + size > file_size) - size = file_size - (before_buffer_bytes + buffer_pos); - - size_t to_read = size; - - while (buffer_pos + to_read > BUFFER_SIZE) - { - memcpy(ptr, buffer + buffer_pos, BUFFER_SIZE - buffer_pos); - ptr += BUFFER_SIZE - buffer_pos; - to_read -= BUFFER_SIZE - buffer_pos; - - before_buffer_bytes += buffer_filled; - buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); - buffer_pos = 0; - } - - memcpy(ptr, buffer + buffer_pos, to_read); - buffer_pos += to_read; - } - - // ******************************************************************************************* - bool Eof() const - { - return before_buffer_bytes + buffer_pos >= file_size; - } - - // ******************************************************************************************* - bool Seek(const size_t requested_pos) - { - if (requested_pos >= before_buffer_bytes && requested_pos < before_buffer_bytes + buffer_filled) - buffer_pos = requested_pos - before_buffer_bytes; - else - { - before_buffer_bytes = requested_pos; - my_fseek(f, requested_pos, SEEK_SET); - buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); - buffer_pos = 0; - } - - return true; - } - - // ******************************************************************************************* - size_t FileSize() const - { - if (f) - return file_size; - else - return 0; - } - - // ******************************************************************************************* - size_t GetPos() const - { - return before_buffer_bytes + buffer_pos; - } -}; - -// ******************************************************************************************* -// Buffered output file -class COutFile -{ - size_t BUFFER_SIZE; - - FILE *f; - uint8_t *buffer; - size_t buffer_pos; - bool success; - bool use_stdout; - -public: - // ******************************************************************************************* - COutFile() : BUFFER_SIZE (8u << 20), f(nullptr), buffer(nullptr), buffer_pos(0), success(false), use_stdout(false) - {}; - - // ******************************************************************************************* - ~COutFile() - { - if (f) - Close(); - if (buffer) - delete[] buffer; - } - - // ******************************************************************************************* - bool Open(const string &file_name, const size_t _BUFFER_SIZE = 8 << 20) - { - if (f) - return false; - - use_stdout = file_name.empty(); - - if (use_stdout) - { - f = stdout; -#ifdef _WIN32 - _setmode(_fileno(f), _O_BINARY); -#endif - } - else - { - f = fopen(file_name.c_str(), "wb"); - if (!f) - return false; - } - - BUFFER_SIZE = _BUFFER_SIZE; - buffer = new uint8_t[BUFFER_SIZE]; - buffer_pos = 0; - success = true; - - return true; - } - - // ******************************************************************************************* - bool Close() - { - if (buffer_pos) - { - success &= fwrite(buffer, 1, buffer_pos, f) == buffer_pos; - buffer_pos = 0; - } - - fflush(f); - - if (f && !use_stdout) - { - fclose(f); - f = nullptr; - } - if (buffer) - { - delete[] buffer; - buffer = nullptr; - } - - return success; - } - - // ******************************************************************************************* - bool IsOpened() - { - return f != nullptr; - } - - // ******************************************************************************************* - void Put(const uint8_t c) - { - if (buffer_pos == BUFFER_SIZE) - { - success &= fwrite(buffer, 1, BUFFER_SIZE, f) == BUFFER_SIZE; - buffer_pos = 0; - } - - buffer[buffer_pos++] = c; - } - - // ******************************************************************************************* - void Write(const uint8_t *p, size_t n) - { - uint8_t *q = (uint8_t *)p; - - while (buffer_pos + n > BUFFER_SIZE) - { - size_t small_n = BUFFER_SIZE - buffer_pos; - memcpy(buffer + buffer_pos, q, small_n); - success &= fwrite(buffer, 1, BUFFER_SIZE, f) == BUFFER_SIZE; - - buffer_pos = 0; - n -= small_n; - q += small_n; - } - - memcpy(buffer+buffer_pos, q, n); - buffer_pos += n; - } - - // ******************************************************************************************* - void WriteUInt(const uint64_t _x, const int no_bytes) - { - uint64_t x = _x; - - for (int i = 0; i < no_bytes; ++i) - { - Put(x & 0xff); - x >>= 8; - } - } - - // [0, 0x8000) -> 0 [15-bit value] - // [0x8000, 0x400000) -> 10 [22-bit value] - // [0x400000, 0x40000000) -> 11 [30-bit value] - // Larger values are not supported! - void WriteUIntVar(const uint64_t x) - { - if (x < 0x8000ull) - { - Put((uint8_t) (x >> 8)); - Put((uint8_t) (x & 0xff)); - } - else if (x < 0x400000ull) - { - Put((uint8_t) (0x80 + (x >> 16))); - Put((uint8_t) ((x >> 8) & 0xff)); - Put((uint8_t) (x & 0xff)); - } - else if (x < 0x40000000ull) - { - Put((uint8_t) (0xc0 + (x >> 24))); - Put((uint8_t) ((x >> 16) & 0xff)); - Put((uint8_t) ((x >> 8) & 0xff)); - Put((uint8_t) (x & 0xff)); - } - else - cerr << "Too large value\n"; - } - - // ******************************************************************************************* - void Write(const string &s) - { - Write((uint8_t*)s.c_str(), s.size()); - } - - // ******************************************************************************************* - void Write(const string &s, const size_t start_pos, const size_t len) - { - Write((uint8_t*)s.c_str() + start_pos, len); - } -}; - -// EOF +#ifndef _IO_H +#define _IO_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include +#include + +#include + +using namespace std; + +#ifndef _WIN32 +#define my_fseek fseek +#define my_ftell ftell +#else +#define my_fseek _fseeki64 +#define my_ftell _ftelli64 +#include +#include +#endif + +// ******************************************************************************************* +// Buffered input file +class CInFile +{ + size_t BUFFER_SIZE = 0; + + FILE *f; + uint8_t *buffer; + size_t buffer_pos; + size_t buffer_filled; + + size_t file_size; + size_t before_buffer_bytes; + +public: + // ******************************************************************************************* + CInFile() : f(nullptr), buffer(nullptr), buffer_pos(0), buffer_filled(0), file_size(0), before_buffer_bytes(0) + {}; + + // ******************************************************************************************* + ~CInFile() + { + if (f) + fclose(f); + if (buffer) + delete[] buffer; + } + + // ******************************************************************************************* + bool Open(const string &file_name, const size_t _BUFFER_SIZE = 128 << 20) + { + if (f) + return false; + + f = fopen(file_name.c_str(), "rb"); + if (!f) + return false; + + my_fseek(f, 0, SEEK_END); + file_size = my_ftell(f); + my_fseek(f, 0, SEEK_SET); + before_buffer_bytes = 0; + + if (_BUFFER_SIZE == ~0ull) + BUFFER_SIZE = file_size; + else + BUFFER_SIZE = _BUFFER_SIZE; + + buffer = new uint8_t[BUFFER_SIZE]; + + buffer_pos = 0; + + buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); + + return true; + } + + // ******************************************************************************************* + bool Close() + { + if (f) + { + fclose(f); + f = nullptr; + } + if (buffer) + { + delete[] buffer; + buffer = nullptr; + } + + return true; + } + + // ******************************************************************************************* + bool IsOpened() + { + return f != nullptr; + } + + // ******************************************************************************************* + int Get() + { + if (buffer_pos < buffer_filled) + return buffer[buffer_pos++]; + + if (feof(f)) + return EOF; + + before_buffer_bytes += buffer_filled; + + buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); + if (buffer_filled == 0) + return EOF; + + buffer_pos = 0; + + return buffer[buffer_pos++]; + } + + // ******************************************************************************************* + bool UnGet() + { + if (buffer_pos) + { + --buffer_pos; + return true; + } + + return false; + } + + // ******************************************************************************************* + uint64_t ReadUInt(const int no_bytes) + { + uint64_t x = 0; + uint64_t shift = 0; + + for (int i = 0; i < no_bytes; ++i) + { + uint64_t c = Get(); + x += c << shift; + shift += 8; + } + + return x; + } + + // ******************************************************************************************* + uint64_t ReadUIntVar() + { + uint64_t x = 0; + uint64_t c = Get(); + + if ((c >> 7) == 0) // [0, 0x8000) + { + x = c << 8; + x += (uint64_t)Get(); + } + else if ((c >> 6) == 0b10) // [0x8000, 0x400000) + { + x = (c & 0x3f) << 16; + x += (uint64_t) Get() << 8; + x += (uint64_t)Get(); + } + else if ((c >> 6) == 0b11) // [0x80000, 0x4000000) + { + x = (c & 0x3f) << 24; + x += (uint64_t)Get() << 16; + x += (uint64_t)Get() << 8; + x += (uint64_t)Get(); + } + + return x; + } + + // ******************************************************************************************* + void Read(uint8_t *ptr, size_t size) + { + if (before_buffer_bytes + buffer_pos + size > file_size) + size = file_size - (before_buffer_bytes + buffer_pos); + + size_t to_read = size; + + while (buffer_pos + to_read > BUFFER_SIZE) + { + memcpy(ptr, buffer + buffer_pos, BUFFER_SIZE - buffer_pos); + ptr += BUFFER_SIZE - buffer_pos; + to_read -= BUFFER_SIZE - buffer_pos; + + before_buffer_bytes += buffer_filled; + buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); + buffer_pos = 0; + } + + memcpy(ptr, buffer + buffer_pos, to_read); + buffer_pos += to_read; + } + + // ******************************************************************************************* + bool Eof() const + { + return before_buffer_bytes + buffer_pos >= file_size; + } + + // ******************************************************************************************* + bool Seek(const size_t requested_pos) + { + if (requested_pos >= before_buffer_bytes && requested_pos < before_buffer_bytes + buffer_filled) + buffer_pos = requested_pos - before_buffer_bytes; + else + { + before_buffer_bytes = requested_pos; + my_fseek(f, requested_pos, SEEK_SET); + buffer_filled = fread(buffer, 1, BUFFER_SIZE, f); + buffer_pos = 0; + } + + return true; + } + + // ******************************************************************************************* + size_t FileSize() const + { + if (f) + return file_size; + else + return 0; + } + + // ******************************************************************************************* + size_t GetPos() const + { + return before_buffer_bytes + buffer_pos; + } +}; + +// ******************************************************************************************* +// Buffered output file +class COutFile +{ + size_t BUFFER_SIZE; + + FILE *f; + uint8_t *buffer; + size_t buffer_pos; + bool success; + bool use_stdout; + +public: + // ******************************************************************************************* + COutFile() : BUFFER_SIZE (8u << 20), f(nullptr), buffer(nullptr), buffer_pos(0), success(false), use_stdout(false) + {}; + + // ******************************************************************************************* + ~COutFile() + { + if (f) + Close(); + if (buffer) + delete[] buffer; + } + + // ******************************************************************************************* + bool Open(const string &file_name, const size_t _BUFFER_SIZE = 8 << 20) + { + if (f) + return false; + + use_stdout = file_name.empty(); + + if (use_stdout) + { + f = stdout; +#ifdef _WIN32 + _setmode(_fileno(f), _O_BINARY); +#endif + } + else + { + f = fopen(file_name.c_str(), "wb"); + if (!f) + return false; + } + + BUFFER_SIZE = _BUFFER_SIZE; + buffer = new uint8_t[BUFFER_SIZE]; + buffer_pos = 0; + success = true; + + return true; + } + + // ******************************************************************************************* + bool Close() + { + if (buffer_pos) + { + success &= fwrite(buffer, 1, buffer_pos, f) == buffer_pos; + buffer_pos = 0; + } + + fflush(f); + + if (f && !use_stdout) + { + fclose(f); + f = nullptr; + } + if (buffer) + { + delete[] buffer; + buffer = nullptr; + } + + return success; + } + + // ******************************************************************************************* + bool IsOpened() + { + return f != nullptr; + } + + // ******************************************************************************************* + void Put(const uint8_t c) + { + if (buffer_pos == BUFFER_SIZE) + { + success &= fwrite(buffer, 1, BUFFER_SIZE, f) == BUFFER_SIZE; + buffer_pos = 0; + } + + buffer[buffer_pos++] = c; + } + + // ******************************************************************************************* + void Write(const uint8_t *p, size_t n) + { + uint8_t *q = (uint8_t *)p; + + while (buffer_pos + n > BUFFER_SIZE) + { + size_t small_n = BUFFER_SIZE - buffer_pos; + memcpy(buffer + buffer_pos, q, small_n); + success &= fwrite(buffer, 1, BUFFER_SIZE, f) == BUFFER_SIZE; + + buffer_pos = 0; + n -= small_n; + q += small_n; + } + + memcpy(buffer+buffer_pos, q, n); + buffer_pos += n; + } + + // ******************************************************************************************* + void WriteUInt(const uint64_t _x, const int no_bytes) + { + uint64_t x = _x; + + for (int i = 0; i < no_bytes; ++i) + { + Put(x & 0xff); + x >>= 8; + } + } + + // [0, 0x8000) -> 0 [15-bit value] + // [0x8000, 0x400000) -> 10 [22-bit value] + // [0x400000, 0x40000000) -> 11 [30-bit value] + // Larger values are not supported! + void WriteUIntVar(const uint64_t x) + { + if (x < 0x8000ull) + { + Put((uint8_t) (x >> 8)); + Put((uint8_t) (x & 0xff)); + } + else if (x < 0x400000ull) + { + Put((uint8_t) (0x80 + (x >> 16))); + Put((uint8_t) ((x >> 8) & 0xff)); + Put((uint8_t) (x & 0xff)); + } + else if (x < 0x40000000ull) + { + Put((uint8_t) (0xc0 + (x >> 24))); + Put((uint8_t) ((x >> 16) & 0xff)); + Put((uint8_t) ((x >> 8) & 0xff)); + Put((uint8_t) (x & 0xff)); + } + else + cerr << "Too large value\n"; + } + + // ******************************************************************************************* + void Write(const string &s) + { + Write((uint8_t*)s.c_str(), s.size()); + } + + // ******************************************************************************************* + void Write(const string &s, const size_t start_pos, const size_t len) + { + Write((uint8_t*)s.c_str() + start_pos, len); + } +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/lz_diff.cpp b/src/common/lz_diff.cpp similarity index 79% rename from src/core/lz_diff.cpp rename to src/common/lz_diff.cpp index d3622e1..bd87598 100644 --- a/src/core/lz_diff.cpp +++ b/src/common/lz_diff.cpp @@ -1,1108 +1,948 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/lz_diff.h" -#include -#include - -// ******************************************************************************************* -CLZDiffBase::CLZDiffBase(const uint32_t _min_match_len) -{ - min_match_len = _min_match_len; - key_len = min_match_len - hashing_step + 1u; - key_mask = ~0ull >> (64 - 2 * key_len); - short_ht_ver = false; - ht_mask = 0; - ht_size = 0; - index_ready = false; -} - -// ******************************************************************************************* -CLZDiffBase::~CLZDiffBase() -{ -} - -// ******************************************************************************************* -bool CLZDiffBase::SetMinMatchLen(const uint32_t _min_match_len) -{ - if (!reference.empty() || index_ready) - return false; - - min_match_len = _min_match_len; - key_len = min_match_len - hashing_step + 1u; - key_mask = ~0ull >> (64 - 2 * key_len); - - return true; -} - -//#define USE_REVCOMP_REFERENCE -//#define USE_REV_REFERENCE -// ******************************************************************************************* -void CLZDiffBase::prepare_gen(const contig_t& _reference) -{ - reference = _reference; - - reference.resize(reference.size() + key_len, invalid_symbol); - -#ifdef USE_REVCOMP_REFERENCE - reference.reserve(2 * reference.size()); - - for (auto p = _reference.rbegin(); p != _reference.rend(); ++p) - if (*p < 4) - reference.emplace_back(3 - *p); - else - reference.emplace_back(*p); - - for (int i = 0; i < key_len; ++i) - reference.emplace_back(invalid_symbol); -#endif - -#ifdef USE_REV_REFERENCE - reference.reserve(2 * reference.size()); - - for (auto p = _reference.rbegin(); p != _reference.rend(); ++p) - reference.emplace_back(*p); - - for (int i = 0; i < key_len; ++i) - reference.emplace_back(invalid_symbol); -#endif - - reference.shrink_to_fit(); -} - -// ******************************************************************************************* -void CLZDiffBase::prepare_index() -{ - ht_size = 0; - - uint32_t no_prev_valid = 0; - -#ifdef USE_SPARSE_HT - uint32_t cnt_mod = 0; - uint32_t key_len_mod = key_len % hashing_step; - - for (auto c : reference) - { - if (c < 4) - ++no_prev_valid; - else - no_prev_valid = 0; - - if (++cnt_mod == hashing_step) - cnt_mod = 0; - - if (cnt_mod == key_len_mod && no_prev_valid >= key_len) - ++ht_size; - } -#else - for (auto c : reference) - { - if (c < 4) - ++no_prev_valid; - else - no_prev_valid = 0; - - if (no_prev_valid >= key_len) - ++ht_size; - } -#endif - - ht_size = (uint64_t)(ht_size / max_load_factor); - - while (ht_size & (ht_size - 1)) - ht_size &= ht_size - 1; - - ht_size <<= 1; - - if (ht_size < 8) - ht_size = 8; - - ht_mask = ht_size - 1; - - if (short_ht_ver) - { - ht16.resize(ht_size, empty_key16); - make_index16(); - } - else - { - ht32.resize(ht_size, empty_key32); - make_index32(); - } - - index_ready = true; -} - -// ******************************************************************************************* -void CLZDiffBase::Prepare(const contig_t& _reference) -{ - short_ht_ver = _reference.size() / hashing_step < 65535; - - prepare_gen(_reference); -} - -// ******************************************************************************************* -void CLZDiffBase::GetCodingCostVector(const contig_t& text, vector& v_costs, const bool prefix_costs) -{ - if (!index_ready) - prepare_index(); - - v_costs.clear(); - v_costs.reserve(text.size()); - - uint32_t text_size = (uint32_t)text.size(); - MurMur64Hash mmh; - - uint32_t i = 0; - uint32_t pred_pos = 0; - - const uint8_t* text_ptr = text.data(); - -#ifdef USE_SPARSE_HT - uint32_t no_prev_literals = 0; -#endif - - uint64_t x; - uint64_t x_prev = ~0ull; - - for (; i + key_len < text_size; ) - { - if (x_prev != ~0ull && no_prev_literals > 0) - x = get_code_skip1(x_prev, text_ptr); - else - x = get_code(text_ptr); - x_prev = x; - - if (x == ~0ull) - { - uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); - - if (Nrun_len >= min_Nrun_len) - { - auto tc = coding_cost_Nrun(Nrun_len); - if (prefix_costs) - { - v_costs.emplace_back(tc); - v_costs.insert(v_costs.end(), Nrun_len - 1, 0); - } - else - { - v_costs.insert(v_costs.end(), Nrun_len - 1, 0); - v_costs.emplace_back(tc); - } - - text_ptr += Nrun_len; - i += Nrun_len; -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - else - { - v_costs.emplace_back(1); - ++i; - ++pred_pos; - ++text_ptr; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - } - - continue; - } - - uint64_t ht_pos = mmh(x) & ht_mask; - - uint32_t len_bck = 0; - uint32_t len_fwd = 0; - uint32_t match_pos = 0; - uint32_t max_len = text_size - i; - - if (short_ht_ver ? - !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : - !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) - { - v_costs.emplace_back(1); - ++i; - ++text_ptr; - ++pred_pos; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - continue; - } - else - { -#ifdef USE_SPARSE_HT - if (len_bck) - { - for (uint32_t k = 0; k < len_bck; ++k) - v_costs.pop_back(); - match_pos -= len_bck; - pred_pos -= len_bck; - text_ptr -= len_bck; - i -= len_bck; - } -#endif - - auto tc = coding_cost_match(match_pos, len_bck + len_fwd, pred_pos); - - if (prefix_costs) - { - v_costs.emplace_back(tc); - v_costs.insert(v_costs.end(), len_bck + len_fwd - 1, 0); - } - else - { - v_costs.insert(v_costs.end(), len_bck + len_fwd - 1, 0); - v_costs.emplace_back(tc); - } - - pred_pos = match_pos + len_bck + len_fwd; - i += len_bck + len_fwd; - text_ptr += len_bck + len_fwd; - -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - } - - for (; i < text_size; ++i) - v_costs.emplace_back(1); -} - -// ******************************************************************************************* -bool CLZDiffBase::find_best_match16(uint32_t ht_pos, const uint8_t* s, const uint32_t max_len, const uint32_t no_prev_literals, - uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) -{ - len_fwd = 0; - len_bck = 0; - - uint32_t min_to_update = min_match_len; - - const uint8_t* ref_ptr = reference.data(); - - for (uint32_t i = 0; i < max_no_tries; ++i) - { - if (ht16[ht_pos] == empty_key16) - break; - - uint32_t h_pos = ht16[ht_pos] * hashing_step; - const uint8_t* p = ref_ptr + h_pos; - - uint32_t f_len = compare_fwd((uint8_t*)s, (uint8_t*)p, max_len); - - if (f_len >= key_len) - { - uint32_t b_len = 0; - for (; b_len < min(no_prev_literals, h_pos); ++b_len) - if (*(s - b_len - 1) != *(p - b_len - 1)) - break; - - if (b_len + f_len > min_to_update) - { - len_bck = b_len; - len_fwd = f_len; - ref_pos = h_pos; - - min_to_update = b_len + f_len; - } - } - - ht_pos = (uint32_t)((ht_pos + 1u) & ht_mask); - } - - return len_bck + len_fwd >= min_match_len; -} - -// ******************************************************************************************* -bool CLZDiffBase::find_best_match32(uint32_t ht_pos, const uint8_t* s, const uint32_t max_len, const uint32_t no_prev_literals, - uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) -{ - len_fwd = 0; - len_bck = 0; - - uint32_t min_to_update = min_match_len; - - const uint8_t* ref_ptr = reference.data(); - - for (uint32_t i = 0; i < max_no_tries; ++i) - { - if (ht32[ht_pos] == empty_key32) - break; - - uint32_t h_pos = ht32[ht_pos] * hashing_step; - const uint8_t* p = ref_ptr + h_pos; - - uint32_t f_len = compare_fwd((uint8_t*)s, (uint8_t*)p, max_len); - - if (f_len >= key_len) - { - uint32_t b_len = 0; - for (; b_len < min(no_prev_literals, h_pos); ++b_len) - if (*(s - b_len - 1) != *(p - b_len - 1)) - break; - - if (b_len + f_len > min_to_update) - { - len_bck = b_len; - len_fwd = f_len; - ref_pos = h_pos; - - min_to_update = b_len + f_len; - } - } - - ht_pos = (uint32_t) ((ht_pos + 1u) & ht_mask); - } - - return len_bck + len_fwd >= min_match_len; -} - -// ******************************************************************************************* -void CLZDiffBase::encode_literal(const uint8_t c, contig_t& encoded) -{ - encoded.push_back('A' + c); -} - -// ******************************************************************************************* -void CLZDiffBase::encode_literal_diff(const uint8_t c, const uint8_t r, contig_t& encoded) -{ - if (r == 0 || (r > 3 || c > 3)) - encoded.push_back(c); - else - { - if (c < r) - encoded.push_back(3 - c); - else - encoded.push_back(c - r); - } -} - -// ******************************************************************************************* -void CLZDiffBase::encode_Nrun(const uint32_t len, contig_t &encoded) -{ - encoded.emplace_back(N_run_starter_code); // N-run start marker - append_int(encoded, len - min_Nrun_len); - encoded.emplace_back(N_code); // N-run stop marker -} - -// ******************************************************************************************* -uint32_t CLZDiffBase::coding_cost_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos) const -{ - uint32_t r; - int dif_pos = (int)ref_pos - (int)pred_pos; - - if (dif_pos >= 0) - r = int_len((uint32_t)dif_pos); - else - r = int_len((uint32_t)-dif_pos) + 1; - - r += int_len(len - min_match_len) + 2; - - return r; -} - -// ******************************************************************************************* -uint32_t CLZDiffBase::coding_cost_Nrun(const uint32_t len) const -{ - return 2 + int_len(len - min_Nrun_len); -} - -// ******************************************************************************************* -uint32_t CLZDiffBase::get_Nrun_len(const uint8_t* s, const uint32_t max_len) const -{ - if (*s != N_code || *(s + 1) != N_code || *(s + 2) != N_code) - return 0; - - uint32_t len; - for (len = 3; len < max_len && *(s + len) == N_code; ++len) - ; - - return len; -} - -// ******************************************************************************************* -bool CLZDiffBase::is_literal(const contig_t::const_iterator& p) const -{ - return (*p >= 'A' && *p <= 'A' + 20) || (*p == '!'); -} - -// ******************************************************************************************* -bool CLZDiffBase::is_Nrun(const contig_t::const_iterator& p) const -{ - return *p == N_run_starter_code; -} - -// ******************************************************************************************* -void CLZDiffBase::decode_literal(contig_t::const_iterator& p, uint8_t &c) -{ - if (*p == '!') - { - c = '!'; - ++p; - } - else - c = *p++ - 'A'; -} - -// ******************************************************************************************* -void CLZDiffBase::decode_Nrun(contig_t::const_iterator& p, uint32_t& len) -{ - int64_t raw_len; - - ++p; // prefix - read_int(p, raw_len); - ++p; // suffix - - len = (uint32_t) (raw_len + min_Nrun_len); -} - -// ******************************************************************************************* -uint64_t CLZDiffBase::get_code(const uint8_t* s) const -{ - uint64_t x = 0; - - uint32_t i = key_len % 4; - - switch (i) - { - case 3: - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s++; - case 2: - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s++; - case 1: - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s++; - } - - for (; i < key_len; ) - { - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t) *s; - ++i; ++s; - - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s; - ++i; ++s; - - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s; - ++i; ++s; - - if (*s > 3) - return ~0ull; - x = (x << 2) + (uint64_t)*s; - ++i; ++s; - } - - return x; -} - -// ******************************************************************************************* -uint64_t CLZDiffBase::get_code_skip1(uint64_t code, const uint8_t* s) const -{ - s += key_len - 1; - - if (*s > 3) - return ~0ull; - - code = (code << 2) & key_mask; - - code += *s; - - return code; -} - -// ******************************************************************************************* -void CLZDiffBase::make_index16() -{ - uint32_t ref_size = (uint32_t)reference.size(); - MurMur64Hash mmh; - - const uint8_t* ptr = reference.data(); - -#ifdef USE_SPARSE_HT - for (uint32_t i = 0; i + key_len < ref_size; i += hashing_step, ptr += hashing_step) -#else - for (uint32_t i = 0; i + key_len < ref_size; ++i, ++ptr) -#endif - { - uint64_t x = get_code(ptr); - if (x == ~0ull) - continue; - uint64_t pos = mmh(x) & ht_mask; - - for (uint32_t j = 0; j < max_no_tries; ++j) - if (ht16[(pos + j) & ht_mask] == empty_key16) - { - ht16[(pos + j) & ht_mask] = i / hashing_step; - break; - } - } -} - -// ******************************************************************************************* -void CLZDiffBase::make_index32() -{ - uint32_t ref_size = (uint32_t)reference.size(); - MurMur64Hash mmh; - - const uint8_t* ptr = reference.data(); - -#ifdef USE_SPARSE_HT - for (uint32_t i = 0; i + key_len < ref_size; i += hashing_step, ptr += hashing_step) -#else - for (uint32_t i = 0; i + key_len < ref_size; ++i, ++ptr) -#endif - { - uint64_t x = get_code(ptr); - if (x == ~0ull) - continue; - uint64_t pos = mmh(x) & ht_mask; - - for (uint32_t j = 0; j < max_no_tries; ++j) - if (ht32[(pos + j) & ht_mask] == empty_key32) - { - ht32[(pos + j) & ht_mask] = i / hashing_step; - break; - } - } -} - -// ******************************************************************************************* -void CLZDiffBase::GetReference(contig_t& s) -{ - if (reference.empty()) - s.clear(); - else - s.assign(reference.begin(), reference.begin() + (reference.size() - key_len)); -} - - -// ******************************************************************************************* -// -// ******************************************************************************************* -void CLZDiff_V1::encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded) -{ - int dif_pos = (int)ref_pos - (int)pred_pos; - - append_int(encoded, dif_pos); - encoded.emplace_back(','); - append_int(encoded, len - min_match_len); - - encoded.emplace_back('.'); -} - -// ******************************************************************************************* -void CLZDiff_V1::decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos) -{ - int64_t raw_pos; - int64_t raw_len; - - read_int(p, raw_pos); - ++p; // ',' - - ref_pos = (uint32_t)(raw_pos + (int64_t)pred_pos); - - if (*p == '.') - len = ~0u; - else - { - read_int(p, raw_len); - len = (uint32_t)(raw_len + min_match_len); - } - - ++p; // '.' -} - -// ******************************************************************************************* -void CLZDiff_V1::Encode(const contig_t& text, contig_t& encoded) -{ - if (!index_ready) - prepare_index(); - - uint32_t text_size = (uint32_t)text.size(); - - encoded.clear(); - -#ifdef IMPROVED_LZ_ENCODING - if (text_size == reference.size() - key_len) - if (equal(text.begin(), text.end(), reference.begin())) - return; // equal sequences -#endif - - MurMur64Hash mmh; - - uint32_t i = 0; - uint32_t pred_pos = 0; - - - const uint8_t* text_ptr = text.data(); - -#ifdef USE_SPARSE_HT - uint32_t no_prev_literals = 0; -#endif - - for (; i + key_len < text_size; ) - { - uint64_t x = get_code(text_ptr); - - if (x == ~0ull) - { - uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); - - if (Nrun_len >= min_Nrun_len) - { - encode_Nrun(Nrun_len, encoded); - text_ptr += Nrun_len; - i += Nrun_len; -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - else - { - encode_literal(*text_ptr, encoded); - - ++i; - ++pred_pos; - ++text_ptr; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - } - - continue; - } - - uint64_t ht_pos = mmh(x) & ht_mask; - - uint32_t len_bck = 0; - uint32_t len_fwd = 0; - uint32_t match_pos = 0; - uint32_t max_len = text_size - i; - - if (short_ht_ver ? - !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : - !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) - { - encode_literal(*text_ptr, encoded); - - ++i; - ++text_ptr; - ++pred_pos; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - continue; - } - else - { -#ifdef USE_SPARSE_HT - if (len_bck) - { - for (uint32_t k = 0; k < len_bck; ++k) - encoded.pop_back(); - match_pos -= len_bck; - pred_pos -= len_bck; - text_ptr -= len_bck; - i -= len_bck; - } -#endif - - encode_match(match_pos, len_bck + len_fwd, pred_pos, encoded); - - pred_pos = match_pos + len_bck + len_fwd; - i += len_bck + len_fwd; - text_ptr += len_bck + len_fwd; - -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - } - - for (; i < text_size; ++i) - encode_literal(text[i], encoded); -} - -// ******************************************************************************************* -size_t CLZDiff_V1::Estimate(const contig_t& text, uint32_t bound) -{ - contig_t tmp; - - Encode(text, tmp); - - return tmp.size(); -} - -// ******************************************************************************************* -void CLZDiff_V1::Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) -{ - uint8_t c; - uint32_t ref_pos, len; - uint32_t pred_pos = 0; - - decoded.clear(); - - for (auto p = encoded.begin(); p != encoded.end(); ) - { - if (is_literal(p)) - { - decode_literal(p, c); - decoded.emplace_back(c); - ++pred_pos; - } - else if (is_Nrun(p)) - { - decode_Nrun(p, len); - decoded.insert(decoded.end(), len, N_code); - } - else - { - decode_match(p, ref_pos, len, pred_pos); - decoded.insert(decoded.end(), reference.begin() + ref_pos, reference.begin() + ref_pos + len); - pred_pos = ref_pos + len; - } - } -} - - -// ******************************************************************************************* -// -// ******************************************************************************************* -void CLZDiff_V2::encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded) -{ - int dif_pos = (int)ref_pos - (int)pred_pos; - - append_int(encoded, dif_pos); - if (len != ~0u) - { - encoded.emplace_back(','); - append_int(encoded, len - min_match_len); - } - - encoded.emplace_back('.'); -} - -// ******************************************************************************************* -void CLZDiff_V2::decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos) -{ - int64_t raw_pos; - int64_t raw_len; - - read_int(p, raw_pos); - ref_pos = (uint32_t)(raw_pos + (int64_t)pred_pos); - - if(*p == ',') - { - ++p; - read_int(p, raw_len); - len = (uint32_t)(raw_len + min_match_len); - ++p; // '.' - } - else - { - len = ~0u; - ++p; // '.' - } -} - -// ******************************************************************************************* -void CLZDiff_V2::Encode(const contig_t& text, contig_t& encoded) -{ - if (!index_ready) - prepare_index(); - - uint32_t text_size = (uint32_t)text.size(); - - encoded.clear(); - - if (text_size == reference.size() - key_len) - if (equal(text.begin(), text.end(), reference.begin())) - return; // equal sequences - - encoded.reserve(text.size() / 64); - - MurMur64Hash mmh; - - uint32_t i = 0; - uint32_t pred_pos = 0; - - const uint8_t* text_ptr = text.data(); - -#ifdef USE_SPARSE_HT - uint32_t no_prev_literals = 0; -#endif - - uint64_t x_prev = ~0ull; - uint64_t x; - - for (; i + key_len < text_size; ) - { - if (x_prev != ~0ull && no_prev_literals > 0) - x = get_code_skip1(x_prev, text_ptr); - else - x = get_code(text_ptr); - x_prev = x; - - if (x == ~0ull) - { - uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); - - if (Nrun_len >= min_Nrun_len) - { - encode_Nrun(Nrun_len, encoded); - text_ptr += Nrun_len; - i += Nrun_len; -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - else - { - encode_literal(*text_ptr, encoded); - - ++i; - ++pred_pos; - ++text_ptr; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - } - - continue; - } - - uint64_t ht_pos = mmh(x) & ht_mask; - - uint32_t len_bck = 0; - uint32_t len_fwd = 0; - uint32_t match_pos = 0; - uint32_t max_len = text_size - i; - - if (short_ht_ver ? - !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : - !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) - { - encode_literal(*text_ptr, encoded); - - ++i; - ++text_ptr; - ++pred_pos; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - continue; - } - else - { -#ifdef USE_SPARSE_HT - if (len_bck) - { - for (uint32_t k = 0; k < len_bck; ++k) - encoded.pop_back(); - match_pos -= len_bck; - pred_pos -= len_bck; - text_ptr -= len_bck; - i -= len_bck; - } -#endif - - if (match_pos == pred_pos) - { - uint32_t e_size = encoded.size(); - for (uint32_t i = 1; i < e_size && i < match_pos; ++i) - { - if (encoded[e_size - i] < 'A' || encoded[e_size - i] > 'Z') - break; - if (encoded[e_size - i] - 'A' == reference[match_pos - i]) - encoded[e_size - i] = '!'; - } - } - - if (i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len) // is match to end of sequence? - encode_match(match_pos, ~0u, pred_pos, encoded); - else - encode_match(match_pos, len_bck + len_fwd, pred_pos, encoded); - - pred_pos = match_pos + len_bck + len_fwd; - i += len_bck + len_fwd; - text_ptr += len_bck + len_fwd; - -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - } - - for (; i < text_size; ++i) - encode_literal(text[i], encoded); -} - -// ******************************************************************************************* -void CLZDiff_V2::Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) -{ - uint8_t c; - uint32_t ref_pos, len; - uint32_t pred_pos = 0; - - decoded.clear(); - - for (auto p = encoded.begin(); p != encoded.end(); ) - { - if (is_literal(p)) - { - decode_literal(p, c); - - if (c == '!') - c = reference[pred_pos]; - decoded.emplace_back(c); - ++pred_pos; - } - else if (is_Nrun(p)) - { - decode_Nrun(p, len); - decoded.insert(decoded.end(), len, N_code); - } - else - { - decode_match(p, ref_pos, len, pred_pos); - - if (len == ~0u) - len = reference.size() - ref_pos; - - decoded.insert(decoded.end(), reference.begin() + ref_pos, reference.begin() + ref_pos + len); - pred_pos = ref_pos + len; - } - } -} - -// ******************************************************************************************* -size_t CLZDiff_V2::Estimate(const contig_t& text, uint32_t bound) -{ - if (!index_ready) - prepare_index(); - - uint32_t text_size = (uint32_t)text.size(); - - uint32_t est_cost = 0; - - if (text_size == reference.size() - key_len) - if (equal(text.begin(), text.end(), reference.begin())) - return 0; // equal sequences - - MurMur64Hash mmh; - - uint32_t i = 0; - uint32_t pred_pos = 0; - - const uint8_t* text_ptr = text.data(); - -#ifdef USE_SPARSE_HT - uint32_t no_prev_literals = 0; -#endif - - uint64_t x_prev = ~0ull; - uint64_t x; - - for (; i + key_len < text_size; ) - { - if (est_cost > bound) - return est_cost; - - if (x_prev != ~0ull && no_prev_literals > 0) - x = get_code_skip1(x_prev, text_ptr); - else - x = get_code(text_ptr); - x_prev = x; - - if (x == ~0ull) - { - uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); - - if (Nrun_len >= min_Nrun_len) - { - est_cost += cost_Nrun(Nrun_len); - text_ptr += Nrun_len; - i += Nrun_len; -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - else - { - ++est_cost; - - ++i; - ++pred_pos; - ++text_ptr; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - } - - continue; - } - - uint64_t ht_pos = mmh(x) & ht_mask; - - uint32_t len_bck = 0; - uint32_t len_fwd = 0; - uint32_t match_pos = 0; - uint32_t max_len = text_size - i; - - if (short_ht_ver ? - !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : - !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) - { - ++est_cost; - - ++i; - ++text_ptr; - ++pred_pos; -#ifdef USE_SPARSE_HT - ++no_prev_literals; -#endif - continue; - } - else - { - if (i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len) // is match to end of sequence? - est_cost += cost_match(match_pos, ~0u, pred_pos); - else - est_cost += cost_match(match_pos, len_bck + len_fwd, pred_pos); - - pred_pos = match_pos + len_bck + len_fwd; - i += len_bck + len_fwd; - text_ptr += len_bck + len_fwd; - -#ifdef USE_SPARSE_HT - no_prev_literals = 0; -#endif - } - } - - est_cost += text_size - i; - - return est_cost; -} - +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "lz_diff.h" +#include +#include + +// ******************************************************************************************* +CLZDiffBase::CLZDiffBase(const uint32_t _min_match_len) +{ + min_match_len = _min_match_len; + key_len = min_match_len - hashing_step + 1u; + key_mask = ~0ull >> (64 - 2 * key_len); + short_ht_ver = false; + ht_mask = 0; + ht_size = 0; + index_ready = false; +} + +// ******************************************************************************************* +CLZDiffBase::~CLZDiffBase() +{ +} + +// ******************************************************************************************* +bool CLZDiffBase::SetMinMatchLen(const uint32_t _min_match_len) +{ + if (!reference.empty() || index_ready) + return false; + + min_match_len = _min_match_len; + key_len = min_match_len - hashing_step + 1u; + key_mask = ~0ull >> (64 - 2 * key_len); + + return true; +} + +//#define USE_REVCOMP_REFERENCE +//#define USE_REV_REFERENCE +// ******************************************************************************************* +void CLZDiffBase::prepare_gen(const contig_t& _reference) +{ + reference = _reference; + + reference.resize(reference.size() + key_len, invalid_symbol); + +#ifdef USE_REVCOMP_REFERENCE + reference.reserve(2 * reference.size()); + + for (auto p = _reference.rbegin(); p != _reference.rend(); ++p) + if (*p < 4) + reference.emplace_back(3 - *p); + else + reference.emplace_back(*p); + + for (int i = 0; i < key_len; ++i) + reference.emplace_back(invalid_symbol); +#endif + +#ifdef USE_REV_REFERENCE + reference.reserve(2 * reference.size()); + + for (auto p = _reference.rbegin(); p != _reference.rend(); ++p) + reference.emplace_back(*p); + + for (int i = 0; i < key_len; ++i) + reference.emplace_back(invalid_symbol); +#endif + + reference.shrink_to_fit(); +} + +// ******************************************************************************************* +void CLZDiffBase::prepare_index() +{ + ht_size = 0; + + uint32_t no_prev_valid = 0; + +#ifdef USE_SPARSE_HT + uint32_t cnt_mod = 0; + uint32_t key_len_mod = key_len % hashing_step; + + for (auto c : reference) + { + if (c < 4) + ++no_prev_valid; + else + no_prev_valid = 0; + + if (++cnt_mod == hashing_step) + cnt_mod = 0; + + if (cnt_mod == key_len_mod && no_prev_valid >= key_len) + ++ht_size; + } +#else + for (auto c : reference) + { + if (c < 4) + ++no_prev_valid; + else + no_prev_valid = 0; + + if (no_prev_valid >= key_len) + ++ht_size; + } +#endif + + ht_size = (uint64_t)(ht_size / max_load_factor); + + while (ht_size & (ht_size - 1)) + ht_size &= ht_size - 1; + + ht_size <<= 1; + + if (ht_size < 8) + ht_size = 8; + + ht_mask = ht_size - 1; + + if (short_ht_ver) + { + ht16.resize(ht_size, empty_key16); + make_index16(); + } + else + { + ht32.resize(ht_size, empty_key32); + make_index32(); + } + + index_ready = true; +} + +// ******************************************************************************************* +void CLZDiffBase::Prepare(const contig_t& _reference) +{ + short_ht_ver = _reference.size() / hashing_step < 65535; + + prepare_gen(_reference); +} + +// ******************************************************************************************* +void CLZDiffBase::AssureIndex() +{ + if (!index_ready) + prepare_index(); +} + +// ******************************************************************************************* +void CLZDiffBase::GetCodingCostVector(const contig_t& text, vector& v_costs, const bool prefix_costs) const +{ + v_costs.clear(); + v_costs.reserve(text.size()); + + uint32_t text_size = (uint32_t)text.size(); + MurMur64Hash mmh; + + uint32_t i = 0; + uint32_t pred_pos = 0; + + const uint8_t* text_ptr = text.data(); + +//#ifdef USE_SPARSE_HT + uint32_t no_prev_literals = 0; +//#endif + + uint64_t x; + uint64_t x_prev = ~0ull; + + for (; i + key_len < text_size; ) + { + if (x_prev != ~0ull && no_prev_literals > 0) + x = get_code_skip1(x_prev, text_ptr); + else + x = get_code(text_ptr); + x_prev = x; + + if (x == ~0ull) + { + uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); + + if (Nrun_len >= min_Nrun_len) + { + auto tc = coding_cost_Nrun(Nrun_len); + if (prefix_costs) + { + v_costs.emplace_back(tc); + v_costs.insert(v_costs.end(), Nrun_len - 1, 0); + } + else + { + v_costs.insert(v_costs.end(), Nrun_len - 1, 0); + v_costs.emplace_back(tc); + } + + text_ptr += Nrun_len; + i += Nrun_len; +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + else + { + v_costs.emplace_back(1); + ++i; + ++pred_pos; + ++text_ptr; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + } + + continue; + } + + uint64_t ht_pos = mmh(x) & ht_mask; + + uint32_t len_bck = 0; + uint32_t len_fwd = 0; + uint32_t match_pos = 0; + uint32_t max_len = text_size - i; + + if (short_ht_ver ? + !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : + !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) + { + v_costs.emplace_back(1); + ++i; + ++text_ptr; + ++pred_pos; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + continue; + } + else + { +#ifdef USE_SPARSE_HT + if (len_bck) + { + for (uint32_t k = 0; k < len_bck; ++k) + v_costs.pop_back(); + match_pos -= len_bck; + pred_pos -= len_bck; + text_ptr -= len_bck; + i -= len_bck; + } +#endif + + auto tc = coding_cost_match(match_pos, len_bck + len_fwd, pred_pos); + + if (prefix_costs) + { + v_costs.emplace_back(tc); + v_costs.insert(v_costs.end(), len_bck + len_fwd - 1, 0); + } + else + { + v_costs.insert(v_costs.end(), len_bck + len_fwd - 1, 0); + v_costs.emplace_back(tc); + } + + pred_pos = match_pos + len_bck + len_fwd; + i += len_bck + len_fwd; + text_ptr += len_bck + len_fwd; + +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + } + + for (; i < text_size; ++i) + v_costs.emplace_back(1); +} + +// ******************************************************************************************* +bool CLZDiffBase::find_best_match16(uint32_t ht_pos, const uint8_t* s, const uint32_t max_len, const uint32_t no_prev_literals, + uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) const +{ + len_fwd = 0; + len_bck = 0; + + uint32_t min_to_update = min_match_len; + + const uint8_t* ref_ptr = reference.data(); + + for (uint32_t i = 0; i < max_no_tries; ++i) + { + if (ht16[ht_pos] == empty_key16) + break; + + uint32_t h_pos = ((uint32_t) ht16[ht_pos]) * hashing_step; + const uint8_t* p = ref_ptr + h_pos; + + uint32_t f_len = compare_fwd((uint8_t*)s, (uint8_t*)p, max_len); + + if (f_len >= key_len) + { + uint32_t b_len = 0; + for (; b_len < min(no_prev_literals, h_pos); ++b_len) + if (*(s - b_len - 1) != *(p - b_len - 1)) + break; + + if (b_len + f_len > min_to_update) + { + len_bck = b_len; + len_fwd = f_len; + ref_pos = h_pos; + + min_to_update = b_len + f_len; + } + } + + ht_pos = (uint32_t)((ht_pos + 1u) & ht_mask); + } + + return len_bck + len_fwd >= min_match_len; +} + +// ******************************************************************************************* +bool CLZDiffBase::find_best_match32(uint32_t ht_pos, const uint8_t* s, const uint32_t max_len, const uint32_t no_prev_literals, + uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) const +{ + len_fwd = 0; + len_bck = 0; + + uint32_t min_to_update = min_match_len; + + const uint8_t* ref_ptr = reference.data(); + + for (uint32_t i = 0; i < max_no_tries; ++i) + { + if (ht32[ht_pos] == empty_key32) + break; + + uint32_t h_pos = ht32[ht_pos] * hashing_step; + const uint8_t* p = ref_ptr + h_pos; + + uint32_t f_len = compare_fwd((uint8_t*)s, (uint8_t*)p, max_len); + + if (f_len >= key_len) + { + uint32_t b_len = 0; + for (; b_len < min(no_prev_literals, h_pos); ++b_len) + if (*(s - b_len - 1) != *(p - b_len - 1)) + break; + + if (b_len + f_len > min_to_update) + { + len_bck = b_len; + len_fwd = f_len; + ref_pos = h_pos; + + min_to_update = b_len + f_len; + } + } + + ht_pos = (uint32_t) ((ht_pos + 1u) & ht_mask); + } + + return len_bck + len_fwd >= min_match_len; +} + +// ******************************************************************************************* +void CLZDiffBase::make_index16() +{ + uint32_t ref_size = (uint32_t)reference.size(); + MurMur64Hash mmh; + + const uint8_t* ptr = reference.data(); + +#ifdef USE_SPARSE_HT + for (uint32_t i = 0; i + key_len < ref_size; i += hashing_step, ptr += hashing_step) +#else + for (uint32_t i = 0; i + key_len < ref_size; ++i, ++ptr) +#endif + { + uint64_t x = get_code(ptr); + if (x == ~0ull) + continue; + uint64_t pos = mmh(x) & ht_mask; + + for (uint32_t j = 0; j < max_no_tries; ++j) + if (ht16[(pos + j) & ht_mask] == empty_key16) + { + ht16[(pos + j) & ht_mask] = i / hashing_step; + break; + } + } +} + +// ******************************************************************************************* +void CLZDiffBase::make_index32() +{ + uint32_t ref_size = (uint32_t)reference.size(); + MurMur64Hash mmh; + + const uint8_t* ptr = reference.data(); + +#ifdef USE_SPARSE_HT + for (uint32_t i = 0; i + key_len < ref_size; i += hashing_step, ptr += hashing_step) +#else + for (uint32_t i = 0; i + key_len < ref_size; ++i, ++ptr) +#endif + { + uint64_t x = get_code(ptr); + if (x == ~0ull) + continue; + uint64_t pos = mmh(x) & ht_mask; + + for (uint32_t j = 0; j < max_no_tries; ++j) + if (ht32[(pos + j) & ht_mask] == empty_key32) + { + ht32[(pos + j) & ht_mask] = i / hashing_step; + break; + } + } +} + +// ******************************************************************************************* +void CLZDiffBase::GetReference(contig_t& s) +{ + if (reference.empty()) + s.clear(); + else + s.assign(reference.begin(), reference.begin() + (reference.size() - key_len)); +} + + +// ******************************************************************************************* +// +// ******************************************************************************************* +void CLZDiff_V1::encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded) +{ + int dif_pos = (int)ref_pos - (int)pred_pos; + + append_int(encoded, dif_pos); + encoded.emplace_back(','); + append_int(encoded, len - min_match_len); + + encoded.emplace_back('.'); +} + +// ******************************************************************************************* +void CLZDiff_V1::decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos) +{ + int64_t raw_pos; + int64_t raw_len; + + read_int(p, raw_pos); + ++p; // ',' + + ref_pos = (uint32_t)(raw_pos + (int64_t)pred_pos); + + if (*p == '.') + len = ~0u; + else + { + read_int(p, raw_len); + len = (uint32_t)(raw_len + min_match_len); + } + + ++p; // '.' +} + +// ******************************************************************************************* +void CLZDiff_V1::Encode(const contig_t& text, contig_t& encoded) +{ + if (!index_ready) + prepare_index(); + + uint32_t text_size = (uint32_t)text.size(); + + encoded.clear(); + +#ifdef IMPROVED_LZ_ENCODING + if (text_size == reference.size() - key_len) + if (equal(text.begin(), text.end(), reference.begin())) + return; // equal sequences +#endif + + MurMur64Hash mmh; + + uint32_t i = 0; + uint32_t pred_pos = 0; + + const uint8_t* text_ptr = text.data(); + +//#ifdef USE_SPARSE_HT + uint32_t no_prev_literals = 0; +//#endif + + for (; i + key_len < text_size; ) + { + uint64_t x = get_code(text_ptr); + + if (x == ~0ull) + { + uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); + + if (Nrun_len >= min_Nrun_len) + { + encode_Nrun(Nrun_len, encoded); + text_ptr += Nrun_len; + i += Nrun_len; +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + else + { + encode_literal(*text_ptr, encoded); + + ++i; + ++pred_pos; + ++text_ptr; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + } + + continue; + } + + uint64_t ht_pos = mmh(x) & ht_mask; + + uint32_t len_bck = 0; + uint32_t len_fwd = 0; + uint32_t match_pos = 0; + uint32_t max_len = text_size - i; + + if (short_ht_ver ? + !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : + !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) + { + encode_literal(*text_ptr, encoded); + + ++i; + ++text_ptr; + ++pred_pos; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + continue; + } + else + { +#ifdef USE_SPARSE_HT + if (len_bck) + { + for (uint32_t k = 0; k < len_bck; ++k) + encoded.pop_back(); + match_pos -= len_bck; + pred_pos -= len_bck; + text_ptr -= len_bck; + i -= len_bck; + } +#endif + + encode_match(match_pos, len_bck + len_fwd, pred_pos, encoded); + + pred_pos = match_pos + len_bck + len_fwd; + i += len_bck + len_fwd; + text_ptr += len_bck + len_fwd; + +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + } + + for (; i < text_size; ++i) + encode_literal(text[i], encoded); +} + +// ******************************************************************************************* +size_t CLZDiff_V1::Estimate(const contig_t& text, uint32_t bound) +{ + contig_t tmp; + + Encode(text, tmp); + + return tmp.size(); +} + +// ******************************************************************************************* +void CLZDiff_V1::Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) +{ + uint8_t c; + uint32_t ref_pos, len; + uint32_t pred_pos = 0; + + decoded.clear(); + + for (auto p = encoded.begin(); p != encoded.end(); ) + { + if (is_literal(p)) + { + decode_literal(p, c); + decoded.emplace_back(c); + ++pred_pos; + } + else if (is_Nrun(p)) + { + decode_Nrun(p, len); + decoded.insert(decoded.end(), len, N_code); + } + else + { + decode_match(p, ref_pos, len, pred_pos); + decoded.insert(decoded.end(), reference.begin() + ref_pos, reference.begin() + ref_pos + len); + pred_pos = ref_pos + len; + } + } +} + + +// ******************************************************************************************* +// +// ******************************************************************************************* +void CLZDiff_V2::encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded) +{ + int dif_pos = (int)ref_pos - (int)pred_pos; + + append_int(encoded, dif_pos); + if (len != ~0u) + { + encoded.emplace_back(','); + append_int(encoded, len - min_match_len); + } + + encoded.emplace_back('.'); +} + +// ******************************************************************************************* +void CLZDiff_V2::decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos) +{ + int64_t raw_pos; + int64_t raw_len; + + read_int(p, raw_pos); + ref_pos = (uint32_t)(raw_pos + (int64_t)pred_pos); + + if(*p == ',') + { + ++p; + read_int(p, raw_len); + len = (uint32_t)(raw_len + min_match_len); + ++p; // '.' + } + else + { + len = ~0u; + ++p; // '.' + } +} + +// ******************************************************************************************* +void CLZDiff_V2::Encode(const contig_t& text, contig_t& encoded) +{ + if (!index_ready) + prepare_index(); + + uint32_t text_size = (uint32_t)text.size(); + + encoded.clear(); + + if (text_size == reference.size() - key_len) + if (equal(text.begin(), text.end(), reference.begin())) + return; // equal sequences + + encoded.reserve(text.size() / 64); + + MurMur64Hash mmh; + + uint32_t i = 0; + uint32_t pred_pos = 0; + + const uint8_t* text_ptr = text.data(); + +//#ifdef USE_SPARSE_HT + uint32_t no_prev_literals = 0; +//#endif + + uint64_t x_prev = ~0ull; + uint64_t x; + + for (; i + key_len < text_size; ) + { + if (x_prev != ~0ull && no_prev_literals > 0) + x = get_code_skip1(x_prev, text_ptr); + else + x = get_code(text_ptr); + x_prev = x; + + if (x == ~0ull) + { + uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); + + if (Nrun_len >= min_Nrun_len) + { + encode_Nrun(Nrun_len, encoded); + text_ptr += Nrun_len; + i += Nrun_len; +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + else + { + encode_literal(*text_ptr, encoded); + + ++i; + ++pred_pos; + ++text_ptr; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + } + + continue; + } + + uint64_t ht_pos = mmh(x) & ht_mask; + + uint32_t len_bck = 0; + uint32_t len_fwd = 0; + uint32_t match_pos = 0; + uint32_t max_len = text_size - i; + + if (short_ht_ver ? + !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : + !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) + { + encode_literal(*text_ptr, encoded); + + ++i; + ++text_ptr; + ++pred_pos; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + continue; + } + else + { +#ifdef USE_SPARSE_HT + if (len_bck) + { + for (uint32_t k = 0; k < len_bck; ++k) + encoded.pop_back(); + match_pos -= len_bck; + pred_pos -= len_bck; + text_ptr -= len_bck; + i -= len_bck; + } +#endif + + if (match_pos == pred_pos) + { + uint32_t e_size = encoded.size(); + for (uint32_t i = 1; i < e_size && i < match_pos; ++i) + { + if (encoded[e_size - i] < 'A' || encoded[e_size - i] > 'Z') + break; + if (encoded[e_size - i] - 'A' == reference[match_pos - i]) + encoded[e_size - i] = '!'; + } + } + + if (i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len) // is match to end of sequence? + encode_match(match_pos, ~0u, pred_pos, encoded); + else + encode_match(match_pos, len_bck + len_fwd, pred_pos, encoded); + + pred_pos = match_pos + len_bck + len_fwd; + i += len_bck + len_fwd; + text_ptr += len_bck + len_fwd; + +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + } + + for (; i < text_size; ++i) + encode_literal(text[i], encoded); +} + +// ******************************************************************************************* +void CLZDiff_V2::Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) +{ + uint8_t c; + uint32_t ref_pos, len; + uint32_t pred_pos = 0; + + decoded.clear(); + + for (auto p = encoded.begin(); p != encoded.end(); ) + { + if (is_literal(p)) + { + decode_literal(p, c); + + if (c == '!') + c = reference[pred_pos]; + decoded.emplace_back(c); + ++pred_pos; + } + else if (is_Nrun(p)) + { + decode_Nrun(p, len); + decoded.insert(decoded.end(), len, N_code); + } + else + { + decode_match(p, ref_pos, len, pred_pos); + + if (len == ~0u) + len = reference.size() - ref_pos; + + decoded.insert(decoded.end(), reference.begin() + ref_pos, reference.begin() + ref_pos + len); + pred_pos = ref_pos + len; + } + } +} + +// ******************************************************************************************* +size_t CLZDiff_V2::Estimate(const contig_t& text, uint32_t bound) +{ + if (!index_ready) + prepare_index(); + + uint32_t text_size = (uint32_t)text.size(); + + uint32_t est_cost = 0; + + if (text_size == reference.size() - key_len) + if (equal(text.begin(), text.end(), reference.begin())) + return 0; // equal sequences + + MurMur64Hash mmh; + + uint32_t i = 0; + uint32_t pred_pos = 0; + + const uint8_t* text_ptr = text.data(); + +//#ifdef USE_SPARSE_HT + uint32_t no_prev_literals = 0; +//#endif + + uint64_t x_prev = ~0ull; + uint64_t x; + + for (; i + key_len < text_size; ) + { + if (est_cost > bound) + return est_cost; + + if (x_prev != ~0ull && no_prev_literals > 0) + x = get_code_skip1(x_prev, text_ptr); + else + x = get_code(text_ptr); + x_prev = x; + + if (x == ~0ull) + { + uint32_t Nrun_len = get_Nrun_len(text_ptr, text_size - i); + + if (Nrun_len >= min_Nrun_len) + { + est_cost += cost_Nrun(Nrun_len); + text_ptr += Nrun_len; + i += Nrun_len; +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + else + { + ++est_cost; + + ++i; + ++pred_pos; + ++text_ptr; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + } + + continue; + } + + uint64_t ht_pos = mmh(x) & ht_mask; + + uint32_t len_bck = 0; + uint32_t len_fwd = 0; + uint32_t match_pos = 0; + uint32_t max_len = text_size - i; + + if (short_ht_ver ? + !find_best_match16((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd) : + !find_best_match32((uint32_t)ht_pos, text_ptr, max_len, no_prev_literals, match_pos, len_bck, len_fwd)) + { + ++est_cost; + + ++i; + ++text_ptr; + ++pred_pos; +#ifdef USE_SPARSE_HT + ++no_prev_literals; +#endif + continue; + } + else + { + if (i + len_bck + len_fwd == text_size && match_pos + len_bck + len_fwd == reference.size() - key_len) // is match to end of sequence? + est_cost += cost_match(match_pos, ~0u, pred_pos); + else + est_cost += cost_match(match_pos, len_bck + len_fwd, pred_pos); + + pred_pos = match_pos + len_bck + len_fwd; + i += len_bck + len_fwd; + text_ptr += len_bck + len_fwd; + +#ifdef USE_SPARSE_HT + no_prev_literals = 0; +#endif + } + } + + est_cost += text_size - i; + + return est_cost; +} + // EOL \ No newline at end of file diff --git a/src/core/lz_diff.h b/src/common/lz_diff.h similarity index 61% rename from src/core/lz_diff.h rename to src/common/lz_diff.h index 6485edc..40aecdc 100644 --- a/src/core/lz_diff.h +++ b/src/common/lz_diff.h @@ -1,281 +1,439 @@ -#ifndef _LZ_DIFF_H -#define _LZ_DIFF_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include "../core/utils.h" - -using namespace std; - -#define USE_SPARSE_HT - -// ******************************************************************************************* -class CLZDiffBase -{ -protected: - const uint32_t empty_key32 = ~0u; - const uint32_t empty_key16 = (uint16_t) ~0u; - const double max_load_factor = 0.7; - const uint32_t max_no_tries = 64; - const uint8_t invalid_symbol = 31; - const uint8_t N_code = 4; - const uint8_t N_run_starter_code = 30; - const uint32_t min_Nrun_len = 4; - -#ifdef USE_SPARSE_HT - const uint32_t hashing_step = 4; -#else - const uint32_t hashing_step = 1; -#endif - - contig_t reference; - vector ht32; - vector ht16; - uint64_t ht_size; - uint64_t ht_mask; - uint32_t key_len; - uint64_t key_mask; - uint32_t min_match_len; - bool short_ht_ver; - bool index_ready; - - void make_index16(); - void make_index32(); - - uint64_t get_code(const uint8_t* s) const; - uint64_t get_code_skip1(uint64_t code, const uint8_t* s) const; - uint32_t get_Nrun_len(const uint8_t* s, const uint32_t max_len) const; - - void encode_literal(const uint8_t c, contig_t& encoded); - void encode_literal_diff(const uint8_t c, const uint8_t r, contig_t& encoded); - void encode_Nrun(const uint32_t len, contig_t& encoded); - - uint32_t coding_cost_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos) const; - uint32_t coding_cost_Nrun(const uint32_t len) const; - uint32_t int_len(const uint32_t x) const - { - if (x < 10) return 1; - if (x < 100) return 2; - if (x < 1000) return 3; - if (x < 10000) return 4; - if (x < 100000) return 5; - if (x < 1000000) return 6; - if (x < 10000000) return 7; - if (x < 100000000) return 8; - if (x < 1000000000) return 9; - return 10; - } - - bool is_literal(const contig_t::const_iterator& p) const; - bool is_Nrun(const contig_t::const_iterator& p) const; - void decode_literal(contig_t::const_iterator& p, uint8_t &c); - void decode_Nrun(contig_t::const_iterator& p, uint32_t& len); - - bool find_best_match16(uint32_t ht_pos, const uint8_t *s, const uint32_t max_len, const uint32_t no_prev_literals, - uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd); - bool find_best_match32(uint32_t ht_pos, const uint8_t *s, const uint32_t max_len, const uint32_t no_prev_literals, - uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd); - inline void append_int(contig_t& text, int64_t x) - { - if (x == 0) - { - text.emplace_back('0'); - return; - } - - if (x < 0) - { - text.push_back('-'); - x = -x; - } - - char tmp[16]; - char* p = tmp + 16; - - for (; x; x /= 10) - { - *--p = (uint8_t)('0' + (x % 10)); - x /= 10; - if (!x) - break; - *--p = (uint8_t)('0' + (x % 10)); - } - - size_t i = text.size(); - text.resize(text.size() + (tmp + 16 - p)); - - for (; p != tmp + 16; ++p, ++i) - text[i] = *p; - - text.insert(text.end(), p, tmp + 16); - } - - void read_int(contig_t::const_iterator& p, int64_t &x) - { - bool is_neg = false; - x = 0; - - if (*p == '-') - { - is_neg = true; - ++p; - } - - while (*p >= '0' && *p <= '9') - x = x * 10 + (int64_t)(*p++ - '0'); - - if (is_neg) - x = -x; - } - - uint32_t compare_fwd(uint8_t* p, uint8_t* q, uint32_t max_len) - { - uint32_t len = 0; - - switch (max_len % 4) - { - case 3: - if (*p++ != *q++) - return len; - ++len; - [[fallthrough]]; - case 2: - if (*p++ != *q++) - return len; - ++len; - [[fallthrough]]; - case 1: - if (*p++ != *q++) - return len; - ++len; - } - - for (; len < max_len; len += 4) - { - if (*p++ != *q++) - return len; - if (*p++ != *q++) - return len+1; - if (*p++ != *q++) - return len+2; - if (*p++ != *q++) - return len+3; - } - - return len; - } - - void prepare_gen(const contig_t& _reference); - void prepare_index(); - -public: - CLZDiffBase(const uint32_t _min_match_len = 18); - virtual ~CLZDiffBase(); - - bool SetMinMatchLen(const uint32_t _min_match_len = 18); - void Prepare(const contig_t& _reference); - - virtual void Encode(const contig_t& text, contig_t&encoded) = 0; - virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) = 0; - - virtual size_t Estimate(const contig_t& text, uint32_t bound = 0) = 0; - - void GetReference(contig_t& s); - void GetCodingCostVector(const contig_t& text, vector &v_costs, const bool prefix_costs); -}; - -// ******************************************************************************************* -class CLZDiff_V1 : public CLZDiffBase -{ - void encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded); - void decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos); - - -public: - CLZDiff_V1(const uint32_t _min_match_len = 18) : CLZDiffBase(_min_match_len) - {} - - virtual ~CLZDiff_V1() {}; - - virtual void Encode(const contig_t& text, contig_t& encoded); - virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded); - - virtual size_t Estimate(const contig_t& text, uint32_t bound = 0); -}; - -// ******************************************************************************************* -class CLZDiff_V2 : public CLZDiffBase -{ - void encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded); - void decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos); - - uint32_t int_len(int x) - { - if (x >= 0) - return uint_len((uint32_t)x); - else - return 1 + uint_len((uint32_t)-x); - } - - uint32_t uint_len(uint32_t x) - { - if (x < 10) - return 1; - if (x < 100) - return 2; - if (x < 1000) - return 3; - if (x < 10000) - return 4; - if (x < 100000) - return 5; - if (x < 1000000) - return 6; - if (x < 10000000) - return 7; - return 8; - } - - uint32_t cost_Nrun(uint32_t x) { - return 2 + uint_len(x); - } - - uint32_t cost_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos) - { - int dif_pos = (int)ref_pos - (int)pred_pos; - - uint32_t r = int_len(dif_pos); - - if (len != ~0u) - r += 1 + uint_len(len - min_match_len); - - ++r; - - return r; - } - -public: - CLZDiff_V2(const uint32_t _min_match_len = 18) : CLZDiffBase(_min_match_len) - {} - - virtual ~CLZDiff_V2() {}; - - virtual void Encode(const contig_t& text, contig_t& encoded); - virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded); - - virtual size_t Estimate(const contig_t& text, uint32_t bound = ~0u); -}; - -// EOF +#ifndef _LZ_DIFF_H +#define _LZ_DIFF_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include "../common/utils.h" + +#include + +using namespace std; + +#define USE_SPARSE_HT + +// ******************************************************************************************* +class CLZDiffBase +{ +protected: + const uint32_t empty_key32 = ~0u; + const uint16_t empty_key16 = ~(uint16_t) 0u; + const double max_load_factor = 0.7; + const uint32_t max_no_tries = 64; + const uint8_t invalid_symbol = 31; + const uint8_t N_code = 4; + const uint8_t N_run_starter_code = 30; + const uint32_t min_Nrun_len = 4; + +#ifdef USE_SPARSE_HT + const uint32_t hashing_step = 4; +#else + const uint32_t hashing_step = 1; +#endif + + contig_t reference; + vector ht32; + vector ht16; + uint64_t ht_size; + uint64_t ht_mask; + uint32_t key_len; + uint64_t key_mask; + uint32_t min_match_len; + bool short_ht_ver; + bool index_ready; + + void make_index16(); + void make_index32(); + + uint64_t get_code(const uint8_t* s) const + { + uint64_t x = 0; + + uint32_t i = key_len % 4; + + switch (i) + { + case 3: + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s++; + [[fallthrough]]; + case 2: + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s++; + [[fallthrough]]; + case 1: + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s++; + } + + for (; i < key_len; ) + { + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s; + ++i; ++s; + + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s; + ++i; ++s; + + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s; + ++i; ++s; + + if (*s > 3) + return ~0ull; + x = (x << 2) + (uint64_t)*s; + ++i; ++s; + } + + return x; + }; + + uint64_t get_code_skip1(uint64_t code, const uint8_t* s) const + { + s += key_len - 1; + + if (*s > 3) + return ~0ull; + + code = (code << 2) & key_mask; + + code += *s; + + return code; + } + + uint32_t get_Nrun_len(const uint8_t* s, const uint32_t max_len) const + { + if (*s != N_code || *(s + 1) != N_code || *(s + 2) != N_code) + return 0; + + uint32_t len; + for (len = 3; len < max_len && *(s + len) == N_code; ++len) + ; + + return len; + } + + void encode_literal(const uint8_t c, contig_t& encoded) const + { + encoded.push_back('A' + c); + } + + void encode_literal_diff(const uint8_t c, const uint8_t r, contig_t& encoded) const + { + if (r == 0 || (r > 3 || c > 3)) + encoded.push_back(c); + else + { + if (c < r) + encoded.push_back(3 - c); + else + encoded.push_back(c - r); + } + } + + void encode_Nrun(const uint32_t len, contig_t& encoded) const + { + encoded.emplace_back(N_run_starter_code); // N-run start marker + append_int(encoded, len - min_Nrun_len); + encoded.emplace_back(N_code); // N-run stop marker + } + + uint32_t coding_cost_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos) const + { + uint32_t r; + int dif_pos = (int)ref_pos - (int)pred_pos; + + if (dif_pos >= 0) + r = int_len((uint32_t)dif_pos); + else + r = int_len((uint32_t)-dif_pos) + 1; + + r += int_len(len - min_match_len) + 2; + + return r; + } + + uint32_t coding_cost_Nrun(const uint32_t len) const + { + return 2 + int_len(len - min_Nrun_len); + } + + uint32_t int_len(const uint32_t x) const + { + if (x < 10) return 1; + if (x < 100) return 2; + if (x < 1000) return 3; + if (x < 10000) return 4; + if (x < 100000) return 5; + if (x < 1000000) return 6; + if (x < 10000000) return 7; + if (x < 100000000) return 8; + if (x < 1000000000) return 9; + return 10; + } + + bool is_literal(const contig_t::const_iterator& p) const + { + return (*p >= 'A' && *p <= 'A' + 20) || (*p == '!'); + } + + bool is_Nrun(const contig_t::const_iterator& p) const + { + return *p == N_run_starter_code; + } + + void decode_literal(contig_t::const_iterator& p, uint8_t &c) const + { + if (*p == '!') + { + c = '!'; + ++p; + } + else + c = *p++ - 'A'; + } + + void decode_Nrun(contig_t::const_iterator& p, uint32_t& len) const + { + int64_t raw_len; + + ++p; // prefix + read_int(p, raw_len); + ++p; // suffix + + len = (uint32_t)(raw_len + min_Nrun_len); + } + + bool find_best_match16(uint32_t ht_pos, const uint8_t *s, const uint32_t max_len, const uint32_t no_prev_literals, + uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) const; + bool find_best_match32(uint32_t ht_pos, const uint8_t *s, const uint32_t max_len, const uint32_t no_prev_literals, + uint32_t& ref_pos, uint32_t& len_bck, uint32_t& len_fwd) const; + inline void append_int(contig_t& text, int64_t x) const + { + if (x == 0) + { + text.emplace_back('0'); + return; + } + + if (x < 0) + { + text.push_back('-'); + x = -x; + } + + char tmp[16]; + char* p = tmp + 16; + + for (; x; x /= 10) + { + *--p = (uint8_t)('0' + (x % 10)); + x /= 10; + if (!x) + break; + *--p = (uint8_t)('0' + (x % 10)); + } + + size_t i = text.size(); + text.resize(text.size() + (tmp + 16 - p)); + + for (; p != tmp + 16; ++p, ++i) + text[i] = *p; + + text.insert(text.end(), p, tmp + 16); + } + + void read_int(contig_t::const_iterator& p, int64_t &x) const + { + bool is_neg = false; + x = 0; + + if (*p == '-') + { + is_neg = true; + ++p; + } + + while (*p >= '0' && *p <= '9') + x = x * 10 + (int64_t)(*p++ - '0'); + + if (is_neg) + x = -x; + } + + uint32_t compare_fwd(uint8_t* p, uint8_t* q, uint32_t max_len) const + { + return (uint32_t)refresh::matching_length(p, q, max_len); + +#if 0 + uint32_t len = 0; + + auto p0 = p; + + switch (max_len % 4) + { + case 3: + if (*p++ != *q++) + return len; + ++len; + [[fallthrough]]; + case 2: + if (*p++ != *q++) + return len; + ++len; + [[fallthrough]]; + case 1: + if (*p++ != *q++) + return len; + ++len; + } + + for (; len < max_len; len += 4) + { +/* if (*p++ != *q++) + return len; + if (*p++ != *q++) + return len+1; + if (*p++ != *q++) + return len+2; + if (*p++ != *q++) + return len+3;*/ + int inc = *p == *q; p += inc; q += inc; + inc = *p == *q; p += inc; q += inc; + inc = *p == *q; p += inc; q += inc; + inc = *p == *q; p += inc; q += inc; + + if (!inc) + break; + } + +// return len; + + return p - p0; +#endif + } + + void prepare_gen(const contig_t& _reference); + void prepare_index(); + +public: + CLZDiffBase(const uint32_t _min_match_len = 18); + virtual ~CLZDiffBase(); + + bool SetMinMatchLen(const uint32_t _min_match_len = 18); + void Prepare(const contig_t& _reference); + + virtual void Encode(const contig_t& text, contig_t&encoded) = 0; + virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded) = 0; + + virtual size_t Estimate(const contig_t& text, uint32_t bound = 0) = 0; + + void AssureIndex(); + + void GetReference(contig_t& s); + void GetCodingCostVector(const contig_t& text, vector &v_costs, const bool prefix_costs) const; +}; + +// ******************************************************************************************* +class CLZDiff_V1 : public CLZDiffBase +{ + void encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded); + void decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos); + + +public: + CLZDiff_V1(const uint32_t _min_match_len = 18) : CLZDiffBase(_min_match_len) + {} + + virtual ~CLZDiff_V1() {}; + + virtual void Encode(const contig_t& text, contig_t& encoded); + virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded); + + virtual size_t Estimate(const contig_t& text, uint32_t bound = 0); +}; + +// ******************************************************************************************* +class CLZDiff_V2 : public CLZDiffBase +{ + void encode_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos, contig_t& encoded); + void decode_match(contig_t::const_iterator& p, uint32_t& ref_pos, uint32_t& len, uint32_t& pred_pos); + + uint32_t int_len(int x) const + { + if (x >= 0) + return uint_len((uint32_t)x); + else + return 1 + uint_len((uint32_t)-x); + } + + uint32_t uint_len(uint32_t x) const + { + if (x < 10) + return 1; + if (x < 100) + return 2; + if (x < 1000) + return 3; + if (x < 10000) + return 4; + if (x < 100000) + return 5; + if (x < 1000000) + return 6; + if (x < 10000000) + return 7; + return 8; + } + + uint32_t cost_Nrun(uint32_t x) const + { + return 2 + uint_len(x); + } + + uint32_t cost_match(const uint32_t ref_pos, const uint32_t len, const uint32_t pred_pos) const + { + int dif_pos = (int)ref_pos - (int)pred_pos; + + uint32_t r = int_len(dif_pos); + + if (len != ~0u) + r += 1 + uint_len(len - min_match_len); + + ++r; + + return r; + } + +public: + CLZDiff_V2(const uint32_t _min_match_len = 18) : CLZDiffBase(_min_match_len) + {} + + virtual ~CLZDiff_V2() {}; + + virtual void Encode(const contig_t& text, contig_t& encoded); + virtual void Decode(const contig_t& reference, const contig_t& encoded, contig_t& decoded); + + virtual size_t Estimate(const contig_t& text, uint32_t bound = ~0u); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/queue.h b/src/common/queue.h similarity index 89% rename from src/core/queue.h rename to src/common/queue.h index 3869a7c..9508228 100644 --- a/src/core/queue.h +++ b/src/common/queue.h @@ -1,431 +1,470 @@ -#ifndef _QUEUE_H -#define _QUEUE_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include - -using namespace std; - -// ******************************************************************************************* -// Multithreading queue with registering mechanism: -// * The queue can report whether it is in wainitng for new data state or there will be no new data -template class CBoundedQueue -{ - typedef list> queue_t; - - queue_t q; - bool is_completed; - int n_producers; - uint32_t n_elements; - size_t current_cost; - size_t max_cost; - - mutable mutex mtx; // The mutex to synchronise on - condition_variable cv_queue_empty; - condition_variable cv_queue_full; - -public: - typename queue_t::iterator q_it; - - // ******************************************************************************************* - CBoundedQueue(const int _n_producers, const size_t _max_cost) - { - max_cost = _max_cost; - - Restart(_n_producers); - }; - - // ******************************************************************************************* - ~CBoundedQueue() - {}; - - // ******************************************************************************************* - void Restart(const int _n_producers) - { - unique_lock lck(mtx); - - is_completed = false; - n_producers = _n_producers; - n_elements = 0; - current_cost = 0; - } - - // ******************************************************************************************* - bool IsEmpty() - { - lock_guard lck(mtx); - return n_elements == 0; - } - - // ******************************************************************************************* - bool IsCompleted() - { - lock_guard lck(mtx); - - return n_elements == 0 && n_producers == 0; - } - - // ******************************************************************************************* - void MarkCompleted() - { - lock_guard lck(mtx); - n_producers--; - - if (!n_producers) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - void Push(const T data, const size_t cost) - { - unique_lock lck(mtx); - cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); - - bool was_empty = n_elements == 0; - q.emplace_back(data, cost); - ++n_elements; - current_cost += cost; - - if (was_empty) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - void Emplace(T&& data, const size_t cost) - { - unique_lock lck(mtx); - cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); - - bool was_empty = n_elements == 0; - q.emplace_back(move(data), cost); - ++n_elements; - current_cost += cost; - - if (was_empty) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - bool Pop(T& data) - { - unique_lock lck(mtx); - cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); - - if (n_elements == 0) - return false; - - data = move(q.front().first); - size_t cost = q.front().second; - - q.pop_front(); - --n_elements; - current_cost -= cost; - - if (n_elements == 0) - cv_queue_empty.notify_all(); - - cv_queue_full.notify_all(); - - return true; - } - - // ******************************************************************************************* - pair GetSize() - { - unique_lock lck(mtx); - - return make_pair(n_elements, current_cost); - } -}; - -// ******************************************************************************************* -template class CBoundedPQueue -{ - typedef multimap, T> queue_t; - - queue_t q; - bool is_completed; - int n_producers; - uint32_t n_elements; - size_t current_cost; - size_t max_cost; - - mutable mutex mtx; - condition_variable cv_queue_empty; - condition_variable cv_queue_full; - -public: - typename queue_t::iterator q_it; - - // ******************************************************************************************* - CBoundedPQueue(const int _n_producers, const size_t _max_cost) - { - current_cost = 0; - max_cost = _max_cost; - - Restart(_n_producers); - }; - - // ******************************************************************************************* - ~CBoundedPQueue() - {}; - - // ******************************************************************************************* - void Restart(const int _n_producers) - { - unique_lock lck(mtx); - - is_completed = false; - n_producers = _n_producers; - n_elements = 0; - } - - // ******************************************************************************************* - bool IsEmpty() - { - lock_guard lck(mtx); - return n_elements == 0; - } - - // ******************************************************************************************* - bool IsCompleted() - { - lock_guard lck(mtx); - - return n_elements == 0 && n_producers == 0; - } - - // ******************************************************************************************* - void MarkCompleted() - { - lock_guard lck(mtx); - n_producers--; - - if (!n_producers) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - void Push(const T data, const size_t priority, const size_t cost) - { - unique_lock lck(mtx); - cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); - - bool was_empty = n_elements == 0; - q.emplace(make_pair, data); - ++n_elements; - current_cost += cost; - - if (was_empty) - cv_queue_empty.notify_all(); -// cv_queue_empty.notify_one(); - } - - // ******************************************************************************************* - void Emplace(T&& data, const size_t priority, const size_t cost) - { - unique_lock lck(mtx); - cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); - - bool was_empty = n_elements == 0; - q.emplace(make_pair(priority, cost), move(data)); - ++n_elements; - current_cost += cost; - - if (was_empty) - cv_queue_empty.notify_all(); -// cv_queue_empty.notify_one(); - } - - // ******************************************************************************************* - bool PopLarge(T& data) - { - unique_lock lck(mtx); - cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); - - if (n_elements == 0) - return false; - - data = move(q.rbegin()->second); - size_t cost = q.rbegin()->first.second; - - // q.pop(); - q.erase(--q.end()); - --n_elements; - current_cost -= cost; - - if (n_elements == 0) - cv_queue_empty.notify_all(); - - cv_queue_full.notify_all(); - - return true; - } - - // ******************************************************************************************* - bool PopSmall(T& data) - { - unique_lock lck(mtx); - cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); - - if (n_elements == 0) - return false; - - data = move(q.begin()->second); - size_t cost = q.begin()->first.second; - - q.erase(q.begin()); - --n_elements; - current_cost -= cost; - - if (n_elements == 0) - cv_queue_empty.notify_all(); - - cv_queue_full.notify_all(); - - return true; - } - - // ******************************************************************************************* - pair GetSize() - { - unique_lock lck(mtx); - - return make_pair(n_elements, current_cost); - } -}; - -// ******************************************************************************************* -// Multithreading queue with registering mechanism: -// * The queue can report whether it is in wainitng for new data state or there will be no new data -template class CPriorityQueue -{ - typedef map queue_t; - - queue_t q; - bool is_completed; - int n_producers; - uint32_t n_elements; - size_t current_priority; - - mutable mutex mtx; // The mutex to synchronise on - condition_variable cv_queue_empty; - -public: - typename queue_t::iterator q_it; - - // ******************************************************************************************* - CPriorityQueue(const int _n_producers) - { - current_priority = 0; - - Restart(_n_producers); - }; - - // ******************************************************************************************* - ~CPriorityQueue() - {}; - - // ******************************************************************************************* - void Restart(const int _n_producers) - { - unique_lock lck(mtx); - - is_completed = false; - n_producers = _n_producers; - n_elements = 0; - current_priority = 0; - } - - // ******************************************************************************************* - bool IsEmpty() - { - lock_guard lck(mtx); - return n_elements == 0; - } - - // ******************************************************************************************* - bool IsCompleted() - { - lock_guard lck(mtx); - - return n_elements == 0 && n_producers == 0; - } - - // ******************************************************************************************* - void MarkCompleted() - { - lock_guard lck(mtx); - n_producers--; - - if (!n_producers) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - void Push(const T data, const size_t priority) - { - unique_lock lck(mtx); - - bool was_empty = n_elements == 0; - q.emplace(priority, data); - ++n_elements; - -// if (was_empty) - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - void Emplace(const size_t priority, T&& data) - { - unique_lock lck(mtx); - -// bool was_empty = n_elements == 0; - q.emplace(priority, move(data)); - ++n_elements; - - cv_queue_empty.notify_all(); - } - - // ******************************************************************************************* - bool Pop(T& data) - { - unique_lock lck(mtx); - cv_queue_empty.wait(lck, [this] {return (!this->q.empty() && current_priority == q.begin()->first) || !this->n_producers; }); - - if (n_elements == 0) - return false; - - data = move(q.begin()->second); - - q.erase(q.begin()); - --n_elements; - ++current_priority; - - cv_queue_empty.notify_all(); - - return true; - } - - // ******************************************************************************************* - size_t GetSize() - { - unique_lock lck(mtx); - - return n_elements; - } -}; - -// EOF +#ifndef _QUEUE_H +#define _QUEUE_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include + +using namespace std; + +// ******************************************************************************************* +// Multithreading queue with registering mechanism: +// * The queue can report whether it is in wainitng for new data state or there will be no new data +template class CBoundedQueue +{ + typedef list> queue_t; + + queue_t q; + bool is_completed; + int n_producers; + uint32_t n_elements; + size_t current_cost; + size_t max_cost; + + mutable mutex mtx; // The mutex to synchronise on + condition_variable cv_queue_empty; + condition_variable cv_queue_full; + +public: + typename queue_t::iterator q_it; + + // ******************************************************************************************* + CBoundedQueue(const int _n_producers, const size_t _max_cost) + { + max_cost = _max_cost; + + Restart(_n_producers); + }; + + // ******************************************************************************************* + ~CBoundedQueue() + {}; + + // ******************************************************************************************* + void Restart(const int _n_producers) + { + unique_lock lck(mtx); + + is_completed = false; + n_producers = _n_producers; + n_elements = 0; + current_cost = 0; + } + + // ******************************************************************************************* + bool IsEmpty() + { + lock_guard lck(mtx); + return n_elements == 0; + } + + // ******************************************************************************************* + bool IsCompleted() + { + lock_guard lck(mtx); + + return n_elements == 0 && n_producers == 0; + } + + // ******************************************************************************************* + void MarkCompleted() + { + lock_guard lck(mtx); + n_producers--; + + if (!n_producers) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + void Push(const T data, const size_t cost) + { + unique_lock lck(mtx); + cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); + + bool was_empty = n_elements == 0; + q.emplace_back(data, cost); + ++n_elements; + current_cost += cost; + + if (was_empty) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + void Emplace(T&& data, const size_t cost) + { + unique_lock lck(mtx); + cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); + + bool was_empty = n_elements == 0; + q.emplace_back(move(data), cost); + ++n_elements; + current_cost += cost; + + if (was_empty) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + bool Pop(T& data) + { + unique_lock lck(mtx); + cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); + + if (n_elements == 0) + return false; + + data = move(q.front().first); + size_t cost = q.front().second; + + q.pop_front(); + --n_elements; + current_cost -= cost; + + if (n_elements == 0) + cv_queue_empty.notify_all(); + + cv_queue_full.notify_all(); + + return true; + } + + // ******************************************************************************************* + pair GetSize() + { + unique_lock lck(mtx); + + return make_pair(n_elements, current_cost); + } +}; + +// ******************************************************************************************* +template class CBoundedPQueue +{ + typedef multimap, T> queue_t; + + queue_t q; + bool is_completed; + int n_producers; + uint32_t n_elements; + size_t current_cost; + size_t max_cost; + + mutable mutex mtx; + condition_variable cv_queue_empty; + condition_variable cv_queue_full; + +public: + typename queue_t::iterator q_it; + + enum class result_t { empty, completed, normal }; + + // ******************************************************************************************* + CBoundedPQueue(const int _n_producers, const size_t _max_cost) + { + current_cost = 0; + max_cost = _max_cost; + + Restart(_n_producers); + }; + + // ******************************************************************************************* + ~CBoundedPQueue() + {}; + + // ******************************************************************************************* + void Restart(const int _n_producers) + { + unique_lock lck(mtx); + + is_completed = false; + n_producers = _n_producers; + n_elements = 0; + } + + // ******************************************************************************************* + bool IsEmpty() + { + lock_guard lck(mtx); + return n_elements == 0; + } + + // ******************************************************************************************* + bool IsCompleted() + { + lock_guard lck(mtx); + + return n_elements == 0 && n_producers == 0; + } + + // ******************************************************************************************* + void MarkCompleted() + { + lock_guard lck(mtx); + n_producers--; + + if (!n_producers) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + void Push(const T data, const size_t priority, const size_t cost) + { + unique_lock lck(mtx); + cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); + + bool was_empty = n_elements == 0; + q.emplace(make_pair, data); + ++n_elements; + current_cost += cost; + + if (was_empty) + cv_queue_empty.notify_all(); +// cv_queue_empty.notify_one(); + } + + // ******************************************************************************************* + void Emplace(T&& data, const size_t priority, const size_t cost) + { + unique_lock lck(mtx); + cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); + + bool was_empty = n_elements == 0; + q.emplace(make_pair(priority, cost), move(data)); + ++n_elements; + current_cost += cost; + + if (was_empty) + cv_queue_empty.notify_all(); +// cv_queue_empty.notify_one(); + } + + // ******************************************************************************************* + void EmplaceNoLock(T&& data, const size_t priority, const size_t cost) + { +// unique_lock lck(mtx); +// cv_queue_full.wait(lck, [this] {return current_cost < max_cost; }); + + bool was_empty = n_elements == 0; + q.emplace(make_pair(priority, cost), move(data)); + ++n_elements; + current_cost += cost; + + if (was_empty) + cv_queue_empty.notify_all(); +// cv_queue_empty.notify_one(); + } + + // ******************************************************************************************* + void EmplaceManyNoCost(T&& data, const size_t priority, size_t n_items) + { + unique_lock lck(mtx); + +// bool was_empty = n_elements == 0; + for(size_t i = 0; i < n_items; ++i) + q.emplace(make_pair(priority, 0), move(data)); + n_elements += n_items; + + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* +// bool PopLarge(T& data) + CBoundedPQueue::result_t PopLarge(T& data) + { + unique_lock lck(mtx); + cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); + + if (n_elements == 0) + { + return n_producers ? result_t::empty : result_t::completed; + } +// return false; + +// data = move(q.rbegin()->second); + + data.swap(q.rbegin()->second); + + size_t cost = q.rbegin()->first.second; + + // q.pop(); + q.erase(--q.end()); + --n_elements; + current_cost -= cost; + + if (n_elements == 0) + cv_queue_empty.notify_all(); + + cv_queue_full.notify_all(); + +// return true; + return result_t::normal; + } + + // ******************************************************************************************* + bool PopSmall(T& data) + { + unique_lock lck(mtx); + cv_queue_empty.wait(lck, [this] {return !this->q.empty() || !this->n_producers; }); + + if (n_elements == 0) + return false; + + data = move(q.begin()->second); + size_t cost = q.begin()->first.second; + + q.erase(q.begin()); + --n_elements; + current_cost -= cost; + + if (n_elements == 0) + cv_queue_empty.notify_all(); + + cv_queue_full.notify_all(); + + return true; + } + + // ******************************************************************************************* + pair GetSize() + { + unique_lock lck(mtx); + + return make_pair(n_elements, current_cost); + } +}; + +// ******************************************************************************************* +// Multithreading queue with registering mechanism: +// * The queue can report whether it is in wainitng for new data state or there will be no new data +template class CPriorityQueue +{ + typedef map queue_t; + + queue_t q; + bool is_completed; + int n_producers; + uint32_t n_elements; + size_t current_priority; + + mutable mutex mtx; // The mutex to synchronise on + condition_variable cv_queue_empty; + +public: + typename queue_t::iterator q_it; + + // ******************************************************************************************* + CPriorityQueue(const int _n_producers) + { + current_priority = 0; + + Restart(_n_producers); + }; + + // ******************************************************************************************* + ~CPriorityQueue() + {}; + + // ******************************************************************************************* + void Restart(const int _n_producers) + { + unique_lock lck(mtx); + + is_completed = false; + n_producers = _n_producers; + n_elements = 0; + current_priority = 0; + } + + // ******************************************************************************************* + bool IsEmpty() + { + lock_guard lck(mtx); + return n_elements == 0; + } + + // ******************************************************************************************* + bool IsCompleted() + { + lock_guard lck(mtx); + + return n_elements == 0 && n_producers == 0; + } + + // ******************************************************************************************* + void MarkCompleted() + { + lock_guard lck(mtx); + n_producers--; + + if (!n_producers) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + void Push(const T data, const size_t priority) + { + unique_lock lck(mtx); + + bool was_empty = n_elements == 0; + q.emplace(priority, data); + ++n_elements; + +// if (was_empty) + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + void Emplace(const size_t priority, T&& data) + { + unique_lock lck(mtx); + +// bool was_empty = n_elements == 0; + q.emplace(priority, move(data)); + ++n_elements; + + cv_queue_empty.notify_all(); + } + + // ******************************************************************************************* + bool Pop(T& data) + { + unique_lock lck(mtx); + cv_queue_empty.wait(lck, [this] {return (!this->q.empty() && current_priority == q.begin()->first) || !this->n_producers; }); + + if (n_elements == 0) + return false; + + data = move(q.begin()->second); + + q.erase(q.begin()); + --n_elements; + ++current_priority; + + cv_queue_empty.notify_all(); + + return true; + } + + // ******************************************************************************************* + size_t GetSize() + { + unique_lock lck(mtx); + + return n_elements; + } +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/segment.cpp b/src/common/segment.cpp similarity index 54% rename from src/core/segment.cpp rename to src/common/segment.cpp index e18fa04..1c54b87 100644 --- a/src/core/segment.cpp +++ b/src/common/segment.cpp @@ -1,425 +1,579 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/segment.h" - -// ******************************************************************************************* -uint32_t CSegment::add_raw(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) -{ - lock_guard lck(mtx); - - if (internal_state == internal_state_t::packed) - unpack(zstd_dctx); - - if (v_raw.size() == contigs_in_pack) - { - store_in_archive(v_raw, zstd_cctx); - v_raw.clear(); - } - - ++no_seqs; - v_raw.emplace_back(s); - - return (uint32_t) (no_seqs - 1u); -} - -// ******************************************************************************************* -uint32_t CSegment::add(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) -{ - lock_guard lck(mtx); - - if (internal_state == internal_state_t::packed) - unpack(zstd_dctx); - - if (no_seqs == 0) - { - lz_diff->Prepare(s); - - store_in_archive(s, zstd_cctx); - - ref_size = s.size() + 1; - } - else - { - if (v_lzp.size() == contigs_in_pack) - { - store_in_archive(v_lzp, zstd_cctx); - v_lzp.clear(); - } - - contig_t delta; - - lz_diff->Encode(s, delta); - -#ifdef IMPROVED_LZ_ENCODING - if (delta.empty()) // same sequence as reference - return 0; -#endif - - auto p = find(v_lzp.begin(), v_lzp.end(), delta); - - if (p != v_lzp.end()) - return no_seqs - distance(p, v_lzp.end()); - - seq_size += s.size() + 1; - packed_size += delta.size() + 1; - - v_lzp.emplace_back(move(delta)); - } - - ++no_seqs; - - return no_seqs - 1u; -} - -// ******************************************************************************************* -uint64_t CSegment::estimate(const contig_t& s, uint32_t bound, ZSTD_DCtx* zstd_dctx) -{ - lock_guard lck(mtx); - - if (internal_state == internal_state_t::packed) - unpack(zstd_dctx); - - if (ref_size == 0) - return 0; - else - return lz_diff->Estimate(s, bound); -} - -// ******************************************************************************************* -void CSegment::get_coding_cost(const contig_t& s, vector& v_costs, const bool prefix_costs, ZSTD_DCtx* zstd_dctx) -{ - lock_guard lck(mtx); - - if (internal_state == internal_state_t::packed) - unpack(zstd_dctx); - - if (ref_size == 0) - return; - else - lz_diff->GetCodingCostVector(s, v_costs, prefix_costs); -} - -// ******************************************************************************************* -size_t CSegment::get_ref_size() -{ - return ref_size; -} - -// ******************************************************************************************* -void CSegment::finish(ZSTD_CCtx* zstd_ctx) -{ - if (!v_lzp.empty()) - store_in_archive(v_lzp, zstd_ctx); - if (!v_raw.empty()) - store_in_archive(v_raw, zstd_ctx); - if (!packed_delta.empty()) - store_compressed_delta_in_archive(); -} - -// ******************************************************************************************* -bool CSegment::get_raw(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) -{ - // Retrive pack of delta-coded contigs - vector raw_seq; - vector pack_raw_seq; - vector zstd_raw_seq; - uint64_t raw_seq_size; - - int part_id = id_seq / contigs_in_pack; - int seq_in_part_id = id_seq % contigs_in_pack; - - tie(stream_id_delta, ignore) = in_archive->GetPart(name + ss_delta_ext(archive_version), part_id, zstd_raw_seq, raw_seq_size); - - if (raw_seq_size == 0) - pack_raw_seq = move(zstd_raw_seq); - else - { - pack_raw_seq.resize(raw_seq_size); - ZSTD_decompressDCtx(zstd_ctx, pack_raw_seq.data(), pack_raw_seq.size(), zstd_raw_seq.data(), zstd_raw_seq.size()); - } - - // Retrive the requested delta-coded contig - uint32_t b_pos = 0; - uint32_t e_pos = 0; - int cnt = 0; - - for (uint32_t i = 0; i < pack_raw_seq.size(); ++i) - { - if (pack_raw_seq[i] == contig_separator) - { - ++cnt; - if (cnt == seq_in_part_id) - b_pos = i + 1; - else if (cnt == seq_in_part_id + 1) - { - e_pos = i; - break; - } - } - } - - ctg.assign(pack_raw_seq.begin() + b_pos, pack_raw_seq.begin() + e_pos); - - return true; -} - -// ******************************************************************************************* -bool CSegment::get(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) -{ - // Retrive reference contig - vector ref_seq; - vector zstd_ref_seq; - - vector zstd_delta_seq; - uint64_t delta_seq_size; - - uint64_t ref_seq_size; - int part_id = (id_seq - 1) / contigs_in_pack; - - tie(stream_id_ref, ignore, stream_id_delta, ignore) = in_archive->GetParts( - name + ss_ref_ext(archive_version), 0, zstd_ref_seq, ref_seq_size, - name + ss_delta_ext(archive_version), part_id, zstd_delta_seq, delta_seq_size); - - if (ref_seq_size == 0) - ref_seq = move(zstd_ref_seq); // No compression - else - { - ref_seq.resize(ref_seq_size); - - if (zstd_ref_seq.back() == 0) - ZSTD_decompressDCtx(zstd_ctx, ref_seq.data(), ref_seq.size(), zstd_ref_seq.data(), zstd_ref_seq.size() - 1u); - else - { - vector v_tuples; - v_tuples.resize(ref_seq_size+1); - - auto output_size = ZSTD_decompressDCtx(zstd_ctx, v_tuples.data(), v_tuples.size(), zstd_ref_seq.data(), zstd_ref_seq.size() - 1u); - - v_tuples.resize(output_size); - tuples2bytes(v_tuples, ref_seq); - } - } - - if (id_seq == 0) - { - ctg = move(ref_seq); - return true; - } - - // Retrive pack of delta-coded contigs - vector delta_seq; - - uint8_t* pack_delta_seq; - bool need_deallocate_pack_delta_seq = false; - - int seq_in_part_id = (id_seq - 1) % contigs_in_pack; - - if (delta_seq_size == 0) - { - pack_delta_seq = zstd_delta_seq.data(); - delta_seq_size = zstd_delta_seq.size(); - } - else - { - pack_delta_seq = new uint8_t[delta_seq_size]; - need_deallocate_pack_delta_seq = true; - ZSTD_decompressDCtx(zstd_ctx, pack_delta_seq, delta_seq_size, zstd_delta_seq.data(), zstd_delta_seq.size()); - } - - if (contigs_in_pack > 1) - { - // Retrive the requested delta-coded contig - uint32_t b_pos = 0; - uint32_t e_pos = 0; - int cnt = 0; - - for (uint32_t i = 0; i < delta_seq_size; ++i) - { - if (pack_delta_seq[i] == contig_separator) - { - ++cnt; - if (cnt == seq_in_part_id) - b_pos = i + 1; - else if (cnt == seq_in_part_id + 1) - { - e_pos = i; - break; - } - } - } - - delta_seq.assign(pack_delta_seq + b_pos, pack_delta_seq + e_pos); - } - else - delta_seq.assign(pack_delta_seq, pack_delta_seq + delta_seq_size - 1); - - // LZ decode delta-encoded contig - ctg.clear(); - lz_diff->Decode(ref_seq, delta_seq, ctg); - - if (need_deallocate_pack_delta_seq) - delete[] pack_delta_seq; - - return true; -} - -// ******************************************************************************************* -void CSegment::appending_init() -{ - if (internal_state != internal_state_t::none) - return; - - // Retrive reference contig - contig_t ref_seq; - - int in_stream_id_ref = in_archive->GetStreamId(name + ss_ref_ext(archive_version)); - int in_stream_id_delta = in_archive->GetStreamId(name + ss_delta_ext(archive_version)); - - int out_stream_id_ref = -1; - int out_stream_id_delta = -1; - - if(in_stream_id_ref >= 0) - out_stream_id_ref = out_archive->RegisterStream(name + ss_ref_ext(archive_version)); - if(in_stream_id_delta >= 0) - out_stream_id_delta = out_archive->RegisterStream(name + ss_delta_ext(archive_version)); - - // Copy of all parts except last one - if (in_stream_id_ref >= 0) - { - in_archive->GetPart(in_stream_id_ref, packed_ref_seq, raw_ref_seq_size); - out_archive->AddPart(out_stream_id_ref, packed_ref_seq, raw_ref_seq_size); - ref_transferred = true; - no_seqs = 1; - } - else - { - packed_ref_seq.clear(); - no_seqs = 0; - } - - if (in_stream_id_delta >= 0) - { - vector tmp_data; - uint64_t tmp_meta; - - uint32_t no_parts = (uint32_t) in_archive->GetNoParts(in_stream_id_delta); - for (uint32_t i = 0; i + 1 < no_parts; ++i) - { - in_archive->GetPart(in_stream_id_delta, tmp_data, tmp_meta); - out_archive->AddPart(out_stream_id_delta, tmp_data, tmp_meta); - no_seqs += contigs_in_pack; - } - - in_archive->GetPart(in_stream_id_delta, packed_delta, raw_delta_size); - } - - internal_state = internal_state_t::packed; - - stream_id_ref = out_stream_id_ref; - stream_id_delta = out_stream_id_delta; -} - -// ******************************************************************************************* -void CSegment::clear() -{ - lock_guard lck(mtx); - - no_seqs = 0; - ref_size = 0; - seq_size = packed_size = 0; - v_raw.clear(); - v_lzp.clear(); - - internal_state = internal_state_t::normal; -} - -// ******************************************************************************************* -uint64_t CSegment::get_no_seqs() -{ - lock_guard lck(mtx); - - return no_seqs; -} - -// ******************************************************************************************* -void CSegment::unpack(ZSTD_DCtx* zstd_ctx) -{ - if (!packed_ref_seq.empty()) - { - contig_t ref_seq; - - if (raw_ref_seq_size == 0) - ref_seq = move(packed_ref_seq); // No compression - else - { - vector v_tuples; - - v_tuples.resize(raw_ref_seq_size + 1); - - auto output_size = ZSTD_decompressDCtx(zstd_ctx, v_tuples.data(), v_tuples.size(), packed_ref_seq.data(), packed_ref_seq.size() - 1u); - - v_tuples.resize(output_size); - - if (packed_ref_seq.back() == 1) // marker (tuples to bytes conversion needed) - tuples2bytes(v_tuples, ref_seq); - else - ref_seq.swap(v_tuples); - } - - packed_ref_seq.clear(); - packed_ref_seq.shrink_to_fit(); - - lz_diff->Prepare(ref_seq); - ref_size = ref_seq.size() + 1; - } - - if (!packed_delta.empty()) - { - contig_t delta_seq; - - if (raw_delta_size == 0) - delta_seq = move(packed_delta); - else - { - delta_seq.resize(raw_delta_size); - ZSTD_decompressDCtx(zstd_ctx, delta_seq.data(), delta_seq.size(), packed_delta.data(), packed_delta.size()); - } - - packed_delta.clear(); - packed_delta.shrink_to_fit(); - - v_lzp.clear(); - - // Retrive the requested delta-coded contig - uint32_t b_pos = 0; - - if (contigs_in_pack > 1) - { - for (uint32_t i = 0; i < delta_seq.size(); ++i) - if (delta_seq[i] == contig_separator) - { - v_lzp.emplace_back(delta_seq.begin() + b_pos, delta_seq.begin() + i); - b_pos = i + 1; - } - } - else - v_lzp.emplace_back(delta_seq.begin(), delta_seq.begin() + (delta_seq.size() - 1)); - - no_seqs += (uint32_t) v_lzp.size(); - - if (ref_size == 0) // There is no reference sequence so the deltas are in fact raw sequences - swap(v_raw, v_lzp); - } - - internal_state = internal_state_t::normal; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "segment.h" + +// ******************************************************************************************* +uint32_t CSegment::add_raw(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) +{ + lock_guard lck(mtx); + + if (internal_state == internal_state_t::packed) + unpack(zstd_dctx); + + if (v_raw.size() == contigs_in_pack) + { + store_in_archive(v_raw, zstd_cctx); + v_raw.clear(); + } + + ++no_seqs; + v_raw.emplace_back(s); + + return (uint32_t) (no_seqs - 1u); +} + +// ******************************************************************************************* +uint32_t CSegment::add(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) +{ + lock_guard lck(mtx); + + if (internal_state == internal_state_t::packed) + unpack(zstd_dctx); + + if (no_seqs == 0) + { + lz_diff->Prepare(s); + + store_in_archive(s, zstd_cctx); + + ref_size = s.size() + 1; + } + else + { + if (v_lzp.size() == contigs_in_pack) + { + store_in_archive(v_lzp, zstd_cctx); + v_lzp.clear(); + } + + contig_t delta; + + lz_diff->Encode(s, delta); + +#ifdef IMPROVED_LZ_ENCODING + if (delta.empty()) // same sequence as reference + return 0; +#endif + + auto p = find(v_lzp.begin(), v_lzp.end(), delta); + + if (p != v_lzp.end()) + return no_seqs - distance(p, v_lzp.end()); + + seq_size += s.size() + 1; + packed_size += delta.size() + 1; + + v_lzp.emplace_back(move(delta)); + } + + ++no_seqs; + + return no_seqs - 1u; +} + +// ******************************************************************************************* +uint64_t CSegment::estimate(const contig_t& s, uint32_t bound, ZSTD_DCtx* zstd_dctx) +{ + if (ref_size == 0) + return 0; + + { + lock_guard lck(mtx); + + if (internal_state == internal_state_t::packed) + unpack(zstd_dctx); + + lz_diff->AssureIndex(); + } + + return lz_diff->Estimate(s, bound); +} + +// ******************************************************************************************* +void CSegment::get_coding_cost(const contig_t& s, vector& v_costs, const bool prefix_costs, ZSTD_DCtx* zstd_dctx) +{ + if (ref_size == 0) + return; + + { + // lock_guard lck(mtx); + lock_guard lck(mtx); + + if (internal_state == internal_state_t::packed) + unpack(zstd_dctx); + lz_diff->AssureIndex(); + } + + lz_diff->GetCodingCostVector(s, v_costs, prefix_costs); +} + +// ******************************************************************************************* +size_t CSegment::get_ref_size() const +{ + return ref_size; +} + +// ******************************************************************************************* +void CSegment::finish(ZSTD_CCtx* zstd_ctx) +{ + if (!v_lzp.empty()) + store_in_archive(v_lzp, zstd_ctx); + if (!v_raw.empty()) + store_in_archive(v_raw, zstd_ctx); + if (!packed_delta.empty()) + store_compressed_delta_in_archive(); +} + +// ******************************************************************************************* +bool CSegment::get_raw(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + // Retrive pack of raw contigs +// vector pack_raw_seq; + + int part_id = id_seq / contigs_in_pack; + int seq_in_part_id = id_seq % contigs_in_pack; + + uint8_t* pack_raw_seq = nullptr; + contig_t buf; + size_t pack_raw_seq_size = 0; + + vector zstd_raw_seq; + uint64_t raw_seq_size; + + if (!fast) + { + tie(stream_id_delta, ignore) = in_archive->GetPart(name + ss_delta_ext(archive_version), part_id, zstd_raw_seq, raw_seq_size); + + if (raw_seq_size == 0) + { + pack_raw_seq = zstd_raw_seq.data(); + pack_raw_seq_size = zstd_raw_seq.size(); + } + else + { + buf.resize(raw_seq_size); + ZSTD_decompressDCtx(zstd_ctx, buf.data(), buf.size(), zstd_raw_seq.data(), zstd_raw_seq.size()); + pack_raw_seq = buf.data(); + pack_raw_seq_size = buf.size(); + } + } + else + { + auto p_raw = pf_packed_raw_seq.find(part_id); + + if (p_raw == pf_packed_raw_seq.end()) + { + tie(stream_id_delta, ignore) = in_archive->GetPart(name + ss_delta_ext(archive_version), part_id, zstd_raw_seq, raw_seq_size); + + if (pf_packed_raw_seq.size() >= pf_max_size) + pf_packed_raw_seq.erase(pf_packed_raw_seq.begin()); + + tie(p_raw, ignore) = pf_packed_raw_seq.insert(make_pair(part_id, vector())); + + if (raw_seq_size == 0) + p_raw->second = move(zstd_raw_seq); + else + { + p_raw->second.resize(raw_seq_size); + ZSTD_decompressDCtx(zstd_ctx, p_raw->second.data(), p_raw->second.size(), zstd_raw_seq.data(), zstd_raw_seq.size()); + } + } + + pack_raw_seq = p_raw->second.data(); + pack_raw_seq_size = p_raw->second.size(); + } + + // Retrive the requested delta-coded contig + uint32_t b_pos = 0; + uint32_t e_pos = 0; + int cnt = 0; + + for (uint32_t i = 0; i < pack_raw_seq_size; ++i) + { + if (pack_raw_seq[i] == contig_separator) + { + ++cnt; + if (cnt == seq_in_part_id) + b_pos = i + 1; + else if (cnt == seq_in_part_id + 1) + { + e_pos = i; + break; + } + } + } + + ctg.assign(pack_raw_seq + b_pos, pack_raw_seq + e_pos); + + return true; +} + +// ******************************************************************************************* +bool CSegment::get(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + // Retrive reference contig +// contig_t ref_seq; + vector zstd_ref_seq; + + vector zstd_delta_seq; + uint64_t delta_seq_size = 0; + + uint64_t ref_seq_size = 0; + int part_id = (id_seq - 1) / contigs_in_pack; + + if (!fast) + { + tie(stream_id_ref, ignore, stream_id_delta, ignore) = in_archive->GetParts( + name + ss_ref_ext(archive_version), 0, zstd_ref_seq, ref_seq_size, + name + ss_delta_ext(archive_version), part_id, zstd_delta_seq, delta_seq_size); + } + else + { + if(ref_seq.empty()) + tie(stream_id_ref, ignore) = in_archive->GetPart(name + ss_ref_ext(archive_version), 0, zstd_ref_seq, ref_seq_size); + + auto p_delta = pf_packed_delta_seq.find(part_id); + if (p_delta == pf_packed_delta_seq.end()) + { + tie(stream_id_delta, ignore) = in_archive->GetPart(name + ss_delta_ext(archive_version), part_id, zstd_delta_seq, delta_seq_size); + if (pf_packed_delta_seq.size() >= pf_max_size) + pf_packed_delta_seq.erase(pf_packed_delta_seq.begin()); + } + } + + if (ref_seq.empty()) + { + if (ref_seq_size == 0) + ref_seq = move(zstd_ref_seq); // No compression + else + { + ref_seq.resize(ref_seq_size); + + if (zstd_ref_seq.back() == 0) + ZSTD_decompressDCtx(zstd_ctx, ref_seq.data(), ref_seq.size(), zstd_ref_seq.data(), zstd_ref_seq.size() - 1u); + else + { + vector v_tuples; + v_tuples.resize(ref_seq_size + 1); + + auto output_size = ZSTD_decompressDCtx(zstd_ctx, v_tuples.data(), v_tuples.size(), zstd_ref_seq.data(), zstd_ref_seq.size() - 1u); + + v_tuples.resize(output_size); + tuples2bytes(v_tuples, ref_seq); + } + } + } + + if (id_seq == 0) + { + if (!fast) + { + ctg = move(ref_seq); + ref_seq.clear(); + ref_seq.shrink_to_fit(); + } + else + ctg = ref_seq; + + return true; + } + + uint8_t* pack_delta_seq; + bool need_deallocate_pack_delta_seq = false; + auto p_delta = pf_packed_delta_seq.begin(); + + if (!fast) + { + if (delta_seq_size == 0) + { + pack_delta_seq = zstd_delta_seq.data(); + delta_seq_size = zstd_delta_seq.size(); + } + else + { + pack_delta_seq = new uint8_t[delta_seq_size]; + need_deallocate_pack_delta_seq = true; + ZSTD_decompressDCtx(zstd_ctx, pack_delta_seq, delta_seq_size, zstd_delta_seq.data(), zstd_delta_seq.size()); + } + } + else + { + p_delta = pf_packed_delta_seq.find(part_id); + + if (p_delta == pf_packed_delta_seq.end()) + { + tie(p_delta, ignore) = pf_packed_delta_seq.insert(make_pair(part_id, make_pair(vector(), vector()))); + + if (delta_seq_size == 0) + p_delta->second.first = zstd_delta_seq; + else + { + p_delta->second.first.resize(delta_seq_size); + ZSTD_decompressDCtx(zstd_ctx, p_delta->second.first.data(), delta_seq_size, zstd_delta_seq.data(), zstd_delta_seq.size()); + } + + auto& sep_pos = p_delta->second.second; + + pack_delta_seq = p_delta->second.first.data(); + delta_seq_size = p_delta->second.first.size(); + + if (contigs_in_pack == 1) + { + sep_pos.emplace_back(0); + sep_pos.emplace_back(delta_seq_size); + } + else + { + sep_pos.emplace_back(0); + + for (uint32_t i = 0; i < delta_seq_size; ++i) + if (pack_delta_seq[i] == contig_separator) + sep_pos.emplace_back(i + 1); + } + } + else + { + pack_delta_seq = p_delta->second.first.data(); + delta_seq_size = p_delta->second.first.size(); + } + } + + // Retrive pack of delta-coded contigs + contig_t delta_seq; + int seq_in_part_id = (id_seq - 1) % contigs_in_pack; + + if (!fast) + { + if (contigs_in_pack > 1) + { + // Retrive the requested delta-coded contig + uint32_t b_pos = 0; + uint32_t e_pos = 0; + int cnt = 0; + + for (uint32_t i = 0; i < delta_seq_size; ++i) + { + if (pack_delta_seq[i] == contig_separator) + { + ++cnt; + if (cnt == seq_in_part_id) + b_pos = i + 1; + else if (cnt == seq_in_part_id + 1) + { + e_pos = i; + break; + } + } + } + + delta_seq.assign(pack_delta_seq + b_pos, pack_delta_seq + e_pos); + } + else + delta_seq.assign(pack_delta_seq, pack_delta_seq + delta_seq_size - 1); + } + else + delta_seq.assign(pack_delta_seq + p_delta->second.second[seq_in_part_id], pack_delta_seq + p_delta->second.second[seq_in_part_id + 1] - 1); + + // LZ decode delta-encoded contig + ctg.clear(); + lz_diff->Decode(ref_seq, delta_seq, ctg); + + if (need_deallocate_pack_delta_seq) + delete[] pack_delta_seq; + + if (!fast) + { + ref_seq.clear(); + ref_seq.shrink_to_fit(); + } + + return true; +} + +// ******************************************************************************************* +bool CSegment::get_raw_locked(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + lock_guard lck(mtx); + + return get_raw(id_seq, ctg, zstd_ctx); +} + +// ******************************************************************************************* +bool CSegment::get_locked(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx) +{ + lock_guard lck(mtx); + + return get(id_seq, ctg, zstd_ctx); +} + +// ******************************************************************************************* +void CSegment::appending_init() +{ + if (internal_state != internal_state_t::none) + return; + + // Retrive reference contig + contig_t ref_seq; + + int in_stream_id_ref = in_archive->GetStreamId(name + ss_ref_ext(archive_version)); + int in_stream_id_delta = in_archive->GetStreamId(name + ss_delta_ext(archive_version)); + + int out_stream_id_ref = -1; + int out_stream_id_delta = -1; + + if(in_stream_id_ref >= 0) + out_stream_id_ref = out_archive->RegisterStream(name + ss_ref_ext(archive_version)); + if(in_stream_id_delta >= 0) + out_stream_id_delta = out_archive->RegisterStream(name + ss_delta_ext(archive_version)); + + // Copy of all parts except last one + if (in_stream_id_ref >= 0) + { + in_archive->GetPart(in_stream_id_ref, packed_ref_seq, raw_ref_seq_size); + out_archive->AddPart(out_stream_id_ref, packed_ref_seq, raw_ref_seq_size); + ref_transferred = true; + no_seqs = 1; + } + else + { + packed_ref_seq.clear(); + no_seqs = 0; + } + + if (in_stream_id_delta >= 0) + { + vector tmp_data; + uint64_t tmp_meta; + + uint32_t no_parts = (uint32_t) in_archive->GetNoParts(in_stream_id_delta); + for (uint32_t i = 0; i + 1 < no_parts; ++i) + { + in_archive->GetPart(in_stream_id_delta, tmp_data, tmp_meta); + out_archive->AddPart(out_stream_id_delta, tmp_data, tmp_meta); + no_seqs += contigs_in_pack; + } + + in_archive->GetPart(in_stream_id_delta, packed_delta, raw_delta_size); + } + + internal_state = internal_state_t::packed; + + stream_id_ref = out_stream_id_ref; + stream_id_delta = out_stream_id_delta; +} + +// ******************************************************************************************* +void CSegment::clear() +{ + lock_guard lck(mtx); + + no_seqs = 0; + ref_size = 0; + seq_size = packed_size = 0; + v_raw.clear(); + v_lzp.clear(); + + internal_state = internal_state_t::normal; +} + +// ******************************************************************************************* +uint64_t CSegment::get_no_seqs() +{ + lock_guard lck(mtx); + + return no_seqs; +} + +// ******************************************************************************************* +void CSegment::unpack(ZSTD_DCtx* zstd_ctx) +{ + bool empty_ctx = zstd_ctx == nullptr; + + if (!packed_ref_seq.empty()) + { + contig_t ref_seq; + + if (raw_ref_seq_size == 0) + ref_seq = move(packed_ref_seq); // No compression + else + { + vector v_tuples; + + v_tuples.resize(raw_ref_seq_size + 1); + + if (zstd_ctx == nullptr) + zstd_ctx = ZSTD_createDCtx(); + + auto output_size = ZSTD_decompressDCtx(zstd_ctx, v_tuples.data(), v_tuples.size(), packed_ref_seq.data(), packed_ref_seq.size() - 1u); + + v_tuples.resize(output_size); + + if (packed_ref_seq.back() == 1) // marker (tuples to bytes conversion needed) + tuples2bytes(v_tuples, ref_seq); + else + ref_seq.swap(v_tuples); + } + + packed_ref_seq.clear(); + packed_ref_seq.shrink_to_fit(); + + lz_diff->Prepare(ref_seq); + ref_size = ref_seq.size() + 1; + } + + if (!packed_delta.empty()) + { + contig_t delta_seq; + + if (raw_delta_size == 0) + delta_seq = move(packed_delta); + else + { + if (zstd_ctx == nullptr) + zstd_ctx = ZSTD_createDCtx(); + + delta_seq.resize(raw_delta_size); + ZSTD_decompressDCtx(zstd_ctx, delta_seq.data(), delta_seq.size(), packed_delta.data(), packed_delta.size()); + } + + packed_delta.clear(); + packed_delta.shrink_to_fit(); + + v_lzp.clear(); + + // Retrive the requested delta-coded contig + uint32_t b_pos = 0; + + if (contigs_in_pack > 1) + { + for (uint32_t i = 0; i < delta_seq.size(); ++i) + if (delta_seq[i] == contig_separator) + { + v_lzp.emplace_back(delta_seq.begin() + b_pos, delta_seq.begin() + i); + b_pos = i + 1; + } + } + else + v_lzp.emplace_back(delta_seq.begin(), delta_seq.begin() + (delta_seq.size() - 1)); + + no_seqs += (uint32_t) v_lzp.size(); + + if (ref_size == 0) // There is no reference sequence so the deltas are in fact raw sequences + swap(v_raw, v_lzp); + } + + if (empty_ctx && zstd_ctx != nullptr) + ZSTD_freeDCtx(zstd_ctx); + + internal_state = internal_state_t::normal; +} + +// EOF diff --git a/src/core/segment.h b/src/common/segment.h similarity index 93% rename from src/core/segment.h rename to src/common/segment.h index 658e6f1..56f063b 100644 --- a/src/core/segment.h +++ b/src/common/segment.h @@ -1,329 +1,342 @@ -#ifndef _SEGMENT_H -#define _SEGMENT_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include -#include -#include -#include -#include -#include -#include "../core/lz_diff.h" -#include "../core/archive.h" -#include "../core/defs.h" - -using namespace std; - -class CSegment -{ - enum class internal_state_t {none, normal, packed}; - - const uint8_t contig_separator = 0xffu; - - string name; - shared_ptr in_archive; - shared_ptr out_archive; - uint32_t contigs_in_pack; - uint32_t min_match_len; - bool concatenated_genomes; - uint32_t archive_version; - - int stream_id_ref; - int stream_id_delta; - - internal_state_t internal_state; - - packed_block_t packed_ref_seq; - packed_block_t packed_delta; - uint64_t raw_ref_seq_size = 0; - uint64_t raw_delta_size = 0; - bool ref_transferred = false; - - unique_ptr lz_diff; - - uint32_t no_seqs; - vector v_lzp; - -public: - vector v_raw; -private: - - uint64_t ref_size; - uint64_t seq_size; - uint64_t packed_size; - mutex mtx; - - // ******************************************************************************************* - void bytes2tuples(const vector& v_bytes, vector& v_tuples) - { - uint8_t me = 0; - - if(!v_bytes.empty()) - me = *max_element(v_bytes.begin(), v_bytes.end()); - - if (me < 4) - bytes2tuples_impl<4, 4>(v_bytes, v_tuples); - else if (me < 6) - bytes2tuples_impl<3, 6>(v_bytes, v_tuples); - else if (me < 16) - bytes2tuples_impl<2, 16>(v_bytes, v_tuples); - else - { - v_tuples = v_bytes; - v_tuples.emplace_back(0x10u); - } - } - - // ******************************************************************************************* - void tuples2bytes(const vector& v_tuples, vector& v_bytes) - { - uint8_t marker = v_tuples.back(); - uint8_t no_bytes = marker >> 4; - uint8_t trailing_bytes = marker & 0xf; - uint32_t output_size = (uint32_t)((v_tuples.size() - 2) * no_bytes + trailing_bytes); - - v_bytes.reserve(output_size); - - if (no_bytes == 4) - tuples2bytes_impl<4, 4>(v_tuples, v_bytes, output_size); - else if (no_bytes == 3) - tuples2bytes_impl<3, 6>(v_tuples, v_bytes, output_size); - else if (no_bytes == 2) - tuples2bytes_impl<2, 16>(v_tuples, v_bytes, output_size); - else - v_bytes.assign(v_tuples.begin(), v_tuples.begin() + v_tuples.size() - 1u); - } - - // ******************************************************************************************* - template - void bytes2tuples_impl(const vector& v_bytes, vector& v_tuples) - { - v_tuples.reserve((v_bytes.size() - NO_BYTES - 1u) / NO_BYTES + 1); - - size_t i; - uint8_t c; - - for (i = 0; i + NO_BYTES <= v_bytes.size(); i += NO_BYTES) - { - c = 0; - - for (uint32_t j = 0; j < NO_BYTES; ++j) - c = c * MULT + v_bytes[i + j]; - - v_tuples.emplace_back(c); - } - - for (c = 0; i < v_bytes.size(); ++i) - c = c * MULT + v_bytes[i]; - - v_tuples.emplace_back(c); - - v_tuples.emplace_back((NO_BYTES << 4) + (v_bytes.size() % NO_BYTES)); // marker for decoding - } - - // ******************************************************************************************* - template - void tuples2bytes_impl(const vector& v_tuples, vector& v_bytes, const uint32_t output_size) - { - uint32_t i, j; - - v_bytes.resize(output_size); - - for (i = j = 0; j + NO_BYTES <= output_size; ++i, j += NO_BYTES) - { - uint8_t c = v_tuples[i]; - - for (int k = NO_BYTES - 1u; k >= 0; --k) - { - v_bytes[(size_t) j + k] = c % MULT; - c /= MULT; - } - } - - uint8_t c = v_tuples[i]; - - uint32_t n = output_size % NO_BYTES; - - if(n) - for (int k = n-1u; k >= 0; --k) - { - v_bytes[(size_t) j + k] = c % MULT; - c /= MULT; - } - } - - // ******************************************************************************************* - void add_to_archive(const int stream_id, const contig_t& data, const int compression_level, ZSTD_CCtx* zstd_ctx) - { - size_t a_size = ZSTD_compressBound(data.size()); - uint8_t *packed = new uint8_t[a_size+1u]; - uint32_t packed_size = (uint32_t) ZSTD_compressCCtx(zstd_ctx, (void *) packed, a_size, data.data(), data.size(), compression_level); - packed[packed_size] = 0; // ZSTD compression marker - plain (0) - - if(packed_size + 1u < (uint32_t) data.size()) - { - vector v_packed(packed, packed + packed_size + 1); - out_archive->AddPartBuffered(stream_id, v_packed, data.size()); - } - else - { - out_archive->AddPartBuffered(stream_id, data, 0); - } - - delete[] packed; - } - - // ******************************************************************************************* - void add_to_archive_tuples(const int stream_id, const contig_t& data, const int compression_level, ZSTD_CCtx* zstd_ctx) - { - vector v_tuples; - - bytes2tuples(data, v_tuples); - - size_t a_size = ZSTD_compressBound(v_tuples.size()); - uint8_t *packed = new uint8_t[a_size+1u]; - uint32_t packed_size = (uint32_t) ZSTD_compressCCtx(zstd_ctx, (void *) packed, a_size, v_tuples.data(), v_tuples.size(), compression_level); - packed[packed_size] = 1; // ZSTD compression marker - tuples (1) - - if(packed_size + 1u < (uint32_t) data.size()) - { - vector v_packed(packed, packed + packed_size + 1); - out_archive->AddPartBuffered(stream_id, v_packed, data.size()); - } - else - { - out_archive->AddPartBuffered(stream_id, data, 0); - } - - delete[] packed; - } - - // ******************************************************************************************* - void store_in_archive(const contig_t& data, ZSTD_CCtx* zstd_ctx) - { - string stream_name = name + ss_ref_ext(archive_version); - - stream_id_ref = out_archive->RegisterStream(stream_name); - - double best_frac = 0.0; - double frac_limit = 0.5; - - for (uint32_t i = 4; i < 32; ++i) - { - uint32_t cnt = 0; - uint32_t cur_size = 0; - - for (uint32_t j = 0; (size_t) j + i < data.size(); ++j) - { - cnt += data[j] == data[(size_t) j + i]; - cur_size += data[j] < 4; // exclude non-ACGT from counting - } - - double frac = 0.0; - if (cur_size) - frac = (double)cnt / cur_size; - - if (frac > best_frac) - { - best_frac = frac; - - if (best_frac >= frac_limit) - break; - } - } - - if (best_frac < 0.5) - add_to_archive_tuples(stream_id_ref, data, 13, zstd_ctx); - else - add_to_archive(stream_id_ref, data, 19, zstd_ctx); - } - - // ******************************************************************************************* - void store_in_archive(const vector& v_data, ZSTD_CCtx* zstd_ctx) - { - contig_t pack; - - size_t res_size = v_data.size(); - for (const auto& x : v_data) - res_size += x.size(); - - pack.reserve(res_size); - - for (auto& x : v_data) - { - pack.insert(pack.end(), x.begin(), x.end()); - pack.push_back(contig_separator); - } - - if (stream_id_delta < 0) - stream_id_delta = out_archive->RegisterStream(name + ss_delta_ext(archive_version)); - - add_to_archive(stream_id_delta, pack, 17, zstd_ctx); - } - - // ******************************************************************************************* - void store_compressed_delta_in_archive() - { - if (stream_id_delta < 0) - { - string stream_name = name + ss_delta_ext(archive_version); - stream_id_delta = out_archive->RegisterStream(stream_name); - } - - out_archive->AddPartBuffered(stream_id_delta, packed_delta, raw_delta_size); - } - - void unpack(ZSTD_DCtx* zstd_ctx); - -public: - // ******************************************************************************************* - CSegment(const string &_name, shared_ptr _in_archive, shared_ptr _out_archive, - const uint32_t _contigs_in_pack, const uint32_t _min_match_len, const bool _concatenated_genomes, uint32_t _archive_version) : - name(_name), in_archive(_in_archive), out_archive(_out_archive), - contigs_in_pack(_contigs_in_pack), min_match_len(_min_match_len), concatenated_genomes(_concatenated_genomes), archive_version(_archive_version), - no_seqs(0), ref_size(0), seq_size(0), packed_size(0) - { - stream_id_ref = -1; - stream_id_delta = -1; - internal_state = internal_state_t::none; - - if (_archive_version < 2000) - lz_diff = make_unique(); - else - lz_diff = make_unique(); - - lz_diff->SetMinMatchLen(min_match_len); - }; - - ~CSegment() - { - } - - uint32_t add_raw(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); - uint32_t add(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); - uint64_t estimate(const contig_t& s, uint32_t bound, ZSTD_DCtx* zstd_dctx); - - void get_coding_cost(const contig_t& s, vector &v_costs, const bool prefix_costs, ZSTD_DCtx* zstd_dctx); - - void finish(ZSTD_CCtx* zstd_ctx); - bool get_raw(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); - bool get(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); - void clear(); - uint64_t get_no_seqs(); - - size_t get_ref_size(); - - void appending_init(); -}; - -// EOF +#ifndef _SEGMENT_H +#define _SEGMENT_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include +#include +#include +#include +#include +#include +#include +#include "../common/lz_diff.h" +#include "../common/archive.h" +#include "../common/defs.h" + +using namespace std; + +class CSegment +{ + enum class internal_state_t {none, normal, packed}; + + const uint8_t contig_separator = 0xffu; + + string name; + shared_ptr in_archive; + shared_ptr out_archive; + uint32_t contigs_in_pack; + uint32_t min_match_len; + bool concatenated_genomes; + uint32_t archive_version; + bool fast; + + int stream_id_ref; + int stream_id_delta; + + internal_state_t internal_state; + + packed_block_t packed_ref_seq; + packed_block_t packed_delta; + uint64_t raw_ref_seq_size = 0; + uint64_t raw_delta_size = 0; + bool ref_transferred = false; + + unique_ptr lz_diff; + + uint32_t no_seqs; + vector v_lzp; + + contig_t ref_seq; + map, vector>> pf_packed_delta_seq; + map> pf_packed_raw_seq; + const size_t pf_max_size = 2; + +public: + vector v_raw; +private: + + uint64_t ref_size; + uint64_t seq_size; + uint64_t packed_size; + mutex mtx; + + // ******************************************************************************************* + void bytes2tuples(const vector& v_bytes, vector& v_tuples) + { + uint8_t me = 0; + + if(!v_bytes.empty()) + me = *max_element(v_bytes.begin(), v_bytes.end()); + + if (me < 4) + bytes2tuples_impl<4, 4>(v_bytes, v_tuples); + else if (me < 6) + bytes2tuples_impl<3, 6>(v_bytes, v_tuples); + else if (me < 16) + bytes2tuples_impl<2, 16>(v_bytes, v_tuples); + else + { + v_tuples = v_bytes; + v_tuples.emplace_back(0x10u); + } + } + + // ******************************************************************************************* + void tuples2bytes(const vector& v_tuples, vector& v_bytes) + { + uint8_t marker = v_tuples.back(); + uint8_t no_bytes = marker >> 4; + uint8_t trailing_bytes = marker & 0xf; + uint32_t output_size = (uint32_t)((v_tuples.size() - 2) * no_bytes + trailing_bytes); + + v_bytes.reserve(output_size); + + if (no_bytes == 4) + tuples2bytes_impl<4, 4>(v_tuples, v_bytes, output_size); + else if (no_bytes == 3) + tuples2bytes_impl<3, 6>(v_tuples, v_bytes, output_size); + else if (no_bytes == 2) + tuples2bytes_impl<2, 16>(v_tuples, v_bytes, output_size); + else + v_bytes.assign(v_tuples.begin(), v_tuples.begin() + v_tuples.size() - 1u); + } + + // ******************************************************************************************* + template + void bytes2tuples_impl(const vector& v_bytes, vector& v_tuples) + { + v_tuples.reserve((v_bytes.size() - NO_BYTES - 1u) / NO_BYTES + 1); + + size_t i; + uint8_t c; + + for (i = 0; i + NO_BYTES <= v_bytes.size(); i += NO_BYTES) + { + c = 0; + + for (uint32_t j = 0; j < NO_BYTES; ++j) + c = c * MULT + v_bytes[i + j]; + + v_tuples.emplace_back(c); + } + + for (c = 0; i < v_bytes.size(); ++i) + c = c * MULT + v_bytes[i]; + + v_tuples.emplace_back(c); + + v_tuples.emplace_back((NO_BYTES << 4) + (v_bytes.size() % NO_BYTES)); // marker for decoding + } + + // ******************************************************************************************* + template + void tuples2bytes_impl(const vector& v_tuples, vector& v_bytes, const uint32_t output_size) + { + uint32_t i, j; + + v_bytes.resize(output_size); + + for (i = j = 0; j + NO_BYTES <= output_size; ++i, j += NO_BYTES) + { + uint8_t c = v_tuples[i]; + + for (int k = NO_BYTES - 1u; k >= 0; --k) + { + v_bytes[(size_t) j + k] = c % MULT; + c /= MULT; + } + } + + uint8_t c = v_tuples[i]; + + uint32_t n = output_size % NO_BYTES; + + if(n) + for (int k = n-1u; k >= 0; --k) + { + v_bytes[(size_t) j + k] = c % MULT; + c /= MULT; + } + } + + // ******************************************************************************************* + void add_to_archive(const int stream_id, const contig_t& data, const int compression_level, ZSTD_CCtx* zstd_ctx) + { + size_t a_size = ZSTD_compressBound(data.size()); + uint8_t *packed = new uint8_t[a_size+1u]; + uint32_t packed_size = (uint32_t) ZSTD_compressCCtx(zstd_ctx, (void *) packed, a_size, data.data(), data.size(), compression_level); + packed[packed_size] = 0; // ZSTD compression marker - plain (0) + + if(packed_size + 1u < (uint32_t) data.size()) + { + vector v_packed(packed, packed + packed_size + 1); + out_archive->AddPartBuffered(stream_id, v_packed, data.size()); + } + else + { + out_archive->AddPartBuffered(stream_id, data, 0); + } + + delete[] packed; + } + + // ******************************************************************************************* + void add_to_archive_tuples(const int stream_id, const contig_t& data, const int compression_level, ZSTD_CCtx* zstd_ctx) + { + vector v_tuples; + + bytes2tuples(data, v_tuples); + + size_t a_size = ZSTD_compressBound(v_tuples.size()); + uint8_t *packed = new uint8_t[a_size+1u]; + uint32_t packed_size = (uint32_t) ZSTD_compressCCtx(zstd_ctx, (void *) packed, a_size, v_tuples.data(), v_tuples.size(), compression_level); + packed[packed_size] = 1; // ZSTD compression marker - tuples (1) + + if(packed_size + 1u < (uint32_t) data.size()) + { + vector v_packed(packed, packed + packed_size + 1); + out_archive->AddPartBuffered(stream_id, v_packed, data.size()); + } + else + { + out_archive->AddPartBuffered(stream_id, data, 0); + } + + delete[] packed; + } + + // ******************************************************************************************* + void store_in_archive(const contig_t& data, ZSTD_CCtx* zstd_ctx) + { + string stream_name = name + ss_ref_ext(archive_version); + + stream_id_ref = out_archive->RegisterStream(stream_name); + + double best_frac = 0.0; + double frac_limit = 0.5; + + for (uint32_t i = 4; i < 32; ++i) + { + uint32_t cnt = 0; + uint32_t cur_size = 0; + + for (uint32_t j = 0; (size_t) j + i < data.size(); ++j) + { + cnt += data[j] == data[(size_t) j + i]; + cur_size += data[j] < 4; // exclude non-ACGT from counting + } + + double frac = 0.0; + if (cur_size) + frac = (double)cnt / cur_size; + + if (frac > best_frac) + { + best_frac = frac; + + if (best_frac >= frac_limit) + break; + } + } + + if (best_frac < 0.5) + add_to_archive_tuples(stream_id_ref, data, 13, zstd_ctx); + else + add_to_archive(stream_id_ref, data, 19, zstd_ctx); + } + + // ******************************************************************************************* + void store_in_archive(const vector& v_data, ZSTD_CCtx* zstd_ctx) + { + contig_t pack; + + size_t res_size = v_data.size(); + for (const auto& x : v_data) + res_size += x.size(); + + res_size += v_data.size() + 1; + + pack.reserve(res_size); + + for (auto& x : v_data) + { + pack.insert(pack.end(), x.begin(), x.end()); + pack.push_back(contig_separator); + } + + if (stream_id_delta < 0) + stream_id_delta = out_archive->RegisterStream(name + ss_delta_ext(archive_version)); + + add_to_archive(stream_id_delta, pack, 17, zstd_ctx); + } + + // ******************************************************************************************* + void store_compressed_delta_in_archive() + { + if (stream_id_delta < 0) + { + string stream_name = name + ss_delta_ext(archive_version); + stream_id_delta = out_archive->RegisterStream(stream_name); + } + + out_archive->AddPartBuffered(stream_id_delta, packed_delta, raw_delta_size); + } + + void unpack(ZSTD_DCtx* zstd_ctx); + +public: + // ******************************************************************************************* + CSegment(const string &_name, shared_ptr _in_archive, shared_ptr _out_archive, + const uint32_t _contigs_in_pack, const uint32_t _min_match_len, const bool _concatenated_genomes, uint32_t _archive_version, bool fast = false) : + name(_name), in_archive(_in_archive), out_archive(_out_archive), + contigs_in_pack(_contigs_in_pack), min_match_len(_min_match_len), concatenated_genomes(_concatenated_genomes), archive_version(_archive_version), fast(fast), + no_seqs(0), ref_size(0), seq_size(0), packed_size(0) + { + stream_id_ref = -1; + stream_id_delta = -1; + internal_state = internal_state_t::none; + + if (_archive_version < 2000) + lz_diff = make_unique(); + else + lz_diff = make_unique(); + + lz_diff->SetMinMatchLen(min_match_len); + }; + + ~CSegment() + { + } + + uint32_t add_raw(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); + uint32_t add(const contig_t& s, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); + uint64_t estimate(const contig_t& s, uint32_t bound, ZSTD_DCtx* zstd_dctx); + + void get_coding_cost(const contig_t& s, vector &v_costs, const bool prefix_costs, ZSTD_DCtx* zstd_dctx); + + void finish(ZSTD_CCtx* zstd_ctx); + bool get_raw(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + bool get(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + + bool get_raw_locked(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + bool get_locked(const uint32_t id_seq, contig_t& ctg, ZSTD_DCtx* zstd_ctx); + + void clear(); + uint64_t get_no_seqs(); + + size_t get_ref_size() const; + + void appending_init(); +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/utils.cpp b/src/common/utils.cpp similarity index 92% rename from src/core/utils.cpp rename to src/common/utils.cpp index 008a4e5..95f5f4d 100644 --- a/src/core/utils.cpp +++ b/src/common/utils.cpp @@ -1,104 +1,104 @@ -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/utils.h" -#include - -// ******************************************************************************************* -string int_to_hex(uint32_t n) -{ - const char dig[] = "0123456789ABCDEF"; - - string res; - - do - { - res.push_back(dig[n & 0xfu]); - n /= 16; - } while (n); - - res.reserve(); - - return res; -} - -// ******************************************************************************************* -string int_to_base64(uint32_t n) -{ - const char dig[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_#"; - - string res; - - do - { - res.push_back(dig[n & 0x3fu]); - n /= 64; - } while (n); - - res.reserve(); - - return res; -} - -// ******************************************************************************************* -string ss_prefix(uint32_t archive_version) -{ - if (archive_version < 3000) - return "seg-"; - else - return "x"; -} - -// ******************************************************************************************* -string ss_base(uint32_t archive_version, uint32_t n) -{ - if (archive_version < 3000) - return "seg-" + to_string(n); - else - return "x" + int_to_base64(n); -} - -// ******************************************************************************************* -string ss_ref_name(uint32_t archive_version, uint32_t n) -{ - if (archive_version < 3000) - return "seg-" + to_string(n) + "-ref"; - else - return "x" + int_to_base64(n) + "r"; -} - -// ******************************************************************************************* -string ss_delta_name(uint32_t archive_version, uint32_t n) -{ - if (archive_version < 3000) - return "seg-" + to_string(n) + "-delta"; - else - return "x" + int_to_base64(n) + "d"; -} - -// ******************************************************************************************* -string ss_ref_ext(uint32_t archive_version) -{ - if (archive_version < 3000) - return "-ref"; - else - return "r"; -} - -// ******************************************************************************************* -string ss_delta_ext(uint32_t archive_version) -{ - if (archive_version < 3000) - return "-delta"; - else - return "d"; -} - -// EOF +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "utils.h" +#include + +// ******************************************************************************************* +string int_to_hex(uint32_t n) +{ + const char dig[] = "0123456789ABCDEF"; + + string res; + + do + { + res.push_back(dig[n & 0xfu]); + n /= 16; + } while (n); + +// res.reserve(); + + return res; +} + +// ******************************************************************************************* +string int_to_base64(uint32_t n) +{ + const char dig[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_#"; + + string res; + + do + { + res.push_back(dig[n & 0x3fu]); + n /= 64; + } while (n); + +// res.reserve(); + + return res; +} + +// ******************************************************************************************* +string ss_prefix(uint32_t archive_version) +{ + if (archive_version < 3000) + return "seg-"; + else + return "x"; +} + +// ******************************************************************************************* +string ss_base(uint32_t archive_version, uint32_t n) +{ + if (archive_version < 3000) + return "seg-" + to_string(n); + else + return "x" + int_to_base64(n); +} + +// ******************************************************************************************* +string ss_ref_name(uint32_t archive_version, uint32_t n) +{ + if (archive_version < 3000) + return "seg-" + to_string(n) + "-ref"; + else + return "x" + int_to_base64(n) + "r"; +} + +// ******************************************************************************************* +string ss_delta_name(uint32_t archive_version, uint32_t n) +{ + if (archive_version < 3000) + return "seg-" + to_string(n) + "-delta"; + else + return "x" + int_to_base64(n) + "d"; +} + +// ******************************************************************************************* +string ss_ref_ext(uint32_t archive_version) +{ + if (archive_version < 3000) + return "-ref"; + else + return "r"; +} + +// ******************************************************************************************* +string ss_delta_ext(uint32_t archive_version) +{ + if (archive_version < 3000) + return "-delta"; + else + return "d"; +} + +// EOF diff --git a/src/core/utils.h b/src/common/utils.h similarity index 60% rename from src/core/utils.h rename to src/common/utils.h index 9103318..26cd0db 100644 --- a/src/core/utils.h +++ b/src/common/utils.h @@ -1,532 +1,333 @@ -#ifndef _UTILS_H -#define _UTILS_H - -// ******************************************************************************************* -// This file is a part of AGC software distributed under MIT license. -// The homepage of the AGC project is https://github.com/refresh-bio/agc -// -// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li -// -// Version: 3.1 -// Date : 2024-03-12 -// ******************************************************************************************* - -#include "../core/defs.h" -#include -#if defined(ARCH_X64) -#include -#elif defined(ARCH_ARM) -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include -#include - -// ********************************************************************************** -template -constexpr T dna_code(const T x) -{ - switch (x) - { - case (T) 'A': return (T) 0; break; - case (T) 'C': return (T) 1; break; - case (T) 'G': return (T) 2; break; - case (T) 'T': return (T) 3; break; - } - - return (T)4; -} - -// ********************************************************************************** -template -constexpr T reverse_complement(const T x) -{ - switch (x) - { - case dna_code('A'): return (T)dna_code('T'); break; - case dna_code('C'): return (T)dna_code('G'); break; - case dna_code('G'): return (T)dna_code('C'); break; - case dna_code('T'): return (T)dna_code('A'); break; - } - - return (T) 4; -} - -// ********************************************************************************** -template -constexpr T complement(const T x) -{ - switch (x) - { - case dna_code('A'): return (T)dna_code('T'); break; - case dna_code('C'): return (T)dna_code('G'); break; - case dna_code('G'): return (T)dna_code('C'); break; - case dna_code('T'): return (T)dna_code('A'); break; - } - - return (T)4; -} - -// ********************************************************************************** -constexpr uint8_t reverse_complement_alhpa(const uint8_t x) -{ - switch (x) - { - case 'A': return 'T'; break; - case 'C': return 'G'; break; - case 'G': return 'C'; break; - case 'T': return 'A'; break; - } - - return 'N'; -} - -// ********************************************************************************** -template -constexpr uint32_t pop_count(T x) -{ - uint32_t r = 0; - - for (; x; ++r) - x &= x - 1; - - return r; -} - -// ********************************************************************************** -template -constexpr bool is_power_2(const T x) -{ - return (x & (x - (T)1)) == 0; -} - -// ********************************************************************************** -constexpr uint64_t ilog2(uint64_t x) -{ - uint64_t r = 0; - - for (; x; ++r) - x >>= 1; - - return r; -} - -// ********************************************************************************** -constexpr uint64_t no_bytes(uint64_t x) -{ - uint64_t r = 1; - - x >>= 8; - - for (; x; ++r) - x >>= 8; - - return r; -} - -// ********************************************************************************** -constexpr uint64_t zigzag_encode(int64_t x) -{ - if (x >= 0) - return (uint64_t)(2 * x); - else - return (uint64_t)(2 * (-x) - 1); -} - -// ********************************************************************************** -constexpr int64_t zigzag_decode(uint64_t x) -{ - if (x & 1) - return -(int64_t) (x + 1) / 2; - else - return (int64_t)(x / 2); -} - -// ********************************************************************************** -constexpr uint64_t zigzag_encode(uint64_t x_curr, uint64_t x_prev) -{ - if (x_curr < x_prev) - return 2 * (x_prev - x_curr) - 1u; - - if (x_curr < 2 * x_prev) - return 2 * (x_curr - x_prev); - - return x_curr; -} - -// ********************************************************************************** -constexpr uint64_t zigzag_decode(uint64_t x_val, uint64_t x_prev) -{ - if (x_val >= 2 * x_prev) - return x_val; - - if (x_val & 1) -// return (2 * x_prev - x_val - 1u) / 2; - return (2 * x_prev - x_val) / 2; // optimization (-1 is unnecessary due to /2) - - return (x_val + 2 * x_prev) / 2; -} - -// ***************************************************************************************** -string ss_prefix(uint32_t archive_version); -string ss_base(uint32_t archive_version, uint32_t n); -string ss_ref_name(uint32_t archive_version, uint32_t n); -string ss_delta_name(uint32_t archive_version, uint32_t n); -string ss_ref_ext(uint32_t archive_version); -string ss_delta_ext(uint32_t archive_version); -string int_to_hex(uint32_t n); -string int_to_base64(uint32_t n); - - -// ***************************************************************************************** -// -class CBarrier -{ -public: - CBarrier(const CBarrier&) = delete; - CBarrier& operator=(const CBarrier&) = delete; - explicit CBarrier(unsigned int count) : - m_count(count), m_generation(0), - m_count_reset_value(count) - { - } - void arrive_and_wait() - { - std::unique_lock< std::mutex > lock(m_mutex); - unsigned int gen = m_generation; - if (--m_count == 0) - { - m_generation++; - m_count = m_count_reset_value; - m_cond.notify_all(); - return; - } - -// m_cond.wait(lock, [&] {return gen != m_generation; }); - while (gen == m_generation) - m_cond.wait(lock); - } -private: - std::mutex m_mutex; - std::condition_variable m_cond; - unsigned int m_count; - unsigned int m_generation; - unsigned int m_count_reset_value; -}; - -// ***************************************************************************************** -// -class CAtomicBarrier -{ -public: - CAtomicBarrier(const CAtomicBarrier&) = delete; - CAtomicBarrier& operator=(const CAtomicBarrier&) = delete; - explicit CAtomicBarrier(int32_t count) : - a_count(count - 1), a_generation(0), - count_reset_value(count - 1) - { - } - - void arrive_and_wait() - { - int32_t old_generation = a_generation; - - if (!a_count.fetch_sub(1, memory_order_relaxed)) - { - a_count = count_reset_value; - ++a_generation; - } - - while (a_generation == old_generation) - ; - } -private: - atomic a_count; - atomic a_generation; - int32_t count_reset_value; -}; - -// ********************************************************************************** -struct MurMur32Hash -{ - std::size_t operator()(uint32_t h) const noexcept - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; - } -}; - -// ********************************************************************************** -// MurMurHash3 -struct MurMur64Hash -{ - std::size_t operator()(size_t h) const noexcept - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccdL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53L; - h ^= h >> 33; - - return h; - } -}; - -// ********************************************************************************** -// MurMurHash3 for pair -struct MurMurPair64Hash -{ - std::size_t operator()(const std::pair& x) const noexcept - { - std::size_t h = x.first; - - h ^= h >> 33; - h *= 0xff51afd7ed558ccdL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53L; - h ^= h >> 33; - - h ^= x.second; - - h ^= h >> 33; - h *= 0xff51afd7ed558ccdL; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53L; - h ^= h >> 33; - - return h; - } -}; - -// ********************************************************************************** -/// MurMurHash3 for strings (simple implementation) -struct MurMurStringsHash -{ - // Based on https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -private: - static uint64_t load64(const char* &p) - { - uint64_t x = (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - x <<= 8; x += (uint64_t)(*p++); - - return x; - } - - static inline uint64_t rotl64(uint64_t x, int8_t r) - { - return (x << r) | (x >> (64 - r)); - } - - static inline uint64_t fmix64(uint64_t k) - { - k ^= k >> 33; - k *= 0xff51afd7ed558ccdull; - k ^= k >> 33; - k *= 0xc4ceb9fe1a85ec53ull; - k ^= k >> 33; - - return k; - } - -public: - std::size_t operator()(const std::string& s) const - { - uint64_t h1 = 0; - uint64_t h2 = 0; - - const std::size_t n_blocks = s.size() / 16; - - const uint64_t c1 = 0x87c37b91114253d5ull; - const uint64_t c2 = 0x4cf5ad432745937full; - - const char* data = s.c_str(); - - for (std::size_t i = 0; i < s.size() / 16; i++) - { - uint64_t k1 = load64(data); - uint64_t k2 = load64(data); - - k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1; - - h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2; - - h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; - } - - std::size_t tail = s.size() % 16; - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch (tail & 15) - { - case 15: k2 ^= ((uint64_t)data[14]) << 48; [[fallthrough]]; - case 14: k2 ^= ((uint64_t)data[13]) << 40; [[fallthrough]]; - case 13: k2 ^= ((uint64_t)data[12]) << 32; [[fallthrough]]; - case 12: k2 ^= ((uint64_t)data[11]) << 24; [[fallthrough]]; - case 11: k2 ^= ((uint64_t)data[10]) << 16; [[fallthrough]]; - case 10: k2 ^= ((uint64_t)data[9]) << 8; [[fallthrough]]; - case 9: k2 ^= ((uint64_t)data[8]) << 0; - k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2; - [[fallthrough]]; - case 8: k1 ^= ((uint64_t)data[7]) << 56; [[fallthrough]]; - case 7: k1 ^= ((uint64_t)data[6]) << 48; [[fallthrough]]; - case 6: k1 ^= ((uint64_t)data[5]) << 40; [[fallthrough]]; - case 5: k1 ^= ((uint64_t)data[4]) << 32; [[fallthrough]]; - case 4: k1 ^= ((uint64_t)data[3]) << 24; [[fallthrough]]; - case 3: k1 ^= ((uint64_t)data[2]) << 16; [[fallthrough]]; - case 2: k1 ^= ((uint64_t)data[1]) << 8; [[fallthrough]]; - case 1: k1 ^= ((uint64_t)data[0]) << 0; - k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1; - }; - - h1 ^= (uint64_t)s.size(); h2 ^= (uint64_t)s.size(); - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return h1 ^ h2; - } -}; - -// ********************************************************************************** -struct hash_pair { - template - size_t operator()(const pair& x) const - { - return hash{}(x.first) ^ hash{}(x.second); - } -}; - -template <> -struct std::hash> -{ - std::size_t operator()(const pair& k) const - { - using std::size_t; - using std::hash; - - return (hash()(k.first)) ^ (hash()(k.second)); - } -}; - -// ********************************************************************************** -class bloom_set_t { -// const uint32_t no_hashes = 2; - const uint32_t no_hashes = 3; - - MurMur64Hash mmh; - - vector arr; - - size_t no_elements; - size_t allocated; - size_t mask; - uint32_t mask_shift; - - uint64_t normalize_size(uint64_t size) - { - size *= no_hashes; - size *= 2; - - while (size & (size - 1)) - size &= size - 1; - - return max((uint64_t) 256, size * 2); - } - - void allocate(size_t size) - { - arr.clear(); - no_elements = 0; - - allocated = normalize_size(size); - - arr.resize(allocated / 64, 0); - - mask_shift = 6 * no_hashes; - mask = (allocated / 64 - 1) << mask_shift; - } - - void insert_impl(uint64_t x) - { - uint64_t h = mmh(x); - - uint64_t pos = (h & mask) >> mask_shift; - - arr[pos] |= (1ull << (h & 63)) | (1ull << ((h >> 6) & 63)) | (1ull << ((h >> 12) & 63)); -// arr[pos] |= (1ull << (h & 63)) | (1ull << ((h >> 6) & 63)); - - ++no_elements; - } - -public: - bloom_set_t(size_t size = 64) - { - allocate(size); - } - - void resize(size_t size) - { - allocate(size); - } - - template - void insert(Iter begin, Iter end) - { - for (auto p = begin; p != end; ++p) - insert_impl(*p); - } - - void insert(uint64_t x) - { - insert_impl(x); - } - - bool check(uint64_t x) - { - uint64_t h = mmh(x); - - uint64_t pos = (h & mask) >> mask_shift; - - return (arr[pos] & (1ull << (h & 63))) && (arr[pos] & (1ull << ((h >> 6) & 63))) && (arr[pos] & (1ull << ((h >> 12) & 63))); -// return (arr[pos] & (1ull << (h & 63))) && (arr[pos] & (1ull << ((h >> 6) & 63))); - } - - double filling_factor() - { - return (double)no_hashes * no_elements / allocated; - } -}; - -// EOF +#ifndef _UTILS_H +#define _UTILS_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "../common/defs.h" +#include +#if defined(ARCH_X64) +#include +#elif defined(ARCH_ARM) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include +#include + +// ********************************************************************************** +template +constexpr T dna_code(const T x) +{ + switch (x) + { + case (T) 'A': return (T) 0; break; + case (T) 'C': return (T) 1; break; + case (T) 'G': return (T) 2; break; + case (T) 'T': return (T) 3; break; + } + + return (T)4; +} + +// ********************************************************************************** +template +constexpr T reverse_complement(const T x) +{ + switch (x) + { + case dna_code('A'): return (T)dna_code('T'); break; + case dna_code('C'): return (T)dna_code('G'); break; + case dna_code('G'): return (T)dna_code('C'); break; + case dna_code('T'): return (T)dna_code('A'); break; + } + + return (T) 4; +} + +// ********************************************************************************** +template +constexpr T complement(const T x) +{ + switch (x) + { + case dna_code('A'): return (T)dna_code('T'); break; + case dna_code('C'): return (T)dna_code('G'); break; + case dna_code('G'): return (T)dna_code('C'); break; + case dna_code('T'): return (T)dna_code('A'); break; + } + + return (T)4; +} + +// ********************************************************************************** +constexpr uint8_t reverse_complement_alhpa(const uint8_t x) +{ + switch (x) + { + case 'A': return 'T'; break; + case 'C': return 'G'; break; + case 'G': return 'C'; break; + case 'T': return 'A'; break; + } + + return 'N'; +} + +// ********************************************************************************** +constexpr uint64_t zigzag_encode(int64_t x) +{ + if (x >= 0) + return (uint64_t)(2 * x); + else + return (uint64_t)(2 * (-x) - 1); +} + +// ********************************************************************************** +constexpr int64_t zigzag_decode(uint64_t x) +{ + if (x & 1) + return -(int64_t) (x + 1) / 2; + else + return (int64_t)(x / 2); +} + +// ********************************************************************************** +constexpr uint64_t zigzag_encode(uint64_t x_curr, uint64_t x_prev) +{ + if (x_curr < x_prev) + return 2 * (x_prev - x_curr) - 1u; + + if (x_curr < 2 * x_prev) + return 2 * (x_curr - x_prev); + + return x_curr; +} + +// ********************************************************************************** +constexpr uint64_t zigzag_decode(uint64_t x_val, uint64_t x_prev) +{ + if (x_val >= 2 * x_prev) + return x_val; + + if (x_val & 1) +// return (2 * x_prev - x_val - 1u) / 2; + return (2 * x_prev - x_val) / 2; // optimization (-1 is unnecessary due to /2) + + return (x_val + 2 * x_prev) / 2; +} + +// ***************************************************************************************** +string ss_prefix(uint32_t archive_version); +string ss_base(uint32_t archive_version, uint32_t n); +string ss_ref_name(uint32_t archive_version, uint32_t n); +string ss_delta_name(uint32_t archive_version, uint32_t n); +string ss_ref_ext(uint32_t archive_version); +string ss_delta_ext(uint32_t archive_version); +string int_to_hex(uint32_t n); +string int_to_base64(uint32_t n); + +// ********************************************************************************** +struct MurMur32Hash +{ + std::size_t operator()(uint32_t h) const noexcept + { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; + } +}; + +// ********************************************************************************** +// MurMurHash3 +struct MurMur64Hash +{ + std::size_t operator()(size_t h) const noexcept + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >> 33; + + return h; + } +}; + +/*class RapidHash { +public: + RapidHash() { +// key = { 0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, 0x589965cc75374cc3ull }; + } + + std::size_t operator()(size_t input) const { +// return mix(input ^ key[0], key[1]); + return mix(input ^ key0, key1); + } + +private: + const size_t key0 = 0xa0761d6478bd642full; + const size_t key1 = 0xe7037ed1a0b428dbull; +// std::array key; + + static uint64_t mix(uint64_t a, uint64_t b) { + uint64_t result = (a ^ (a >> 30)) * b; + return result ^ (result >> 27); + } +}; +using MurMur64Hash = RapidHash;*/ + +// ********************************************************************************** +// MurMurHash3 for pair +struct MurMurPair64Hash +{ + std::size_t operator()(const std::pair& x) const noexcept + { + std::size_t h = x.first; + + h ^= h >> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >> 33; + + h ^= x.second; + + h ^= h >> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >> 33; + + return h; + } +}; + +// ********************************************************************************** +/// MurMurHash3 for strings (simple implementation) +struct MurMurStringsHash +{ + // Based on https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp +private: + static uint64_t load64(const char* &p) + { + uint64_t x = (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + x <<= 8; x += (uint64_t)(*p++); + + return x; + } + + static inline uint64_t rotl64(uint64_t x, int8_t r) + { + return (x << r) | (x >> (64 - r)); + } + + static inline uint64_t fmix64(uint64_t k) + { + k ^= k >> 33; + k *= 0xff51afd7ed558ccdull; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53ull; + k ^= k >> 33; + + return k; + } + +public: + std::size_t operator()(const std::string& s) const + { + uint64_t h1 = 0; + uint64_t h2 = 0; + +// const std::size_t n_blocks = s.size() / 16; + + const uint64_t c1 = 0x87c37b91114253d5ull; + const uint64_t c2 = 0x4cf5ad432745937full; + + const char* data = s.c_str(); + + for (std::size_t i = 0; i < s.size() / 16; i++) + { + uint64_t k1 = load64(data); + uint64_t k2 = load64(data); + + k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1; + + h1 = rotl64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2; + + h2 = rotl64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; + } + + std::size_t tail = s.size() % 16; + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (tail & 15) + { + case 15: k2 ^= ((uint64_t)data[14]) << 48; [[fallthrough]]; + case 14: k2 ^= ((uint64_t)data[13]) << 40; [[fallthrough]]; + case 13: k2 ^= ((uint64_t)data[12]) << 32; [[fallthrough]]; + case 12: k2 ^= ((uint64_t)data[11]) << 24; [[fallthrough]]; + case 11: k2 ^= ((uint64_t)data[10]) << 16; [[fallthrough]]; + case 10: k2 ^= ((uint64_t)data[9]) << 8; [[fallthrough]]; + case 9: k2 ^= ((uint64_t)data[8]) << 0; + k2 *= c2; k2 = rotl64(k2, 33); k2 *= c1; h2 ^= k2; + [[fallthrough]]; + case 8: k1 ^= ((uint64_t)data[7]) << 56; [[fallthrough]]; + case 7: k1 ^= ((uint64_t)data[6]) << 48; [[fallthrough]]; + case 6: k1 ^= ((uint64_t)data[5]) << 40; [[fallthrough]]; + case 5: k1 ^= ((uint64_t)data[4]) << 32; [[fallthrough]]; + case 4: k1 ^= ((uint64_t)data[3]) << 24; [[fallthrough]]; + case 3: k1 ^= ((uint64_t)data[2]) << 16; [[fallthrough]]; + case 2: k1 ^= ((uint64_t)data[1]) << 8; [[fallthrough]]; + case 1: k1 ^= ((uint64_t)data[0]) << 0; + k1 *= c1; k1 = rotl64(k1, 31); k1 *= c2; h1 ^= k1; + }; + + h1 ^= (uint64_t)s.size(); h2 ^= (uint64_t)s.size(); + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return h1 ^ h2; + } +}; + +// EOF #endif \ No newline at end of file diff --git a/src/core/agc_compressor.cpp b/src/core/agc_compressor.cpp index 1be2fcb..09a7675 100644 --- a/src/core/agc_compressor.cpp +++ b/src/core/agc_compressor.cpp @@ -4,17 +4,22 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #include #include +#include +#include #include -#include "../core/agc_compressor.h" -#include "../core/agc_decompressor.h" +#include "agc_compressor.h" +#include "agc_decompressor.h" #include +#include + +#include #ifdef _DEBUG #define NO_RADULS @@ -64,6 +69,12 @@ CAGCCompressor::~CAGCCompressor() close_compression(1); else if (working_mode == working_mode_t::appending) close_compression(1); + + if (zstd_dctx_for_fallback) + { + ZSTD_freeDCtx(zstd_dctx_for_fallback); + zstd_dctx_for_fallback = nullptr; + } } // ******************************************************************************************* @@ -368,6 +379,51 @@ void CAGCCompressor::appending_init() sort(term.second.begin(), term.second.end()); } +// ******************************************************************************************* +void CAGCCompressor::add_fallback_kmers(vector::iterator first, vector::iterator last) +{ + lock_guard lck(mtx_fallback_map); + + vector> empty_vec; + + for (auto p = first; p != last; ++p) + if(fallback_filter(*p)) + map_fallback_minimizers.emplace(*p, empty_vec); +} + +// ******************************************************************************************* +void CAGCCompressor::add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, vector>& cand_fallback_kmers) +{ + lock_guard lck(mtx_fallback_map); + + pair sp_pair_dir{ splitter1, splitter2 }; + pair sp_pair_rc{ splitter2, splitter1 }; + + for (auto x : cand_fallback_kmers) + { + auto& mfm_kd = map_fallback_minimizers[x.first]; + auto& to_add = x.second ? sp_pair_dir : sp_pair_rc; + + if (count(mfm_kd.begin(), mfm_kd.end(), to_add) == 0) + mfm_kd.emplace_back(to_add); + } +} + +// ******************************************************************************************* +void CAGCCompressor::add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, uint64_t kmer, bool is_dir_oriented) +{ +// lock_guard lck(mtx_fallback_map); + + pair sp_pair_dir{ splitter1, splitter2 }; + pair sp_pair_rc{ splitter2, splitter1 }; + + auto& mfm_kd = map_fallback_minimizers[kmer]; + auto& to_add = is_dir_oriented ? sp_pair_dir : sp_pair_rc; + + if (count(mfm_kd.begin(), mfm_kd.end(), to_add) == 0) + mfm_kd.emplace_back(to_add); +} + // ******************************************************************************************* bool CAGCCompressor::determine_splitters(const string& reference_file_name, const size_t segment_size, const uint32_t no_threads) { @@ -445,6 +501,8 @@ bool CAGCCompressor::determine_splitters(const string& reference_file_name, cons auto v_begin = v_candidate_kmers.begin() + v_candidate_kmers_offset; auto v_end = v_candidate_kmers.end(); + add_fallback_kmers(v_begin, v_end); + gio.Close(); if (!gio.Open(reference_file_name, false)) @@ -458,6 +516,8 @@ bool CAGCCompressor::determine_splitters(const string& reference_file_name, cons pq_contigs_raw = make_unique>(1, 4ull << 30); vv_splitters.resize(no_threads); + vv_fallback_minimizers.resize(no_threads); + start_splitter_finding_threads(v_threads, no_threads, v_begin, v_end, vv_splitters); while (gio.ReadContigRaw(id, contig)) @@ -564,6 +624,8 @@ bool CAGCCompressor::count_kmers(vector>>& v_contig remove_non_singletons(v_candidate_kmers, v_duplicated_kmers, v_candidate_kmers_offset); + add_fallback_kmers(v_candidate_kmers.begin() + v_candidate_kmers_offset, v_candidate_kmers.end()); + if (verbosity > 1 && is_app_mode) cerr << "No. of singletons: " << v_candidate_kmers.size() - v_candidate_kmers_offset << endl; @@ -697,13 +759,18 @@ void CAGCCompressor::start_kmer_collecting_threads(vector &v_threads, co } // ******************************************************************************************* -void CAGCCompressor::find_splitters_in_contig(contig_t& ctg, const vector::iterator v_begin, const vector::iterator v_end, vector& v_splitters) +void CAGCCompressor::find_splitters_in_contig(contig_t& ctg, const vector::iterator v_begin, const vector::iterator v_end, vector& v_splitters, vector> &v_fallbacks) { // Initialization to large value to add 1st candidate k-mer uint64_t current_len = segment_size; vector v_recent_kmers; CKmer kmer(kmer_length, kmer_mode_t::canonical); + uint64_t prev_splitter = ~0ull; + vector> fallback_kmers_in_segment; + + MurMur64Hash mmh; + kmer.Reset(); for (auto x : ctg) @@ -718,6 +785,9 @@ void CAGCCompressor::find_splitters_in_contig(contig_t& ctg, const vector= segment_size) { uint64_t d = kmer.data(); @@ -725,6 +795,13 @@ void CAGCCompressor::find_splitters_in_contig(contig_t& ctg, const vector{prev_splitter, d, x.first, (uint64_t)x.second}); + + fallback_kmers_in_segment.clear(); + prev_splitter = d; + current_len = 0; kmer.Reset(); v_recent_kmers.clear(); @@ -741,6 +818,8 @@ void CAGCCompressor::find_splitters_in_contig(contig_t& ctg, const vector{prev_splitter, * p, x.first, x.second}); break; } } @@ -764,6 +843,7 @@ void CAGCCompressor::build_candidate_kmers_from_archive(const uint32_t n_t) count_kmers(v_contig_data, n_t); vv_splitters.resize(n_t); + vv_fallback_minimizers.resize(n_t); } // ******************************************************************************************* @@ -778,18 +858,21 @@ void CAGCCompressor::start_splitter_finding_threads(vector& v_threads, c uint32_t thread_id = i; - while (!pq_contigs_raw->IsCompleted()) + while (true) { contig_t task; - if (!pq_contigs_raw->PopLarge(task)) + auto q_res = pq_contigs_raw->PopLarge(task); + + if (q_res == CBoundedPQueue::result_t::empty) continue; + else if (q_res == CBoundedPQueue::result_t::completed) + break; preprocess_raw_contig(task); - find_splitters_in_contig(task, v_begin, v_end, v_splitters[thread_id]); + find_splitters_in_contig(task, v_begin, v_end, v_splitters[thread_id], vv_fallback_minimizers[thread_id]); } - }); } @@ -871,13 +954,11 @@ void CAGCCompressor::preprocess_raw_contig(contig_t& ctg) void CAGCCompressor::register_segments(uint32_t n_t) { buffered_seg_part.sort_known(n_t); + uint32_t no_new = buffered_seg_part.process_new(); for (uint32_t i = 0; i < no_new; ++i) - { - out_archive->RegisterStream(ss_ref_name(archive_version, no_segments + i)); - out_archive->RegisterStream(ss_delta_name(archive_version, no_segments + i)); - } + out_archive->RegisterStreams(ss_ref_name(archive_version, no_segments + i), ss_delta_name(archive_version, no_segments + i)); no_segments += no_new; @@ -900,6 +981,12 @@ void CAGCCompressor::store_segments(ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) uint64_t kmer1; uint64_t kmer2; + const size_t max_buff_size = 32; + + vector buffered_coll_insertions; + + int no_parts = buffered_seg_part.get_no_parts(); + while (true) { int block_group_id = buffered_seg_part.get_vec_id(); @@ -908,13 +995,11 @@ void CAGCCompressor::store_segments(ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) if (block_group_id < 0) break; - for(int group_id = block_group_id; group_id > block_group_id - 10; --group_id) - if(!buffered_seg_part.is_empty_part(group_id)) + for (int group_id = block_group_id; group_id > block_group_id - CBufferedSegPart::part_id_step; --group_id) + { + if (!buffered_seg_part.is_empty_part(group_id)) while (buffered_seg_part.get_part(group_id, kmer1, kmer2, sample_name, contig_name, seg_data, is_rev_comp, seg_part_no)) { -/* if (contig_name == "cluster3_contig_100") - cout << "!";*/ - if (v_segments[group_id] == nullptr) { v_segments[group_id] = make_shared(ss_base(archive_version, group_id), nullptr, out_archive, pack_cardinality, min_match_len, concatenated_genomes, archive_version); @@ -949,9 +1034,58 @@ void CAGCCompressor::store_segments(ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) else in_group_id = v_segments[group_id]->add(seg_data, zstd_cctx, zstd_dctx); - collection_desc->add_segment_placed(sample_name, contig_name, seg_part_no, group_id, in_group_id, is_rev_comp, (uint32_t)seg_data.size()); + // collection_desc->add_segment_placed(sample_name, contig_name, seg_part_no, group_id, in_group_id, is_rev_comp, (uint32_t)seg_data.size()); + if (buffered_coll_insertions.size() == max_buff_size) + { + collection_desc->add_segments_placed(buffered_coll_insertions); + buffered_coll_insertions.clear(); + } + + buffered_coll_insertions.emplace_back(sample_name, contig_name, seg_part_no, group_id, in_group_id, is_rev_comp, (uint32_t)seg_data.size()); } + } } + + collection_desc->add_segments_placed(buffered_coll_insertions); +} + +// ******************************************************************************************* +void CAGCCompressor::prepare_compressing_stuctures(const uint32_t n_t) +{ + v_cctx.clear(); + v_cctx.reserve(n_t); + v_dctx.clear(); + v_dctx.reserve(n_t); + + for (uint32_t i = 0; i < n_t; ++i) + { + v_cctx.emplace_back(ZSTD_createCCtx()); + v_dctx.emplace_back(ZSTD_createDCtx()); + } +} + +// ******************************************************************************************* +void CAGCCompressor::release_compressing_stuctures() +{ + for (auto x : v_cctx) + ZSTD_freeCCtx(x); + for (auto x : v_dctx) + ZSTD_freeDCtx(x); + + v_cctx.clear(); + v_dctx.clear(); +} + +// ******************************************************************************************* +void CAGCCompressor::compressing_stage1_job(uint32_t thread_id, uint32_t n_t) +{ + +} + +// ******************************************************************************************* +void CAGCCompressor::compressing_stage2_job(uint32_t thread_id, uint32_t n_t) +{ + } // ******************************************************************************************* @@ -965,15 +1099,17 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar v_threads.emplace_back([&, i, n_t]() { auto zstd_cctx = ZSTD_createCCtx(); auto zstd_dctx = ZSTD_createDCtx(); - uint32_t thread_id = i; - while (!pq_contigs_desc_working->IsCompleted()) + while(true) { task_t task; - if (!pq_contigs_desc_working->PopLarge(task)) + auto q_res = pq_contigs_desc_working->PopLarge(task); + if (q_res == CBoundedPQueue::result_t::empty) continue; + else if (q_res == CBoundedPQueue::result_t::completed) + break; if (get<0>(task) == contig_processing_stage_t::registration) { @@ -981,6 +1117,16 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar bar.arrive_and_wait(); if (thread_id == 0) register_segments(n_t); + +// if((n_t == 1 && thread_id == 0) || (thread_id == 1)) + if(thread_id == 0) + for (auto& v_fallback_minimizers : vv_fallback_minimizers) + { + for (auto& x : v_fallback_minimizers) + add_fallback_mapping(x[0], x[1], x[2], (bool)x[3]); + v_fallback_minimizers.clear(); + } + bar.arrive_and_wait(); store_segments(zstd_cctx, zstd_dctx); @@ -1041,11 +1187,11 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar if (get<0>(task) == contig_processing_stage_t::new_splitters) { bar.arrive_and_wait(); - if (thread_id == 0) - { + + auto bloom_insert = [&] { // Add new splitters - for(auto &v : vv_splitters) - { + for (auto& v : vv_splitters) + { for (auto& x : v) { hs_splitters.insert_fast(x); @@ -1060,20 +1206,30 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar bloom_splitters.resize((uint64_t)(hs_splitters.size() / 0.25)); bloom_splitters.insert(hs_splitters.begin(), hs_splitters.end()); } + }; + + if (thread_id == 0) + { + if (n_t == 1) + bloom_insert(); for (auto& x : v_raw_contigs) { auto cost = get<2>(x).size(); - pq_contigs_desc_aux->Emplace(make_tuple(contig_processing_stage_t::hard_contigs, get<0>(x), get<1>(x), move(get<2>(x))), 1, cost); + // No other thread operates at the moment + pq_contigs_desc_aux->EmplaceNoLock(make_tuple(contig_processing_stage_t::hard_contigs, get<0>(x), get<1>(x), move(get<2>(x))), 1, cost); } v_raw_contigs.clear(); - for(uint32_t i = 0; i < n_t; ++i) - pq_contigs_desc_aux->Emplace(make_tuple(contig_processing_stage_t::registration, "", "", contig_t()), 0, 0); + pq_contigs_desc_aux->EmplaceManyNoCost(make_tuple(contig_processing_stage_t::registration, "", "", contig_t()), 0, n_t); pq_contigs_desc_working = pq_contigs_desc_aux; } + else if (thread_id == 1) + { + bloom_insert(); + } bar.arrive_and_wait(); @@ -1087,18 +1243,22 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar size_t ctg_size = get<3>(task).size(); - if (compress_contig(get<0>(task), get<1>(task), get<2>(task), get<3>(task), zstd_cctx, zstd_dctx, thread_id)) + if (compress_contig(get<0>(task), get<1>(task), get<2>(task), get<3>(task), zstd_cctx, zstd_dctx, thread_id, bar)) { - processed_bases += ctg_size; + auto old_pb = processed_bases.fetch_add(ctg_size); + auto new_pb = old_pb + ctg_size; - if (verbosity > 0 && is_app_mode) + if (verbosity > 0 && is_app_mode && old_pb / 10'000'000 != new_pb / 10'000'000) + { cerr << "Compressed: " + to_string(processed_bases / 1'000'000) + " Mb\r"; - fflush(stdout); + fflush(stdout); + } } else { lock_guard lck(mtx_raw_contigs); v_raw_contigs.emplace_back(get<1>(task), get<2>(task), move(get<3>(task))); +// v_raw_contigs.emplace_back(get<1>(task), get<2>(task), get<3>(task)); } get<3>(task).clear(); @@ -1113,7 +1273,7 @@ void CAGCCompressor::start_compressing_threads(vector& v_threads, my_bar // ******************************************************************************************* pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const string& contig_name, uint32_t seg_part_no, - contig_t segment, CKmer kmer_front, CKmer kmer_back, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx) + contig_t &&segment, CKmer kmer_front, CKmer kmer_back, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id, my_barrier& bar) { pair pk, pk2(~0ull, ~0ull); contig_t segment_rc; @@ -1127,7 +1287,17 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const { // No terminal splitters present - pk = make_pair(~0ull, ~0ull); + if (fallback_filter) // Try fallback minimizers procedure + { +// tie(pk, store_rc) = find_cand_segment_using_fallback_minimizers(segment, 2); + tie(pk, store_rc) = find_cand_segment_using_fallback_minimizers(segment, 1); +// tie(pk, store_rc) = find_cand_segment_using_fallback_minimizers(segment, (uint64_t) (segment.size() * fallback_frac * 0.2)); + + if(pk != pk_empty && store_rc) + reverse_complement_copy(segment, segment_rc); + } + else + pk = pk_empty; } else if (kmer_front.is_full() && kmer_back.is_full()) { @@ -1147,7 +1317,22 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const CKmer kmer = kmer_front; reverse_complement_copy(segment, segment_rc); - tie(pk, store_rc) = find_cand_segment_with_one_splitter(kmer, segment, segment_rc, zstd_dctx); + tie(pk, store_rc) = find_cand_segment_with_one_splitter(kmer, segment, segment_rc, zstd_dctx, bar); + + if (pk.first == ~0ull || pk.second == ~0ull) + { + auto pk_alt = pk; + bool store_rc_alt = false; + + tie(pk_alt, store_rc_alt) = find_cand_segment_using_fallback_minimizers(segment, 5); +// tie(pk_alt, store_rc_alt) = find_cand_segment_using_fallback_minimizers(segment, (uint64_t)(segment.size() * fallback_frac * 0.1)); + + if (pk_alt != pk_empty) + { + pk = pk_alt; + store_rc = store_rc_alt; + } + } } else if (kmer_back.is_full()) { @@ -1156,8 +1341,23 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const reverse_complement_copy(segment, segment_rc); bool store_dir; - tie(pk, store_dir) = find_cand_segment_with_one_splitter(kmer, segment_rc, segment, zstd_dctx); + tie(pk, store_dir) = find_cand_segment_with_one_splitter(kmer, segment_rc, segment, zstd_dctx, bar); store_rc = !store_dir; + + if (pk.first == ~0ull || pk.second == ~0ull) + { + auto pk_alt = pk; + bool store_dir_alt = false; + + tie(pk_alt, store_dir_alt) = find_cand_segment_using_fallback_minimizers(segment_rc, 5); +// tie(pk_alt, store_dir_alt) = find_cand_segment_using_fallback_minimizers(segment_rc, (uint64_t)(segment.size() * fallback_frac * 0.1)); + + if (pk_alt != pk_empty) + { + pk = pk_alt; + store_rc = !store_dir_alt; + } + } } auto p = map_segments.find(pk); @@ -1190,7 +1390,7 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const kmer2.swap_dir_rc(); } - auto split_match = find_cand_segment_with_missing_middle_splitter(kmer1, kmer2, use_rc ? segment_rc : segment, use_rc ? segment : segment_rc, zstd_dctx); + auto split_match = find_cand_segment_with_missing_middle_splitter(kmer1, kmer2, use_rc ? segment_rc : segment, use_rc ? segment : segment_rc, zstd_dctx, bar); if (split_match.first != ~0ull) { @@ -1257,7 +1457,26 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const p = map_segments.find(pk); } - + + if (p == map_segments.end() && fallback_filter) // Try fallback minimizers procedure + { + pair pk_fb; + bool store_rc_fb; + + tie(pk_fb, store_rc_fb) = find_cand_segment_using_fallback_minimizers(segment, 2); +// tie(pk_fb, store_rc_fb) = find_cand_segment_using_fallback_minimizers(segment, (uint64_t)(segment.size() * fallback_frac * 0.05)); + + if (pk_fb != pk_empty) + { + pk = pk_fb; + store_rc = store_rc_fb; + p = map_segments.find(pk); + + if (store_rc) + reverse_complement_copy(segment, segment_rc); + } + } + uint32_t segment_size = (uint32_t) segment.size(); uint32_t segment2_size = (uint32_t) segment2.size(); @@ -1280,15 +1499,18 @@ pair_segment_desc_t CAGCCompressor::add_segment(const string& sample_name, const } // ******************************************************************************************* -pair CAGCCompressor::find_cand_segment_with_missing_middle_splitter(CKmer kmer_front, CKmer kmer_back, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx) +pair CAGCCompressor::find_cand_segment_with_missing_middle_splitter(CKmer kmer_front, CKmer kmer_back, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx, my_barrier& bar) { - vector shared_splitters; - - shared_splitters.resize(map_segments_terminators[kmer_front.data()].size()); - auto p_front = map_segments_terminators.find(kmer_front.data()); auto p_back = map_segments_terminators.find(kmer_back.data()); + if (p_front == map_segments_terminators.end() || p_back == map_segments_terminators.end()) + return make_pair(~0ull, 0); + + vector shared_splitters; + + shared_splitters.resize(min(p_front->second.size(), p_back->second.size())); + auto p_shared = set_intersection( p_front->second.begin(), p_front->second.end(), p_back->second.begin(), p_back->second.end(), @@ -1304,33 +1526,79 @@ pair CAGCCompressor::find_cand_segment_with_missing_middle_s vector v_costs1, v_costs2; +// v_costs1.reserve(segment_dir.size()); +// v_costs2.reserve(segment_dir.size()); + auto segment_id1 = map_segments[minmax(kmer_front.data(), middle)]; auto segment_id2 = map_segments[minmax(middle, kmer_back.data())]; - auto seg1 = v_segments[segment_id1]; auto seg2 = v_segments[segment_id2]; - if (kmer_front.data() < middle) - seg1->get_coding_cost(segment_dir, v_costs1, true, zstd_dctx); - else - { - seg1->get_coding_cost(segment_rc, v_costs1, false, zstd_dctx); - reverse(v_costs1.begin(), v_costs1.end()); - } + auto seg1_run = [&] { + if (kmer_front.data() < middle) + seg1->get_coding_cost(segment_dir, v_costs1, true, zstd_dctx); + else + { + seg1->get_coding_cost(segment_rc, v_costs1, false, zstd_dctx); + reverse(v_costs1.begin(), v_costs1.end()); + } + + partial_sum(v_costs1.begin(), v_costs1.end(), v_costs1.begin()); + }; + + auto seg2_run = [&](ZSTD_DCtx *loc_zstd_dctx) { +/* if (middle < kmer_back.data()) + { + seg2->get_coding_cost(segment_dir, v_costs2, false, loc_zstd_dctx); + reverse(v_costs2.begin(), v_costs2.end()); + } + else + seg2->get_coding_cost(segment_rc, v_costs2, true, loc_zstd_dctx); + + partial_sum(v_costs2.begin(), v_costs2.end(), v_costs2.begin()); + + reverse(v_costs2.begin(), v_costs2.end());*/ + + if (middle < kmer_back.data()) + { + seg2->get_coding_cost(segment_dir, v_costs2, false, nullptr); + partial_sum(v_costs2.rbegin(), v_costs2.rend(), v_costs2.rbegin()); + } + else + { + seg2->get_coding_cost(segment_rc, v_costs2, true, nullptr); + partial_sum(v_costs2.begin(), v_costs2.end(), v_costs2.begin()); + reverse(v_costs2.begin(), v_costs2.end()); + } + + }; - if (middle < kmer_back.data()) +#ifndef USE_INCREMENTING_BARRIERS + seg1_run(); + seg2_run(zstd_dctx); +#else + bool run_seg2_in_separate_thread = true; + + if (segment_id1 == segment_id2) + run_seg2_in_separate_thread = false; + else if (!bar.try_increment()) + run_seg2_in_separate_thread = false; + + if(run_seg2_in_separate_thread) { - seg2->get_coding_cost(segment_dir, v_costs2, false, zstd_dctx); - reverse(v_costs2.begin(), v_costs2.end()); + future fut = async([&] {seg2_run(nullptr); }); + seg1_run(); + + fut.wait(); + bar.decrement(); } else - seg2->get_coding_cost(segment_rc, v_costs2, true, zstd_dctx); - - partial_sum(v_costs1.begin(), v_costs1.end(), v_costs1.begin()); - partial_sum(v_costs2.begin(), v_costs2.end(), v_costs2.begin()); - - reverse(v_costs2.begin(), v_costs2.end()); + { + seg1_run(); + seg2_run(zstd_dctx); + } +#endif uint32_t best_sum = ~0u; uint32_t best_pos = 0; @@ -1354,14 +1622,14 @@ pair CAGCCompressor::find_cand_segment_with_missing_middle_s } // ******************************************************************************************* -pair, bool> CAGCCompressor::find_cand_segment_with_one_splitter(CKmer kmer, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx) +pair, bool> CAGCCompressor::find_cand_segment_with_one_splitter(CKmer kmer, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx, my_barrier& bar) { const pair empty_pk(~0ull, ~0ull); pair best_pk(~0ull, ~0ull); uint64_t best_estim_size = segment_dir.size() < 16 ? segment_dir.size() : segment_dir.size() - 16u; bool is_best_rc = false; - vector>> v_candidates; + vector, std::array>> v_candidates; // filled with array to avoid false sharing auto p = map_segments_terminators.find(kmer.data()); if (p == map_segments_terminators.end()) @@ -1404,7 +1672,7 @@ pair, bool> CAGCCompressor::find_cand_segment_with_one_ get<3>(ck) = v_segments[map_segments[cand_pk]]; } - int64_t segment_size = (int64_t) segment_dir.size(); + int64_t segment_size = (int64_t)segment_dir.size(); stable_sort(v_candidates.begin(), v_candidates.end(), [segment_size](const auto& x, const auto& y) { int64_t x_size = get<3>(x)->get_ref_size(); int64_t y_size = get<3>(y)->get_ref_size(); @@ -1416,12 +1684,23 @@ pair, bool> CAGCCompressor::find_cand_segment_with_one_ return x_size < y_size; }); + { + set> test_set; + + for (auto& cand : v_candidates) + test_set.emplace(get<3>(cand)); + + if (test_set.size() != v_candidates.size()) + cout << "!!!\n"; + } + +#ifndef USE_INCREMENTING_BARRIERS for (auto& candidate : v_candidates) { - auto estim_size = get<3>(candidate)->estimate(get<2>(candidate) ? segment_rc : segment_dir, (uint32_t) best_estim_size, zstd_dctx); + auto estim_size = get<3>(candidate)->estimate(get<2>(candidate) ? segment_rc : segment_dir, (uint32_t)best_estim_size, zstd_dctx); auto cand_pk = make_pair(get<0>(candidate), get<1>(candidate)); - if (estim_size < best_estim_size || + if (estim_size < best_estim_size || (estim_size == best_estim_size && cand_pk < best_pk) || (estim_size == best_estim_size && cand_pk == best_pk && !get<2>(candidate))) { @@ -1430,6 +1709,84 @@ pair, bool> CAGCCompressor::find_cand_segment_with_one_ is_best_rc = get<2>(candidate); } } +#else + + int no_extra_threads = 0; + +// if (v_candidates.size() > 1) +// no_extra_threads = bar.try_increment_max((int32_t)v_candidates.size() - 1); + if (v_candidates.size() > 2) + no_extra_threads = bar.try_increment_max((int32_t)(v_candidates.size() - 1) / 2); + + vector v_estim_size(v_candidates.size(), best_estim_size); + vector> v_cand_pk(v_candidates.size(), best_pk); + + if (no_extra_threads == 0) + { + for (size_t i = 0; i < v_candidates.size(); ++i) + { + v_estim_size[i] = get<3>(v_candidates[i])->estimate(get<2>(v_candidates[i]) ? segment_rc : segment_dir, (uint32_t)best_estim_size, zstd_dctx); + + if (v_estim_size[i] < best_estim_size) + best_estim_size = v_estim_size[i]; + } + } + else + { + atomic cand_idx = 0; + atomic a_best_estim_size = best_estim_size; + + auto v_candidates_begin = v_candidates.begin(); + auto n_candidates = v_candidates.size(); + auto v_estim_size_begin = v_estim_size.begin(); + + auto job = [v_candidates_begin, v_estim_size_begin, &cand_idx, n_candidates, &a_best_estim_size, &segment_rc, &segment_dir, &bar] { + while (true) + { + auto j = cand_idx.fetch_add(1); + if (j >= n_candidates) + break; + + auto p_candidates = v_candidates_begin + j; + auto p_estim_size = v_estim_size_begin + j; + auto cur_estim_size = get<3>(*p_candidates)->estimate(get<2>(*p_candidates) ? segment_rc : segment_dir, (uint32_t)a_best_estim_size.load(), nullptr); + + *p_estim_size = cur_estim_size; + + uint64_t cur_best_estim_size = a_best_estim_size.load(); + + while(cur_best_estim_size > cur_estim_size) + if (a_best_estim_size.compare_exchange_strong(cur_best_estim_size, cur_estim_size)) // Can fail if some other thread also updates at the same time + break; + }; + }; + + vector> v_fut; + + for (size_t i = 0; i < no_extra_threads; ++i) + v_fut.emplace_back(async(job)); + job(); + + for (auto& f : v_fut) + f.wait(); + bar.decrement(no_extra_threads); + } + + for (size_t i = 0; i < v_candidates.size(); ++i) + { + auto& candidate = v_candidates[i]; + auto cand_pk = make_pair(get<0>(candidate), get<1>(candidate)); + + if (v_estim_size[i] < best_estim_size || + (v_estim_size[i] == best_estim_size && cand_pk < best_pk) || + (v_estim_size[i] == best_estim_size && cand_pk == best_pk && !get<2>(candidate))) + { + best_estim_size = v_estim_size[i]; + best_pk = cand_pk; + is_best_rc = get<2>(candidate); + } + } +#endif if (best_pk == empty_pk) { @@ -1445,9 +1802,195 @@ pair, bool> CAGCCompressor::find_cand_segment_with_one_ return make_pair(best_pk, is_best_rc); } +// #define DEBUG_CANDIDATES +// ******************************************************************************************* +pair, bool> CAGCCompressor::find_cand_segment_using_fallback_minimizers(contig_t& segment, uint64_t max_val) +{ + const size_t max_num_to_estimate = 10; + const bool short_segments = segment_size <= 10000; + + lock_guard lck(mtx_fallback_map); + + if (!zstd_dctx_for_fallback) + zstd_dctx_for_fallback = ZSTD_createDCtx(); + + CKmer kmer(kmer_length, kmer_mode_t::canonical); + + map, vector> cand_seg_counts; + + kmer.Reset(); + + for (auto x : segment) + { + if (x > 3) + kmer.Reset(); + else + { + kmer.insert(x); + + if (kmer.is_full() && fallback_filter(kmer.data())) + { + auto p = map_fallback_minimizers.find(kmer.data()); + + if (p != map_fallback_minimizers.end()) + { + for (auto y : p->second) + { + if (y.first != ~0ull && y.second != ~0ull) // !!! TODO: consider to relax + { + if (!kmer.is_dir_oriented()) + swap(y.first, y.second); + cand_seg_counts[y].emplace_back(kmer.data()); + } + } + } + } + } + } + + vector>> pruned_cand_seg_counts; + + for (auto& x : cand_seg_counts) + { + std::sort(x.second.begin(), x.second.end()); + size_t x_size = unique(x.second.begin(), x.second.end()) - x.second.begin(); + if (x_size >= max_val) + pruned_cand_seg_counts.emplace_back((uint64_t)x_size, x.first); + } + + if (pruned_cand_seg_counts.empty()) + return make_pair(pk_empty, false); + + if (pruned_cand_seg_counts.size() <= max_num_to_estimate) + std::sort(pruned_cand_seg_counts.begin(), pruned_cand_seg_counts.end(), greater>>()); + else + { + std::partial_sort(pruned_cand_seg_counts.begin(), pruned_cand_seg_counts.begin() + max_num_to_estimate, pruned_cand_seg_counts.end(), greater>>()); + pruned_cand_seg_counts.resize(max_num_to_estimate); + } + + // Avoid trying poor candidates + while (pruned_cand_seg_counts.back().first * 2 < pruned_cand_seg_counts.front().first) + pruned_cand_seg_counts.pop_back(); + + contig_t segment_rc; + reverse_complement_copy(segment, segment_rc); + +#ifdef DEBUG_CANDIDATES + stringstream ss; + ss << "**** Segment_size: " << segment.size() << " cand_seg_counts: " << cand_seg_counts.size() << "\n"; + + vector> cand_evaluation; + cand_evaluation.reserve(pruned_cand_seg_counts.size()); +#endif + + pair best_pair = pk_empty; + uint64_t best_es = segment.size(); + uint64_t best_es_no_items = 0; + uint64_t best_no_items = pruned_cand_seg_counts.front().first; + + for (auto& x : pruned_cand_seg_counts) + { + decltype(map_segments)::iterator p; + bool is_seg_rc = x.second.first > x.second.second; + + if(!is_seg_rc) + p = map_segments.find(x.second); + else + p = map_segments.find(make_pair(x.second.second, x.second.first)); + + uint64_t es = 0; + int32_t seg_id = -1; + + if (p != map_segments.end()) // Can fail if the mappings are to a segment from the same sample - it's ok + { + if (short_segments) // Fast decision based on no. of shared k-mers if the segments are short + { + best_pair = x.second; + best_es = 0; + break; + } + + es = v_segments[p->second]->estimate(is_seg_rc ? segment_rc : segment, best_es, zstd_dctx_for_fallback); + seg_id = p->second; + } + +#ifdef DEBUG_CANDIDATES + cand_evaluation.emplace_back(seg_id, is_seg_rc, es); +#endif + + if (es && es < best_es) + { + best_es = es; + best_pair = x.second; + best_es_no_items = x.first; + } + } + +#ifdef DEBUG_CANDIDATES + ss << "\tBest es: " << best_es << "\t\tBest es. no. items: " << best_es_no_items << "\t\tBest no. items: " << pruned_cand_seg_counts.front().first << endl; + + for (size_t i = 0; i < cand_evaluation.size(); ++i) + ss << "\tSegment id: " << get<0>(cand_evaluation[i]) << " (is rc.: " << get<1>(cand_evaluation[i]) << ") No items : " << get<0>(pruned_cand_seg_counts[i]) << "\tes : " << get<2>(cand_evaluation[i]) << endl; + + cout << ss.str(); +#endif + + // Decide if it is worth to use the found segment. It could be better to leave it to be a new reference (in adaptive mode) + if (adaptive_compression) + { + if (short_segments) + { + if(best_es >= segment.size() * 0.9) + return make_pair(pk_empty, false); + } + else + { + if (best_es >= segment.size() * 0.2) + return make_pair(pk_empty, false); + } + } + + if(best_pair.first <= best_pair.second) + return make_pair(best_pair, false); + else + return make_pair(make_pair(best_pair.second, best_pair.first), true); +} + +// ******************************************************************************************* +void CAGCCompressor::add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, const contig_t& segment) +{ + CKmer kmer(kmer_length, kmer_mode_t::canonical); + MurMur64Hash mmh; + + auto splitter_dir = make_pair(splitter1, splitter2); + auto splitter_rev = make_pair(splitter2, splitter1); + + kmer.Reset(); + + for (auto x : segment) + { + if (x > 3) + kmer.Reset(); + else + { + kmer.insert(x); + + if (kmer.is_full() && fallback_filter(kmer.data())) + { + auto& mfm_kd = map_fallback_minimizers[kmer.data()]; + auto to_add = kmer.is_dir_oriented() ? splitter_dir : splitter_rev; + + if (count(mfm_kd.begin(), mfm_kd.end(), to_add) == 0) + mfm_kd.emplace_back(to_add); + } + } + } +} + // ******************************************************************************************* bool CAGCCompressor::compress_contig(contig_processing_stage_t contig_processing_stage, string sample_name, string id, contig_t& contig, - ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id) + ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id, my_barrier& bar) { CKmer kmer(kmer_length, kmer_mode_t::canonical); @@ -1470,7 +2013,7 @@ bool CAGCCompressor::compress_contig(contig_processing_stage_t contig_processing if (bloom_splitters.check(d) && hs_splitters.check(d)) { auto seg_id = add_segment(sample_name, id, seg_part_no, - get_part(contig, split_pos, pos + 1 - split_pos), split_kmer, kmer, zstd_cctx, zstd_dctx); + move(get_part(contig, split_pos, pos + 1 - split_pos)), split_kmer, kmer, zstd_cctx, zstd_dctx, thread_id, bar); ++seg_part_no; @@ -1497,7 +2040,7 @@ bool CAGCCompressor::compress_contig(contig_processing_stage_t contig_processing if (split_pos < contig.size()) add_segment(sample_name, id, seg_part_no, - get_part(contig, split_pos, contig.size() - split_pos), split_kmer, CKmer(kmer_length, kmer_mode_t::canonical), zstd_cctx, zstd_dctx); + move(get_part(contig, split_pos, contig.size() - split_pos)), split_kmer, CKmer(kmer_length, kmer_mode_t::canonical), zstd_cctx, zstd_dctx, thread_id, bar); return true; } @@ -1528,7 +2071,9 @@ void CAGCCompressor::find_new_splitters(contig_t& ctg, uint32_t thread_id) v_contig_kmers.erase(p_end, v_contig_kmers.end()); - find_splitters_in_contig(ctg, v_contig_kmers.begin(), v_contig_kmers.end(), vv_splitters[thread_id]); + add_fallback_kmers(v_contig_kmers.begin(), v_contig_kmers.end()); + + find_splitters_in_contig(ctg, v_contig_kmers.begin(), v_contig_kmers.end(), vv_splitters[thread_id], vv_fallback_minimizers[thread_id]); } // ******************************************************************************************* @@ -1640,9 +2185,8 @@ bool CAGCCompressor::AddSampleFiles(vector> _v_sample_file_ if (++cnt_contigs_in_sample >= max_no_contigs_before_synchronization) { // Send synchronization tokens - for (uint32_t i = 0; i < no_workers; ++i) - pq_contigs_desc->Emplace(make_tuple( - adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, "", "", contig_t()), sample_priority, 0); + pq_contigs_desc->EmplaceManyNoCost(make_tuple( + adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, "", "", contig_t()), sample_priority, no_workers); cnt_contigs_in_sample = 0; --sample_priority; @@ -1679,10 +2223,9 @@ bool CAGCCompressor::AddSampleFiles(vector> _v_sample_file_ if (!concatenated_genomes && any_contigs_added) { // Send synchronization tokens - for (uint32_t i = 0; i < no_workers; ++i) - pq_contigs_desc->Emplace(make_tuple( - adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, - "", "", contig_t()), sample_priority, 0); + pq_contigs_desc->EmplaceManyNoCost(make_tuple( + adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, + "", "", contig_t()), sample_priority, no_workers); --sample_priority; } @@ -1693,9 +2236,8 @@ bool CAGCCompressor::AddSampleFiles(vector> _v_sample_file_ if (concatenated_genomes)// && ++cnt_contigs_in_sample >= max_no_contigs_before_synchronization) { // Send synchronization tokens - for (uint32_t i = 0; i < no_workers; ++i) - pq_contigs_desc->Emplace(make_tuple( - adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, "", "", contig_t()), sample_priority, 0); + pq_contigs_desc->EmplaceManyNoCost(make_tuple( + adaptive_compression ? contig_processing_stage_t::new_splitters : contig_processing_stage_t::registration, "", "", contig_t()), sample_priority, no_workers); cnt_contigs_in_sample = 0; --sample_priority; @@ -1724,7 +2266,7 @@ bool CAGCCompressor::AddSampleFiles(vector> _v_sample_file_ // ******************************************************************************************* bool CAGCCompressor::Create(const string& _file_name, const uint32_t _pack_cardinality, const uint32_t _kmer_length, const string& reference_file_name, const uint32_t _segment_size, - const uint32_t _min_match_len, const bool _concatenated_genomes, const bool _adaptive_compression, const uint32_t _verbosity, const uint32_t no_threads) + const uint32_t _min_match_len, const bool _concatenated_genomes, const bool _adaptive_compression, const uint32_t _verbosity, const uint32_t no_threads, double _fallback_frac) { if (working_mode != working_mode_t::none) return false; @@ -1736,7 +2278,9 @@ bool CAGCCompressor::Create(const string& _file_name, const uint32_t _pack_cardi adaptive_compression = _adaptive_compression; segment_size = _segment_size; verbosity = _verbosity; - + fallback_frac = _fallback_frac; + fallback_filter.reset(fallback_frac); + if (!determine_splitters(reference_file_name, _segment_size, no_threads)) { working_mode = working_mode_t::none; @@ -1779,7 +2323,7 @@ bool CAGCCompressor::Create(const string& _file_name, const uint32_t _pack_cardi // ******************************************************************************************* bool CAGCCompressor::Append(const string& _in_archive_fn, const string& _out_archive_fn, const uint32_t _verbosity, const bool _prefetch_archive, const bool _concatenated_genomes, const bool _adaptive_compression, - const uint32_t no_threads) + const uint32_t no_threads, double _fallback_frac) { if (working_mode != working_mode_t::none) return false; @@ -1789,6 +2333,8 @@ bool CAGCCompressor::Append(const string& _in_archive_fn, const string& _out_arc prefetch_archive = _prefetch_archive; concatenated_genomes = _concatenated_genomes; adaptive_compression = _adaptive_compression; + fallback_frac = _fallback_frac; + fallback_filter.reset(fallback_frac); min_match_len = compression_params.min_match_len; uint32_t segment_size = compression_params.segment_size; diff --git a/src/core/agc_compressor.h b/src/core/agc_compressor.h index 1c3148a..5797be8 100644 --- a/src/core/agc_compressor.h +++ b/src/core/agc_compressor.h @@ -7,15 +7,17 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/agc_basic.h" +#include "../common/agc_basic.h" #include "../core/genome_io.h" #include "../core/hs.h" #include "../core/kmer.h" -#include "../core/utils.h" +#include "../common/utils.h" +#include "../core/utils_adv.h" + #include #include #include @@ -39,11 +41,11 @@ class CBufferedSegPart kmer2(_kmer2), sample_name(_sample_name), contig_name(_contig_name), - seg_data(move(_seg_data)), + seg_data(move(_seg_data)), is_rev_comp(_is_rev_comp), seg_part_no(_seg_part_no) {}; - + seg_part_t() : kmer1(~0ull), kmer2(~0ull), @@ -54,10 +56,58 @@ class CBufferedSegPart seg_part_no{ 0 } {}; - seg_part_t(seg_part_t&&) = default; - seg_part_t& operator=(const seg_part_t&) = default; - seg_part_t& operator=(seg_part_t&&) = default; - seg_part_t(const seg_part_t&) = default; + seg_part_t(const seg_part_t& rhs) + { + kmer1 = rhs.kmer1; + kmer2 = rhs.kmer2; + sample_name = rhs.sample_name; + contig_name = rhs.contig_name; + seg_data = rhs.seg_data; + is_rev_comp = rhs.is_rev_comp; + seg_part_no = rhs.seg_part_no; + } + + seg_part_t(seg_part_t&& rhs) noexcept + { + kmer1 = rhs.kmer1; + kmer2 = rhs.kmer2; + sample_name = move(rhs.sample_name); + contig_name = move(rhs.contig_name); + seg_data = move(rhs.seg_data); + is_rev_comp = rhs.is_rev_comp; + seg_part_no = rhs.seg_part_no; + } + + seg_part_t& operator=(const seg_part_t& rhs) { + if (this == &rhs) + return *this; + + kmer1 = rhs.kmer1; + kmer2 = rhs.kmer2; + sample_name = rhs.sample_name; + contig_name = rhs.contig_name; + seg_data = rhs.seg_data; + is_rev_comp = rhs.is_rev_comp; + seg_part_no = rhs.seg_part_no; + + return *this; + } + + seg_part_t& operator=(seg_part_t&& rhs) noexcept + { + if (this == &rhs) + return *this; + + kmer1 = rhs.kmer1; + kmer2 = rhs.kmer2; + sample_name = move(rhs.sample_name); + contig_name = move(rhs.contig_name); + seg_data = move(rhs.seg_data); + is_rev_comp = rhs.is_rev_comp; + seg_part_no = rhs.seg_part_no; + + return *this; + } bool operator<(const struct seg_part_t& x) const { @@ -69,6 +119,7 @@ class CBufferedSegPart } }; +public: // ******************************************************************************************* struct kk_seg_part_t { uint64_t kmer1; @@ -84,11 +135,11 @@ class CBufferedSegPart kmer2(_kmer2), sample_name(_sample_name), contig_name(_contig_name), - seg_data(move(_seg_data)), + seg_data(move(_seg_data)), is_rev_comp(_is_rev_comp), seg_part_no(_seg_part_no) {}; - + kk_seg_part_t() : kmer1{}, kmer2{}, @@ -113,6 +164,7 @@ class CBufferedSegPart } }; +private: // ******************************************************************************************* struct list_seg_part_t { mutex mtx; @@ -126,7 +178,7 @@ class CBufferedSegPart l_seg_part = x.l_seg_part; virt_begin = x.virt_begin; }; - + list_seg_part_t(list_seg_part_t&& x) noexcept { l_seg_part = move(x.l_seg_part); @@ -169,7 +221,7 @@ class CBufferedSegPart l_seg_part.emplace_back(move(seg_part)); } - void emplace(uint64_t kmer1, uint64_t kmer2, const string &sample_name, const string &contig_name, contig_t &seg_data, bool is_rev_comp, uint32_t seg_part_no) + void emplace(uint64_t kmer1, uint64_t kmer2, const string& sample_name, const string& contig_name, contig_t& seg_data, bool is_rev_comp, uint32_t seg_part_no) { lock_guard lck(mtx); l_seg_part.emplace_back(kmer1, kmer2, sample_name, contig_name, seg_data, is_rev_comp, seg_part_no); @@ -212,7 +264,7 @@ class CBufferedSegPart return true; } - bool pop(uint64_t &kmer1, uint64_t &kmer2, string &sample_name, string &contig_name, contig_t &seg_data, bool &is_rev_comp, uint32_t &seg_part_no) + bool pop(uint64_t& kmer1, uint64_t& kmer2, string& sample_name, string& contig_name, contig_t& seg_data, bool& is_rev_comp, uint32_t& seg_part_no) { if (virt_begin >= l_seg_part.size()) { @@ -239,7 +291,7 @@ class CBufferedSegPart uint32_t size() { - return (uint32_t) l_seg_part.size(); + return (uint32_t)l_seg_part.size(); } }; @@ -251,6 +303,8 @@ class CBufferedSegPart atomic a_v_part_id; public: + static const int32_t part_id_step = 1; + CBufferedSegPart(uint32_t no_raw_groups) { vl_seg_part.resize(no_raw_groups); @@ -265,45 +319,74 @@ class CBufferedSegPart void add_known(uint32_t group_id, uint64_t kmer1, uint64_t kmer2, const string& sample_name, const string& contig_name, contig_t&& seg_data, bool is_rev_comp, uint32_t seg_part_no) { -// vl_seg_part[group_id].append(seg_part_t(kmer1, kmer2, sample_name, contig_name, seg_data, is_rev_comp, seg_part_no)); // internal mutex + // !!! TODO: use move() here? vl_seg_part[group_id].emplace(kmer1, kmer2, sample_name, contig_name, seg_data, is_rev_comp, seg_part_no); // internal mutex } void add_new(uint64_t kmer1, uint64_t kmer2, const string& sample_name, const string& contig_name, contig_t& seg_data, bool is_rev_comp, uint32_t seg_part_no) { lock_guard lck(mtx); + // !!! TODO: use move() here? s_seg_part.emplace(kmer1, kmer2, sample_name, contig_name, seg_data, is_rev_comp, seg_part_no); } void sort_known(uint32_t nt) { lock_guard lck(mtx); + const uint32_t min_parts_in_job = 16; + + uint64_t n_seg_part = vl_seg_part.size(); + + nt = clamp(n_seg_part / min_parts_in_job, 1u, nt); + + auto vl_seg_part_begin = vl_seg_part.begin(); + + atomic seg_part_id = 0; + uint64_t job_step = n_seg_part / (16 * nt); + if (job_step < 16) + job_step = 16; + + auto job = [&seg_part_id, job_step, n_seg_part, vl_seg_part_begin] { + while (true) + { + uint64_t j_from = seg_part_id.fetch_add(job_step); + + if (j_from >= n_seg_part) + break; + + uint64_t j_to = min(j_from + job_step, n_seg_part); + + auto p = vl_seg_part_begin + j_from; + + for (uint64_t j = j_from; j < j_to; ++j, ++p) + p->sort(); + } + }; vector> v_fut; v_fut.reserve(nt); - uint64_t n_seg_part = vl_seg_part.size(); + for (uint64_t i = 0; i < nt - 1; ++i) + v_fut.emplace_back(async(job)); - for (uint64_t i = 0; i < nt; ++i) - v_fut.emplace_back(async([&, i] { - uint64_t j_from = i * n_seg_part / nt; - uint64_t j_to = (i + 1) * n_seg_part / nt; - - for (uint64_t j = j_from; j < j_to; ++j) - vl_seg_part[j].sort(); - })); + job(); for (auto& f : v_fut) f.wait(); } + const set& get_seg_parts() const + { + return s_seg_part; + } + uint32_t process_new() { lock_guard lck(mtx); map, uint32_t> m_kmers; - uint32_t group_id = (uint32_t) vl_seg_part.size(); + uint32_t group_id = (uint32_t)vl_seg_part.size(); // Assign group ids to new segments for (const auto& x : s_seg_part) @@ -314,9 +397,9 @@ class CBufferedSegPart m_kmers[make_pair(x.kmer1, x.kmer2)] = group_id++; } - uint32_t no_new = group_id - (uint32_t) vl_seg_part.size(); + uint32_t no_new = group_id - (uint32_t)vl_seg_part.size(); - if(vl_seg_part.capacity() < group_id) + if (vl_seg_part.capacity() < group_id) vl_seg_part.reserve((uint64_t)(group_id * 1.2)); vl_seg_part.resize(group_id); @@ -351,26 +434,73 @@ class CBufferedSegPart } } + atomic clear_idx{}; + const uint64_t job_clear_step = 64; + + void clear_job() + { + uint64_t n_seg_part = vl_seg_part.size(); + auto vl_seg_part_begin = vl_seg_part.begin(); + + while (true) + { + uint64_t loc_idx = clear_idx.fetch_add(job_clear_step); + uint64_t upp_idx = min(loc_idx + job_clear_step, n_seg_part); + + auto p = vl_seg_part_begin + loc_idx; + + for (; loc_idx < upp_idx; ++loc_idx, ++p) + // vl_seg_part[loc_idx].clear(); + p->clear(); + + if (loc_idx >= n_seg_part) + break; + } + } + void clear(uint32_t nt) { lock_guard lck(mtx); + const uint32_t min_parts_in_job = 512; - vector> v_fut; + uint64_t n_seg_part = vl_seg_part.size(); - v_fut.reserve(nt); + nt = clamp(n_seg_part / min_parts_in_job, 1u, nt); - uint64_t n_seg_part = vl_seg_part.size(); + uint64_t job_step = n_seg_part / (64 * nt); + if (job_step < 64) + job_step = 64; + + atomic idx = 0; + + auto vl_seg_part_begin = vl_seg_part.begin(); + + auto job = [&idx, job_step, n_seg_part, vl_seg_part_begin] { + while (true) + { + uint64_t loc_idx = idx.fetch_add(job_step); - for (uint64_t i = 0; i < nt; ++i) - v_fut.emplace_back(async([&, i] { - uint64_t j_from = i * n_seg_part / nt; - uint64_t j_to = (i + 1) * n_seg_part / nt; + if (loc_idx >= n_seg_part) + break; - for (uint64_t j = j_from; j < j_to; ++j) - vl_seg_part[j].clear(); - })); + uint64_t upp_idx = min(loc_idx + job_step, n_seg_part); + + auto p = vl_seg_part_begin + loc_idx; + + for (; loc_idx < upp_idx; ++loc_idx, ++p) + p->clear(); + } + }; + + vector> v_fut; + + v_fut.reserve(nt); + + for (uint64_t i = 0; i < nt - 1; ++i) + v_fut.emplace_back(async(job)); s_seg_part.clear(); + job(); for (auto& f : v_fut) f.wait(); @@ -386,7 +516,12 @@ class CBufferedSegPart int get_vec_id() { // return a_v_part_id.fetch_sub(1); - return a_v_part_id.fetch_sub(10); + return a_v_part_id.fetch_sub(part_id_step); + } + + int get_no_parts() + { + return (int)vl_seg_part.size(); } bool is_empty_part(int group_id) @@ -431,10 +566,45 @@ class CAGCCompressor : public CAGCBasic {} }; - using my_barrier = CBarrier; + // ******************************************************************************************* + class kmer_filter_t + { + uint64_t thr = 0ull; + MurMur64Hash mmh; + uint64_t rnd = 0xD73F8BF11046C40Eull; + + public: + kmer_filter_t(double fraction = 0.0) + { + reset(fraction); + } + + void reset(double fraction) + { + if (fraction == 0.0) + thr = 0ull; + else + thr = (uint64_t)(((double)~0ull) * fraction); + } + + explicit operator bool() const + { + return thr != 0ull; + } + + bool operator()(uint64_t x) const + { + return (mmh(x) ^ rnd) < thr; + } + }; + +// using my_barrier = CBarrier; // using my_barrier = CAtomicBarrier; + using my_barrier = CAtomicBarrierWithIncrementing; // using my_barrier = barrier<>; + const pair pk_empty = make_pair(~0ull, ~0ull); + shared_mutex seg_map_mtx; shared_mutex seg_vec_mtx; @@ -459,6 +629,12 @@ class CAGCCompressor : public CAGCBasic unordered_map, MurMur64Hash> map_segments_terminators; // shared_mutex (seg_map_mtx) vector> v_segments; // shared_mutex to vector (seg_vec_mtx) + internal mutexes in stored objects + mutex mtx_fallback_map; + kmer_filter_t fallback_filter; + double fallback_frac = 0.0; + vector>> vv_fallback_minimizers; + unordered_map>> map_fallback_minimizers; // mtx_fallback_map + uint32_t no_segments; atomic id_segment = 0; @@ -487,20 +663,30 @@ class CAGCCompressor : public CAGCBasic unique_ptr> pq_contigs_raw; // internal mutexes unique_ptr> q_contigs_data; // internal mutexes + vector v_cctx; + vector v_dctx; + ZSTD_DCtx* zstd_dctx_for_fallback = nullptr; + bool compress_contig(contig_processing_stage_t contig_processing_stage, string sample_name, string id, contig_t& contig, - ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id); + ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id, my_barrier &bar); pair_segment_desc_t add_segment(const string &sample_name, const string &contig_name, uint32_t seg_part_no, - contig_t segment, CKmer kmer_front, CKmer kmer_back, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); + contig_t &&segment, CKmer kmer_front, CKmer kmer_back, ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx, uint32_t thread_id, my_barrier& bar); void register_segments(uint32_t n_t); void store_segments(ZSTD_CCtx* zstd_cctx, ZSTD_DCtx* zstd_dctx); - pair, bool> find_cand_segment_with_one_splitter(CKmer kmer, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx); - pair find_cand_segment_with_missing_middle_splitter(CKmer kmer_front, CKmer kmer_back, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx); + pair, bool> find_cand_segment_with_one_splitter(CKmer kmer, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx, my_barrier& bar); + pair find_cand_segment_with_missing_middle_splitter(CKmer kmer_front, CKmer kmer_back, contig_t& segment_dir, contig_t& segment_rc, ZSTD_DCtx* zstd_dctx, my_barrier& bar); + pair, bool> find_cand_segment_using_fallback_minimizers(contig_t& segment, uint64_t max_val); contig_t get_part(const contig_t& contig, uint64_t pos, uint64_t len); void preprocess_raw_contig(contig_t& ctg); void find_new_splitters(contig_t& ctg, uint32_t thread_id); + void add_fallback_kmers(vector::iterator first, vector::iterator last); + void add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, vector>& cand_fallback_kmers); + void add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, uint64_t kmer, bool is_dir_oriented); + void add_fallback_mapping(uint64_t splitter1, uint64_t splitter2, const contig_t &segment); + // ******************************************************************************************* void append(vector& data, uint32_t num) { @@ -531,6 +717,12 @@ class CAGCCompressor : public CAGCBasic // ******************************************************************************************* bool close_compression(const uint32_t no_threads); + void prepare_compressing_stuctures(const uint32_t n_t); + void release_compressing_stuctures(); + + void compressing_stage1_job(uint32_t thread_id, uint32_t n_t); + void compressing_stage2_job(uint32_t thread_id, uint32_t n_t); + void start_compressing_threads(vector &v_threads, my_barrier &bar, const uint32_t n_t); void start_finalizing_threads(vector& v_threads, const uint32_t n_t); void start_splitter_finding_threads(vector& v_threads, const uint32_t n_t, const vector::iterator v_begin, const vector::iterator v_end, vector>& v_splitters); @@ -549,7 +741,7 @@ class CAGCCompressor : public CAGCBasic void remove_non_singletons(vector& vec, vector& v_duplicated, size_t virtual_begin); void enumerate_kmers(contig_t& ctg, vector &vec); - void find_splitters_in_contig(contig_t& ctg, const vector::iterator v_begin, const vector::iterator v_end, vector& v_splitters); + void find_splitters_in_contig(contig_t& ctg, const vector::iterator v_begin, const vector::iterator v_end, vector& v_splitters, vector>&v_fallbacks); void store_file_type_info(); @@ -560,9 +752,9 @@ class CAGCCompressor : public CAGCBasic ~CAGCCompressor(); bool Create(const string& _file_name, const uint32_t _pack_cardinality, const uint32_t _kmer_length, const string& reference_file_name, const uint32_t _segment_size, - const uint32_t _min_match_len, const bool _concatenated_genomes, const bool _adaptive_compression, const uint32_t _verbosity, const uint32_t _no_threads); + const uint32_t _min_match_len, const bool _concatenated_genomes, const bool _adaptive_compression, const uint32_t _verbosity, const uint32_t _no_threads, double _fallback_frac); bool Append(const string& _in_archive_fn, const string& _out_archive_fn, const uint32_t _verbosity, const bool _prefetch_archive, const bool _concatenated_genomes, const bool _adaptive_compression, - const uint32_t no_threads); + const uint32_t no_threads, double _fallback_frac); void AddCmdLine(const string& cmd_line); diff --git a/src/core/agc_decompressor.cpp b/src/core/agc_decompressor.cpp index df284bb..835c3c6 100644 --- a/src/core/agc_decompressor.cpp +++ b/src/core/agc_decompressor.cpp @@ -4,12 +4,12 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/agc_decompressor.h" -#include "../core/genome_io.h" +#include "agc_decompressor.h" +#include "genome_io.h" #include #include @@ -25,62 +25,6 @@ CAGCDecompressor::~CAGCDecompressor() { } -// ******************************************************************************************* -void CAGCDecompressor::convert_and_split_into_lines(contig_t& ctg, contig_t& working_space, uint32_t line_len) -{ - if (ctg.empty()) - return; - - size_t dest_size = ctg.size() + (ctg.size() + line_len - 1) / line_len; - working_space.resize(dest_size); - - auto p = ctg.data(); - auto q = working_space.data(); - - size_t to_save = ctg.size(); - - for (; to_save > line_len; to_save -= line_len) - { - uint32_t i; - - switch (i = line_len % 8) - { - case 7: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 6: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 5: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 4: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 3: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 2: *q++ = cnv_num[*p++]; [[fallthrough]]; - case 1: *q++ = cnv_num[*p++]; - } - - for (; i < line_len; i += 8) - { - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - *q++ = cnv_num[*p++]; - } - - *q++ = '\n'; - } - - if (to_save) - { - while (to_save--) - *q++ = cnv_num[*p++]; - *q++ = '\n'; - } - - assert(q == working_space.data() + working_space.size()); - - std::swap(ctg, working_space); -} - // ******************************************************************************************* void CAGCDecompressor::gzip_contig(contig_t& ctg, contig_t& working_space, refresh::gz_in_memory& gzip_compressor) { @@ -94,10 +38,10 @@ void CAGCDecompressor::gzip_contig(contig_t& ctg, contig_t& working_space, refre } // ******************************************************************************************* -void CAGCDecompressor::start_decompressing_threads(vector& v_threads, const uint32_t n_t, uint32_t gzip_level, uint32_t line_len) +void CAGCDecompressor::start_decompressing_threads(vector& v_threads, const uint32_t n_t, uint32_t gzip_level, uint32_t line_len, bool fast) { for (uint32_t i = 0; i < n_t; ++i) - v_threads.emplace_back([&, i, gzip_level, line_len] { + v_threads.emplace_back([&, i, gzip_level, line_len, fast] { auto zstd_ctx = ZSTD_createDCtx(); @@ -112,13 +56,13 @@ void CAGCDecompressor::start_decompressing_threads(vector& v_threads, co size_t priority = contig_desc.priority; - if (!decompress_contig(contig_desc, zstd_ctx, ctg)) + if (!decompress_contig(contig_desc, zstd_ctx, ctg, fast)) continue; if(line_len == 0) - convert_to_alpha(ctg); + CNumAlphaConverter::convert_to_alpha(ctg); else - convert_and_split_into_lines(ctg, working_space, line_len); + CNumAlphaConverter::convert_and_split_into_lines(ctg, working_space, line_len); if (gzip_level) gzip_contig(ctg, working_space, gzip_compressor); @@ -163,7 +107,7 @@ bool CAGCDecompressor::AssignArchive(const CAGCBasic& agc_basic) } // ******************************************************************************************* -bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity) +bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, bool no_ref, bool fast, uint32_t verbosity) { if (working_mode != working_mode_t::decompression) return false; @@ -179,7 +123,10 @@ bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _l return false; } - collection_desc->get_samples_list(v_samples); + collection_desc->get_samples_list(v_samples, false); + + if (no_ref && !v_samples.empty()) + v_samples.erase(v_samples.begin()); q_contig_tasks = make_unique>(1, 1); pq_contigs_to_save = make_unique>(no_threads); @@ -194,6 +141,7 @@ bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _l uint32_t global_id = 0; string prev_sample_name; bool is_gio_opened = false; + size_t file_id = 0; string eol = ""; @@ -219,7 +167,7 @@ bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _l gio.Open(cur_path.string(), true); if (verbosity > 0) { - cerr << eol << cur_path.string(); + cerr << eol << cur_path.string() << " (" << ++file_id << " of " << v_samples.size() << ")"; eol = "\n"; } } @@ -248,7 +196,7 @@ bool CAGCDecompressor::GetCollectionFiles(const string& _path, const uint32_t _l q_contig_tasks->Restart(1); - start_decompressing_threads(v_threads, no_threads, gzip_level, _line_length); + start_decompressing_threads(v_threads, no_threads, gzip_level, _line_length, fast); sample_desc_t sample_desc; @@ -362,7 +310,7 @@ bool CAGCDecompressor::GetSampleFile(const string& _file_name, const vector y.segments.size(); }); + std::sort(v_tasks.begin(), v_tasks.end(), [](auto& x, auto& y) {return x.segments.size() > y.segments.size(); }); q_contig_tasks->Restart(1); @@ -388,6 +336,71 @@ bool CAGCDecompressor::GetSampleFile(const string& _file_name, const vector& sample_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity) +{ + if (working_mode != working_mode_t::decompression) + return false; + + contig_t ctg, working_space; + contig_task_t contig_desc; + + FILE* stream; + + if (_file_name.empty()) + { + stream = stdout; +#ifdef _WIN32 + _setmode(_fileno(stream), _O_BINARY); +#endif + } + else + { + stream = fopen(_file_name.c_str(), "wb"); + if (!stream) + { +// if(app_mode) + // !!! TODO: check app mode + cerr << "Cannot open destination file: " << _file_name << endl; + return false; + } + setvbuf(stream, nullptr, _IOFBF, 1 << 20); + } + + auto zstd_ctx = ZSTD_createDCtx(); + + CStreamWrapper stream_wrapper(stream, _line_length, gzip_level); + + for (const auto &s : sample_names) + { + sample_desc_t sample_desc; + + if (!collection_desc->get_sample_desc(s, sample_desc)) + { + cerr << "There is no sample " << s << endl; + + return false; + } + + for (auto contig_desc : sample_desc) + { + contig_task_t contig_task(0, "", contig_desc.first, contig_desc.second); + + stream_wrapper.start_contig(contig_desc.first); + + if (!decompress_contig_streaming(contig_task, zstd_ctx, stream_wrapper, false)) + continue; + } + } + + if (!_file_name.empty()) + fclose(stdout); + + ZSTD_freeDCtx(zstd_ctx); + + return true; +} + // ******************************************************************************************* bool CAGCDecompressor::GetSampleSequences(const string& sample_name, vector>>& v_contig_seq, const uint32_t no_threads) { @@ -554,4 +567,99 @@ bool CAGCDecompressor::GetContigFile(const string& _file_name, const vector& contig_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity) +{ + if (working_mode != working_mode_t::decompression) + return false; + + vector> v_sample_contig; + name_range_t name_range; + string sample; + + for (const auto &cn : contig_names) + { + if (!analyze_contig_query(cn, sample, name_range)) + { + cerr << "Wrong contig format: " << cn << endl; + continue; + } + + if (!sample.empty()) + { + if (!collection_desc->is_contig_desc(sample, name_range.name)) + { + cerr << "There is no sample:contig pair: " << sample << " : " << name_range.name << endl; + return false; + } + + v_sample_contig.emplace_back(sample, name_range_t(name_range.name, name_range.from, name_range.to)); + } + else + { + auto v_cand_samples = collection_desc->get_samples_for_contig(name_range.name); + if (v_cand_samples.size() == 0) + { + cerr << "There is no contig: " << name_range.name << endl; + return false; + } + if (v_cand_samples.size() > 1) + { + cerr << "There are " << v_cand_samples.size() << " samples with conting: " << name_range.name << endl; + return false; + } + + v_sample_contig.emplace_back(v_cand_samples.front(), name_range_t(name_range.name, name_range.from, name_range.to)); + } + } + + FILE* stream; + + if (_file_name.empty()) + { + stream = stdout; +#ifdef _WIN32 + _setmode(_fileno(stream), _O_BINARY); +#endif + } + else + { + stream = fopen(_file_name.c_str(), "wb"); + if (!stream) + { + // if(app_mode) + // !!! TODO: check app mode + cerr << "Cannot open destination file: " << _file_name << endl; + return false; + } + setvbuf(stream, nullptr, _IOFBF, 1 << 20); + } + + auto zstd_ctx = ZSTD_createDCtx(); + + CStreamWrapper stream_wrapper(stream, _line_length, gzip_level); + + for (auto& p_sc : v_sample_contig) + { + vector contig_desc; + collection_desc->get_contig_desc(p_sc.first, p_sc.second.name, contig_desc); + +// contig_task_t contig_task(0, p_sc.first, p_sc.second.name, contig_desc); + contig_task_t contig_task(0, p_sc.first, p_sc.second, contig_desc); + +// stream_wrapper.start_contig(p_sc.second.name); + stream_wrapper.start_contig(p_sc.second.str()); + + if (!decompress_contig_streaming(contig_task, zstd_ctx, stream_wrapper, false)) + continue; + } + + if (!_file_name.empty()) + fclose(stdout); + + ZSTD_freeDCtx(zstd_ctx); + + return true; +} + // EOF diff --git a/src/core/agc_decompressor.h b/src/core/agc_decompressor.h index d437f6e..67dd249 100644 --- a/src/core/agc_decompressor.h +++ b/src/core/agc_decompressor.h @@ -7,29 +7,30 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/agc_decompressor_lib.h" -#include "../../libs/gz_wrapper.h" +#include "../common/agc_decompressor_lib.h" +#include // ******************************************************************************************* // Class supporting only decompression of AGC files - extended version (can store also in gzipped files) class CAGCDecompressor : public CAGCDecompressorLibrary { - void start_decompressing_threads(vector& v_threads, const uint32_t n_t, uint32_t gzip_level = 0, uint32_t line_len = 0); + void start_decompressing_threads(vector& v_threads, const uint32_t n_t, uint32_t gzip_level = 0, uint32_t line_len = 0, bool fast = false); - void convert_and_split_into_lines(contig_t& ctg, contig_t& working_space, uint32_t line_len); void gzip_contig(contig_t& ctg, contig_t& working_space, refresh::gz_in_memory& gzip_compressor); public: CAGCDecompressor(bool _is_app_mode); ~CAGCDecompressor(); - bool GetCollectionFiles(const string& _path, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity); + bool GetCollectionFiles(const string& _path, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, bool no_ref, bool fast, uint32_t verbosity); bool GetSampleFile(const string& _file_name, const vector& sample_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity); bool GetContigFile(const string& _file_name, const vector& contig_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity); + bool GetSampleForStreaming(const string& _file_name, const vector& sample_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity); + bool GetContigForStreaming(const string& _file_name, const vector& contig_names, const uint32_t _line_length, const uint32_t no_threads, const uint32_t gzip_level, uint32_t verbosity); bool GetSampleSequences(const string& sample_name, vector>> &v_contig_seq, const uint32_t no_threads); diff --git a/src/core/genome_io.cpp b/src/core/genome_io.cpp index 9eeca82..aab3db7 100644 --- a/src/core/genome_io.cpp +++ b/src/core/genome_io.cpp @@ -4,11 +4,11 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/genome_io.h" +#include "genome_io.h" #include #include #include @@ -19,10 +19,7 @@ CGenomeIO::CGenomeIO() { writing = false; -// in = nullptr; out = nullptr; -// gz_in = nullptr; -// gz_out = nullptr; is_gzipped = false; use_stdout = false; @@ -46,67 +43,41 @@ CGenomeIO::~CGenomeIO() // ******************************************************************************************* bool CGenomeIO::Open(const string& _file_name, const bool _writing) { -// if (in || out || gz_in || gz_out || sif) if (out || sif) return false; -// is_gzipped = _file_name.length() > 3 && _file_name.substr(_file_name.length() - 3, 3) == ".gz"s; use_stdout = _file_name.empty(); writing = _writing; if (writing) { -/* if (is_gzipped) + if (use_stdout) { - gz_out = gzopen(_file_name.c_str(), "w3"); - if (!gz_out) - return false; - gzbuffer(gz_out, (uint32_t) gz_buffer_size); - } - else*/ - { - if (use_stdout) - { - out = stdout; + out = stdout; #ifdef _WIN32 - _setmode(_fileno(out), _O_BINARY); + _setmode(_fileno(out), _O_BINARY); #endif - } - else - { - out = fopen(_file_name.c_str(), "wb"); - if (!out) - return false; - setvbuf(out, nullptr, _IOFBF, buffer_size); - - } + } + else + { + out = fopen(_file_name.c_str(), "wb"); + if (!out) + return false; + setvbuf(out, nullptr, _IOFBF, write_buffer_size); + } buffer = nullptr; } else { -/* if (is_gzipped) - { - gz_in = gzopen(_file_name.c_str(), "r"); - if (!gz_in) - return false; - gzbuffer(gz_in, (uint32_t) gz_buffer_size); - } - else - { - in = fopen(_file_name.c_str(), "rb"); - if (!in) - return false; - }*/ - sif = new refresh::stream_in_file(_file_name); if (!sif->is_open()) return false; sdf = new refresh::stream_decompression(sif); - buffer = new uint8_t[buffer_size]; + buffer = new uint8_t[read_buffer_size]; } buffer_pos = 0; @@ -120,12 +91,7 @@ bool CGenomeIO::Close() { if (writing) { -/* if (gz_out) - { - gzclose(gz_out); - gz_out = nullptr; - } - else*/ if (out) + if (out) { fflush(out); if(!use_stdout) @@ -135,20 +101,8 @@ bool CGenomeIO::Close() } else { -/* if (gz_in) - { - gzclose(gz_in); - gz_in = nullptr; - } - else if (in) - { - fclose(in); - in = nullptr; - }*/ - if (sif) { -// sif->close(); delete sdf; delete sif; sif = nullptr; @@ -171,22 +125,6 @@ size_t CGenomeIO::FileSize() if (!writing) { -/* if (is_gzipped) - { - while (fill_buffer()) - { - s += buffer_filled; - buffer_pos = buffer_filled; - } - gzseek(gz_in, 0, SEEK_SET); - } - else - { - fseek(in, 0, SEEK_END); - s = my_ftell(in); - fseek(in, 0, SEEK_SET); - }*/ - const size_t loc_buf_size = 1 << 25; char* loc_buf = new char[loc_buf_size]; size_t readed; @@ -256,14 +194,9 @@ bool CGenomeIO::fill_buffer() buffer_filled = 0; } - size_t to_read = buffer_size - buffer_filled; + size_t to_read = read_buffer_size - buffer_filled; size_t readed; -/* if (is_gzipped) - readed = gzread(gz_in, buffer + buffer_filled, (uint32_t) to_read); - else - readed = fread(buffer + buffer_filled, 1, to_read, in);*/ - sdf->read((char*) buffer + buffer_filled, to_read, readed); buffer_filled += readed; @@ -274,7 +207,6 @@ bool CGenomeIO::fill_buffer() // ******************************************************************************************* bool CGenomeIO::read_contig_raw(string& id, contig_t& contig) { -// if (!in && !gz_in && !sif) if (!sif) return false; diff --git a/src/core/genome_io.h b/src/core/genome_io.h index 7c3b23b..565b1da 100644 --- a/src/core/genome_io.h +++ b/src/core/genome_io.h @@ -7,24 +7,17 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #include #include #include #include -#include "defs.h" -//#include -/*#ifdef _MSC_VER -#include "../../../3rd_party/zlib-ng/build-vs/zlib.h" -#else -#include "../../../3rd_party/zlib-ng/zlib.h" -#endif*/ -//#include -#include "../../libs/file_wrapper.h" -#include "../../libs/gz_wrapper.h" +#include "../common/defs.h" +#include +#include #ifndef _WIN32 #define my_fseek fseek @@ -55,10 +48,7 @@ class CGenomeIO string file_name; bool writing; -// FILE* in; FILE* out; -// gzFile gz_in; -// gzFile gz_out; bool is_gzipped; bool use_stdout; @@ -69,8 +59,8 @@ class CGenomeIO vector gzip_zero_compressor_buffer; uint8_t* buffer; - const size_t buffer_size = 128 << 20; - const size_t gz_buffer_size = 32 << 20; + const size_t write_buffer_size = 32 << 20; + const size_t read_buffer_size = 4 << 20; size_t buffer_filled; size_t buffer_pos; diff --git a/src/core/hs.h b/src/core/hs.h index d741324..bc99e13 100644 --- a/src/core/hs.h +++ b/src/core/hs.h @@ -7,8 +7,8 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #if defined(ARCH_X64) diff --git a/src/core/kmer.h b/src/core/kmer.h index 67cf7b6..7bbad8f 100644 --- a/src/core/kmer.h +++ b/src/core/kmer.h @@ -7,14 +7,14 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/defs.h" +#include "../common/defs.h" #include #include -#include "../core/utils.h" +#include "../common/utils.h" enum class kmer_mode_t {direct, rev_comp, canonical}; diff --git a/src/core/utils_adv.h b/src/core/utils_adv.h new file mode 100644 index 0000000..9f2301f --- /dev/null +++ b/src/core/utils_adv.h @@ -0,0 +1,349 @@ +#ifndef _UTILS_ADV_H +#define _UTILS_ADV_H + +// ******************************************************************************************* +// This file is a part of AGC software distributed under MIT license. +// The homepage of the AGC project is https://github.com/refresh-bio/agc +// +// Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li +// +// Version: 3.2 +// Date : 2024-11-21 +// ******************************************************************************************* + +#include "../common/defs.h" +#include +#if defined(ARCH_X64) +#include +#elif defined(ARCH_ARM) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include +#include + +// ***************************************************************************************** +// +class CBarrier +{ +public: + CBarrier(const CBarrier&) = delete; + CBarrier& operator=(const CBarrier&) = delete; + explicit CBarrier(unsigned int count) : + m_count(count), m_generation(0), + m_count_reset_value(count) + { + } + void arrive_and_wait() + { + std::unique_lock< std::mutex > lock(m_mutex); + unsigned int gen = m_generation; + if (--m_count == 0) + { + m_generation++; + m_count = m_count_reset_value; + m_cond.notify_all(); + return; + } + + // m_cond.wait(lock, [&] {return gen != m_generation; }); + while (gen == m_generation) + m_cond.wait(lock); + } +private: + std::mutex m_mutex; + std::condition_variable m_cond; + unsigned int m_count; + unsigned int m_generation; + unsigned int m_count_reset_value; +}; + +// ***************************************************************************************** +// +class CAtomicBarrier +{ +public: + CAtomicBarrier(const CAtomicBarrier&) = delete; + CAtomicBarrier& operator=(const CAtomicBarrier&) = delete; + explicit CAtomicBarrier(int32_t count) : + a_count(count - 1), a_generation(0), + count_reset_value(count - 1) + { + } + + void arrive_and_wait() + { + int32_t old_generation = a_generation; + + if (!a_count.fetch_sub(1, memory_order_relaxed)) + { + a_count = count_reset_value; + ++a_generation; + } + + while (a_generation == old_generation) + ; + } +private: + atomic a_count; + atomic a_generation; + int32_t count_reset_value; +}; + +// ***************************************************************************************** +// +class CAtomicBarrierWithIncrementing +{ + std::atomic a_count; + std::atomic a_generation; + int32_t count_reset_value; + +public: + CAtomicBarrierWithIncrementing(const CAtomicBarrierWithIncrementing&) = delete; + CAtomicBarrierWithIncrementing& operator=(const CAtomicBarrierWithIncrementing&) = delete; + explicit CAtomicBarrierWithIncrementing(int32_t count) : + a_count(count - 1), a_generation(0), + count_reset_value(count - 1) + { + } + + void arrive_and_wait() + { + int32_t old_generation = a_generation.load(); + + if (!a_count.fetch_sub(1, memory_order_relaxed)) + { + a_count = count_reset_value; + ++a_generation; + a_generation.notify_all(); + return; + } + + a_generation.wait(old_generation); + } + + bool try_increment(int32_t inc = 1) + { + // return false; + + auto new_count = a_count.fetch_add(inc, memory_order_relaxed) + inc; + + if (new_count <= count_reset_value) + return true; + + a_count.fetch_sub(inc, memory_order_relaxed); + return false; + } + + int32_t try_increment_max(int32_t inc_req) + { + // return 0; + + inc_req = min(inc_req, count_reset_value); + + auto new_count = a_count.fetch_add(inc_req, memory_order_relaxed) + inc_req; + + if (new_count <= count_reset_value) + return inc_req; + + int32_t to_many = new_count - count_reset_value; + + if (to_many >= inc_req) // Can happen if other thread is also in this function at the same time + to_many = inc_req; + + int32_t no_ext = inc_req - to_many; + + a_count.fetch_sub(to_many, memory_order_relaxed); + + return no_ext; + } + + void decrement(int32_t dec = 1) + { + // return; + + a_count.fetch_sub(dec); + } +}; + +// ********************************************************************************** +class bloom_set_t { + // const uint32_t no_hashes = 2; + const uint32_t no_hashes = 3; + + MurMur64Hash mmh; + + // vector arr; + uint64_t* arr = nullptr; + uint64_t* raw_arr = nullptr; + + size_t no_elements; + size_t allocated; + size_t mask; + uint32_t mask_shift; + + uint64_t normalize_size(uint64_t size) + { + size *= no_hashes; + size *= 2; + + while (size & (size - 1)) + size &= size - 1; + + return max((uint64_t)256, size * 2); + } + + void allocate(size_t size) + { + if (raw_arr) + delete[] raw_arr; + // arr.clear(); + no_elements = 0; + + allocated = normalize_size(size); + + // arr.resize(allocated / 64, 0); + raw_arr = new uint64_t[allocated / 64 + 7]; + arr = raw_arr; + while (((uint64_t)arr) % 64 != 0) + ++arr; + + fill_n(arr, allocated / 64, 0ull); + + mask_shift = 6 * no_hashes; + mask = (allocated / 64 - 1) << mask_shift; + } + + void insert_impl(uint64_t x) + { + uint64_t h = mmh(x); + + uint64_t pos = (h & mask) >> mask_shift; + + arr[pos] |= (1ull << (h & 63)) | (1ull << ((h >> 6) & 63)) | (1ull << ((h >> 12) & 63)); + // arr[pos] |= (1ull << (h & 63)) | (1ull << ((h >> 6) & 63)); + + ++no_elements; + } + +public: + bloom_set_t(size_t size = 64) + { + allocate(size); + } + + ~bloom_set_t() + { + if (raw_arr) + delete[] raw_arr; + } + + void resize(size_t size) + { + allocate(size); + } + + template + void insert(Iter begin, Iter end) + { + for (auto p = begin; p != end; ++p) + insert_impl(*p); + } + + void insert(uint64_t x) + { + insert_impl(x); + } + + bool check(uint64_t x) + { + uint64_t h = mmh(x); + + uint64_t pos = (h & mask) >> mask_shift; + + return (arr[pos] & (1ull << (h & 63))) && (arr[pos] & (1ull << ((h >> 6) & 63))) && (arr[pos] & (1ull << ((h >> 12) & 63))); + // return (arr[pos] & (1ull << (h & 63))) && (arr[pos] & (1ull << ((h >> 6) & 63))); + } + + double filling_factor() + { + return (double)no_hashes * no_elements / allocated; + } +}; + +// ********************************************************************************** +template +constexpr uint32_t pop_count(T x) +{ + uint32_t r = 0; + + for (; x; ++r) + x &= x - 1; + + return r; +} + +// ********************************************************************************** +template +constexpr bool is_power_2(const T x) +{ + return (x & (x - (T)1)) == 0; +} + +// ********************************************************************************** +constexpr uint64_t ilog2(uint64_t x) +{ + uint64_t r = 0; + + for (; x; ++r) + x >>= 1; + + return r; +} + +// ********************************************************************************** +constexpr uint64_t no_bytes(uint64_t x) +{ + uint64_t r = 1; + + x >>= 8; + + for (; x; ++r) + x >>= 8; + + return r; +} + +// ********************************************************************************** +struct hash_pair { + template + size_t operator()(const pair& x) const + { + return hash{}(x.first) ^ hash{}(x.second); + } +}; + +template <> +struct std::hash> +{ + std::size_t operator()(const pair& k) const + { + using std::size_t; + using std::hash; + + return (hash()(k.first)) ^ (hash()(k.second)); + } +}; + +// EOF +#endif \ No newline at end of file diff --git a/src/lib-cxx/agc-api.h b/src/lib-cxx/agc-api.h index cf1b1e9..51722b3 100644 --- a/src/lib-cxx/agc-api.h +++ b/src/lib-cxx/agc-api.h @@ -4,8 +4,8 @@ // // Copyright(C) 2021-2024, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2022-12-22 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* #ifndef AGC_API_H diff --git a/src/lib-cxx/lib-cxx.cpp b/src/lib-cxx/lib-cxx.cpp index 637cc65..7a0ca3b 100644 --- a/src/lib-cxx/lib-cxx.cpp +++ b/src/lib-cxx/lib-cxx.cpp @@ -4,11 +4,11 @@ // // Copyright(C) 2021-2022, S.Deorowicz, A.Danek, H.Li // -// Version: 3.1 -// Date : 2024-03-12 +// Version: 3.2 +// Date : 2024-11-21 // ******************************************************************************************* -#include "../core/agc_decompressor_lib.h" +#include "../common/agc_decompressor_lib.h" #include "agc-api.h" #include diff --git a/src/lib-cxx/lib-cxx.vcxproj b/src/lib-cxx/lib-cxx.vcxproj index 34ffab9..7e31135 100644 --- a/src/lib-cxx/lib-cxx.vcxproj +++ b/src/lib-cxx/lib-cxx.vcxproj @@ -78,11 +78,11 @@ true - ../../3rd_party/zstd/lib;$(VC_IncludePath);$(WindowsSDK_IncludePath); + ../..\3rd_party\libdeflate;../../3rd_party;$(VC_IncludePath);$(WindowsSDK_IncludePath); false - ../../3rd_party/zstd/lib;$(VC_IncludePath);$(WindowsSDK_IncludePath); + ../..\3rd_party\libdeflate;../../3rd_party;$(VC_IncludePath);$(WindowsSDK_IncludePath); @@ -128,7 +128,7 @@ /D_CRT_SECURE_NO_WARNINGS %(AdditionalOptions) - stdcpp17 + stdcpp20 @@ -148,7 +148,7 @@ /D_CRT_SECURE_NO_WARNINGS %(AdditionalOptions) - stdcpp17 + stdcpp20 stdc11 @@ -160,32 +160,34 @@ - - - - - - - - - - + + + + + + + + + + + + + - - - - - - + + + + + + + + + + - - - {8bfd8150-94d5-4bf9-8a50-7bd9929a0850} - - diff --git a/src/lib-cxx/lib-cxx.vcxproj.filters b/src/lib-cxx/lib-cxx.vcxproj.filters index 9acf9b2..401d989 100644 --- a/src/lib-cxx/lib-cxx.vcxproj.filters +++ b/src/lib-cxx/lib-cxx.vcxproj.filters @@ -18,34 +18,43 @@ Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + Header Files - + + Header Files + + + Header Files + + + Header Files + + Header Files @@ -53,22 +62,34 @@ Source Files - + + Source Files + + + Source Files + + + Source Files + + + Source Files + + Source Files - + Source Files - + Source Files - + Source Files - + Source Files - + Source Files diff --git a/py_agc_api/py_agc_api.cpp b/src/py_agc_api/py_agc_api.cpp similarity index 95% rename from py_agc_api/py_agc_api.cpp rename to src/py_agc_api/py_agc_api.cpp index 1eb95fb..c4ef0a4 100644 --- a/py_agc_api/py_agc_api.cpp +++ b/src/py_agc_api/py_agc_api.cpp @@ -1,88 +1,88 @@ -#include -#include -#include -#include -#include -#include - -//binding STL container std::vector -PYBIND11_MAKE_OPAQUE(std::vector); - -namespace py = pybind11; - - -PYBIND11_MODULE(py_agc_api, m) { - m.doc() = "Python wrapper for AGC_API."; // optional module docstring - - // StringVector can be used in Python code (binding std::vector) - py::bind_vector>(m, "StringVector") - .def(py::init<>()) - .def("clear", &std::vector::clear) - .def("pop_back", &std::vector::pop_back) - .def("__len__", [](const std::vector &v) { return v.size(); }) - .def("__iter__", [](std::vector &v) { - return py::make_iterator(v.begin(), v.end()); - }, py::keep_alive<0, 1>()); - - // Class that represents agc archive - py::class_(m, "CAGCFile") - .def(py::init<>()) //parameterless constructor - - //Open(file_name, prefetching = true) opens agc archive - // - //@return true for success and false for error - .def("Open", &CAGCFile::Open) - - //Close() closes opened archive - //@return true for success and false for error - .def("Close", &CAGCFile::Close) //Close() closes opened archive - - //NSample() - //@returns number of samples in the archive - .def("NSample", &CAGCFile::NSample) - - //GetReferenceSample() - //@returns reference sample - .def("GetReferenceSample", [](CAGCFile& ptr){ std::string s; ptr.GetReferenceSample(s); return s;}) - - - //NCtg(sample) - //@returns number of contig in sample - .def("NCtg", &CAGCFile::NCtg) - - //ListSample(samples: StringVector) - //@param samples vector of strings (StringVector) with sample names (returned value) - //@return number of samples to be written to - .def("ListSample", &CAGCFile::ListSample) - - //ListCtg(sample, names: StringVector) - //@param sample sample name - //@param names vector of strings (StringVector) with contig names (returned value) - //@return number of contigs in the sample - .def("ListCtg", &CAGCFile::ListCtg) - - //GetCtgLen(sample, name) - //Get the length of a contig. - //@param sample sample name; - //@param name contig name - //@return contig length, or <0 for errors - .def("GetCtgLen", &CAGCFile::GetCtgLen) - - //GetCtgSeq(sample, name, start, end) - //@param sample sample name - //@param name contig name - //@param start start offset - //@param end end offset - //@return contig sequence - .def("GetCtgSeq", [](CAGCFile& ptr, const std::string& sample, const std::string& name, int start, int end) { std::string s; ptr.GetCtgSeq(sample, name, start, end, s); return s;}) - - //GetCtgSeq(name, start, end) - //@param name contig name - //@param start start offset - //@param end end offset - //@return contig sequence (if unique name across all contigs in all samples) - .def("GetCtgSeq", [](CAGCFile& ptr, const std::string& name, int start, int end) { std::string s; std::string empty; ptr.GetCtgSeq(empty, name, start, end, s); return s;}) - ; - -} - +#include +#include +#include +#include +#include "../core/agc_decompressor.h" +#include "../lib-cxx/agc-api.h" + +//binding STL container std::vector +PYBIND11_MAKE_OPAQUE(std::vector); + +namespace py = pybind11; + + +PYBIND11_MODULE(py_agc_api, m) { + m.doc() = "Python wrapper for AGC_API."; // optional module docstring + + // StringVector can be used in Python code (binding std::vector) + py::bind_vector>(m, "StringVector") + .def(py::init<>()) + .def("clear", &std::vector::clear) + .def("pop_back", &std::vector::pop_back) + .def("__len__", [](const std::vector &v) { return v.size(); }) + .def("__iter__", [](std::vector &v) { + return py::make_iterator(v.begin(), v.end()); + }, py::keep_alive<0, 1>()); + + // Class that represents agc archive + py::class_(m, "CAGCFile") + .def(py::init<>()) //parameterless constructor + + //Open(file_name, prefetching = true) opens agc archive + // + //@return true for success and false for error + .def("Open", &CAGCFile::Open) + + //Close() closes opened archive + //@return true for success and false for error + .def("Close", &CAGCFile::Close) //Close() closes opened archive + + //NSample() + //@returns number of samples in the archive + .def("NSample", &CAGCFile::NSample) + + //GetReferenceSample() + //@returns reference sample + .def("GetReferenceSample", [](CAGCFile& ptr){ std::string s; ptr.GetReferenceSample(s); return s;}) + + + //NCtg(sample) + //@returns number of contig in sample + .def("NCtg", &CAGCFile::NCtg) + + //ListSample(samples: StringVector) + //@param samples vector of strings (StringVector) with sample names (returned value) + //@return number of samples to be written to + .def("ListSample", &CAGCFile::ListSample) + + //ListCtg(sample, names: StringVector) + //@param sample sample name + //@param names vector of strings (StringVector) with contig names (returned value) + //@return number of contigs in the sample + .def("ListCtg", &CAGCFile::ListCtg) + + //GetCtgLen(sample, name) + //Get the length of a contig. + //@param sample sample name; + //@param name contig name + //@return contig length, or <0 for errors + .def("GetCtgLen", &CAGCFile::GetCtgLen) + + //GetCtgSeq(sample, name, start, end) + //@param sample sample name + //@param name contig name + //@param start start offset + //@param end end offset + //@return contig sequence + .def("GetCtgSeq", [](CAGCFile& ptr, const std::string& sample, const std::string& name, int start, int end) { std::string s; ptr.GetCtgSeq(sample, name, start, end, s); return s;}) + + //GetCtgSeq(name, start, end) + //@param name contig name + //@param start start offset + //@param end end offset + //@return contig sequence (if unique name across all contigs in all samples) + .def("GetCtgSeq", [](CAGCFile& ptr, const std::string& name, int start, int end) { std::string s; std::string empty; ptr.GetCtgSeq(empty, name, start, end, s); return s;}) + ; + +} + diff --git a/py_agc_api/py_agc_test.py b/src/py_agc_api/py_agc_test.py similarity index 97% rename from py_agc_api/py_agc_test.py rename to src/py_agc_api/py_agc_test.py index 9ae3675..e061fb2 100644 --- a/py_agc_api/py_agc_test.py +++ b/src/py_agc_api/py_agc_test.py @@ -1,67 +1,67 @@ -#!/usr/bin/env python3 -import sys -import py_agc_api as agc -import textwrap - - -# open AGC archive ("toy_ex/toy_ex.agc") -agc_arch = agc.CAGCFile() -if not agc_arch.Open("toy_ex/toy_ex.agc", True): - print("Error: cannot open agc archive") - sys.exit(1) - -#Get number of samples in the archive -n = agc_arch.NSample(); -print("No. samples: ", n) - -#Get reference name -reference = agc_arch.GetReferenceSample() -print("Reference sample: ", reference) - -#Get list of samples in the archive -set_list = agc.StringVector() - - -agc_arch.ListSample(set_list); -print("Samples in file with no. of contigs") - -iterator = iter(set_list) -for i in range(n): #for i in range(len(set_list)): - sample = set_list[i]; - no_ctg = agc_arch.NCtg(sample) #Get number of contigs for sample - print(sample, ":", no_ctg) - - -#Get contents of the sample -sample=set_list[0] -print("\nContents of sample:", sample) -#Get number of contigs for the 0th sample -no_ctg = agc_arch.NCtg(sample) -#Get name of contigs of the 0th sample -ctg_list = agc.StringVector() -agc_arch.ListCtg(sample, ctg_list); -#Print name and length of each contig of the 0th sample -for i in range(no_ctg): - ctg_len = agc_arch.GetCtgLen(sample, ctg_list[i]) #Get length of the of the contig - print("length of contig", ctg_list[i],":",ctg_len) #print contig name and length -#Print part of contig in the sample (of length 5 if possible, from position 8, if possible) - start=8 #start position of contig part - length=5 #part length - end=start+length-1 #end position of contig part - ctg_len = agc_arch.GetCtgLen(sample, ctg_list[i]) #Get length of the of the contig - if end >= ctg_len: - end = ctg_len - 1 - if length > ctg_len: - start = 0 #print whole contig - else: - start = end - (length - 1) - print("\tsubsequence start:", start, ", end:", end, ", length:", length) - #Get and print part of 0th contig in the 0th sample, from start to end - seq = agc_arch.GetCtgSeq(sample, ctg_list[i], start, end) - print("\tsubsequence length:", len(seq)) - print("\tpos:", start, "-", end, "len:", end-start+1, "seq:",seq); - #Get (query without sample name) and print part of 0th contig in the 0th sample, from start to end (work only if contig name is unique among samples) - seq = agc_arch.GetCtgSeq(ctg_list[i], start, end) - print("\tsubsequence start:", start, ", end:", end, ", length:", length) - print("\tsubsequence length:", len(seq)) - print("\tpos:", start, "-", end, "len:", end-start+1, "seq:",seq); +#!/usr/bin/env python3 +import sys +import py_agc_api as agc +import textwrap + + +# open AGC archive ("toy_ex/toy_ex.agc") +agc_arch = agc.CAGCFile() +if not agc_arch.Open("toy_ex/toy_ex.agc", True): + print("Error: cannot open agc archive") + sys.exit(1) + +#Get number of samples in the archive +n = agc_arch.NSample(); +print("No. samples: ", n) + +#Get reference name +reference = agc_arch.GetReferenceSample() +print("Reference sample: ", reference) + +#Get list of samples in the archive +set_list = agc.StringVector() + + +agc_arch.ListSample(set_list); +print("Samples in file with no. of contigs") + +iterator = iter(set_list) +for i in range(n): #for i in range(len(set_list)): + sample = set_list[i]; + no_ctg = agc_arch.NCtg(sample) #Get number of contigs for sample + print(sample, ":", no_ctg) + + +#Get contents of the sample +sample=set_list[0] +print("\nContents of sample:", sample) +#Get number of contigs for the 0th sample +no_ctg = agc_arch.NCtg(sample) +#Get name of contigs of the 0th sample +ctg_list = agc.StringVector() +agc_arch.ListCtg(sample, ctg_list); +#Print name and length of each contig of the 0th sample +for i in range(no_ctg): + ctg_len = agc_arch.GetCtgLen(sample, ctg_list[i]) #Get length of the of the contig + print("length of contig", ctg_list[i],":",ctg_len) #print contig name and length +#Print part of contig in the sample (of length 5 if possible, from position 8, if possible) + start=8 #start position of contig part + length=5 #part length + end=start+length-1 #end position of contig part + ctg_len = agc_arch.GetCtgLen(sample, ctg_list[i]) #Get length of the of the contig + if end >= ctg_len: + end = ctg_len - 1 + if length > ctg_len: + start = 0 #print whole contig + else: + start = end - (length - 1) + print("\tsubsequence start:", start, ", end:", end, ", length:", length) + #Get and print part of 0th contig in the 0th sample, from start to end + seq = agc_arch.GetCtgSeq(sample, ctg_list[i], start, end) + print("\tsubsequence length:", len(seq)) + print("\tpos:", start, "-", end, "len:", end-start+1, "seq:",seq); + #Get (query without sample name) and print part of 0th contig in the 0th sample, from start to end (work only if contig name is unique among samples) + seq = agc_arch.GetCtgSeq(ctg_list[i], start, end) + print("\tsubsequence start:", start, ", end:", end, ", length:", length) + print("\tsubsequence length:", len(seq)) + print("\tpos:", start, "-", end, "len:", end-start+1, "seq:",seq); diff --git a/py_agc_api/set_path.sh b/src/py_agc_api/set_path.sh similarity index 98% rename from py_agc_api/set_path.sh rename to src/py_agc_api/set_path.sh index 189af7b..4592e56 100644 --- a/py_agc_api/set_path.sh +++ b/src/py_agc_api/set_path.sh @@ -1,2 +1,2 @@ -SCRIPTPATH="$( cd "$(dirname "$BASH_SOURCE")" ; pwd -P )" -export PYTHONPATH="${PYTHONPATH}:${SCRIPTPATH}/" +SCRIPTPATH="$( cd "$(dirname "$BASH_SOURCE")" ; pwd -P )" +export PYTHONPATH="${PYTHONPATH}:${SCRIPTPATH}/"