Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/4031 journals added sheet duplicated #2444

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
64 changes: 48 additions & 16 deletions doajtest/unit/test_task_datalog_journal_added_update.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import time
import unittest
from datetime import datetime
from typing import List
from unittest.mock import MagicMock
from unittest.mock import patch

from doajtest.fixtures import JournalFixtureFactory
from doajtest.helpers import DoajTestCase
from portality.lib import dates
from portality.models import Journal
from portality.models import Journal, datalog_journal_added
from portality.models.datalog_journal_added import DatalogJournalAdded
from portality.tasks import datalog_journal_added_update
from portality.tasks.datalog_journal_added_update import DatalogJournalAddedUpdate, to_display_data, \
Expand Down Expand Up @@ -39,6 +40,16 @@

class TestDatalogJournalAddedUpdate(DoajTestCase):

def test_sync_datalog_journal_added(self):
journals = create_test_journals(4)
save_and_block_last(journals[-1:])
datalog_journal_added_update.sync_datalog_journal_added()
assert DatalogJournalAdded.count() == 1

save_and_block_last(journals[:-1])
datalog_journal_added_update.sync_datalog_journal_added()
assert DatalogJournalAdded.count() == 4

def test_execute__normal(self):
"""
test background job execute
Expand All @@ -50,7 +61,7 @@ def test_execute__normal(self):

save_test_datalog()

journals = save_test_journals(3)
journals = save_test_journals(4)

worksheet = MagicMock()
worksheet.get_all_values.return_value = [
Expand All @@ -64,10 +75,10 @@ def test_execute__normal(self):
.open.return_value
.worksheet.return_value) = worksheet

background_task = background_helper.execute_by_bg_task_type(DatalogJournalAddedUpdate,
filename=input_filename,
worksheet_name=input_worksheet_name,
google_key_path=input_google_key_path)
background_helper.execute_by_bg_task_type(DatalogJournalAddedUpdate,
filename=input_filename,
worksheet_name=input_worksheet_name,
google_key_path=input_google_key_path)

worksheet.get_all_values.assert_called()
new_rows_added_to_excels, row_idx, *_ = worksheet.insert_rows.call_args.args
Expand Down Expand Up @@ -128,17 +139,26 @@ def test_latest_row_index(self):
def test_find_new_datalog_journals(self):
save_test_journals(3)

def _find_new_datalog_journals(latest_date_str):
def _count_new_datalog_journals(latest_date_str):
datalog_list = datalog_journal_added_update.find_new_datalog_journals(
dates.parse(latest_date_str)
)
datalog_list = list(datalog_list)
return len(datalog_list)

assert _find_new_datalog_journals('2101-01-01') == 2
assert _find_new_datalog_journals('2102-01-01') == 1
assert _find_new_datalog_journals('2103-01-01') == 0
assert _find_new_datalog_journals('2104-01-01') == 0
assert _count_new_datalog_journals('2101-01-01') == 3
assert _count_new_datalog_journals('2101-01-01T22:22:22Z') == 2
assert _count_new_datalog_journals('2102-01-01') == 2
assert _count_new_datalog_journals('2103-01-01') == 1
assert _count_new_datalog_journals('2104-01-01') == 0

def test_is_datalog_exist(self):
save_test_datalog()
# save_all_block_last(testdata_datalog_list)

assert datalog_journal_added.is_issn_exists('1234-3000', '2021-01-01')
assert datalog_journal_added.is_issn_exists('1234-1000', datetime(2020, 1, 1))
assert not datalog_journal_added.is_issn_exists('9999-9999', datetime(2021, 1, 1))


def save_test_datalog():
Expand All @@ -147,17 +167,29 @@ def save_test_datalog():

time.sleep(2)

DatalogJournalAdded.refresh()

def save_test_journals(n_journals: int) -> List[Journal]:

def create_test_journals(n_journals):
journals = JournalFixtureFactory.make_many_journal_sources(count=n_journals, in_doaj=True)
journals = map(lambda d: Journal(**d), journals)
journals = list(journals)
assert len(journals) == n_journals
journals[0]['created_date'] = '2103-01-01'
journals[1]['created_date'] = '2102-01-01'
journals[2]['created_date'] = '2101-01-01'
save_and_block_last(journals)
test_dates = [
'2103-01-01T03:00:00Z',
'2102-01-01T02:00:00Z',
'2101-01-01T22:22:22Z',
'2101-01-01T01:00:00Z',
]
for i, j in enumerate(journals):
if i < len(test_dates):
j['created_date'] = test_dates[i]
return journals


def save_test_journals(n_journals: int) -> List[Journal]:
journals = create_test_journals(n_journals)
save_and_block_last(journals)
return journals


Expand Down
10 changes: 9 additions & 1 deletion portality/dao.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import annotations

import json
import re
import os
import re
import sys
import time
import urllib.parse
Expand Down Expand Up @@ -1087,6 +1087,14 @@ def _yield_index_alias():
return index_aliases


def is_exist(query: dict, index):
query['size'] = 1
query['_source'] = False
res = ES.search(body=query, index=index, size=1, ignore=[404])

return res.get('hits', {}).get('total',{}).get('value', 0) > 0


class BlockTimeOutException(Exception):
pass

Expand Down
10 changes: 10 additions & 0 deletions portality/migrate/4031_journals_added_sheet_duplicated/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# 2024-12-25; Issue 4031 - Journals added sheet duplicated

## Execution

Run the script to remove all records in DatalogJournalAdded

python portality/migrate/4031_journals_added_sheet_duplicated/cleanup_4031_journals_added_sheet_duplicated.py


* After running the script, manually **BACKUP** and remove all record in google sheet
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from portality.models.datalog_journal_added import DatalogJournalAdded


def main():
# remove all records
query = {
"query": {
"match_all": {}
},
}
DatalogJournalAdded.delete_by_query(query)


if __name__ == '__main__':
main()
65 changes: 64 additions & 1 deletion portality/models/datalog_journal_added.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from portality.dao import DomainObject
from __future__ import annotations

import elasticsearch

from portality import dao
from portality.dao import DomainObject, ScrollInitialiseException
from portality.lib import coerce
from portality.lib.coerce import COERCE_MAP
from portality.lib.seamless import SeamlessMixin
Expand Down Expand Up @@ -77,3 +82,61 @@ def journal_id(self):
@journal_id.setter
def journal_id(self, val):
self.__seamless__.set_single('journal_id', val)


class LastDatalogJournalAddedQuery:

def query(self):
return {
"size": 1,
"sort": [
{
"date_added": {
"order": "desc"
}
}
],
"query": {
"match_all": {}
}
}


def find_last_datalog():
try:
record = next(DatalogJournalAdded.iterate(LastDatalogJournalAddedQuery().query()), None)
except (elasticsearch.exceptions.NotFoundError, ScrollInitialiseException):
record = None
return record


class DateAddedDescQuery:

def query(self):
return {
'sort': [
{'date_added': {'order': 'desc'}}
]
}


class IssnDateMatchQuery:
def __init__(self, issn, date_added):
self.issn = issn
self.date_added = date_added

def query(self):
return {
"query": {
"bool": {
"filter": [
{"term": {"issn.keyword": self.issn}},
{"term": {"date_added": self.date_added}}
]
}
},
}


def is_issn_exists(issn, date_added):
return dao.is_exist(IssnDateMatchQuery(issn, date_added).query(), DatalogJournalAdded.index_name())
2 changes: 1 addition & 1 deletion portality/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@
"old_data_cleanup": {"month": "*", "day": "12", "day_of_week": "*", "hour": "6", "minute": "30"},
"monitor_bgjobs": {"month": "*", "day": "*/6", "day_of_week": "*", "hour": "10", "minute": "0"},
"find_discontinued_soon": {"month": "*", "day": "*", "day_of_week": "*", "hour": "0", "minute": "3"},
"datalog_journal_added_update": {"month": "*", "day": "*", "day_of_week": "*", "hour": "*", "minute": "*/30"}
"datalog_journal_added_update": {"month": "*", "day": "*", "day_of_week": "*", "hour": "4", "minute": "30"}
}


Expand Down
4 changes: 2 additions & 2 deletions portality/tasks/anon_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import shutil
import uuid
from typing import Callable, NoReturn
from typing import Callable

from portality import models, dao
from portality.background import BackgroundTask
Expand Down Expand Up @@ -111,7 +111,7 @@ def _copy_on_complete(path, logger_fn, tmpStore, mainStore, container):


def run_anon_export(tmpStore, mainStore, container, clean=False, limit=None, batch_size=100000,
logger_fn: Callable[[str], NoReturn] = None):
logger_fn: Callable[[str], None] = None):
if logger_fn is None:
logger_fn = print
if clean:
Expand Down
Loading
Loading