[IMP] queue_job: remove cron garbage collector and automatically requ…

…eue jobs in timeout [IMP] queue_job: increment 'retry' when re-queuing job that have been killed
OCA · Dec 23, 2024 · e0e9327 · e0e9327
1 parent 2413ef6
commit e0e9327
Show file tree

Hide file tree

Showing 11 changed files with 167 additions and 143 deletions.
diff --git a/queue_job/README.rst b/queue_job/README.rst
@@ -137,18 +137,7 @@ Configuration
 .. [1] It works with the threaded Odoo server too, although this way
        of running Odoo is obviously not for production purposes.
 
-* Be sure to check out *Jobs Garbage Collector* CRON and change *enqueued_delta* and *started_delta* parameters to your needs.
-
-  * ``enqueued_delta``: Spent time in minutes after which an enqueued job is considered stuck.
-    Set it to 0 to disable this check.
-  * ``started_delta``: Spent time in minutes after which a started job is considered stuck.
-    This parameter should not be less than ``--limit-time-real // 60`` parameter in your configuration.
-    Set it to 0 to disable this check. Set it to -1 to automate it, based in the server's ``--limit-time-real`` config parameter.
-
-  .. code-block:: python
-
-    # `model` corresponds to 'queue.job' model
-    model.requeue_stuck_jobs(enqueued_delta=1, started_delta=-1)
+* Jobs that remain in `enqueued` or `started` state (because, for instance, their worker has been killed) will be automatically re-queued.
 
 Usage
 =====

diff --git a/queue_job/__manifest__.py b/queue_job/__manifest__.py
@@ -2,7 +2,7 @@
 
 {
     "name": "Job Queue",
-    "version": "16.0.2.7.1",
+    "version": "16.0.2.8.0",
     "author": "Camptocamp,ACSONE SA/NV,Odoo Community Association (OCA)",
     "website": "https://github.com/OCA/queue",
     "license": "LGPL-3",

diff --git a/queue_job/controllers/main.py b/queue_job/controllers/main.py
@@ -31,6 +31,8 @@ def _try_perform_job(self, env, job):
         job.set_started()
         job.store()
         env.cr.commit()
+        job.lock()
+
         _logger.debug("%s started", job)
 
         job.perform()

diff --git a/queue_job/data/queue_data.xml b/queue_job/data/queue_data.xml
@@ -1,17 +1,6 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <odoo>
     <data noupdate="1">
-        <record id="ir_cron_queue_job_garbage_collector" model="ir.cron">
-            <field name="name">Jobs Garbage Collector</field>
-            <field name="interval_number">5</field>
-            <field name="interval_type">minutes</field>
-            <field name="numbercall">-1</field>
-            <field ref="model_queue_job" name="model_id" />
-            <field name="state">code</field>
-            <field
-                name="code"
-            >model.requeue_stuck_jobs(enqueued_delta=1, started_delta=-1)</field>
-        </record>
         <!-- Queue-job-related subtypes for messaging / Chatter -->
         <record id="mt_job_failed" model="mail.message.subtype">
             <field name="name">Job failed</field>

diff --git a/queue_job/job.py b/queue_job/job.py
@@ -238,6 +238,34 @@ def load_many(cls, env, job_uuids):
         recordset = cls.db_records_from_uuids(env, job_uuids)
         return {cls._load_from_db_record(record) for record in recordset}
 
+    def lock(self):
+        self.env.cr.execute(
+            """
+            SELECT
+                *
+            FROM
+                queue_job_locks
+            WHERE
+                id in (
+                    SELECT
+                        id
+                    FROM
+                        queue_job
+                    WHERE
+                        uuid = %s
+                        AND state='started'
+                )
+            FOR UPDATE;
+        """,
+            [self.uuid],
+        )
+
+        # 1 job should be locked
+        if not 1 == len(self.env.cr.fetchall()):
+            raise RetryableJobError(
+                f"Trying to lock job that wasn't started, uuid: {self.uuid}"
+            )
+
     @classmethod
     def _load_from_db_record(cls, job_db_record):
         stored = job_db_record
@@ -517,6 +545,11 @@ def perform(self):
 
         The job is executed with the user which has initiated it.
         """
+        if self.max_retries and self.retry >= self.max_retries:
+            raise FailedJobError(
+                "Job: %s, Max. retries (%d) reached" % (self.uuid, self.max_retries)
+            )
+
         self.retry += 1
         try:
             self.result = self.func(*tuple(self.args), **self.kwargs)
@@ -820,6 +853,23 @@ def set_started(self):
         self.date_started = datetime.now()
         self.worker_pid = os.getpid()
 
+        # add job to list of lockable jobs
+        self.env.cr.execute(
+            """
+            INSERT INTO
+                queue_job_locks (id)
+            SELECT
+                id
+            FROM
+                queue_job
+            WHERE
+                uuid = %s
+            ON CONFLICT(id)
+            DO NOTHING;
+        """,
+            [self.uuid],
+        )
+
     def set_done(self, result=None):
         self.state = DONE
         self.exc_name = None

diff --git a/queue_job/jobrunner/runner.py b/queue_job/jobrunner/runner.py
@@ -114,22 +114,6 @@
 * After creating a new database or installing queue_job on an
   existing database, Odoo must be restarted for the runner to detect it.
 
-* When Odoo shuts down normally, it waits for running jobs to finish.
-  However, when the Odoo server crashes or is otherwise force-stopped,
-  running jobs are interrupted while the runner has no chance to know
-  they have been aborted. In such situations, jobs may remain in
-  ``started`` or ``enqueued`` state after the Odoo server is halted.
-  Since the runner has no way to know if they are actually running or
-  not, and does not know for sure if it is safe to restart the jobs,
-  it does not attempt to restart them automatically. Such stale jobs
-  therefore fill the running queue and prevent other jobs to start.
-  You must therefore requeue them manually, either from the Jobs view,
-  or by running the following SQL statement *before starting Odoo*:
-
-.. code-block:: sql
-
-  update queue_job set state='pending' where state in ('started', 'enqueued')
-
 .. rubric:: Footnotes
 
 .. [1] From a security standpoint, it is safe to have an anonymous HTTP
@@ -155,7 +139,7 @@
 from odoo.tools import config
 
 from . import queue_job_config
-from .channels import ENQUEUED, NOT_DONE, PENDING, ChannelManager
+from .channels import ENQUEUED, NOT_DONE, ChannelManager
 
 SELECT_TIMEOUT = 60
 ERROR_RECOVERY_DELAY = 5
@@ -207,35 +191,14 @@ def _connection_info_for(db_name):
 
 
 def _async_http_get(scheme, host, port, user, password, db_name, job_uuid):
-    # Method to set failed job (due to timeout, etc) as pending,
-    # to avoid keeping it as enqueued.
-    def set_job_pending():
-        connection_info = _connection_info_for(db_name)
-        conn = psycopg2.connect(**connection_info)
-        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
-        with closing(conn.cursor()) as cr:
-            cr.execute(
-                "UPDATE queue_job SET state=%s, "
-                "date_enqueued=NULL, date_started=NULL "
-                "WHERE uuid=%s and state=%s "
-                "RETURNING uuid",
-                (PENDING, job_uuid, ENQUEUED),
-            )
-            if cr.fetchone():
-                _logger.warning(
-                    "state of job %s was reset from %s to %s",
-                    job_uuid,
-                    ENQUEUED,
-                    PENDING,
-                )
-
     # TODO: better way to HTTP GET asynchronously (grequest, ...)?
     #       if this was python3 I would be doing this with
     #       asyncio, aiohttp and aiopg
     def urlopen():
         url = "{}://{}:{}/queue_job/runjob?db={}&job_uuid={}".format(
             scheme, host, port, db_name, job_uuid
         )
+        # pylint: disable=except-pass
         try:
             auth = None
             if user:
@@ -249,10 +212,9 @@ def urlopen():
             # for codes between 500 and 600
             response.raise_for_status()
         except requests.Timeout:
-            set_job_pending()
+            pass
         except Exception:
             _logger.exception("exception in GET %s", url)
-            set_job_pending()
 
     thread = threading.Thread(target=urlopen)
     thread.daemon = True
@@ -343,6 +305,60 @@ def set_job_enqueued(self, uuid):
                 (ENQUEUED, uuid),
             )
 
+    def requeue_dead_jobs(self):
+        """
+        Set started and enqueued jobs but not locked to pending
+
+        A job is locked when it's being executed
+        When a job is killed, it releases the lock
+
+        Adding a buffer on 'date_enqueued' to check
+        that it has been enqueued for more than 10sec.
+        This prevents from requeuing jobs before they are actually started.
+
+        When Odoo shuts down normally, it waits for running jobs to finish.
+        However, when the Odoo server crashes or is otherwise force-stopped,
+        running jobs are interrupted while the runner has no chance to know
+        they have been aborted.
+        """
+
+        with closing(self.conn.cursor()) as cr:
+            query = """
+            UPDATE
+                queue_job
+            SET
+                state='pending',
+                retry=(CASE WHEN state='started' THEN retry+1 ELSE retry END)
+            WHERE
+                id in (
+                    SELECT
+                        id
+                    FROM
+                        queue_job_locks
+                    WHERE
+                        id in (
+                            SELECT
+                                id
+                            FROM
+                                queue_job
+                            WHERE
+                                state IN ('enqueued','started')
+                                AND date_enqueued <
+                                (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
+                        )
+                    FOR UPDATE SKIP LOCKED
+                )
+            RETURNING uuid
+            """
+
+            cr.execute(query)
+
+            for (uuid,) in cr.fetchall():
+                _logger.warning(
+                    "Re-queued job with uuid: %s",
+                    uuid,
+                )
+
 
 class QueueJobRunner(object):
     def __init__(
@@ -424,6 +440,11 @@ def initialize_databases(self):
                         self.channel_manager.notify(db_name, *job_data)
                 _logger.info("queue job runner ready for db %s", db_name)
 
+    def requeue_dead_jobs(self):
+        for db in self.db_by_name.values():
+            if db.has_queue_job:
+                db.requeue_dead_jobs()
+
     def run_jobs(self):
         now = _odoo_now()
         for job in self.channel_manager.get_jobs_to_run(now):
@@ -516,6 +537,7 @@ def run(self):
                 _logger.info("database connections ready")
                 # inner loop does the normal processing
                 while not self._stop:
+                    self.requeue_dead_jobs()
                     self.process_notifications()
                     self.run_jobs()
                     self.wait_notification()

diff --git a/queue_job/migrations/16.0.2.7.0/pre-migration.py b/queue_job/migrations/16.0.2.7.0/pre-migration.py
@@ -0,0 +1,35 @@
+# License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl.html)
+
+
+def migrate(cr, version):
+    # Create job lock table
+    cr.execute(
+        """
+            CREATE TABLE IF NOT EXISTS queue_job_locks (
+                id INT PRIMARY KEY,
+                CONSTRAINT
+                    queue_job_locks_queue_job_id_fkey
+                FOREIGN KEY (id)
+                REFERENCES queue_job (id) ON DELETE CASCADE
+            );
+        """
+    )
+
+    # Deactivate cron garbage collector
+    cr.execute(
+        """
+            UPDATE
+                ir_cron
+            SET
+                active=False
+            WHERE id IN (
+                SELECT res_id
+                FROM
+                    ir_model_data
+                WHERE
+                    module='queue_job'
+                    AND model='ir.cron'
+                    AND name='ir_cron_queue_job_garbage_collector'
+            );
+        """
+    )
diff --git a/queue_job/models/queue_job.py b/queue_job/models/queue_job.py
@@ -6,7 +6,6 @@
 from datetime import datetime, timedelta
 
 from odoo import _, api, exceptions, fields, models
-from odoo.osv import expression
 from odoo.tools import config, html_escape
 
 from odoo.addons.base_sparse_field.models.fields import Serialized
@@ -417,58 +416,6 @@ def autovacuum(self):
                     break
         return True
 
-    def requeue_stuck_jobs(self, enqueued_delta=1, started_delta=0):
-        """Fix jobs that are in a bad states
-
-        :param in_queue_delta: lookup time in minutes for jobs
-                               that are in enqueued state,
-                               0 means that it is not checked
-
-        :param started_delta: lookup time in minutes for jobs
-                              that are in started state,
-                              0 means that it is not checked,
-                              -1 will use `--limit-time-real` config value
-        """
-        if started_delta == -1:
-            started_delta = (config["limit_time_real"] // 60) + 1
-        return self._get_stuck_jobs_to_requeue(
-            enqueued_delta=enqueued_delta, started_delta=started_delta
-        ).requeue()
-
-    def _get_stuck_jobs_domain(self, queue_dl, started_dl):
-        domain = []
-        now = fields.datetime.now()
-        if queue_dl:
-            queue_dl = now - timedelta(minutes=queue_dl)
-            domain.append(
-                [
-                    "&",
-                    ("date_enqueued", "<=", fields.Datetime.to_string(queue_dl)),
-                    ("state", "=", "enqueued"),
-                ]
-            )
-        if started_dl:
-            started_dl = now - timedelta(minutes=started_dl)
-            domain.append(
-                [
-                    "&",
-                    ("date_started", "<=", fields.Datetime.to_string(started_dl)),
-                    ("state", "=", "started"),
-                ]
-            )
-        if not domain:
-            raise exceptions.ValidationError(
-                _("If both parameters are 0, ALL jobs will be requeued!")
-            )
-        return expression.OR(domain)
-
-    def _get_stuck_jobs_to_requeue(self, enqueued_delta, started_delta):
-        job_model = self.env["queue.job"]
-        stuck_jobs = job_model.search(
-            self._get_stuck_jobs_domain(enqueued_delta, started_delta)
-        )
-        return stuck_jobs
-
     def related_action_open_record(self):
         """Open a form view with the record(s) of the job.