metabrainz · phw · Jan 7, 2024 · Dec 21, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/picard/acoustid/__init__.py b/picard/acoustid/__init__.py
@@ -34,10 +34,7 @@
 from PyQt6 import QtCore
 
 from picard import log
-from picard.acoustid.json_helpers import (
-    max_source_count,
-    parse_recording,
-)
+from picard.acoustid.recordings import RecordingResolver
 from picard.config import get_config
 from picard.const import (
     DEFAULT_FPCALC_THREADS,
@@ -49,6 +46,7 @@
     find_executable,
     win_prefix_longpath,
 )
+from picard.webservice.api_helpers import AcoustIdAPIHelper
 
 
 def get_score(node):
@@ -76,7 +74,7 @@ def find_fpcalc():
 
 class AcoustIDClient(QtCore.QObject):
 
-    def __init__(self, acoustid_api):
+    def __init__(self, acoustid_api: AcoustIdAPIHelper):
         super().__init__()
         self._queue = deque()
         self._running = 0
@@ -93,7 +91,6 @@ def get_max_processes(self):
         return config.setting['fpcalc_threads'] or DEFAULT_FPCALC_THREADS
 
     def _on_lookup_finished(self, task, document, http, error):
-        doc = {}
         if error:
             mparms = {
                 'error': http.errorString(),
@@ -108,42 +105,16 @@ def _on_lookup_finished(self, task, document, http, error):
                 mparms,
                 echo=None
             )
+            task.next_func({}, http, error)
         else:
             try:
-                recording_list = doc['recordings'] = []
                 status = document['status']
                 if status == 'ok':
-                    results = document.get('results') or []
-                    for result in results:
-                        recordings = result.get('recordings') or []
-                        max_sources = max_source_count(recordings)
-                        result_score = get_score(result)
-                        for recording in recordings:
-                            parsed_recording = parse_recording(recording)
-                            if parsed_recording is not None:
-                                # Calculate a score based on result score and sources for this
-                                # recording relative to other recordings in this result
-                                score = min(recording.get('sources', 1) / max_sources, 1.0) * 100
-                                parsed_recording['score'] = score * result_score
-                                parsed_recording['acoustid'] = result['id']
-                                recording_list.append(parsed_recording)
-
-                    if results:
-                        if not recording_list:
-                            # Set AcoustID in tags if there was no matching recording
-                            task.file.metadata['acoustid_id'] = results[0]['id']
-                            task.file.update()
-                            log.debug(
-                                "AcoustID: Found no matching recordings for '%s',"
-                                " setting acoustid_id tag to %r",
-                                task.file.filename, results[0]['id']
-                            )
-                        else:
-                            log.debug(
-                                "AcoustID: Lookup successful for '%s' (recordings: %d)",
-                                task.file.filename,
-                                len(recording_list)
-                            )
+                    resolver = RecordingResolver(
+                        self._acoustid_api.webservice,
+                        document,
+                        callback=partial(self._on_recording_resolve_finish, task, document, http))
+                    resolver.resolve()
                 else:
                     mparms = {
                         'error': document['error']['message'],
@@ -157,11 +128,32 @@ def _on_lookup_finished(self, task, document, http, error):
                         mparms,
                         echo=None
                     )
+                    task.next_func({}, http, error)
             except (AttributeError, KeyError, TypeError) as e:
                 log.error("AcoustID: Error reading response", exc_info=True)
-                error = e
-
-        task.next_func(doc, http, error)
+                task.next_func({}, http, e)
+
+    def _on_recording_resolve_finish(self, task, document, http, result=None, error=None):
+        recording_list = result
+        if not recording_list:
+            results = document.get('results')
+            if results:
+                # Set AcoustID in tags if there was no matching recording
+                acoustid = results[0].get('id')
+                task.file.metadata['acoustid_id'] = acoustid
+                task.file.update()
+                log.debug(
+                    "AcoustID: Found no matching recordings for '%s',"
+                    " setting acoustid_id tag to %r",
+                    task.file.filename, acoustid
+                )
+        else:
+            log.debug(
+                "AcoustID: Lookup successful for '%s' (recordings: %d)",
+                task.file.filename,
+                len(recording_list)
+            )
+        task.next_func({'recordings': recording_list}, http, error)
 
     def _lookup_fingerprint(self, task, result=None, error=None):
         if task.file.state == File.REMOVED:

diff --git a/picard/acoustid/json_helpers.py b/picard/acoustid/json_helpers.py
@@ -144,13 +144,5 @@ def parse_recording(recording):
     return recording_mb
 
 
-def max_source_count(recordings):
-    """Given a list of recordings return the highest number of sources.
-    This ignores recordings without metadata.
-    """
-    sources = [
-        r.get('sources', 1)
-        for r in recordings
-        if r.get('title')
-    ]
-    return max(sources + [1])
+def recording_has_metadata(recording):
+    return 'id' in recording and recording.get('title') is not None
diff --git a/picard/acoustid/recordings.py b/picard/acoustid/recordings.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+#
+# Picard, the next-generation MusicBrainz tagger
+#
+# Copyright (C) 2023 Philipp Wolfer
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+from collections import (
+    defaultdict,
+    deque,
+    namedtuple,
+)
+from functools import partial
+from typing import (
+    Dict,
+    List,
+)
+
+from PyQt6.QtNetwork import QNetworkReply
+
+from picard.acoustid.json_helpers import (
+    parse_recording,
+    recording_has_metadata,
+)
+from picard.webservice import WebService
+from picard.webservice.api_helpers import MBAPIHelper
+
+
+# Only do extra lookup for recordings without metadata, if they have at least
+# this percentage of sources compared to the recording with most sources.
+SOURCE_THRESHOLD_NO_METADATA = 0.25
+
+# Load max. this number of recordings without metadata per AcoustID
+MAX_NO_METADATA_RECORDINGS = 3
+
+
+class Recording:
+    recording: dict
+    result_score: float
+    sources: int
+
+    def __init__(self, recording, result_score=1.0, sources=1):
+        self.recording = recording
+        self.result_score = result_score
+        self.sources = sources
+
+
+IncompleteRecording = namedtuple('Recording', 'mbid acoustid result_score sources')
+
+
+class RecordingResolver:
+    """Given an AcoustID lookup result returns a list of MB recordings.
+    The recordings are either directly taken from the AcoustID result or, if the
+    results return only the MBID without metadata, loaded via the MB web service.
+    """
+
+    _recording_map: Dict[str, Dict[str, Recording]]
+
+    def __init__(self, ws: WebService, doc: dict, callback: callable) -> None:
+        self._mbapi = MBAPIHelper(ws)
+        self._doc = doc
+        self._callback = callback
+        self._recording_map = defaultdict(dict)
+        self._recording_cache = dict()
+        self._missing_metadata = deque()
+
+    def resolve(self) -> None:
+        results = self._doc.get('results') or []
+        incomplete_counts = defaultdict(lambda: 0)
+        for result in results:
+            recordings = result.get('recordings') or []
+            result_score = get_score(result)
+            acoustid = result.get('id')
+            max_sources = max_source_count_raw_recording(recordings)
+            for recording in sorted(recordings, key=lambda r: r.get('sources', 1), reverse=True):
+                mbid = recording.get('id')
+                sources = recording.get('sources', 1)
+                if recording_has_metadata(recording):
+                    mb_recording = parse_recording(recording)
+                    self._recording_cache[mbid] = mb_recording
+                    self._recording_map[acoustid][recording['id']] = Recording(
+                        recording=mb_recording,
+                        result_score=result_score,
+                        sources=sources,
+                    )
+                else:
+                    if (sources / max_sources > SOURCE_THRESHOLD_NO_METADATA
+                        and incomplete_counts[acoustid] < MAX_NO_METADATA_RECORDINGS):
+                        self._missing_metadata.append(IncompleteRecording(
+                            mbid=mbid,
+                            acoustid=acoustid,
+                            result_score=result_score,
+                            sources=sources,
+                        ))
+                        incomplete_counts[acoustid] += 1
+
+        self._load_recordings()
+
+    def _load_recordings(self):
+        if not self._missing_metadata:
+            self._send_results()
+            return
+
+        mbid = self._missing_metadata[0].mbid
+        if mbid in self._recording_cache:
+            mb_recording = self._recording_cache[mbid]
+            self._recording_request_finished(mbid, mb_recording, None, None)
+        else:
+            self._mbapi.get_track_by_id(
+                self._missing_metadata[0].mbid,
+                partial(self._recording_request_finished, mbid),
+                inc=('artists', 'release-groups', 'releases', 'media'),
+            )
+
+    def _recording_request_finished(self, original_mbid, mb_recording, http, error):
+        recording = self._missing_metadata.popleft()
+        if error:
+            if error == QNetworkReply.NetworkError.ContentNotFoundError:
+                # Recording does not exist, ignore and move on
+                self._load_recordings()
+            else:
+                self._send_results(error)
+            return
+
+        mbid = mb_recording.get('id')
+        recording_dict = self._recording_map[recording.acoustid]
+        if mbid:
+            self._recording_cache[mbid] = mb_recording
+            # This was a redirect, cache the old MBID as well
+            if original_mbid != mbid:
+                self._recording_cache[original_mbid] = mb_recording
+            if mbid not in recording_dict:
+                recording_dict[mbid] = Recording(
+                    recording=mb_recording,
+                    result_score=recording.result_score,
+                    sources=recording.sources,
+                )
+            else:
+                recording_dict[mbid].sources += recording.sources
+        self._load_recordings()
+
+    def _send_results(self, error=None):
+        self._callback(list(parse_recording_map(self._recording_map)), error)
+
+
+def get_score(node):
+    try:
+        return float(node.get('score', 1.0))
+    except (TypeError, ValueError):
+        return 1.0
+
+
+def parse_recording_map(recording_map: Dict[str, Dict[str, Recording]]):
+    for acoustid, recordings in recording_map.items():
+        recording_list = recordings.values()
+        max_sources = max_source_count(recording_list)
+        for recording in recording_list:
+            parsed_recording = recording.recording
+            if parsed_recording is not None:
+                # Calculate a score based on result score and sources for this
+                # recording relative to other recordings in this result
+                score = min(recording.sources / max_sources, 1.0) * 100
+                parsed_recording['score'] = score * recording.result_score
+                parsed_recording['acoustid'] = acoustid
+                parsed_recording['sources'] = recording.sources
+            yield parsed_recording
+
+
+def max_source_count(recordings: List[Recording]):
+    """Given a list of recordings return the highest number of sources.
+    This ignores recordings without metadata.
+    """
+    sources = {r.sources for r in recordings}
+    sources.add(1)
+    return max(sources)
+
+
+def max_source_count_raw_recording(recordings: List[dict]):
+    """Given a list of recordings return the highest number of sources.
+    This ignores recordings without metadata.
+    """
+    sources = {r.get('sources', 1) for r in recordings}
+    sources.add(1)
+    return max(sources)