From d5d57ac3f0a2f70794d1ed56b61291f5b4cd5a91 Mon Sep 17 00:00:00 2001 From: Thomas <71355143+thomas694@users.noreply.github.com> Date: Sun, 1 Sep 2024 22:27:27 +0200 Subject: [PATCH] Fix issue #564 Problems in Likes crawler --- .../Crawler/TumblrLikedByCrawler.cs | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs index c979cd77..48adf2c8 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs @@ -201,6 +201,16 @@ private async Task CrawlPageAsync(int crawlerNumber) return; } + if (isLikesUrl) + { + var posts = ExtractPosts(document); + await DownloadPage(posts); + } + else + { + await AddUrlsToDownloadListAsync(document); + } + pagination = ExtractNextPageLink(document); pageNumber++; var notWithinTimespan = !CheckIfWithinTimespan(pagination); @@ -215,16 +225,6 @@ private async Task CrawlPageAsync(int crawlerNumber) } nextPage.Add(Blog.Url + (isLikesUrl ? "?before=" : "/page/" + pageNumber + "/") + pagination); - if (isLikesUrl) - { - var posts = ExtractPosts(document); - await DownloadPage(posts); - } - else - { - await AddUrlsToDownloadListAsync(document); - } - Interlocked.Increment(ref numberOfPagesCrawled); UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled); if (notWithinTimespan) @@ -537,7 +537,7 @@ private void DownloadMedia(DataModels.TumblrSearchJson.Content content, Post dat if (content.Provider == "tumblr" || url.Contains("tumblr.com") || Blog.RegExVideos) { string thumbnailUrl = content.Poster[0].Url; - AddToDownloadList(new PhotoPost(thumbnailUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(thumbnailUrl, data, index))); + AddToDownloadList(new PhotoPost(thumbnailUrl, thumbnailUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(thumbnailUrl, data, index))); } } // can only download preview image for non-tumblr (embedded) video posts @@ -556,12 +556,13 @@ private void DownloadMedia(DataModels.TumblrSearchJson.Content content, Post dat { if (Blog.DownloadPhoto) { + var postedUrl = url; if (url.Contains("tumblr.com/")) { url = RetrieveOriginalImageUrl(url, 2000, 3000, false); url = CheckPnjUrl(url); } - AddToDownloadList(new PhotoPost(url, data.Id, data.UnixTimestamp.ToString(), BuildFileName(url, data, index))); + AddToDownloadList(new PhotoPost(url, postedUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(url, data, index))); } } } @@ -740,20 +741,14 @@ private static long ExtractNextPageLink(string document) // const string htmlPagination = "(id=\"next_page_link\" href=\"[A-Za-z0-9_/:.-]+/([0-9]+)/([A-Za-z0-9]+))\""; - const string jsonPagination = "&before=([0-9]*)"; - const string jsonPagination2 = "\\?before=([0-9]*)"; + const string jsonPagination = @"(&|\\?|\\u0026)before=([0-9]*)"; _ = long.TryParse(Regex.Match(document, htmlPagination).Groups[3].Value, out var unixTime); if (unixTime == 0) { - var r = Regex.Match(document, jsonPagination); - _ = long.TryParse(r.Groups[1].Value, out unixTime); - - if (unixTime == 0) - { - _ = long.TryParse(Regex.Match(document, jsonPagination2).Groups[1].Value, out unixTime); - } + var r = Regex.Matches(document, jsonPagination); + _ = long.TryParse(r[r.Count-1].Groups[2].Value, out unixTime); } return unixTime;