Skip to content

Commit

Permalink
Fix issue #404 Blog crawler seems to rescan whole blog
Browse files Browse the repository at this point in the history
- The normal crawler didn't recognize pinned posts.
  • Loading branch information
thomas694 committed Dec 4, 2022
1 parent 5e0024a commit ed3e684
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
using TumblThree.Applications.Services;
using TumblThree.Domain;
using TumblThree.Domain.Models.Blogs;
using Newtonsoft.Json;
using System.Dynamic;

namespace TumblThree.Applications.Crawler
{
Expand All @@ -28,6 +30,8 @@ namespace TumblThree.Applications.Crawler
[PartCreationPolicy(CreationPolicy.NonShared)]
public class TumblrBlogCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
{
private static readonly Regex extractJsonFromPage = new Regex("window\\['___INITIAL_STATE___'] = ({.*});");

private readonly IDownloader downloader;
private readonly ITumblrToTextParser<Post> tumblrJsonParser;
private readonly IPostQueue<CrawlerData<Post>> jsonQueue;
Expand Down Expand Up @@ -301,11 +305,21 @@ private async Task<ulong> GetHighestPostIdAsync()

private async Task<ulong> GetHighestPostIdCoreAsync()
{
string document = await GetApiPageWithRetryAsync(0);
var url = "https://www.tumblr.com/" + TumblThree.Domain.Models.Blogs.Blog.ExtractName(Blog.Url);
string document = await GetRequestAsync(url);
string pinnedId = "";
if (document.Contains("___INITIAL_STATE___"))
{
var extracted = extractJsonFromPage.Match(document).Groups[1].Value;
dynamic obj = JsonConvert.DeserializeObject<ExpandoObject>(extracted);
pinnedId = obj?.PeeprRoute?.initialTimeline?.objects?[0]?.id ?? "";
}

document = await GetApiPageWithRetryAsync(0);
var response = ConvertJsonToClass<TumblrApiJson>(document);

Blog.Posts = response.PostsTotal;
Post post = response.Posts?.FirstOrDefault();
Post post = response.Posts?.FirstOrDefault(x => x.Id != pinnedId);
if (DateTime.TryParse(post?.DateGmt, out var latestPost)) Blog.LatestPost = latestPost;
_ = ulong.TryParse(post?.Id, out var highestId);
return highestId;
Expand Down
2 changes: 1 addition & 1 deletion src/TumblThree/TumblThree.Domain/Models/Blogs/Blog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,7 @@ protected static string ExtractSubDomain(string url)
return null;
}

protected static string ExtractName(string url)
public static string ExtractName(string url)
{
return ExtractSubDomain(url);
}
Expand Down

0 comments on commit ed3e684

Please sign in to comment.