From 02ba1e67cd677868b07831fe6d2b899f3ab15977 Mon Sep 17 00:00:00 2001 From: Thomas <71355143+thomas694@users.noreply.github.com> Date: Wed, 13 Apr 2022 22:34:43 +0200 Subject: [PATCH] Fix issue #231 Download gifv / pnj with their real file extension - Tumblr serves gifs as gif or webm and png as png or jpg for performance and bandwidth reasons. They use the "virtual" file extensions gifv and pnj in their urls, in recent days also only the png extension, although a jpg could be returned. - So instead of saving the downloaded files with these "virtual" extensions we save them with the real extensions now. --- .../Crawler/AbstractTumblrCrawler.cs | 9 +++-- .../Downloader/AbstractDownloader.cs | 19 +++++++--- .../Downloader/FileDownloader.cs | 35 ++++++++++++++++--- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs index b080f05b..424255b2 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs @@ -586,11 +586,14 @@ private string BuildFileNameCore(string url, string blogName, DateTime date, int %x "_{number}" ({number}: 2..n) %y " ({number})" ({number}: 2..n) */ - string filename = Blog.FilenameTemplate; - url = url.IndexOf('?') > 0 ? url.Substring(0, url.IndexOf('?')) : url; - filename += Path.GetExtension(FileName(url)); + string extension = Path.GetExtension(FileName(url)); + if (extension.ToLower() == ".gifv") + extension = ".gif"; + else if (extension.ToLower() == ".pnj") + extension += ".png"; + string filename = Blog.FilenameTemplate + extension; if (ContainsCI(filename, "%f")) filename = ReplaceCI(filename, "%f", Path.GetFileNameWithoutExtension(FileName(url))); if (ContainsCI(filename, "%d")) filename = ReplaceCI(filename, "%d", date.ToString("yyyyMMdd")); if (ContainsCI(filename, "%e")) filename = ReplaceCI(filename, "%e", date.ToString("yyyyMMddHHmmss")); diff --git a/src/TumblThree/TumblThree.Applications/Downloader/AbstractDownloader.cs b/src/TumblThree/TumblThree.Applications/Downloader/AbstractDownloader.cs index bf6d24a4..b26c927e 100644 --- a/src/TumblThree/TumblThree.Applications/Downloader/AbstractDownloader.cs +++ b/src/TumblThree/TumblThree.Applications/Downloader/AbstractDownloader.cs @@ -289,7 +289,8 @@ protected virtual async Task DownloadBinaryPostAsync(TumblrPost downloadIt string fileName = FileName(downloadItem); UpdateProgressQueueInformation(Resources.ProgressSkipFile, fileName); } - else if (!shellService.Settings.LoadAllDatabases && blog.CheckDirectoryForFiles && blog.CheckIfBlogShouldCheckDirectory(FileName(downloadItem), FileNameNew(downloadItem))) + else if (!shellService.Settings.LoadAllDatabases && blog.CheckDirectoryForFiles && (blog.CheckIfBlogShouldCheckDirectory(FileNameUrl(downloadItem), FileNameNew(downloadItem)) + || blog.CheckIfBlogShouldCheckDirectory(FileName(downloadItem), FileNameNew(downloadItem)))) { string fileName = AddFileToDb(downloadItem); UpdateProgressQueueInformation(Resources.ProgressSkipFile, fileName); @@ -346,7 +347,7 @@ protected string AddFileToDb(TumblrPost downloadItem) { if (AppendTemplate == null) { - files.AddFileToDb(FileName(downloadItem), downloadItem.Filename); + files.AddFileToDb(FileNameUrl(downloadItem), downloadItem.Filename); return downloadItem.Filename; } return files.AddFileToDb(FileName(downloadItem), downloadItem.Filename, AppendTemplate); @@ -359,7 +360,7 @@ public bool CheckIfFileExistsInDB(string filenameUrl) protected bool CheckIfFileExistsInDB(TumblrPost downloadItem) { - string filename = FileName(downloadItem); + string filename = FileNameUrl(downloadItem); if (shellService.Settings.LoadAllDatabases) { return managerService.CheckIfFileExistsInDB(filename, shellService.Settings.LoadArchive); @@ -410,11 +411,21 @@ protected static string Url(TumblrPost downloadItem) return downloadItem.Url; } - protected virtual string FileName(TumblrPost downloadItem) + protected virtual string FileNameUrl(TumblrPost downloadItem) { return downloadItem.Url.Split('/').Last(); } + protected virtual string FileName(TumblrPost downloadItem) + { + string filename = downloadItem.Url.Split('/').Last(); + if (Path.GetExtension(filename).ToLower() == ".gifv") + filename = Path.GetFileNameWithoutExtension(filename) + ".gif"; + if (Path.GetExtension(filename).ToLower() == ".pnj") + filename += ".png"; + return filename; + } + protected static string FileNameNew(TumblrPost downloadItem) { return downloadItem.Filename; diff --git a/src/TumblThree/TumblThree.Applications/Downloader/FileDownloader.cs b/src/TumblThree/TumblThree.Applications/Downloader/FileDownloader.cs index 997da356..b7424902 100644 --- a/src/TumblThree/TumblThree.Applications/Downloader/FileDownloader.cs +++ b/src/TumblThree/TumblThree.Applications/Downloader/FileDownloader.cs @@ -42,14 +42,23 @@ public async Task DownloadFileWithResumeAsync(string url, string destinati { var fileInfo = new FileInfo(destinationPath); totalBytesReceived = fileInfo.Length; - if (totalBytesReceived >= await CheckDownloadSizeAsync(url).TimeoutAfter(settings.TimeOut)) return true; + var result = await CheckDownloadSizeAsync(url, destinationPath).TimeoutAfter(settings.TimeOut); + if (totalBytesReceived >= result.contentLength) return true; + if (destinationPath != result.destinationPath) + { + File.Delete(destinationPath); + destinationPath = result.destinationPath; + fileInfo = new FileInfo(destinationPath); + totalBytesReceived = fileInfo.Length; + } } if (ct.IsCancellationRequested) return false; var fileMode = totalBytesReceived > 0 ? FileMode.Append : FileMode.Create; - using (var fileStream = new FileStream(destinationPath, fileMode, FileAccess.Write, FileShare.Read, bufferSize, true)) + var fileStream = new FileStream(destinationPath, fileMode, FileAccess.Write, FileShare.Read, bufferSize, true); + try { while (true) { @@ -69,6 +78,15 @@ public async Task DownloadFileWithResumeAsync(string url, string destinati bool isChunked = false; using (var response = await request.GetResponseAsync().TimeoutAfter(settings.TimeOut)) { + if (url.Contains("tumblr.com") && (url.Contains(".png") || url.Contains(".pnj")) + && Path.GetExtension(destinationPath).ToLower() == ".png" && (response.Headers["Content-Type"]?.Contains("jpeg") ?? false)) + { + fileStream.Dispose(); + File.Delete(destinationPath); + destinationPath = Path.Combine(Path.GetDirectoryName(destinationPath), Path.GetFileNameWithoutExtension(destinationPath) + ".jpg"); + fileStream = new FileStream(destinationPath, FileMode.Create, FileAccess.Write, FileShare.Read, bufferSize, true); + } + isChunked = response.Headers.ToString().Contains("chunked"); totalBytesToReceive = totalBytesReceived + (response.ContentLength == -1 ? 0 : response.ContentLength); @@ -124,9 +142,13 @@ public async Task DownloadFileWithResumeAsync(string url, string destinati return true; } + finally + { + fileStream?.Dispose(); + } } - private async Task CheckDownloadSizeAsync(string url) + private async Task<(long contentLength, string destinationPath)> CheckDownloadSizeAsync(string url, string destinationPath) { var requestRegistration = new CancellationTokenRegistration(); try @@ -136,7 +158,12 @@ private async Task CheckDownloadSizeAsync(string url) using (var response = await request.GetResponseAsync()) { - return response.ContentLength; + if (url.Contains("tumblr.com") && (url.Contains(".png") || url.Contains(".pnj")) + && Path.GetExtension(destinationPath).ToLower() == ".png" && (response.Headers["Content-Type"]?.Contains("jpeg") ?? false)) + { + destinationPath = Path.Combine(Path.GetDirectoryName(destinationPath), Path.GetFileNameWithoutExtension(destinationPath) + ".jpg"); + } + return (response.ContentLength, destinationPath); } } finally