Skip to content

Commit

Permalink
Rm deprecated wget, replace with requests to debug reldir issue
Browse files Browse the repository at this point in the history
  • Loading branch information
agahkarakuzu committed Oct 6, 2023
1 parent 0d05342 commit 787d641
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 24 deletions.
2 changes: 1 addition & 1 deletion repo2data/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.7.0"
__version__ = "2.8.0"
58 changes: 38 additions & 20 deletions repo2data/repo2data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
"""
import os
import json
import wget
import requests
import subprocess
import re
import urllib
import urllib.request
import patoolib
import time


class Repo2Data():
Expand Down Expand Up @@ -194,23 +195,39 @@ def _already_downloaded(self):

return dl

def _wget_download(self):
"""Install the data with wget library"""
print("Info : Starting to download with wget %s ..." %
(self._data_requirement_file["src"]))
# Try it few times to avoid truncated data
attempts = 0
while attempts < 3:
# Download with standard weblink
try:
wget.download(
self._data_requirement_file["src"], out=self._dst_path)
print(" ")
attempts = 999
except urllib.error.ContentTooShortError:
attempts = attempts + 1
print("Warning : Truncated data, retry %d ..." % (attempts))
pass
def _url_download(self):
"""
Under the assumption that the download link points to
a single tar/zip etc file, use requests library to
downlad the data to a relative path.
"""
url = self._data_requirement_file["src"]
directory = self._dst_path
max_retries = 3
retry_delay = 5
for retry in range(max_retries):
response = requests.get(url, stream=True)
if response.status_code == 200:
# Create the directory if it doesn't exist
if not os.path.exists(directory):
os.makedirs(directory)
# Get the filename from the URL
filename = url.split('/')[-1]
# Path to save the file
filepath = os.path.join(directory, filename)
# Save the content of the response to a file
with open(filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=128):
file.write(chunk)
print(f'File downloaded to: {filepath}')
return filepath
else:
print(f'Attempt {retry + 1} - Failed to download the file. Status code: {response.status_code}')
if retry < max_retries - 1:
print(f'Retrying in {retry_delay} seconds...')
time.sleep(retry_delay)
# If hits here means retries failed.
print('Download failed after multiple attempts.')

def _gdrive_download(self):
"""Install the data with google drive utility"""
Expand Down Expand Up @@ -291,13 +308,14 @@ def _osf_download(self):

def _scan_dl_type(self):
"""Detect which function to use for download"""
# if it is an http link, then we use wget
# If an http link is provided or the url does not match one of the providers
# (osf, google, datalad, git), then fall back to requests to download the file.
if ((re.match(".*?(https://).*?", self._data_requirement_file["src"])
or re.match(".*?(http://).*?", self._data_requirement_file["src"]))
and not re.match(".*?(\\.git)", self._data_requirement_file["src"])
and not re.match(".*?(drive\\.google\\.com).*?", self._data_requirement_file["src"])
and not re.match(".*?(https://osf\\.io).*?", self._data_requirement_file["src"])):
self._wget_download()
self._url_download()
# if the source link has a .git, we use datalad
elif re.match(".*?(\\.git)", self._data_requirement_file["src"]):
self._datalad_download()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pytest==6.2.0
osfclient==0.0.5
gdown==4.2.0
zenodo-get==1.3.4
requests
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

setup(name='repo2data',
version=repo2data.utils.get_version(),
description='Automatic data fetcher from a remote server.',
description='To download data from a variety of providers.',
url='https://github.com/SIMEXP/Repo2Data',
download_url='https://github.com/SIMEXP/Repo2Data/archive/v{}.tar.gz'.format(repo2data.utils.get_version()),
author='Loic TETREL',
author_email='loic.tetrel.pro@gmail.com',
author_email='roboneurolibre@gmail.com',
license='MIT',
packages=['repo2data'],
scripts=['bin/repo2data'],
Expand All @@ -16,7 +16,7 @@
'patool',
#seg-fault with datalad
'datalad',
'wget',
'requests',
'osfclient',
'gdown',
'zenodo-get'
Expand Down

0 comments on commit 787d641

Please sign in to comment.