ai-content-maker/.venv/Lib/site-packages/pooch/downloaders.py

1158 lines
40 KiB
Python
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
The classes that actually handle the downloads.
"""
import os
import sys
import ftplib
import warnings
from .utils import parse_url
try:
from tqdm import tqdm
except ImportError:
tqdm = None
try:
import paramiko
except ImportError:
paramiko = None
def choose_downloader(url, progressbar=False):
"""
Choose the appropriate downloader for the given URL based on the protocol.
Parameters
----------
url : str
A URL (including protocol).
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
Returns
-------
downloader
A downloader class, like :class:`pooch.HTTPDownloader`,
:class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
Examples
--------
>>> downloader = choose_downloader("http://something.com")
>>> print(downloader.__class__.__name__)
HTTPDownloader
>>> downloader = choose_downloader("https://something.com")
>>> print(downloader.__class__.__name__)
HTTPDownloader
>>> downloader = choose_downloader("ftp://something.com")
>>> print(downloader.__class__.__name__)
FTPDownloader
>>> downloader = choose_downloader("doi:DOI/filename.csv")
>>> print(downloader.__class__.__name__)
DOIDownloader
"""
known_downloaders = {
"ftp": FTPDownloader,
"https": HTTPDownloader,
"http": HTTPDownloader,
"sftp": SFTPDownloader,
"doi": DOIDownloader,
}
parsed_url = parse_url(url)
if parsed_url["protocol"] not in known_downloaders:
raise ValueError(
f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
f"Must be one of {known_downloaders.keys()}."
)
downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
return downloader
class HTTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over HTTP/HTTPS.
When called, downloads the given file URL into the specified local file.
Uses the :mod:`requests` library to manage downloads.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
**kwargs
All keyword arguments given when creating an instance of this class
will be passed to :func:`requests.get`.
Examples
--------
Download one of the data files from the Pooch repository:
>>> import os
>>> from pooch import __version__, check_version
>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
>>> url = url.format(check_version(__version__, fallback="main"))
>>> downloader = HTTPDownloader()
>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
Authentication can be handled by passing a user name and password to
:func:`requests.get`. All arguments provided when creating an instance of
the class are forwarded to :func:`requests.get`. We'll use
``auth=(username, password)`` to use basic HTTPS authentication. The
https://httpbin.org website allows us to make a fake a login request using
whatever username and password we provide to it:
>>> user = "doggo"
>>> password = "goodboy"
>>> # httpbin will ask for the user and password we provide in the URL
>>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
>>> # Trying without the login credentials causes an error
>>> downloader = HTTPDownloader()
>>> try:
... downloader(url=url, output_file="tiny-data.txt", pooch=None)
... except Exception:
... print("There was an error!")
There was an error!
>>> # Pass in the credentials to HTTPDownloader
>>> downloader = HTTPDownloader(auth=(user, password))
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> with open("tiny-data.txt") as f:
... for line in f:
... print(line.rstrip())
{
"authenticated": true,
"user": "doggo"
}
>>> os.remove("tiny-data.txt")
"""
def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
self.kwargs = kwargs
self.progressbar = progressbar
self.chunk_size = chunk_size
if self.progressbar is True and tqdm is None:
raise ValueError("Missing package 'tqdm' required for progress bars.")
def __call__(
self, url, output_file, pooch, check_only=False
): # pylint: disable=R0914
"""
Download the given URL over HTTP to the given output file.
Uses :func:`requests.get`.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
check_only : bool
If True, will only check if a file exists on the server and
**without downloading the file**. Will return ``True`` if the file
exists and ``False`` otherwise.
Returns
-------
availability : bool or None
If ``check_only==True``, returns a boolean indicating if the file
is available on the server. Otherwise, returns ``None``.
"""
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415
if check_only:
timeout = self.kwargs.get("timeout", 5)
response = requests.head(url, timeout=timeout, allow_redirects=True)
available = bool(response.status_code == 200)
return available
kwargs = self.kwargs.copy()
timeout = kwargs.pop("timeout", 5)
kwargs.setdefault("stream", True)
ispath = not hasattr(output_file, "write")
if ispath:
# pylint: disable=consider-using-with
output_file = open(output_file, "w+b")
# pylint: enable=consider-using-with
try:
response = requests.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
content = response.iter_content(chunk_size=self.chunk_size)
total = int(response.headers.get("content-length", 0))
if self.progressbar is True:
# Need to use ascii characters on Windows because there isn't
# always full unicode support
# (see https://github.com/tqdm/tqdm/issues/454)
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=total,
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
elif self.progressbar:
progress = self.progressbar
progress.total = total
for chunk in content:
if chunk:
output_file.write(chunk)
output_file.flush()
if self.progressbar:
# Use the chunk size here because chunk may be much
# larger if the data are decompressed by requests after
# reading (happens with text files).
progress.update(self.chunk_size)
# Make sure the progress bar gets filled even if the actual number
# is chunks is smaller than expected. This happens when streaming
# text files that are compressed by the server when sending (gzip).
# Binary files don't experience this.
if self.progressbar:
progress.reset()
progress.update(total)
progress.close()
finally:
if ispath:
output_file.close()
return None
class FTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over FTP.
When called, downloads the given file URL into the specified local file.
Uses the :mod:`ftplib` module to manage downloads.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
port : int
Port used for the FTP connection.
username : str
User name used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous FTP).
password : str
Password used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous FTP). Use the empty string
to indicate no password is required.
account : str
Some servers also require an "account" name for authentication.
timeout : int
Timeout in seconds for ftp socket operations, use None to mean no
timeout.
progressbar : bool
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. **Custom progress bars are not yet supported.**
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
"""
def __init__(
self,
port=21,
username="anonymous",
password="",
account="",
timeout=None,
progressbar=False,
chunk_size=1024,
):
self.port = port
self.username = username
self.password = password
self.account = account
self.timeout = timeout
self.progressbar = progressbar
self.chunk_size = chunk_size
if self.progressbar is True and tqdm is None:
raise ValueError("Missing package 'tqdm' required for progress bars.")
def __call__(self, url, output_file, pooch, check_only=False):
"""
Download the given URL over FTP to the given output file.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
check_only : bool
If True, will only check if a file exists on the server and
**without downloading the file**. Will return ``True`` if the file
exists and ``False`` otherwise.
Returns
-------
availability : bool or None
If ``check_only==True``, returns a boolean indicating if the file
is available on the server. Otherwise, returns ``None``.
"""
parsed_url = parse_url(url)
ftp = ftplib.FTP(timeout=self.timeout)
ftp.connect(host=parsed_url["netloc"], port=self.port)
if check_only:
directory, file_name = os.path.split(parsed_url["path"])
try:
ftp.login(user=self.username, passwd=self.password, acct=self.account)
available = file_name in ftp.nlst(directory)
finally:
ftp.close()
return available
ispath = not hasattr(output_file, "write")
if ispath:
# pylint: disable=consider-using-with
output_file = open(output_file, "w+b")
# pylint: enable=consider-using-with
try:
ftp.login(user=self.username, passwd=self.password, acct=self.account)
command = f"RETR {parsed_url['path']}"
if self.progressbar:
# Make sure the file is set to binary mode, otherwise we can't
# get the file size. See: https://stackoverflow.com/a/22093848
ftp.voidcmd("TYPE I")
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=int(ftp.size(parsed_url["path"])),
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
with progress:
def callback(data):
"Update the progress bar and write to output"
progress.update(len(data))
output_file.write(data)
ftp.retrbinary(command, callback, blocksize=self.chunk_size)
else:
ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
finally:
ftp.quit()
if ispath:
output_file.close()
return None
class SFTPDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files over SFTP.
When called, downloads the given file URL into the specified local file.
Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
installed.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
the download of files (for example, to use authentication or print a
progress bar).
Parameters
----------
port : int
Port used for the SFTP connection.
username : str
User name used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous SFTP).
password : str
Password used to login to the server. Only needed if the server
requires authentication (i.e., no anonymous SFTP). Use the empty
string to indicate no password is required.
timeout : int
Timeout in seconds for sftp socket operations, use None to mean no
timeout.
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard
error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
be installed.
"""
def __init__(
self,
port=22,
username="anonymous",
password="",
account="",
timeout=None,
progressbar=False,
):
self.port = port
self.username = username
self.password = password
self.account = account
self.timeout = timeout
self.progressbar = progressbar
# Collect errors and raise only once so that both missing packages are
# captured. Otherwise, the user is only warned of one of them at a
# time (and we can't test properly when they are both missing).
errors = []
if self.progressbar and tqdm is None:
errors.append("Missing package 'tqdm' required for progress bars.")
if paramiko is None:
errors.append("Missing package 'paramiko' required for SFTP downloads.")
if errors:
raise ValueError(" ".join(errors))
def __call__(self, url, output_file, pooch):
"""
Download the given URL over SFTP to the given output file.
The output file must be given as a string (file name/path) and not an
open file object! Otherwise, paramiko cannot save to that file.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str
Path (and file name) to which the file will be downloaded. **Cannot
be a file object**.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
"""
parsed_url = parse_url(url)
connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
sftp = None
try:
connection.connect(username=self.username, password=self.password)
sftp = paramiko.SFTPClient.from_transport(connection)
sftp.get_channel().settimeout = self.timeout
if self.progressbar:
size = int(sftp.stat(parsed_url["path"]).st_size)
use_ascii = bool(sys.platform == "win32")
progress = tqdm(
total=size,
ncols=79,
ascii=use_ascii,
unit="B",
unit_scale=True,
leave=True,
)
if self.progressbar:
with progress:
def callback(current, total):
"Update the progress bar and write to output"
progress.total = int(total)
progress.update(int(current - progress.n))
sftp.get(parsed_url["path"], output_file, callback=callback)
else:
sftp.get(parsed_url["path"], output_file)
finally:
connection.close()
if sftp is not None:
sftp.close()
class DOIDownloader: # pylint: disable=too-few-public-methods
"""
Download manager for fetching files from Digital Object Identifiers (DOIs).
Open-access data repositories often issue Digital Object Identifiers (DOIs)
for data which provide a stable link and citation point. The trick is
finding out the download URL for a file given the DOI.
When called, this downloader uses the repository's public API to find out
the download URL from the DOI and file name. It then uses
:class:`pooch.HTTPDownloader` to download the URL into the specified local
file. Allowing "URL"s to be specified with the DOI instead of the actual
HTTP download link. Uses the :mod:`requests` library to manage downloads
and interact with the APIs.
The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
Notice that there are no ``//`` like in HTTP/FTP and you must specify a
file name after the DOI (separated by a ``/``).
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
download files given the DOI instead of an HTTP link.
Supported repositories:
* `figshare <https://www.figshare.com>`__
* `Zenodo <https://www.zenodo.org>`__
* `Dataverse <https://dataverse.org/>`__ instances
.. attention::
DOIs from other repositories **will not work** since we need to access
their particular APIs to find the download links. We welcome
suggestions and contributions adding new repositories.
Parameters
----------
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
chunk_size : int
Files are streamed *chunk_size* bytes at a time instead of loading
everything into memory at one. Usually doesn't need to be changed.
**kwargs
All keyword arguments given when creating an instance of this class
will be passed to :func:`requests.get`.
Examples
--------
Download one of the data files from the figshare archive of Pooch test
data:
>>> import os
>>> downloader = DOIDownloader()
>>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
Same thing but for our Zenodo archive:
>>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
>>> os.path.exists("tiny-data.txt")
True
>>> with open("tiny-data.txt") as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove("tiny-data.txt")
"""
def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
self.kwargs = kwargs
self.progressbar = progressbar
self.chunk_size = chunk_size
def __call__(self, url, output_file, pooch):
"""
Download the given DOI URL over HTTP to the given output file.
Uses the repository's API to determine the actual HTTP download URL
from the given DOI.
Uses :func:`requests.get`.
Parameters
----------
url : str
The URL to the file you want to download.
output_file : str or file-like object
Path (and file name) to which the file will be downloaded.
pooch : :class:`~pooch.Pooch`
The instance of :class:`~pooch.Pooch` that is calling this method.
"""
parsed_url = parse_url(url)
data_repository = doi_to_repository(parsed_url["netloc"])
# Resolve the URL
file_name = parsed_url["path"]
# remove the leading slash in the path
if file_name[0] == "/":
file_name = file_name[1:]
download_url = data_repository.download_url(file_name)
# Instantiate the downloader object
downloader = HTTPDownloader(
progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
)
downloader(download_url, output_file, pooch)
def doi_to_url(doi):
"""
Follow a DOI link to resolve the URL of the archive.
Parameters
----------
doi : str
The DOI of the archive.
Returns
-------
url : str
The URL of the archive in the data repository.
"""
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415
# Use doi.org to resolve the DOI to the repository website.
response = requests.get(f"https://doi.org/{doi}", timeout=5)
url = response.url
if 400 <= response.status_code < 600:
raise ValueError(
f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
)
return url
def doi_to_repository(doi):
"""
Instantiate a data repository instance from a given DOI.
This function implements the chain of responsibility dispatch
to the correct data repository class.
Parameters
----------
doi : str
The DOI of the archive.
Returns
-------
data_repository : DataRepository
The data repository object
"""
# This should go away in a separate issue: DOI handling should
# not rely on the (non-)existence of trailing slashes. The issue
# is documented in https://github.com/fatiando/pooch/issues/324
if doi[-1] == "/":
doi = doi[:-1]
repositories = [
FigshareRepository,
ZenodoRepository,
DataverseRepository,
]
# Extract the DOI and the repository information
archive_url = doi_to_url(doi)
# Try the converters one by one until one of them returned a URL
data_repository = None
for repo in repositories:
if data_repository is None:
data_repository = repo.initialize(
archive_url=archive_url,
doi=doi,
)
if data_repository is None:
repository = parse_url(archive_url)["netloc"]
raise ValueError(
f"Invalid data repository '{repository}'. "
"To request or contribute support for this repository, "
"please open an issue at https://github.com/fatiando/pooch/issues"
)
return data_repository
class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
@classmethod
def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
return None # pragma: no cover
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
raise NotImplementedError # pragma: no cover
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
raise NotImplementedError # pragma: no cover
class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
base_api_url = "https://zenodo.org/api/records"
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
self._api_version = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Check whether this is a Zenodo URL
parsed_archive_url = parse_url(archive_url)
if parsed_archive_url["netloc"] != "zenodo.org":
return None
return cls(doi, archive_url)
@property
def api_response(self):
"""Cached API response from Zenodo"""
if self._api_response is None:
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415
article_id = self.archive_url.split("/")[-1]
self._api_response = requests.get(
f"{self.base_api_url}/{article_id}",
timeout=5,
).json()
return self._api_response
@property
def api_version(self):
"""
Version of the Zenodo API we are interacting with
The versions can either be :
- ``"legacy"``: corresponds to the Zenodo API that was supported until
2023-10-12 (before the migration to InvenioRDM).
- ``"new"``: corresponds to the new API that went online on 2023-10-13
after the migration to InvenioRDM.
The ``"new"`` API breaks backward compatibility with the ``"legacy"``
one and could probably be replaced by an updated version that restores
the behaviour of the ``"legacy"`` one.
Returns
-------
str
"""
if self._api_version is None:
if all("key" in file for file in self.api_response["files"]):
self._api_version = "legacy"
elif all("filename" in file for file in self.api_response["files"]):
self._api_version = "new"
else:
raise ValueError(
"Couldn't determine the version of the Zenodo API for "
f"{self.archive_url} (doi:{self.doi})."
)
return self._api_version
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
Notes
-----
After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
link to the desired files that appears in the API response leads to 404
errors (by 2023-10-17). The files are available in the following url:
``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
This method supports both the legacy and the new API.
"""
# Create list of files in the repository
if self.api_version == "legacy":
files = {item["key"]: item for item in self.api_response["files"]}
else:
files = [item["filename"] for item in self.api_response["files"]]
# Check if file exists in the repository
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive "
f"{self.archive_url} (doi:{self.doi})."
)
# Build download url
if self.api_version == "legacy":
download_url = files[file_name]["links"]["self"]
else:
article_id = self.api_response["id"]
download_url = (
f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
)
return download_url
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
Notes
-----
After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
checksums for each file listed in the API reference is now an md5 sum.
This method supports both the legacy and the new API.
"""
for filedata in self.api_response["files"]:
checksum = filedata["checksum"]
if self.api_version == "legacy":
key = "key"
else:
key = "filename"
checksum = f"md5:{checksum}"
pooch.registry[filedata[key]] = checksum
class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Check whether this is a Figshare URL
parsed_archive_url = parse_url(archive_url)
if parsed_archive_url["netloc"] != "figshare.com":
return None
return cls(doi, archive_url)
def _parse_version_from_doi(self):
"""
Parse version from the doi
Return None if version is not available in the doi.
"""
# Get suffix of the doi
_, suffix = self.doi.split("/")
# Split the suffix by dots and keep the last part
last_part = suffix.split(".")[-1]
# Parse the version from the last part
if last_part[0] != "v":
return None
version = int(last_part[1:])
return version
@property
def api_response(self):
"""Cached API response from Figshare"""
if self._api_response is None:
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415
# Use the figshare API to find the article ID from the DOI
article = requests.get(
f"https://api.figshare.com/v2/articles?doi={self.doi}",
timeout=5,
).json()[0]
article_id = article["id"]
# Parse desired version from the doi
version = self._parse_version_from_doi()
# With the ID and version, we can get a list of files and their
# download links
if version is None:
# Figshare returns the latest version available when no version
# is specified through the DOI.
warnings.warn(
f"The Figshare DOI '{self.doi}' doesn't specify which version of "
"the repository should be used. "
"Figshare will point to the latest version available.",
UserWarning,
)
# Define API url using only the article id
# (figshare will resolve the latest version)
api_url = f"https://api.figshare.com/v2/articles/{article_id}"
else:
# Define API url using article id and the desired version
# Get list of files using article id and the version
api_url = (
"https://api.figshare.com/v2/articles/"
f"{article_id}/versions/{version}"
)
# Make the request and return the files in the figshare repository
response = requests.get(api_url, timeout=5)
response.raise_for_status()
self._api_response = response.json()["files"]
return self._api_response
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
files = {item["name"]: item for item in self.api_response}
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
)
download_url = files[file_name]["download_url"]
return download_url
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
for filedata in self.api_response:
pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None
@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""
# Access the DOI as if this was a DataVerse instance
response = cls._get_api_response(doi, archive_url)
# If we failed, this is probably not a DataVerse instance
if 400 <= response.status_code < 600:
return None
# Initialize the repository and overwrite the api response
repository = cls(doi, archive_url)
repository.api_response = response
return repository
@classmethod
def _get_api_response(cls, doi, archive_url):
"""
Perform the actual API request
This has been separated into a separate ``classmethod``, as it can be
used prior and after the initialization.
"""
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415
parsed = parse_url(archive_url)
response = requests.get(
f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
f":persistentId?persistentId=doi:{doi}",
timeout=5,
)
return response
@property
def api_response(self):
"""Cached API response from a DataVerse instance"""
if self._api_response is None:
self._api_response = self._get_api_response(
self.doi, self.archive_url
) # pragma: no cover
return self._api_response
@api_response.setter
def api_response(self, response):
"""Update the cached API response"""
self._api_response = response
def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
parsed = parse_url(self.archive_url)
response = self.api_response.json()
files = {
file["dataFile"]["filename"]: file["dataFile"]
for file in response["data"]["latestVersion"]["files"]
}
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive "
f"{self.archive_url} (doi:{self.doi})."
)
# Generate download_url using the file id
download_url = (
f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
f"{files[file_name]['id']}"
)
return download_url
def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
"""
for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
pooch.registry[filedata["dataFile"]["filename"]] = (
f"md5:{filedata['dataFile']['md5']}"
)