# Copyright (c) 2018 The Pooch Developers. # Distributed under the terms of the BSD 3-Clause License. # SPDX-License-Identifier: BSD-3-Clause # # This code is part of the Fatiando a Terra project (https://www.fatiando.org) # """ Test the downloader classes and functions separately from the Pooch core. """ import os import sys from tempfile import TemporaryDirectory import pytest try: import tqdm except ImportError: tqdm = None try: import paramiko except ImportError: paramiko = None from .. import Pooch from ..downloaders import ( HTTPDownloader, FTPDownloader, SFTPDownloader, DOIDownloader, choose_downloader, FigshareRepository, ZenodoRepository, DataverseRepository, doi_to_url, ) from ..processors import Unzip from .utils import ( pooch_test_url, check_large_data, check_tiny_data, data_over_ftp, pooch_test_figshare_url, pooch_test_zenodo_url, pooch_test_zenodo_with_slash_url, pooch_test_dataverse_url, ) BASEURL = pooch_test_url() FIGSHAREURL = pooch_test_figshare_url() ZENODOURL = pooch_test_zenodo_url() ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url() DATAVERSEURL = pooch_test_dataverse_url() @pytest.mark.skipif(tqdm is None, reason="requires tqdm") @pytest.mark.parametrize( "url", [ BASEURL + "tiny-data.txt", # HTTPDownloader FIGSHAREURL, # DOIDownloader ], ) def test_progressbar_kwarg_passed(url): """The progressbar keyword argument must pass through choose_downloader""" downloader = choose_downloader(url, progressbar=True) assert downloader.progressbar is True @pytest.mark.skipif(paramiko is None, reason="requires paramiko") def test_progressbar_kwarg_passed_sftp(): """The progressbar keyword argument must pass through choose_downloader""" url = "sftp://test.rebex.net/pub/example/pocketftp.png" downloader = choose_downloader(url, progressbar=True) assert downloader.progressbar is True def test_unsupported_protocol(): "Should raise ValueError when protocol is not supported" with pytest.raises(ValueError): choose_downloader("httpup://some-invalid-url.com") # Simulate the DOI format with pytest.raises(ValueError): choose_downloader("doii:XXX/XXX/file") def test_invalid_doi_repository(): "Should fail if data repository is not supported" with pytest.raises(ValueError) as exc: # Use the DOI of the Pooch paper in JOSS (not a data repository) DOIDownloader()( url="doi:10.21105/joss.01943/file_name.txt", output_file=None, pooch=None ) assert "Invalid data repository 'joss.theoj.org'" in str(exc.value) def test_doi_url_not_found(): "Should fail if the DOI is not found" with pytest.raises(ValueError) as exc: doi_to_url(doi="NOTAREALDOI") assert "Is the DOI correct?" in str(exc.value) @pytest.mark.parametrize( "repository,doi", [ (FigshareRepository, "10.6084/m9.figshare.14763051.v1"), (ZenodoRepository, "10.5281/zenodo.4924875"), (DataverseRepository, "10.11588/data/TKCFEF"), ], ids=["figshare", "zenodo", "dataverse"], ) def test_figshare_url_file_not_found(repository, doi): "Should fail if the file is not found in the archive" with pytest.raises(ValueError) as exc: url = doi_to_url(doi) repo = repository.initialize(doi, url) repo.download_url(file_name="bla.txt") assert "File 'bla.txt' not found" in str(exc.value) @pytest.mark.parametrize( "url", [FIGSHAREURL, ZENODOURL, DATAVERSEURL], ids=["figshare", "zenodo", "dataverse"], ) def test_doi_downloader(url): "Test the DOI downloader" # Use the test data we have on the repository with TemporaryDirectory() as local_store: downloader = DOIDownloader() outfile = os.path.join(local_store, "tiny-data.txt") downloader(url + "tiny-data.txt", outfile, None) check_tiny_data(outfile) @pytest.mark.network def test_zenodo_downloader_with_slash_in_fname(): """ Test the Zenodo downloader when the path contains a forward slash Related to issue #336 """ # Use the test data we have on the repository with TemporaryDirectory() as local_store: base_url = ZENODOURL_W_SLASH + "santisoler/pooch-test-data-v1.zip" downloader = DOIDownloader() outfile = os.path.join(local_store, "test-data.zip") downloader(base_url, outfile, None) # unpack the downloaded zip file so we can check the integrity of # tiny-data.txt fnames = Unzip()(outfile, action="download", pooch=None) (fname,) = [f for f in fnames if "tiny-data.txt" in f] check_tiny_data(fname) @pytest.mark.network def test_figshare_unspecified_version(): """ Test if passing a Figshare url without a version warns about it, but still downloads it. """ url = FIGSHAREURL # Remove the last bits of the doi, where the version is specified and url = url[: url.rindex(".")] + "/" # Create expected warning message doi = url[4:-1] warning_msg = f"The Figshare DOI '{doi}' doesn't specify which version of " with TemporaryDirectory() as local_store: downloader = DOIDownloader() outfile = os.path.join(local_store, "tiny-data.txt") with pytest.warns(UserWarning, match=warning_msg): downloader(url + "tiny-data.txt", outfile, None) @pytest.mark.network @pytest.mark.parametrize( "version, missing, present", [ ( 1, "LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz", "cropped-before.tar.gz", ), ( 2, "cropped-before.tar.gz", "LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz", ), ], ) def test_figshare_data_repository_versions(version, missing, present): """ Test if setting the version in Figshare DOI works as expected """ # Use a Figshare repo as example (we won't download files from it since # they are too big) doi = f"10.6084/m9.figshare.21665630.v{version}" url = f"https://doi.org/{doi}/" figshare = FigshareRepository(doi, url) filenames = [item["name"] for item in figshare.api_response] assert present in filenames assert missing not in filenames @pytest.mark.network def test_ftp_downloader(ftpserver): "Test ftp downloader" with data_over_ftp(ftpserver, "tiny-data.txt") as url: with TemporaryDirectory() as local_store: downloader = FTPDownloader(port=ftpserver.server_port) outfile = os.path.join(local_store, "tiny-data.txt") downloader(url, outfile, None) check_tiny_data(outfile) @pytest.mark.network @pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP") def test_sftp_downloader(): "Test sftp downloader" with TemporaryDirectory() as local_store: downloader = SFTPDownloader(username="demo", password="password") url = "sftp://test.rebex.net/pub/example/pocketftp.png" outfile = os.path.join(local_store, "pocketftp.png") downloader(url, outfile, None) assert os.path.exists(outfile) @pytest.mark.network @pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP") def test_sftp_downloader_fail_if_file_object(): "Downloader should fail when a file object rather than string is passed" with TemporaryDirectory() as local_store: downloader = SFTPDownloader(username="demo", password="password") url = "sftp://test.rebex.net/pub/example/pocketftp.png" outfile = os.path.join(local_store, "pocketftp.png") with open(outfile, "wb") as outfile_obj: with pytest.raises(TypeError): downloader(url, outfile_obj, None) @pytest.mark.skipif(paramiko is not None, reason="paramiko must be missing") def test_sftp_downloader_fail_if_paramiko_missing(): "test must fail if paramiko is not installed" with pytest.raises(ValueError) as exc: SFTPDownloader() assert "'paramiko'" in str(exc.value) @pytest.mark.skipif(tqdm is not None, reason="tqdm must be missing") @pytest.mark.parametrize("downloader", [HTTPDownloader, FTPDownloader, SFTPDownloader]) def test_downloader_progressbar_fails(downloader): "Make sure an error is raised if trying to use progressbar without tqdm" with pytest.raises(ValueError) as exc: downloader(progressbar=True) assert "'tqdm'" in str(exc.value) @pytest.mark.network @pytest.mark.skipif(tqdm is None, reason="requires tqdm") @pytest.mark.parametrize( "url,downloader", [(BASEURL, HTTPDownloader), (FIGSHAREURL, DOIDownloader)], ids=["http", "figshare"], ) def test_downloader_progressbar(url, downloader, capsys): "Setup a downloader function that prints a progress bar for fetch" download = downloader(progressbar=True) with TemporaryDirectory() as local_store: fname = "tiny-data.txt" url = url + fname outfile = os.path.join(local_store, fname) download(url, outfile, None) # Read stderr and make sure the progress bar is printed only when told captured = capsys.readouterr() printed = captured.err.split("\r")[-1].strip() assert len(printed) == 79 if sys.platform == "win32": progress = "100%|####################" else: progress = "100%|████████████████████" # Bar size is not always the same so can't reliably test the whole bar. assert printed[:25] == progress # Check that the downloaded file has the right content check_tiny_data(outfile) @pytest.mark.network @pytest.mark.skipif(tqdm is None, reason="requires tqdm") def test_downloader_progressbar_ftp(capsys, ftpserver): "Setup an FTP downloader function that prints a progress bar for fetch" with data_over_ftp(ftpserver, "tiny-data.txt") as url: download = FTPDownloader(progressbar=True, port=ftpserver.server_port) with TemporaryDirectory() as local_store: outfile = os.path.join(local_store, "tiny-data.txt") download(url, outfile, None) # Read stderr and make sure the progress bar is printed only when # told captured = capsys.readouterr() printed = captured.err.split("\r")[-1].strip() assert len(printed) == 79 if sys.platform == "win32": progress = "100%|####################" else: progress = "100%|████████████████████" # Bar size is not always the same so can't reliably test the whole # bar. assert printed[:25] == progress # Check that the file was actually downloaded check_tiny_data(outfile) @pytest.mark.network @pytest.mark.skipif(tqdm is None, reason="requires tqdm") @pytest.mark.skipif(paramiko is None, reason="requires paramiko") def test_downloader_progressbar_sftp(capsys): "Setup an SFTP downloader function that prints a progress bar for fetch" downloader = SFTPDownloader(progressbar=True, username="demo", password="password") with TemporaryDirectory() as local_store: url = "sftp://test.rebex.net/pub/example/pocketftp.png" outfile = os.path.join(local_store, "pocketftp.png") downloader(url, outfile, None) # Read stderr and make sure the progress bar is printed only when told captured = capsys.readouterr() printed = captured.err.split("\r")[-1].strip() assert len(printed) == 79 if sys.platform == "win32": progress = "100%|####################" else: progress = "100%|████████████████████" # Bar size is not always the same so can't reliably test the whole bar. assert printed[:25] == progress # Check that the file was actually downloaded assert os.path.exists(outfile) @pytest.mark.network def test_downloader_arbitrary_progressbar(capsys): "Setup a downloader function with an arbitrary progress bar class." class MinimalProgressDisplay: """A minimalist replacement for tqdm.tqdm""" def __init__(self, total): self.count = 0 self.total = total def __repr__(self): """represent current completion""" return str(self.count) + "/" + str(self.total) def render(self): """print self.__repr__ to stderr""" print(f"\r{self}", file=sys.stderr, end="") def update(self, i): """modify completion and render""" self.count = i self.render() def reset(self): """set counter to 0""" self.count = 0 @staticmethod def close(): """print a new empty line""" print("", file=sys.stderr) pbar = MinimalProgressDisplay(total=None) download = HTTPDownloader(progressbar=pbar) with TemporaryDirectory() as local_store: fname = "large-data.txt" url = BASEURL + fname outfile = os.path.join(local_store, "large-data.txt") download(url, outfile, None) # Read stderr and make sure the progress bar is printed only when told captured = capsys.readouterr() printed = captured.err.split("\r")[-1].strip() progress = "336/336" assert printed == progress # Check that the downloaded file has the right content check_large_data(outfile) class TestZenodoAPISupport: """ Test support for different Zenodo APIs """ article_id = 123456 doi = f"10.0001/zenodo.{article_id}" doi_url = f"https://doi.org/{doi}" file_name = "my-file.zip" file_url = ( "https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip" ) file_checksum = "2942bfabb3d05332b66eb128e0842cff" legacy_api_response = { "created": "2021-20-19T08:00:00.000000+00:00", "modified": "2021-20-19T08:00:00.000000+00:00", "id": article_id, "doi": doi, "doi_url": doi_url, "files": [ { "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", "key": file_name, "checksum": f"md5:{file_checksum}", "links": { "self": file_url, }, } ], } new_api_response = { "created": "2021-20-19T08:00:00.000000+00:00", "modified": "2021-20-19T08:00:00.000000+00:00", "id": article_id, "doi": doi, "doi_url": doi_url, "files": [ { "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", "filename": file_name, "checksum": file_checksum, "links": { "self": file_url, }, } ], } invalid_api_response = { "created": "2021-20-19T08:00:00.000000+00:00", "modified": "2021-20-19T08:00:00.000000+00:00", "id": article_id, "doi": doi, "doi_url": doi_url, "files": [ { "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", "filename": file_name, "checksum": file_checksum, "links": { "self": file_url, }, }, { "id": "513d7033-93a2-4eeb-821c-2fb0bbab0012", "key": file_name, "checksum": f"md5:{file_checksum}", "links": { "self": file_url, }, }, ], } @pytest.mark.parametrize( "api_version, api_response", [ ("legacy", legacy_api_response), ("new", new_api_response), ("invalid", invalid_api_response), ], ) def test_api_version(self, httpserver, api_version, api_response): """ Test if the API version is correctly detected. """ # Create a local http server httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( api_response ) # Create Zenodo downloader downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) # Override base url for the API of the downloader downloader.base_api_url = httpserver.url_for("") # Check if the API version is correctly identified if api_version != "invalid": assert downloader.api_version == api_version else: msg = "Couldn't determine the version of the Zenodo API" with pytest.raises(ValueError, match=msg): api_version = downloader.api_version @pytest.mark.parametrize( "api_version, api_response", [("legacy", legacy_api_response), ("new", new_api_response)], ) def test_download_url(self, httpserver, api_version, api_response): """ Test if the download url is correct for each API version. """ # Create a local http server httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( api_response ) # Create Zenodo downloader downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) # Override base url for the API of the downloader downloader.base_api_url = httpserver.url_for("") # Check if the download url is correct download_url = downloader.download_url(file_name=self.file_name) if api_version == "legacy": assert download_url == self.file_url else: expected_url = ( "https://zenodo.org/records/" f"{self.article_id}/files/{self.file_name}?download=1" ) assert download_url == expected_url @pytest.mark.parametrize( "api_response", [legacy_api_response, new_api_response], ) def test_populate_registry(self, httpserver, tmp_path, api_response): """ Test if population of registry is correctly done for each API version. """ # Create a local http server httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json( api_response ) # Create sample pooch object puppy = Pooch(base_url="", path=tmp_path) # Create Zenodo downloader downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url) # Override base url for the API of the downloader downloader.base_api_url = httpserver.url_for("") # Populate registry downloader.populate_registry(puppy) assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}