import shutil from pathlib import Path from typing import Tuple from wasabi import msg from .commands import run_command from .filesystem import is_subpath_of, make_tempdir def git_checkout( repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False ): git_version = get_git_version() if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): msg.fail("Parent of destination of checkout must exist", exits=1) if sparse and git_version >= (2, 22): return git_sparse_checkout(repo, subpath, dest, branch) elif sparse: # Only show warnings if the user explicitly wants sparse checkout but # the Git version doesn't support it err_old = ( f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " f"that doesn't fully support sparse checkout yet." ) err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." msg.warn( f"{err_unk if git_version == (0, 0) else err_old} " f"This means that more files than necessary may be downloaded " f"temporarily. To only download the files needed, make sure " f"you're using Git v2.22 or above." ) with make_tempdir() as tmp_dir: cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories try: source_path = tmp_dir / Path(subpath) if not is_subpath_of(tmp_dir, source_path): err = f"'{subpath}' is a path outside of the cloned repository." msg.fail(err, repo, exits=1) shutil.copytree(str(source_path), str(dest)) except FileNotFoundError: err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" msg.fail(err, repo, exits=1) def git_sparse_checkout(repo, subpath, dest, branch): # We're using Git, partial clone and sparse checkout to # only clone the files we need # This ends up being RIDICULOUS. omg. # So, every tutorial and SO post talks about 'sparse checkout'...But they # go and *clone* the whole repo. Worthless. And cloning part of a repo # turns out to be completely broken. The only way to specify a "path" is.. # a path *on the server*? The contents of which, specifies the paths. Wat. # Obviously this is hopelessly broken and insecure, because you can query # arbitrary paths on the server! So nobody enables this. # What we have to do is disable *all* files. We could then just checkout # the path, and it'd "work", but be hopelessly slow...Because it goes and # transfers every missing object one-by-one. So the final piece is that we # need to use some weird git internals to fetch the missings in bulk, and # *that* we can do by path. # We're using Git and sparse checkout to only clone the files we need with make_tempdir() as tmp_dir: # This is the "clone, but don't download anything" part. cmd = ( f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} --filter=blob:none" ) run_command(cmd) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" ret = run_command(cmd, capture=True) git_repo = _http_to_git(repo) # Now pass those missings into another bit of git internals missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) if not missings: err = ( f"Could not find any relevant files for '{subpath}'. " f"Did you specify a correct and complete path within repo '{repo}' " f"and branch {branch}?" ) msg.fail(err, exits=1) cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" run_command(cmd, capture=True) # And finally, we can checkout our subpath cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" run_command(cmd, capture=True) # Get a subdirectory of the cloned path, if appropriate source_path = tmp_dir / Path(subpath) if not is_subpath_of(tmp_dir, source_path): err = f"'{subpath}' is a path outside of the cloned repository." msg.fail(err, repo, exits=1) shutil.move(str(source_path), str(dest)) def git_repo_branch_exists(repo: str, branch: str) -> bool: """Uses 'git ls-remote' to check if a repository and branch exists repo (str): URL to get repo. branch (str): Branch on repo to check. RETURNS (bool): True if repo:branch exists. """ get_git_version() cmd = f"git ls-remote {repo} {branch}" # We might be tempted to use `--exit-code` with `git ls-remote`, but # `run_command` handles the `returncode` for us, so we'll rely on # the fact that stdout returns '' if the requested branch doesn't exist ret = run_command(cmd, capture=True) exists = ret.stdout != "" return exists def get_git_version( error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", ) -> Tuple[int, int]: """Get the version of git and raise an error if calling 'git --version' fails. error (str): The error message to show. RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns (0, 0) if the version couldn't be determined. """ try: ret = run_command("git --version", capture=True) except Exception: raise RuntimeError(error) stdout = ret.stdout.strip() if not stdout or not stdout.startswith("git version"): return 0, 0 version = stdout[11:].strip().split(".") return int(version[0]), int(version[1]) def _http_to_git(repo: str) -> str: if repo.startswith("http://"): repo = repo.replace(r"http://", r"https://") if repo.startswith(r"https://"): repo = repo.replace("https://", "git@").replace("/", ":", 1) if repo.endswith("/"): repo = repo[:-1] repo = f"{repo}.git" return repo