Source code for symforce.examples.bundle_adjustment_in_the_large.download_dataset

# ----------------------------------------------------------------------------
# SymForce - Copyright 2022, Skydio, Inc.
# This source code is under the Apache 2.0 license found in the LICENSE file.
# ----------------------------------------------------------------------------

"""
Script to download Bundle-Adjustment-in-the-Large datasets into the `./data` folder
"""

from __future__ import annotations

import bz2
import re
import urllib.parse
from html.parser import HTMLParser
from pathlib import Path

import argh
import requests

CURRENT_DIR = Path(__file__).resolve().parent

BASE_URL = "https://grail.cs.washington.edu/projects/bal"


[docs]class BALParser(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        self.links: list[str] = []

[docs]    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag == "a":
            for attr, maybe_value in attrs:
                if attr == "href" and maybe_value is not None:
                    self.links.append(maybe_value)


DATASETS = ["ladybug", "trafalgar", "dubrovnik", "venice", "final"]


[docs]@argh.arg("dataset", choices=DATASETS)
def main(dataset: str) -> None:
    parser = BALParser()
    parser.feed(requests.get(f"{BASE_URL}/{dataset}.html").content.decode())

    (CURRENT_DIR / "data" / dataset).mkdir(exist_ok=True)
    for data_url in parser.links:
        if not re.match(rf"data/{dataset}/problem-.+\.txt\.bz2", data_url):
            continue

        print(f"Downloading {data_url}")

        result = requests.get(f"{BASE_URL}/{data_url}")
        result_uncompressed = bz2.decompress(result.content)
        problem_name = Path(urllib.parse.urlparse(data_url).path).stem
        (CURRENT_DIR / "data" / dataset / problem_name).write_bytes(result_uncompressed)


if __name__ == "__main__":
    argh.dispatch_command(main)