Source code for cfpq_data.dataset.data

"""Download graph data from dataset."""
import logging
import os
import pathlib
import shutil

import requests

from typing import Union

from cfpq_data.config import DATA, GRAPHS_DIR, GRAMMARS_DIR, BENCHMARKS_DIR, VERSION

__all__ = [
    "DATASET_URL",
    "GRAMMARS_URL",
    "BENCHMARK_URL",
    "DATASET",
    "GRAMMAR_TEMPLATES",
    "BENCHMARKS",
    "download",
    "download_grammars",
    "download_benchmark",
]

DATASET_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/graph/"
GRAMMARS_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/grammar/"
BENCHMARK_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/benchmark/"

DATASET = [
    "skos",
    "wc",
    "generations",
    "travel",
    "univ",
    "atom",
    "biomedical",
    "bzip",
    "foaf",
    "people",
    "pr",
    "funding",
    "ls",
    "wine",
    "pizza",
    "gzip",
    "core",
    "pathways",
    "enzyme",
    "eclass",
    "go_hierarchy",
    "go",
    "apache",
    "init",
    "mm",
    "geospecies",
    "ipc",
    "lib",
    "block",
    "arch",
    "crypto",
    "security",
    "sound",
    "net",
    "fs",
    "drivers",
    "postgre",
    "kernel",
    "taxonomy",
    "taxonomy_hierarchy",
    "avrora",
    "batik",
    "eclipse",
    "fop",
    "h2",
    "jython",
    "luindex",
    "lusearch",
    "pmd",
    "sunflow",
    "tomcat",
    "tradebeans",
    "tradesoap",
    "xalan",
]


GRAMMAR_TEMPLATES = [
    "c_alias",
    "dyck",
    "java_points_to",
    "nested_parentheses",
]


BENCHMARKS = [
    "MS_Reachability",
]


[docs] def download(name: str) -> pathlib.Path: """Download graph data from dataset. Parameters ---------- name : str The name of the graph from the dataset. Examples -------- >>> from cfpq_data import * >>> path = download("generations") Returns ------- path : Path Path to the file with graph data. """ if name in DATASET: logging.info(f"Found graph with {name=}") GRAPHS_DIR.mkdir(exist_ok=True, parents=True) graph_archive = GRAPHS_DIR / f"{name}.tar.gz" graph = GRAPHS_DIR / name / f"{name}.csv" with requests.get( url=DATASET_URL + f"{name}.tar.gz", stream=True, ) as r: with open(graph_archive, "wb") as f: shutil.copyfileobj(r.raw, f) logging.info(f"Load archive {graph_archive=}") shutil.unpack_archive(graph_archive, GRAPHS_DIR) logging.info(f"Unzip graph {name=} to file {graph=}") os.remove(graph_archive) logging.info(f"Remove archive {graph_archive=}") return graph else: raise FileNotFoundError(f"No graph with {name=} found")
[docs] def download_grammars( template: str, *, graph_name: Union[str, None] = None ) -> Union[pathlib.Path, None]: """Download grammars of the given template. Parameters ---------- template : str The name of the grammar template from the dataset. graph_name : Union[str, None] The name of the specified graph from the dataset or None for downloading example grammars. Examples -------- >>> from cfpq_data import * >>> path = download_grammars("java_points_to", graph_name="avrora") Returns ------- path : Union[Path, None] Path to the directory with grammars data or None if there is no such grammars in dataset. """ if template not in GRAMMAR_TEMPLATES: raise FileNotFoundError(f"No grammar {template=} found") if graph_name is None: logging.info(f"Found grammar {template=}") grammars_name = f"{template}" url = GRAMMARS_URL + f"example/{grammars_name}.tar.gz" elif graph_name in DATASET: logging.info(f"Found graph with {graph_name=} and grammar {template=}") grammars_name = f"{template}_{graph_name}" url = GRAMMARS_URL + f"{grammars_name}.tar.gz" else: raise FileNotFoundError(f"No graph with {graph_name=} found") GRAMMARS_DIR.mkdir(exist_ok=True, parents=True) grammar_archive = GRAMMARS_DIR / f"{grammars_name}.tar.gz" grammars = GRAMMARS_DIR / grammars_name with requests.get( url=url, stream=True, ) as r: if r.status_code == 404: logging.info( f"No grammars with {template=} for graph with {graph_name=} found" ) return None else: with open(grammar_archive, "wb") as f: shutil.copyfileobj(r.raw, f) logging.info(f"Load archive {grammar_archive=}") shutil.unpack_archive(grammar_archive, GRAMMARS_DIR) logging.info( f"Unzip grammars with {template=} for graph with {graph_name=} to directory {grammars=}" ) os.remove(grammar_archive) logging.info(f"Remove archive {grammar_archive=}") return grammars
[docs] def download_benchmark(name: str) -> pathlib.Path: """Download benchmark data. Parameters ---------- name : str The name of the benchmark. Examples -------- >>> from cfpq_data import * >>> path = download_benchmark("MS_Reachability") Returns ------- path : Path Path to the directory with benchmark data. """ if name in BENCHMARKS: logging.info(f"Found benchmark with {name=}") BENCHMARKS_DIR.mkdir(exist_ok=True, parents=True) benchmark_archive = BENCHMARKS_DIR / f"{name}.tar.gz" benchmark = BENCHMARKS_DIR / name with requests.get( url=BENCHMARK_URL + f"{name}.tar.gz", stream=True, ) as r: with open(benchmark_archive, "wb") as f: shutil.copyfileobj(r.raw, f) logging.info(f"Load archive {benchmark_archive=}") shutil.unpack_archive(benchmark_archive, BENCHMARKS_DIR) logging.info(f"Unzip benchmark {name=} to directory {benchmark=}") os.remove(benchmark_archive) logging.info(f"Remove archive {benchmark_archive=}") return benchmark else: raise FileNotFoundError(f"No benchmark with {name=} found")