Source code for cfpq_data.dataset.data

Name: CFPQ_Data
License: https://www.apache.org/licenses/LICENSE-2.0
"""Download graph data from dataset."""
import logging
import os
import pathlib
import shutil

import requests

from typing import Union

from cfpq_data.config import DATA, GRAPHS_DIR, GRAMMARS_DIR, BENCHMARKS_DIR, VERSION

__all__ = [
    "DATASET_URL",
    "GRAMMARS_URL",
    "BENCHMARK_URL",
    "DATASET",
    "GRAMMAR_TEMPLATES",
    "BENCHMARKS",
    "download",
    "download_grammars",
    "download_benchmark",
]

DATASET_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/graph/"
GRAMMARS_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/grammar/"
BENCHMARK_URL = f"https://cfpq-data.storage.yandexcloud.net/{VERSION[0]}.0.0/benchmark/"

DATASET = [
    "skos",
    "wc",
    "generations",
    "travel",
    "univ",
    "atom",
    "biomedical",
    "bzip",
    "foaf",
    "people",
    "pr",
    "funding",
    "ls",
    "wine",
    "pizza",
    "gzip",
    "core",
    "pathways",
    "enzyme",
    "eclass",
    "go_hierarchy",
    "go",
    "apache",
    "init",
    "mm",
    "geospecies",
    "ipc",
    "lib",
    "block",
    "arch",
    "crypto",
    "security",
    "sound",
    "net",
    "fs",
    "drivers",
    "postgre",
    "kernel",
    "taxonomy",
    "taxonomy_hierarchy",
    "avrora",
    "batik",
    "eclipse",
    "fop",
    "h2",
    "jython",
    "luindex",
    "lusearch",
    "pmd",
    "sunflow",
    "tomcat",
    "tradebeans",
    "tradesoap",
    "xalan",
]


GRAMMAR_TEMPLATES = [
    "c_alias",
    "dyck",
    "java_points_to",
    "nested_parentheses",
]


BENCHMARKS = [
    "MS_Reachability",
]



[docs]
def download(name: str) -> pathlib.Path:
    """Download graph data from dataset.

    Parameters
    ----------
    name : str
        The name of the graph from the dataset.

    Examples
    --------
    >>> from cfpq_data import *
    >>> path = download("generations")

    Returns
    -------
    path : Path
        Path to the file with graph data.
    """
    if name in DATASET:
        logging.info(f"Found graph with {name=}")

        GRAPHS_DIR.mkdir(exist_ok=True, parents=True)

        graph_archive = GRAPHS_DIR / f"{name}.tar.gz"
        graph = GRAPHS_DIR / name / f"{name}.csv"

        with requests.get(
            url=DATASET_URL + f"{name}.tar.gz",
            stream=True,
        ) as r:
            with open(graph_archive, "wb") as f:
                shutil.copyfileobj(r.raw, f)

        logging.info(f"Load archive {graph_archive=}")

        shutil.unpack_archive(graph_archive, GRAPHS_DIR)

        logging.info(f"Unzip graph {name=} to file {graph=}")

        os.remove(graph_archive)

        logging.info(f"Remove archive {graph_archive=}")

        return graph
    else:
        raise FileNotFoundError(f"No graph with {name=} found")




[docs]
def download_grammars(
    template: str, *, graph_name: Union[str, None] = None
) -> Union[pathlib.Path, None]:
    """Download grammars of the given template.

    Parameters
    ----------
    template : str
        The name of the grammar template from the dataset.

    graph_name : Union[str, None]
        The name of the specified graph from the dataset or None for downloading example grammars.

    Examples
    --------
    >>> from cfpq_data import *
    >>> path = download_grammars("java_points_to", graph_name="avrora")

    Returns
    -------
    path : Union[Path, None]
        Path to the directory with grammars data or None if there is no such grammars in dataset.
    """
    if template not in GRAMMAR_TEMPLATES:
        raise FileNotFoundError(f"No grammar {template=} found")

    if graph_name is None:
        logging.info(f"Found grammar {template=}")
        grammars_name = f"{template}"
        url = GRAMMARS_URL + f"example/{grammars_name}.tar.gz"
    elif graph_name in DATASET:
        logging.info(f"Found graph with {graph_name=} and grammar {template=}")
        grammars_name = f"{template}_{graph_name}"
        url = GRAMMARS_URL + f"{grammars_name}.tar.gz"
    else:
        raise FileNotFoundError(f"No graph with {graph_name=} found")

    GRAMMARS_DIR.mkdir(exist_ok=True, parents=True)

    grammar_archive = GRAMMARS_DIR / f"{grammars_name}.tar.gz"
    grammars = GRAMMARS_DIR / grammars_name

    with requests.get(
        url=url,
        stream=True,
    ) as r:
        if r.status_code == 404:
            logging.info(
                f"No grammars with {template=} for graph with {graph_name=} found"
            )
            return None
        else:
            with open(grammar_archive, "wb") as f:
                shutil.copyfileobj(r.raw, f)

    logging.info(f"Load archive {grammar_archive=}")

    shutil.unpack_archive(grammar_archive, GRAMMARS_DIR)

    logging.info(
        f"Unzip grammars with {template=} for graph with {graph_name=} to directory {grammars=}"
    )

    os.remove(grammar_archive)

    logging.info(f"Remove archive {grammar_archive=}")

    return grammars




[docs]
def download_benchmark(name: str) -> pathlib.Path:
    """Download benchmark data.

    Parameters
    ----------
    name : str
        The name of the benchmark.

    Examples
    --------
    >>> from cfpq_data import *
    >>> path = download_benchmark("MS_Reachability")

    Returns
    -------
    path : Path
        Path to the directory with benchmark data.
    """
    if name in BENCHMARKS:
        logging.info(f"Found benchmark with {name=}")

        BENCHMARKS_DIR.mkdir(exist_ok=True, parents=True)

        benchmark_archive = BENCHMARKS_DIR / f"{name}.tar.gz"
        benchmark = BENCHMARKS_DIR / name

        with requests.get(
            url=BENCHMARK_URL + f"{name}.tar.gz",
            stream=True,
        ) as r:
            with open(benchmark_archive, "wb") as f:
                shutil.copyfileobj(r.raw, f)

        logging.info(f"Load archive {benchmark_archive=}")

        shutil.unpack_archive(benchmark_archive, BENCHMARKS_DIR)

        logging.info(f"Unzip benchmark {name=} to directory {benchmark=}")

        os.remove(benchmark_archive)

        logging.info(f"Remove archive {benchmark_archive=}")

        return benchmark
    else:
        raise FileNotFoundError(f"No benchmark with {name=} found")