From 138e706d7f8faa318e9e27a14f07d49567f6ba8c Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 20 May 2026 16:22:12 +0530 Subject: [PATCH 1/3] chore: remove migrated aboutcode.federated code Signed-off-by: Keshav Priyadarshi --- aboutcode/federated/CHANGELOG.rst | 9 - aboutcode/federated/README.rst | 69 - aboutcode/federated/__init__.py | 1702 ----------------- .../foo/aboutcode-federated-config.yml | 1034 ---------- aboutcode/federated/tests/test_federated.py | 302 --- pyproject-aboutcode.federated.toml | 76 - 6 files changed, 3192 deletions(-) delete mode 100644 aboutcode/federated/CHANGELOG.rst delete mode 100644 aboutcode/federated/README.rst delete mode 100644 aboutcode/federated/__init__.py delete mode 100644 aboutcode/federated/tests/test_data/all-presets/foo/aboutcode-federated-config.yml delete mode 100644 aboutcode/federated/tests/test_federated.py delete mode 100644 pyproject-aboutcode.federated.toml diff --git a/aboutcode/federated/CHANGELOG.rst b/aboutcode/federated/CHANGELOG.rst deleted file mode 100644 index ecc7152da..000000000 --- a/aboutcode/federated/CHANGELOG.rst +++ /dev/null @@ -1,9 +0,0 @@ -Changelog -============= - - -v0.1.0 (October 20, 2025) ---------------------------- - -- Initial release of the ``aboutcode.federated`` library based on - original work in the ``aboutcode.hashid`` library. \ No newline at end of file diff --git a/aboutcode/federated/README.rst b/aboutcode/federated/README.rst deleted file mode 100644 index a88f9b05d..000000000 --- a/aboutcode/federated/README.rst +++ /dev/null @@ -1,69 +0,0 @@ -aboutcode.federated -=================== - -This is a library of utilities to compute ids and file paths for AboutCode -federated data based on Package URL - - -Federated data utilities goal is to handle content-defined and hash-addressable -Package data keyed by PURL stored in many Git repositories. This approach to -federate decentralized data is called FederatedCode. - - -Overview -======== - -The main design elements for these utilities are: - -1. **Data Federation**: A Data Federation is a database, representing a consistent, -non-overlapping set of data kind clusters (like scans, vulnerabilities or SBOMs) -across many package ecosystems, aka. PURL types. -A Federation is similar to a traditional database. - -2. **Data Cluster**: A Data Federation contains Data Clusters, where a Data Cluster -purpose is to store the data of a single kind (like scans) across multiple PURL -types. The cluster name is the data kind name and is used as the prefix for -repository names. A Data Cluster is akin to a table in a traditional database. - -3. **Data Repository**: A DataCluster contains of one or more Git Data Repository, -each storing datafiles of the cluster data kind and a one PURL type, spreading -the datafiles in multiple Data Directories. The name is data-kind +PURL- -type+hashid. A Repository is similar to a shard or tablespace in a traditionale -database. - -4. **Data Directory**: In a Repository, a Data Directory contains the datafiles for -PURLs. The directory name PURL-type+hashid - -5. **Data File**: This is a Data File of the DataCluster's Data Kind that is -stored in subdirectories structured after the PURL components:: - - namespace/name/version/qualifiers/subpath: - -- Either at the level of a PURL name: namespace/name, -- Or at the PURL version level namespace/name/version, -- Or at the PURL qualifiers+PURL subpath level. - -A Data File can be for instance a JSON scan results file, or a list of PURLs in -YAML. - -For example, a list of PURLs as a Data Kind would stored at the name -subdirectory level:: - - gem-0107/gem/random_password_generator/purls.yml - -Or a ScanCode scan as a Data Kind at the version subdirectory level:: - - gem-0107/npm/file/3.24.3/scancode.yml - - - -License -------- - -Copyright (c) AboutCode and others. All rights reserved. - -SPDX-License-Identifier: Apache-2.0 - -See https://github.com/aboutcode-org/vulnerablecode for support or download. - -See https://aboutcode.org for more information about AboutCode OSS projects. diff --git a/aboutcode/federated/__init__.py b/aboutcode/federated/__init__.py deleted file mode 100644 index e036955e9..000000000 --- a/aboutcode/federated/__init__.py +++ /dev/null @@ -1,1702 +0,0 @@ -# -# Copyright (c) AboutCode and others. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about our open source projects. -# - -from dataclasses import dataclass -from dataclasses import field as datafield -from hashlib import sha256 -from pathlib import Path -from typing import Any -from typing import Iterable -from typing import Optional -from typing import Tuple -from typing import Union -from urllib.parse import quote -from urllib.parse import urlsplit - -import requests -import saneyaml -import uritemplate -from packageurl import PackageURL -from packageurl import normalize_qualifiers -from packageurl import normalize_subpath -from packageurl import normalize_version - -__version__ = "0.1.0" - -""" -Federated data utilities to handle content-defined and hash-addressable Package -Federated data utilities goal is to handle content-defined and hash-addressable -Package data keyed by PURL stored in many Git repositories. This approach to -federate decentralized data is called FederatedCode. - - -Overview -======== - -The main design elements are: - -1. Data Federation: A Data Federation is a database, representing a consistent, -non-overlapping set of data kind clusters (like scans, vulnerabilities or SBOMs) -across many package ecosystems, aka. PURL types. -A Federation is similar to a traditional database. - -2. Data Cluster: A Data Federation contains Data Clusters, where a Data Cluster -purpose is to store the data of a single kind (like scans) across multiple PURL -types. The cluster name is the data kind name and is used as the prefix for -repository names. A Data Cluster is akin to a table in a traditional database. - -3. Data Repository: A DataCluster contains of one or more Git Data Repository, -each storing datafiles of the cluster data kind and a one PURL type, spreading -the datafiles in multiple Data Directories. The name is data-kind +PURL- -type+hashid. A Repository is similar to a shard or tablespace in a traditionale -database. - -4. Data Directory: In a Repository, a Data Directory contains the datafiles for -PURLs. The directory name PURL-type+hashid - -5. Data File: This is a Data File of the DataCluster's Data Kind that is -stored in subdirectories structured after the PURL components:: - - namespace/name/version/qualifiers/subpath: - -- Either at the level of a PURL name: namespace/name, -- Or at the PURL version level namespace/name/version, -- Or at the PURL qualifiers+PURL subpath level. - -A Data File can be for instance a JSON scan results file, or a list of PURLs in -YAML. - -For example, a list of PURLs as a Data Kind would stored at the name -subdirectory level:: - - gem-0107/gem/random_password_generator/purls.yml - -Or a ScanCode scan as a Data Kind at the version subdirectory level:: - - gem-0107/npm/file/3.24.3/scancode.yml - - -Design -====== - -The core approach is to distribute the many datafiles for a package in multiple -directories stored in multiple Git repositories, so that each directory and repo -is not too big, with not too many files, and files are spread roughly evenly -across all the directories and repositories. - -At the same time the design is such that it is possible to directly access a -single datafile across all these directories and Git repositories knowing only -its package PURL and resolve that to a URL to fetch a single datafile directly -by using the Git web interface (like on GitHub, Gitlab or gitweb) - - -Why not using a single Git repo? --------------------------------- - -We need multiple Git repositories to avoid very big repositories that are -impractical to use. We want each repo to be under the common limits of public -repository hosting services, like GitHub and its 5GB limit. Typicaly a maximum -size of 5GB and a target size of about 1GB of compressed content makes the most -sense. We store text and Git combination of XDiff, XDelta a zlib compression -typically can reduce the stored size by about 5, meaning that a 1GB repo may -contain about 5GB actual uncompressed text. - - -Why not using a single dir in a repo? --------------------------------------- - -Multiple directories are needed to store many package datafiles to avoid -directories with too many files in the same directory, which makes every -filesystem performance suffer. Typically a max of about 10,000 files in a -directory is a decent target. - - -Hash-based content distribution -------------------------------- - -To distribute files roughly evenly across repositories and directories and still -using PURL as a key, we use a hashid derived from a hash computed on the PURL -string and use that to generate repositories and directory names. - -It then becomes possible to distribute the data across many Git repositories and -directories evenly and compute a URL and path to access a datafile directly -from a PURL. - - -Object hierarchy ----------------- - -- **federation**: defined by its name and a Git repo with a config file with - clusters configuration for data kind and PURL type parameters, enabling pointing - to multiple repositories - - - **cluster**: identified by the data kind name, prefixing its data repos - - - **repo**: data repo (Git) identified by datakind+PURL-type+hashid - - - **directory**: dir in a repo, identified by PURL-type+PURL-hashid - - - **PURL path**: ns/name/version/extra_path derived from the PURL - - - **datafile**: file storing the data as text JSON/YAML/XML - -Example -------- - -For instance, in the aboutcode data federation, for a cluster about purl -versions, we would have: - -- data federation definition git repo, with its config file. - - aboutcode-data/aboutcode-data - - aboutcode-federation-config.yml - -- data cluster repos name prefix is the data kind - - aboutcode-data/purls - -- data repository git repo, with a purl sub dir tree and datafile. - The first repo name has a hash of 0000 which is the first PURL hashid of the - range of PURL hashid stored in this repo's dirs. - - - aboutcode-data/purls-gem-0000/ - -- data directory, with a purl sub dir tree and datafile. The dir name - composed of type+hashid. - - - aboutcode-data/purls-gem-0000/gem-0107/ - -- PURL subdirectory, and datafile, here list of PURLs for the gem named rails: - - aboutcode-data/purls-gem-0000/gem-0107/rails/purls.yml - -In this example, if the base URL for this cluster is at the aboutcode-data -GitHub organization, so the URL to the purls.yml datafile is inferred this way -based on the cluster config:: - - https://github.com/ - aboutcode-data/purls-gem-0000/ - raw/refs/heads/main/ - gem-0107/rails/purls.yml - - -More Design details -=================== - -The DataCluster and Data kind design aligns with the needs of users: for -example, a user using only vulnerability data for Java and JavaScript may not -care directly for Haskell metadata. Or may care only for another kind of data -like fingerprints. - -* DataCluster: A set of repos for only one data kind for many package types. - -* Data Kind: Identifier for the kind of data stored in the datafile of - DataCluster, like PURL versions, or the original API metadata files, or high - level scans, or scans with file details, reachability slices, fingerprints, or - vulnerability advisories and so on. - -* Repository: A repo is a Git repo that stores a group of Directories of a - DataCluster/data kind, like for all the npms with a PURL hash of 0000 to 1023, - where we store npm metadata files for each PURL. All repo names in a cluster - share the same data-kind prefix. - -* Directory: Named after a PURL type and PURL hashid, it stores the datafiles - for the PURLs that hash to that hashid. - - -Naming conventions -------------------- - -- Federation: like aboutcode-data. Also the name of the config repo. - -- DataCluster name prefix: data kind stored in that cluster, like "purls" or "scancode" - -- For data repos: data kind + PURL type + PURL hashid like - purls-npm-0512 or purls-scancode-scans-0000 - The PURL hashid is the first hashid of a range of hashid stored in that repo. - -- For data dirs in a repo: PURL type + dir_number like npm-0513 or pypi-0000. - The hashid is that of the PURLs whose data files are stored in that directory. - - -PURL Hashid ------------ - -The PURL hashid is central to the design and is simply a number between 0 and -1023 (e.g., 1024 values which is a power of two). - -It could be updated to up 8192 in the future, but 1024 is good enough to spread -files in multiple dirs. - -The Core PURL is a PURL without version, subpath and qualifiers. We hash this -Core PURL as UTF-8-encoded bytes using SHA256. - -The first few bytes of the SHA256 binary digest are converted to an integer -using little endian encoding, then converted modulo a max value of 1024 to yield -an integer converted to a 4-chars, zero-padded string between 0000 and 1023. - -Based on this hashid and the data kind and PURL type, directories are grouped in -one or more Git reposities of a cluster, based on a cluster-defined number of -directories of a type per Git repo. - - -Example of repo and dir names ------------------------------ - -With 4 dirs per repo, we get 256 repos, like these - -purls-npm-0000 - npm-0000 - npm-0001 - npm-0002 - npm-0003 - -purls-npm-0004 - npm-0004 - npm-0005 - npm-0006 - npm-0007 - -purls-npm-0008 - npm-0008 - ... and so on - - -And with 512 dirs per repo, we get 2 repos: - -purls-npm-0000 - npm-0000 - npm-0001 - npm-0002 - ... - npm-0511 - -purls-npm-0512 - npm-0512 - npm-0513 - ... - npm-1023 - - -Git repos sizing assumptions for each ecosystems -------------------------------------------------- - -For small ecosystems with few packages, like luarocks or swift, a single Git -repo or a few repos may be enough to store all the data of a kind. There, a -luarocks cluster of repos will have a single Git repo, with 1024 root -directories. - -At the other end of the spectrum, a package type with many packages like npm may -need 1024 Git repositories to store all the metadata. In this case a npm cluster -of repos will have 1024 Git repos, each with a single root directory. - -We can start with reasonable assumptions wrt. the size of each cluster, as a -number of directory per Git repo and the volume of data we would store in each -using these starting values: - -1. For super large ecosystems (with ~5M packages): - -- one dir per repo, yielding 1,024 repos -- github, npm - -2. For large ecosystems (with ~500K packages) - -- eight dirs per repo, yielding 128 repos -- golang, maven, nuget, perl, php, pypi, ruby, huggingface - -3. For medium ecosystems (with ~50K packages) - -- 32 dirs per repo, yielding 32 Git repositories -- alpm, bitbucket, cocoapods, composer, deb, docker, gem, generic, - mlflow, pub, rpm, cargo - -4. For small ecosystem (with ~2K packages) - -- 1,024 directories in one git repository -- all others - -For instance, say we want a cluster to store all the npm PURLs. As of 2025-10, -npm hosts about 4M unique package names (and roughly 20 versions per name on -average with ~80M updates in total in https://replicate.npmjs.com/). Storing 4M -names takes about 100MB uncompressed. Adding versions would take about 2GB -uncompressed. This means that we can store comfortably all npm PURLs in a single -repository size-wise, but we may want to use more repositories anyway as storing -4M directories and purls.yml files in a single repo will not be a happy event, -so using 32 repos with 32 dirs or 64 repos with 16 dirs may be a better -approach. - -See also original post on the approach: -- https://github.com/aboutcode-org/federatedcode/issues/3#issuecomment-2388371726 - - -Rebalancing and splitting a DataCluster repos ------------------------------------------------- - -We can rebalance a cluster, like when we first store the data in a cluster with -a single Git repository for a given PURL type, and later split this repo to more -repos, without loosing the ability to address datafiles directly just knowing a -PURL and without having to rename all the files and directories. - -In this design, the directory names are stable and do not change as long as we -keep the default 1024 hash values for the PURL hashid. The only thing that -changes are the repo names when more repos are created from a split, when the -size of a Git repo grows too large. - -When a split to occur, we should perform these operations: - -- lock the cluster as "read-only" for the duration of a split operation. This is - to signal to processes and tool that are updating the cluster that they cannot - push new data to there yet. This could be done by updating the cluster config - or the federation config. - -- copy existing Git repos to be split to new repos based on the new number of - directories per repo. - -- filter Git history in existing and new repos to keep only the history related - to the directories stored in a given repo. - -- update the cluster config file in cluster Git repo with the new number of - directories - -- push new Git and existing Git repos - -- unlock the cluster. - -We may need to keep the old and new Clusters around too, and may need to add a -simple DataCluster version suffix in Cluster names, and a way to redirect from an -old frozen, inactive DataCluster to a new rebalanced one. - -It may even be possible to continue writing to a cluster as long as writing is -done in two places until the split is completed. In practice split should be -reasonably rare and reasonably fast, making this a lesser issue. - -It is also possible to change the PURL hashid range for a DataCluster, say going -from 1024 to 2049, 4096 or 8192. This would imply moving all the files around -are the directory structure would change from the new hashids. This is likely -to be an exceptional operation. - -""" - -PACKAGE_REPOS_NAME_PREFIX = "aboutcode-packages" - -KIND_PURLS_FILENAME = "purls.yml" -KIND_VULNERABILITIES_FILENAME = "vulnerabilities.yml" - - -def get_package_purls_yml_file_path(purl: Union[PackageURL, str]): - """ - Return the path to a Package purls.yml YAML for a purl. - """ - return get_package_base_dir(purl) / KIND_PURLS_FILENAME - - -def get_package_vulnerabilities_yml_file_path(purl: Union[PackageURL, str]): - """ - Return the path to a Package vulnerabilities.yml YAML for a purl. - """ - return get_package_base_dir(purl) / KIND_VULNERABILITIES_FILENAME - - -def get_package_base_dir(purl: Union[PackageURL, str]): - """ - Return the base path to a Package directory (ignoring version) for a purl - """ - if isinstance(purl, str): - purl = PackageURL.from_string(purl) - - path_elements = package_path_elements(purl) - phash, core_path, _pversion, _extra_path = path_elements - return Path(f"{PACKAGE_REPOS_NAME_PREFIX}-{purl.type}-{phash}") / core_path - - -@dataclass -class DataFederation: - """ - A data federation is the root object and holds the configuration defining its - data clusters, data kinds, PURL types and data repositories. - """ - - # Hardcoded Aboutcode known "root" federation URL that is the parent all of - # all Git remote repositories - ABCD_FED_ROOT_URL = "https://github.com/aboutcode-data" - # and federation name - ABCD_FED_NAME = "aboutcode-data" - - CONFIG_FILENAME = "aboutcode-federated-config.yml" - - # name for this federation. Used as the prefix for all repos - name: str - # Root dir of all federation local data, like all Git repos checkout. - local_root_dir: Path = None - # root URL for all Git repos for this federation - remote_root_url: str = None - description: Optional[str] = datafield(default="") - documentation_url: Optional[str] = datafield(default="") - # SPDX license expression - data_license: Optional[str] = datafield(default="") - data_maintainers: list["DataMaintainer"] = datafield(default_factory=list) - - # List of DataCluster objects - # Each cluster is for a single, unique data kind in a federation. - data_clusters: list["DataCluster"] = datafield(default_factory=list, repr=False) - - _data_clusters_by_data_kind: dict[str, "DataCluster"] = datafield( - default_factory=dict, repr=False, init=False - ) - - def __post_init__(self): - self.populate_clusters() - - def populate_clusters(self): - self._data_clusters_by_data_kind = { - cluster.data_kind: cluster for cluster in self.data_clusters - } - - def add_cluster(self, cluster): - self._data_clusters_by_data_kind[cluster.data_kind] = cluster - self.data_clusters = list(self._data_clusters_by_data_kind.values()) - - @property - def local_config_dir(self): - # this is also the directory of the config Git repo checkout - return self.local_root_dir / self.name - - @property - def local_config_file(self): - return self.local_config_dir / self.CONFIG_FILENAME - - @classmethod - def remote_config_file_url( - cls, - remote_root_url: str, - federation_name: str, - ): - """Return a URL to directly download the federation config file""" - return build_direct_federation_config_file_url( - remote_root_url=remote_root_url, - federation_name=federation_name, - config_filename=cls.CONFIG_FILENAME, - ) - - @property - def config_repo(self) -> "GitRepo": - """ - Return the GitRepo that contains the configuration for this federation. - """ - return GitRepo( - name=self.name, - local_root_dir=self.local_root_dir, - remote_root_url=self.remote_root_url, - ) - - @classmethod - def from_dict( - cls, - data: dict, - local_root_dir: Path = None, - remote_root_url: str = None, - ) -> "DataFederation": - """ - Return a DataFederation from a configuration mapping. - """ - name = data["name"] - - rru = data.get("remote_root_url") - if remote_root_url and rru != remote_root_url: - raise TypeError(f"Inconsistent remote_root_urls: {rru!r} and {remote_root_url!r}") - - data_clusters = data.get("data_clusters") or [] - - data_kinds = sorted(c["data_kind"] for c in data_clusters) - if data_kinds != sorted(set(data_kinds)): - raise TypeError(f"Duplicated data kinds: {data_kinds}") - - data_clusters = [DataCluster.from_dict(data=cluster) for cluster in data_clusters] - - data_maintainers = data.get("data_maintainers") or [] - data_maintainers = [DataMaintainer(**mnt) for mnt in data_maintainers] - - return cls( - name=name, - local_root_dir=local_root_dir and Path(local_root_dir) or None, - remote_root_url=remote_root_url, - description=data.get("description"), - documentation_url=data.get("documentation_url"), - data_license=data.get("data_license"), - data_maintainers=data_maintainers, - data_clusters=data_clusters, - ) - - @classmethod - def load(cls, name: str, local_root_dir: Path, remote_root_url: str = None) -> "DataFederation": - """ - Return an existing DataFederation loaded from ``local_root_dir`` using - the existing configuration file at its conventional location. - """ - lrd = Path(local_root_dir).resolve() - lcf = lrd / name / cls.CONFIG_FILENAME - return cls.from_yaml_config( - name=name, - text=lcf.read_text(), - remote_root_url=remote_root_url, - local_root_dir=lrd, - ) - - @classmethod - def from_url( - cls, - name: str, - remote_root_url: str, - local_root_dir: Path = None, - ) -> "DataFederation": - """ - Return a DataFederation loaded from a remote configuration file. - """ - rcf_url = build_direct_federation_config_file_url( - remote_root_url=remote_root_url, - federation_name=name, - config_filename=cls.CONFIG_FILENAME, - ) - headers = {"User-Agent": "AboutCode/FederatedCode"} - response = requests.get(url=rcf_url, headers=headers) - if not response.ok: - raise Exception(f"Failed to fetch Federation config: {rcf_url}") - - return cls.from_yaml_config( - name=name, - text=response.text, - remote_root_url=remote_root_url, - local_root_dir=local_root_dir, - ) - - @classmethod - def from_yaml_config( - cls, - name: str, - text: str, - local_root_dir: Path = None, - remote_root_url: str = None, - ) -> "DataFederation": - """ - Return a DataFederation loaded from a YAML configuration text. - """ - data = saneyaml.load(text) - - if data["name"] != name: - raise TypeError( - f"Inconsistent federation name {name!r} " f"with YAML config text: {text!r}" - ) - - lrd = local_root_dir and Path(local_root_dir) or None - return cls.from_dict(data=data, local_root_dir=lrd, remote_root_url=remote_root_url) - - def to_dict(self): - """ - Return a mapping for this federation configuration. - """ - return dict( - name=self.name, - remote_root_url=self.remote_root_url, - description=self.description, - documentation_url=self.documentation_url, - data_license=self.data_license, - data_maintainers=[m.to_dict() for m in self.data_maintainers], - data_clusters=[dc.to_dict() for dc in self.data_clusters], - ) - - def to_yaml(self): - """ - Return a YAML text string for this federation configuration. - """ - return saneyaml.dump(self.to_dict()) - - def dump(self): - """ - Write federation configuration file as YAML. - """ - if not (lrd := self.local_root_dir): - raise ValueError(f"Cannot dump without a local_root_dir : {lrd!r}") - Path(self.local_config_file).write_text(self.to_yaml()) - - @classmethod - def init(cls, name, local_root_dir, remote_root_url=None) -> "DataFederation": - """ - Initialize a new DataFederation in local_root_dir. Fetch the remote - config repo if remote_root_url is provided and the repo exists there. - """ - local_root_dir = Path(local_root_dir).resolve() - local_config_repo_dir = local_root_dir / name - # create dir if needed - # or check if this is a git repo? - # if not init git repo - # create basic config and save that in the config file - if remote_root_url: - # TODO: clone or sync? repo in local_config_repo_dir - # raise NotImplementedError("remote_repo_url is not yet supported.") - pass - - raise NotImplementedError() - - def git_init(self): - """ - Create all Git repos for this federation as needed. Sets the remote - if the remote_root_url is defined. - """ - raise NotImplementedError() - - @classmethod - def bootstrap(cls, local_root_dir) -> "DataFederation": - """ - Return the root, seed DataFederation from AboutCode, bootstrapping in - local_root_dir. - """ - return DataFederation.init( - name=cls.ABCD_FED_NAME, - local_root_dir=local_root_dir, - remote_root_url=cls.ABCD_FED_ROOT_URL, - ) - - def get_cluster(self, data_kind: str) -> "DataCluster": - """ - Return a DataCluster for this data kind or None. - """ - return self._data_clusters_by_data_kind.get(data_kind) - - def get_datafile_download_url(self, data_kind: str, purl: Union[str, PackageURL]) -> Path: - """ - Return the direct download URL to the data file for a data kind given a - PURL, or None. - """ - cluster = self.get_cluster(data_kind=data_kind) - return cluster.get_datafile_download_url(purl=purl) - - def get_local_datafile(self, data_kind: str, purl: Union[str, PackageURL]) -> "LocalDataFile": - """ - Return a LocalDataFile for a data kind given a PURL, or None. - """ - cluster = self.get_cluster(data_kind=data_kind) - return cluster.get_datafile_local_path(purl=purl) - - -@dataclass -class LocalDataFile: - """A local data file stored optionally in a GitRepo""" - - path: Path - git_repo: "GitRepo" = None - - -@dataclass(order=True) -class DataCluster: - """ - AboutCode Federation DataCluster. - """ - - # The name for the data kind stored in this data cluster. There is only one - # per cluster and the name is unique in a federation. - # this is the name of cluster - data_kind: str - - # a URI template to build the path to the datafile for this data kind. - # this is the path relative to the root of a cluster directory. It does not - # include directory and repository. - # - # For instance for a purls.yml file stored for each package: - # {/namespace}/{name}/purls.yml - # - # For a scancode.json file stored for each package version: - # {/namespace}/{name}/{version}/scancode.json - datafile_path_template: str - - # list of unique PurlTypeConfig for types stored in this data cluster. - # "default" is the type that applies to all types not listed here by default - # and it will be added if not provided. - purl_type_configs: list["PurlTypeConfig"] = datafield( - default_factory=list, - repr=False, - ) - - # JSON or XML schema URL for the file format of this data kind if available - data_schema_url: Optional[str] = datafield(default="") - - # description of the data kind format, and description of how this data kind - # is created: which tool, option, etc for instance, a short description of a - # tool and the tool options, like a scancode toolkit command line option, or - # the URL to an API whe we fetch API data - description: Optional[str] = datafield(default="") - - documentation_url: Optional[str] = datafield(default="") - - # SPDX license expression - data_license: Optional[str] = datafield(default="") - - data_maintainers: list["DataMaintainer"] = datafield(default_factory=list) - - # mapping of {purl_type: DataRepository} for the repos stored in this data - # cluster. This is auto populated and not serialized in the config file. - _data_repositories_by_purl_type: dict[str, "DataRepository"] = datafield( - default_factory=dict, - init=False, - repr=False, - ) - - # mapping of {purl_type: PurlTypeConfig} for the repos stored in this data - # cluster. This is auto populated and not serialized in the config file. - _configs_by_purl_type: dict[str, "PurlTypeConfig"] = datafield( - default_factory=dict, - init=False, - repr=False, - ) - - def __post_init__(self): - self.populate_repos() - self.populate_configs() - - def populate_repos(self): - """ - Populate the DataRepository for this DataCluster data kind and PurlTypeConfig. - """ - kind = self.data_kind - drbpt = self._data_repositories_by_purl_type - - for ptc in self.purl_type_configs: - drbpt[ptc.purl_type] = [repo for repo in ptc.get_repos(data_kind=kind)] - - def populate_configs(self): - for ptc in self.purl_type_configs: - self._configs_by_purl_type[ptc.purl_type] = ptc - - @classmethod - def from_dict(cls, data: dict) -> "DataCluster": - ptcs = [PurlTypeConfig(**pt) for pt in data.get("purl_type_configs", [])] - - ptypes = sorted(pt.purl_type for pt in ptcs) - if ptypes != sorted(set(ptypes)): - raise ValueError(f"Duplicate purl types: {ptypes!r}") - - if "default" not in ptypes: - ptcs.append(PurlTypeConfig.default_config()) - - data_maintainers = data.get("data_maintainers") or [] - data_maintainers = [DataMaintainer(**mnt) for mnt in data_maintainers] - - return cls( - data_kind=data["data_kind"], - datafile_path_template=data.get("datafile_path_template"), - purl_type_configs=ptcs, - data_schema_url=data.get("data_schema_url"), - description=data.get("description"), - documentation_url=data.get("documentation_url"), - data_license=data.get("data_license"), - data_maintainers=data_maintainers, - ) - - def to_dict(self): - return dict( - data_kind=self.data_kind, - datafile_path_template=self.datafile_path_template, - purl_type_configs=[pt.to_dict() for pt in self.purl_type_configs], - data_schema_url=self.data_schema_url, - description=self.description, - documentation_url=self.documentation_url, - data_license=self.data_license, - data_maintainers=[m.to_dict() for m in self.data_maintainers], - ) - - def split_cluster(self, number_of_repos, number_of_dirs): - """ - Split the repositories of a cluster in more repositories and directories - """ - raise NotImplementedError() - - def get_datafile_download_url(self, purl: Union[str, PackageURL]) -> str: - """ - Return the direct download URL to the data file of the data kind stored - in this cluster given a PURL. - """ - raise NotImplementedError() - - purl = as_purl(purl) - # FIXME: create as member - purl_type_config_by_type = {ptc.purl_type: ptc for ptc in self.purl_type_configs} - purl_type_config = purl_type_config_by_type(purl.type, self.default_config()) - - ppe = package_path_elements(purl, max_value=purl_type_config.number_of_dirs) - purl_hash, core_path, version, extra_path = ppe - - direct_url = None - # construct a path based on path template - # construct a URL - return direct_url - - def get_local_datafile(self, purl: Union[str, PackageURL]) -> LocalDataFile: - """ - Return a LocalDataFile of the data kind stored in this cluster given a - PURL, or None - """ - raise NotImplementedError() - - def get_config(self, purl_type: str) -> "PurlTypeConfig": - """ - Return a PurlTypeConfig for this purl type. - """ - if purl_type not in self._configs_by_purl_type: - return self._configs_by_purl_type["default"] - return self._configs_by_purl_type[purl_type] - - def get_datafile_relative_path(self, purl: Union[str, PackageURL]) -> str: - """ - Return the datfile path relative to the root of a cluster directory - given a PURL. - """ - purl = as_purl(purl=purl) - - if not purl.version and "{version}" in self.datafile_path_template: - raise ValueError( - f"DataCluster '{self.data_kind}' needs PackageURL with version to generate path." - ) - - template = uritemplate.URITemplate(self.datafile_path_template) - return template.expand( - namespace=purl.namespace, - name=purl.name, - version=purl.version, - ) - - def get_repo_and_dir_hash(self, purl: Union[str, PackageURL]) -> Tuple[str, str]: - """ - Return the repository hash and directory hash given a PURL. - """ - purl = as_purl(purl=purl) - ptc = self.get_config(purl.type) - purl_hashid = compute_purl_hash(purl=purl) - purl_hash = int(purl_hashid) - repo_hash = purl_hash - (purl_hash % ptc.numbers_of_dirs_per_repo) - return f"{repo_hash:04}", purl_hashid - - def get_datafile_repo_and_path(self, purl: Union[str, PackageURL]) -> Tuple[str, str]: - """ - Return the repository name and relative path to the datafile of the data kind stored - in this cluster given a PURL. - """ - purl = as_purl(purl) - repo_hash, dir_hash = self.get_repo_and_dir_hash(purl) - relative_datafile_path = self.get_datafile_relative_path(purl) - - directory_name = f"{purl.type}-{dir_hash}" - repository_name = f"{self.data_kind}-{purl.type}-{repo_hash}" - datafile_path = f"{directory_name}{relative_datafile_path}" - - return repository_name, datafile_path - - -@dataclass -class PurlTypeConfig: - """ - Configuration settings for a PURL type stored in a DataCluster - """ - - # Maximum number of dirs we can support - # at 10Gb per dir, that would support 80TB - MAX_NUMBER_OF_DIRS = 8192 - - # purl type or "default" for a default that applies to all types - purl_type: str - - # number of repos for this PURL type in a cluster - number_of_repos: int = 1 - - # number of dirs for this PURL type in a cluster. Also defines the max PURL - # hash value. - number_of_dirs: int = 1024 - - def to_dict(self) -> dict[str, Any]: - return dict( - purl_type=self.purl_type, - number_of_repos=self.number_of_repos, - number_of_dirs=self.number_of_dirs, - ) - - def __post_init__(self): - self.number_of_repos = int(self.number_of_repos) - self.number_of_dirs = int(self.number_of_dirs) - - if not self.number_of_dirs or self.number_of_dirs > self.MAX_NUMBER_OF_DIRS: - raise TypeError( - f"number_of_dirs {self.number_of_dirs!r} " - f"must be between 1 and {self.MAX_NUMBER_OF_DIRS} included" - ) - - if not is_valid_power_of_two(self.number_of_dirs): - raise TypeError(f"number_of_dirs must be a power of 2, " f"not {self.number_of_dirs!r}") - - if not self.number_of_repos or self.number_of_repos > self.number_of_dirs: - raise TypeError( - f"number_of_repos {self.number_of_repos!r} must be between " - f"1 and {self.number_of_dirs!r}" - ) - - if not is_valid_power_of_two(self.number_of_repos): - raise TypeError( - f"number_of_repos must be a power of 2, " f"not {self.number_of_repos!r}" - ) - - @property - def numbers_of_dirs_per_repo(self) -> int: - """ - Return the number of directories in each repos for this type. - It can be any power of 2 from 1 to number_of_dirs (default to 1024) - """ - return self.number_of_dirs // self.number_of_repos - - @property - def hashids(self) -> list[str]: - """ - Return a list of hashid 4-char strings for this PURL type. - """ - # all possible hashids as 4-char strings padded with zeros - return [f"{v:04}" for v in range(self.number_of_dirs)] - - def get_repos(self, data_kind: str) -> Iterable["DataRepository"]: - """ - Yield DataRepository (populated with DataDirectory) for this PURL type. - """ - purl_type = self.purl_type - dirs_per_repo = self.numbers_of_dirs_per_repo - # all possible hashids as 4-char strings padded with zeros - hashids = self.hashids - - for i in range(0, self.number_of_dirs, dirs_per_repo): - hashids_of_repo = hashids[i : i + dirs_per_repo] - yield DataRepository.from_hashids( - data_kind=data_kind, - purl_type=purl_type, - hashids=hashids_of_repo, - ) - - @classmethod - def default_config(cls) -> "PurlTypeConfig": - """ - Return the default used when nothing is specified for a type - """ - return cls( - purl_type="default", - number_of_repos=1, - number_of_dirs=cls.number_of_dirs, - ) - - @classmethod - def large_size_configs(cls): - """ - Return a list of initial PurlTypeConfig for common types to be used as - template when configuring clusters from scratch for storing data of - large size (scans, etc) - """ - - # This is an initial tiering by type system for storing package metadata - # where the datafile would be large. - # The tiers are as follows: - # 1. Super Large Ecosystem (~5M packages): 1,024 git repositories - # 2. Large Ecosystem (~500K packages): 128 git repositories - # 3. Medium Ecosystem (~50K packages): 16 repositories - # 4. Small Ecosystem (~2K packages): 1 git repository - NUMBER_OF_REPOS_BY_PURL_TYPE = { - # Super Large Ecosystem - "github": 1024, - "npm": 1024, - # Large Ecosystem - "golang": 128, - "maven": 128, - "nuget": 128, - "perl": 128, - "php": 128, - "pypi": 128, - "ruby": 128, - # Medium Ecosystem - "alpm": 16, - "bitbucket": 16, - "cargo": 16, - "cocoapods": 16, - "composer": 16, - "deb": 16, - "docker": 16, - "gem": 16, - "generic": 16, - "huggingface": 16, - "mlflow": 16, - "pub": 16, - "rpm": 16, - # Small Ecosystem all use the default - "default": 1, - } - return [ - cls(purl_type=pt, number_of_repos=nor, number_of_dirs=cls.number_of_dirs) - for pt, nor in NUMBER_OF_REPOS_BY_PURL_TYPE.items() - ] - - @classmethod - def medium_size_configs(cls): - """ - Return a list of initial PurlTypeConfig for common types to be used as - template when configuring clusters from scratch for storing data of - medium size (metadata files, etc.) - """ - NUMBER_OF_REPOS_BY_PURL_TYPE = { - # Super Large Ecosystem - "github": 256, - "npm": 256, - # Large Ecosystem - "golang": 32, - "maven": 32, - "nuget": 32, - "perl": 32, - "php": 32, - "pypi": 32, - "ruby": 32, - # Medium Ecosystem - "alpm": 8, - "bitbucket": 8, - "cargo": 8, - "cocoapods": 8, - "composer": 8, - "deb": 8, - "docker": 8, - "gem": 8, - "generic": 8, - "huggingface": 8, - "mlflow": 8, - "pub": 8, - "rpm": 8, - # Small Ecosystem all use the default - "default": 1, - } - return [ - cls(purl_type=pt, number_of_repos=nor, number_of_dirs=cls.number_of_dirs) - for pt, nor in NUMBER_OF_REPOS_BY_PURL_TYPE.items() - ] - - @classmethod - def small_size_configs(cls): - """ - Return a list of initial PurlTypeConfig for common types to be used as - template when configuring clusters from scratch for storing data of - medium size (purls, etc.) - """ - NUMBER_OF_REPOS_BY_PURL_TYPE = { - # Super Large Ecosystem - "github": 128, - "npm": 128, - # Large Ecosystem - "golang": 16, - "maven": 16, - "nuget": 16, - "perl": 16, - "php": 16, - "pypi": 16, - "ruby": 16, - # Medium Ecosystem - "alpm": 4, - "bitbucket": 4, - "cargo": 4, - "cocoapods": 4, - "composer": 4, - "deb": 4, - "docker": 4, - "gem": 4, - "generic": 4, - "huggingface": 4, - "mlflow": 4, - "pub": 4, - "rpm": 4, - # Small Ecosystem all use the default - "default": 1, - } - return [ - cls(purl_type=pt, number_of_repos=nor, number_of_dirs=cls.number_of_dirs) - for pt, nor in NUMBER_OF_REPOS_BY_PURL_TYPE.items() - ] - - -def cluster_preset(): - """ - Return a mapping of preset DataCluster by data kind for registered kinds. - """ - clusters = [ - DataCluster( - data_kind="purls", - description="List of fully qualified PURL strings for a package, sorted by version.", - datafile_path_template="{/namespace}/{name}/purls.yml", - purl_type_configs=PurlTypeConfig.small_size_configs(), - data_schema_url="", - documentation_url="https://github.com/package-url/purl-spec/", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="api_package_metadata", - description="Raw API response datafiles for a package (ignoring versions). " - "Each datafile path and schema is PURL type-specific " - "and not documented here.", - # FIXME: a POM is in XML, some metadata files may be code - datafile_path_template="", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="api_package_version_responses", - description="Raw API response datafiles for a package versions. " - "Each datafile path and schema is PURL type-specific " - "and not documented here.", - # FIXME: a POM is in XML, some metadata files may be code - datafile_path_template="", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="purldb", - description="PurlDB normalized metadata datafiles for each package " - "versions. Does not include fingerprints and symbols.", - datafile_path_template="{/namespace}/{name}/{version}/purldb.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - # legacy, moving to advisories instead - DataCluster( - data_kind="vulnerabilities", - description="VulnerableCode vulnerabilities for each package. " - "Also includes a separate vulnerabilities directory/", - datafile_path_template="{/namespace}/{name}/vulnerabilities.json", - purl_type_configs=[PurlTypeConfig.default_config()], - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="security_advisories", - description="VulnerableCode security advisories for each package version.", - datafile_path_template="{/namespace}/{name}/{version}/advisories.yml", - purl_type_configs=[PurlTypeConfig.default_config()], - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="scancode_toolkit_scans", - description="scancode toolkit scans for each package version.", - datafile_path_template="{/namespace}/{name}/{version}/scancode-toolkit.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="scancode_fingerprints", - description="scancode_fingerprints for each package version.", - datafile_path_template="{/namespace}/{name}/{version}/scancode-fingerprints.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="cyclonedx14_sboms", - description="CycloneDX v1.4 sboms for each package version", - datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-14.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="cyclonedx15_sboms", - description="CycloneDX v1.5 sboms for each package version", - datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-15.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="cyclonedx16_sboms", - description="CycloneDX v1.6 sboms for each package version", - datafile_path_template="{/namespace}/{name}/{version}/cyclonedx-16.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="spdx2_sboms", - description="SPDX version 2.x sboms for each package version", - datafile_path_template="{/namespace}/{name}/{version}/spdx-2.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="atom_slices", - description="Atom slices for each package version", - datafile_path_template="{/namespace}/{name}/{version}/atom.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="atom_vulnerable_slices", - description="Atom vulnerable_slices for each vulnerable package version", - # FIXME: need to qualify these with an advisory / CVE? - datafile_path_template="{/namespace}/{name}/{version}/atom-vulnerable.json", - purl_type_configs=PurlTypeConfig.large_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - DataCluster( - data_kind="openssf_security_scorecards", - description="OpenSSf security_scorecards for package", - # FIXME: need to qualify these with an advisory / CVE? - datafile_path_template="{/namespace}/{name}/security_scorecard.json", - purl_type_configs=PurlTypeConfig.medium_size_configs(), - data_schema_url="", - documentation_url="", - data_license="CC-BY-4.0", - ), - ] - return {dc.data_kind: dc for dc in clusters} - - -@dataclass -class DataRepository: - """ - A Data Repository (Git repo or local plain dir) in a DataCluster - """ - - data_kind: str - purl_type: str - start_hashid: str - - data_directories: list["DataDirectory"] = datafield( - default_factory=list, - repr=False, - ) - - @property - def name(self): - return f"{self.data_kind}-{self.purl_type}-{self.start_hashid}" - - @classmethod - def from_hashids( - cls, - data_kind: str, - purl_type: str, - hashids: list[str], - ) -> "DataRepository": - """ - Return a new DataRepository to store ``data_kind`` of ``purl_type`` for - a list of ``hashids``. - """ - - data_directories = [DataDirectory(purl_type=purl_type, hashid=hashid) for hashid in hashids] - - # always the 1st hashid of the range of hashid stored in that repo - start_hashid = hashids[0] - - return cls( - data_kind=data_kind, - purl_type=purl_type, - start_hashid=start_hashid, - data_directories=data_directories, - ) - - @property - def git_repo(self) -> "GitRepo": - """ - Return the GitRepo that contains the data for this DataRepository. - """ - return GitRepo( - name=self.name, - local_root_dir=self.local_root_dir, - remote_root_url=self.remote_root_url, - ) - - -@dataclass -class DataDirectory: - """ - A Data Directory in a Data Repository - """ - - purl_type: str - hashid: str - - local_root_dir: Path = None - - def __post_init__(self): - if len(self.hashid) != 4: - raise TypeError(f"Invalid hashid length. Must be 4: {self.hashid!r}") - - @property - def name(self): - return f"{self.purl_type}-{self.hashid}" - - def local_dir_path(self, local_root_dir, repo_name) -> Union[Path, None]: - return local_root_dir / repo_name / self.name - - -@dataclass -class DataMaintainer: - """ - Person or org that maintains a data federation or cluster - """ - - name: str - email: Optional[str] = None - url: Optional[str] = None - - def to_dict(self): - return dict( - name=self.name, - email=self.email, - url=self.url, - ) - - -@dataclass -class GitRepo: - """ - A Git Repo. - """ - - # the name of the repo also the checkout local dir name - name: str - # The path to the local root directory that contains this git repo - local_root_dir: Path - # The root URL that contains the a Git repo with this name - remote_root_url: str = None - - @property - def local_repo_dir(self): - return self.local_root_dir / self.name - - def remote_repo_url(self): - return f"{self.remote_root_url}" + uritemplate.expand("{/name}", name=self.name) - - def is_real_git(self): - """ - Return True if this local repo is initialized on disk, False if this is - just some directory. - """ - return (self.local_repo_dir / ".git").exists() - - def __post_init__(self): - self.local_root_dir = Path(self.local_root_dir).resolve() - - def init(self): - raise NotImplementedError() - - def clone(self): - raise NotImplementedError() - - def pull(self): - raise NotImplementedError() - - def push(self): - raise NotImplementedError() - - -def build_direct_federation_config_file_url( - remote_root_url: str, - federation_name: str, - config_filename: str, -): - """ - Return the URL to download a remote config file for a federation - """ - return build_raw_download_url( - root_url=remote_root_url, - repo=federation_name, - path=config_filename, - branch="main", - ) - - -def build_raw_download_url( - root_url: str, - repo: str, - path: str, - branch: str = "main", - builder=None, -): - """ - Return a direct access raw URL to a file in a know public repo. - """ - _scheme, server, _path, _query, _fragment = urlsplit(root_url) - if not builder: - git_url_builder_by_server = { - "github.com": build_raw_download_url_github, - "gitlab.com": build_raw_download_url_gitlab, - "codeberg.org": build_raw_download_url_codeberg, - } - builder = git_url_builder_by_server[server] - - return builder(root_url=root_url, repo=repo, path=path, branch=branch) - - -def build_raw_download_url_github( - root_url: str, - repo: str, - path: str, - branch: str = "main", -): - """ - Return a direct access raw URL to a file in a github repo. - """ - # NB: an alternative could be - # https://raw.githubusercontent.com/{org}/{repo}/refs/heads/main/{path} - return "/".join([root_url, repo, "raw/refs/heads", branch, path]) - - -def build_raw_download_url_gitlab( - root_url: str, - repo: str, - path: str, - branch: str = "main", -): - """ - Return a direct access raw URL to a file in a gitlab repo. - """ - # note that the org can be multiple path segments - return "/".join([root_url, repo, "-/raw", branch, path]) - - -def build_raw_download_url_codeberg( - root_url: str, - repo: str, - path: str, - branch: str = "main", -): - """ - Return a direct access raw URL to a file in a codeberg repo. - """ - return "/".join([root_url, repo, "raw/branch", branch, path]) - - -def compute_purl_hash(purl: Union[PackageURL, str], max_value: int = 1024) -> str: - """ - Return a hash string from a ``purl`` string or object. - - The PURL is normalized and we drop its version, qualifiers and subpath. This - four characters hash string is the integer hash value between 0000 and 1023, - left-padded with zeros. - - The function is designed to be easily portable across tech stacks and easy - to implement in many programming languages: - - - the hash is based on sha256, available is all common languages, - - the hash is based on the hash integer value between, left padded with 0 - - we use simple arithmetic on integer with modulo. - - Use these steps to compute a PURL hash: - - - Convert the PURL to a core PURL with only type, namespace and name. - - Compute a SHA256 hash on that core PURL string encoded to bytes as UTF-8. - - Convert that hash value to an integer. - - Compute a modulo on that integer with the the max value. - With default max_value of 1024, this yields an int between 0 and 1023. - - Convert that integer to a 4-characters string left-padded with zero. - - For example:: - - The hash does not change with version or qualifiers:: - >>> compute_purl_hash("pkg:pypi/univers@30.12.0") - '0145' - >>> compute_purl_hash("pkg:pypi/univers@10.12.0") - '0145' - >>> compute_purl_hash("pkg:pypi/univers@30.12.0?foo=bar#sub/path") - '0145' - - The hash is left padded with zeros:: - >>> compute_purl_hash("pkg:pypi/expressionss") - '0760' - - We use the canonical PURL. Here pypi normalization always uses dash for - underscore :: - - >>> compute_purl_hash("pkg:pypi/license_expression") - '0297' - >>> compute_purl_hash("pkg:pypi/license-expression") - '0297' - - Originally designed in : - https://github.com/aboutcode-org/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154 - """ - - core_purl = get_core_purl(purl).to_string() - return _compute_hash(core_purl=core_purl, max_value=max_value) - - -def _compute_hash(core_purl: str, max_value: int = 1024) -> str: - """ - Return a hash string from a ``core_purl`` string. The core purl string - must be computed ahead - - For example:: - - >>> compute_purl_hash("pkg:pypi/univers") - '0145' - - The hash is left padded with zeros:: - >>> compute_purl_hash("pkg:pypi/expressionss") - '0760' - """ - - core_purl_bytes = core_purl.encode("utf-8") - hash_bytes = sha256(core_purl_bytes).digest() - # Only keep the first 4 bytes to avoid creating very large integers. - # We only support up to 8192 hashes max_value, 2**13 , aka 13 bits. - # So 2 bytes are enough. - hash_bytes = hash_bytes[:2] - # Convert bytes to integer, using little endian - hash_int = int.from_bytes(hash_bytes, byteorder="little") - # compute modulo max value - short_int = hash_int % max_value - # return as 4-char string left padded with 0 - return f"{short_int:04}" - - -def is_valid_power_of_two(n: int, max_value: int = 1024): - """ - Return True if ``n`` is a power of two between 1 and ``max_value``. - Use bit manipulations. - - See https://stackoverflow.com/questions/57025836 - """ - return n > 0 and n <= max_value and (n & (n - 1) == 0) - - -def percent_quote_more(qs): - """ - Return a percent-quoted string from ``qs`` string by quoting all non-quoted - characters, but ignoring already quoted characters. This makes the quoted - string safe to use in a path as a directory or file name. - - For example:: - >>> percent_quote_more("foo") - 'foo' - - >>> percent_quote_more("foo/bar") - 'foo%2Fbar' - - >>> percent_quote_more("foo:bar") - 'foo%3Abar' - - >>> percent_quote_more("foo%2Fbar") - 'foo%2Fbar' - """ - if not qs: - return qs - try: - return quote(qs, safe="%") - except Exception as e: - raise Exception(f"Failed to percent_quote_more: {qs!r}") from e - - -def as_purl(purl: Union[PackageURL, str]): - """ - Return a PackageURL from ``purl`` object or string. - """ - if isinstance(purl, str): - purl = PackageURL.from_string(purl) - elif not isinstance(purl, PackageURL): - raise ValueError(f"purl {purl!r} must be of type PackageURL or str, not {type(purl)!r}") - return purl - - -def get_core_purl(purl: Union[PackageURL, str]): - """ - Return a new "core" purl from a ``purl`` object or string, dropping version, - qualifiers and subpath. - """ - purl = as_purl(purl) - purld = purl.to_dict() - del purld["version"] - del purld["qualifiers"] - del purld["subpath"] - return PackageURL(**purld) - - -def package_path_elements( - purl: Union[PackageURL, str], - max_value: int = 1024, -): - """ - Return a 4-tuple of POSIX path strings from the ``purl`` string or object. - - The tuple members are: - (short-purl-hash, core-purl-path, purl-version, purl-extra-path) - - These members can be joined as needed with a POSIX "/" path separator to - create a repository and directory structures in a DataCluster. - - short-purl-hash: PURL-based hash, up to max_value - core-purl-path: type/namespace/name - purl-version: PURL version, further percent-quoted for safe path usage - extra_path: qualifiers#subpath combined and percent-quoted for safe path usage - - For example: - - We use the same hash and base path for different versions of the same PURL:: - - >>> package_path_elements("pkg:pypi/license_expression@30.3.1") - ('0297', 'pypi/license-expression', '30.3.1', '') - >>> package_path_elements("pkg:pypi/license_expression@10.3.1") - ('0297', 'pypi/license-expression', '10.3.1', '') - - We percent-quote versions and qualifiers+subpath elements to make these safe - to use as directory names in filesystems. We avoid double encoding of - already quoted parts:: - - >>> package_path_elements("pkg:pypi/license_expression@30.3.1?foo=bar&baz=bar#sub/path") - ('0297', 'pypi/license-expression', '30.3.1', 'baz%3Dbar%26foo%3Dbar%23sub%2Fpath') - - The function accepts also a PURL object:: - - >>> purl = PackageURL( - ... type="pypi", - ... name="license_expression", - ... version="b#ar/?30.3.2!", - ... qualifiers=dict(foo="bar"), - ... subpath="a/b/c") - >>> package_path_elements(purl) - ('0297', 'pypi/license-expression', 'b%23ar%2F%3F30.3.2%21', 'foo%3Dbar%23a%2Fb%2Fc') - """ - purl = as_purl(purl) - core_purl = get_core_purl(purl).to_string() - - # core path is kept encoded, just stripped from the pkg: prefix - _pkg, _, core_path = core_purl.partition(":") - purl_hash = _compute_hash(core_purl=core_purl, max_value=max_value) - - version = normalize_version(purl.version, purl.type) - if version: - version = percent_quote_more(version) - - extra_path = "" - if pq := purl.qualifiers: - # note that we percent-quote everything including the / character - extra_path = percent_quote_more(normalize_qualifiers(pq, encode=True)) - - if psp := purl.subpath: - psp = normalize_subpath(psp, encode=True) - extra_path += percent_quote_more(f"#{psp}") - - return purl_hash, core_path, version, extra_path diff --git a/aboutcode/federated/tests/test_data/all-presets/foo/aboutcode-federated-config.yml b/aboutcode/federated/tests/test_data/all-presets/foo/aboutcode-federated-config.yml deleted file mode 100644 index ca4204bb7..000000000 --- a/aboutcode/federated/tests/test_data/all-presets/foo/aboutcode-federated-config.yml +++ /dev/null @@ -1,1034 +0,0 @@ -name: foo -remote_root_url: -description: -documentation_url: -data_license: -data_maintainers: [] -data_clusters: - - data_kind: api_package_metadata - datafile_path_template: - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: Raw API response datafiles for a package (ignoring versions). Each datafile - path and schema is PURL type-specific and not documented here. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: api_package_version_responses - datafile_path_template: - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: Raw API response datafiles for a package versions. Each datafile path and schema - is PURL type-specific and not documented here. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: atom_slices - datafile_path_template: '{/namespace}/{name}/{version}/atom.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: Atom slices for each package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: atom_vulnerable_slices - datafile_path_template: '{/namespace}/{name}/{version}/atom-vulnerable.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: Atom vulnerable_slices for each vulnerable package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: cyclonedx14_sboms - datafile_path_template: '{/namespace}/{name}/{version}/cyclonedx-14.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: CycloneDX v1.4 sboms for each package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: cyclonedx15_sboms - datafile_path_template: '{/namespace}/{name}/{version}/cyclonedx-15.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: CycloneDX v1.5 sboms for each package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: cyclonedx16_sboms - datafile_path_template: '{/namespace}/{name}/{version}/cyclonedx-16.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: CycloneDX v1.6 sboms for each package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: openssf_security_scorecards - datafile_path_template: '{/namespace}/{name}/security_scorecard.json' - purl_type_configs: - - purl_type: github - number_of_repos: 256 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 256 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 32 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 8 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: OpenSSf security_scorecards for package - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: purldb - datafile_path_template: '{/namespace}/{name}/{version}/purldb.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: PurlDB normalized metadata datafiles for each package versions. Does not include - fingerprints and symbols. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: purls - datafile_path_template: '{/namespace}/{name}/purls.yml' - purl_type_configs: - - purl_type: github - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 4 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: List of fully qualified PURL strings for a package, sorted by version. - documentation_url: https://github.com/package-url/purl-spec/ - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: scancode_fingerprints - datafile_path_template: '{/namespace}/{name}/{version}/scancode-fingerprints.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: scancode_fingerprints for each package version. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: scancode_toolkit_scans - datafile_path_template: '{/namespace}/{name}/{version}/scancode-toolkit.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: scancode toolkit scans for each package version. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: security_advisories - datafile_path_template: '{/namespace}/{name}/{version}/advisories.yml' - purl_type_configs: - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: VulnerableCode security advisories for each package version. - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: spdx2_sboms - datafile_path_template: '{/namespace}/{name}/{version}/spdx-2.json' - purl_type_configs: - - purl_type: github - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: npm - number_of_repos: 1024 - number_of_dirs: 1024 - - purl_type: golang - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: maven - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: nuget - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: perl - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: php - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: pypi - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: ruby - number_of_repos: 128 - number_of_dirs: 1024 - - purl_type: alpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: bitbucket - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cargo - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: cocoapods - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: composer - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: deb - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: docker - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: gem - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: generic - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: huggingface - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: mlflow - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: pub - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: rpm - number_of_repos: 16 - number_of_dirs: 1024 - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: SPDX version 2.x sboms for each package version - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] - - data_kind: vulnerabilities - datafile_path_template: '{/namespace}/{name}/vulnerabilities.json' - purl_type_configs: - - purl_type: default - number_of_repos: 1 - number_of_dirs: 1024 - data_schema_url: - description: VulnerableCode vulnerabilities for each package. Also includes a separate vulnerabilities - directory/ - documentation_url: - data_license: CC-BY-4.0 - data_maintainers: [] diff --git a/aboutcode/federated/tests/test_federated.py b/aboutcode/federated/tests/test_federated.py deleted file mode 100644 index 07adb4c35..000000000 --- a/aboutcode/federated/tests/test_federated.py +++ /dev/null @@ -1,302 +0,0 @@ -# -# Copyright (c) AboutCode and others. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about our open source projects. -# - -from pathlib import Path -from pickle import FALSE - -import pytest -import requests -from packageurl import PackageURL - -from aboutcode.federated import DataCluster -from aboutcode.federated import DataDirectory -from aboutcode.federated import DataFederation -from aboutcode.federated import DataMaintainer -from aboutcode.federated import DataRepository -from aboutcode.federated import GitRepo -from aboutcode.federated import PurlTypeConfig -from aboutcode.federated import as_purl -from aboutcode.federated import build_direct_federation_config_file_url -from aboutcode.federated import cluster_preset -from aboutcode.federated import compute_purl_hash -from aboutcode.federated import get_core_purl -from aboutcode.federated import is_valid_power_of_two -from aboutcode.federated import package_path_elements -from aboutcode.federated import percent_quote_more - -TEST_DATA = Path(__file__).parent / "test_data" - -REGEN = False - - -def test_DataFederation_from_dict_and_to_dict(tmp_path): - data = { - "name": "fed", - "remote_root_url": "https://example.com", - "description": "desc", - "documentation_url": "doc", - "data_license": "MIT", - "maintainers": [{"name": "x"}], - "data_clusters": [], - } - fed = DataFederation.from_dict(data, local_root_dir=tmp_path) - d = fed.to_dict() - assert "name" in d - - -def test_DataFederation_basic(tmp_path): - f = DataFederation( - name="fed", - local_root_dir=tmp_path, - remote_root_url="https://foo.com", - ) - assert f.local_config_dir == tmp_path / "fed" - assert str(f.local_config_file).endswith("fed/aboutcode-federated-config.yml") - assert isinstance(f.config_repo, GitRepo) - - -def test_DataFederation_remote_config_file_url(): - url = DataFederation.remote_config_file_url( - remote_root_url="https://github.com/org", federation_name="fed" - ) - assert url == "https://github.com/org/fed/raw/refs/heads/main/aboutcode-federated-config.yml" - - -def test_DataFederation_load(tmp_path): - # setup - cfg_file = tmp_path / "fed" / DataFederation.CONFIG_FILENAME - cfg_file.parent.mkdir(parents=True) - cfg_file.write_text("name: fed\n") - - # test - fed = DataFederation.load("fed", tmp_path) - assert fed.name == "fed" - assert fed.data_clusters == [] - - -def test_DataFederation_from_url(monkeypatch): - class Response: - ok = True - text = "name: fed\n" "remote_root_url: https://github.com/org\n" - - monkeypatch.setattr(requests, "get", lambda url, headers: Response()) - fed = DataFederation.from_url(name="fed", remote_root_url="https://github.com/org") - assert fed.name == "fed" - assert fed.data_clusters == [] - - -def test_DataCluster_from_dict(): - data = { - "data_kind": "x", - "datafile_path_template": "{/foo}/data.json", - "purl_type_configs": [], - } - DataCluster.from_dict(data) - - -def test_PurlTypeConfig_basic(): - ptc = PurlTypeConfig(purl_type="npm", number_of_repos=4, number_of_dirs=16) - assert ptc.numbers_of_dirs_per_repo == 4 - assert len(ptc.hashids) == 16 - repos = list(ptc.get_repos(data_kind="purls")) - assert len(repos) == 4 - assert all(len(r.data_directories) == 4 for r in repos) - assert all(isinstance(r, DataRepository) for r in repos) - - -def test_PurlTypeConfig_validates_settings(): - with pytest.raises(TypeError): - PurlTypeConfig(purl_type="npm", number_of_repos=3, number_of_dirs=16) - with pytest.raises(TypeError): - PurlTypeConfig(purl_type="npm", number_of_repos=4, number_of_dirs=0) - with pytest.raises(TypeError): - PurlTypeConfig(purl_type="npm", number_of_repos=8, number_of_dirs=4) - - -def test_PurlTypeConfig_defaults_and_presets(): - d = PurlTypeConfig.default_config() - assert isinstance(d, PurlTypeConfig) - - assert d.purl_type == "default" - large = PurlTypeConfig.large_size_configs() - assert all(isinstance(ptc, PurlTypeConfig) for ptc in large) - - medium = PurlTypeConfig.medium_size_configs() - assert all(isinstance(ptc, PurlTypeConfig) for ptc in medium) - - small = PurlTypeConfig.small_size_configs() - assert all(isinstance(ptc, PurlTypeConfig) for ptc in small) - - -def test_DataRepository_from_hashids(): - repo = DataRepository.from_hashids("purls", "npm", ["0000", "0001"]) - assert repo.name == "purls-npm-0000" - assert len(repo.data_directories) == 2 - - -def test_DataDirectory(): - d = DataDirectory(purl_type="pypi", hashid="0256") - assert d.name == "pypi-0256" - - -def test_DataDirectory_with_local_dir(tmp_path): - d = DataDirectory(purl_type="npm", hashid="0010", local_root_dir=tmp_path) - assert d.name == "npm-0010" - path = d.local_dir_path(local_root_dir=tmp_path, repo_name="repo") - assert str(path).endswith("repo/npm-0010") - - -def test_DataMaintainer(): - m = DataMaintainer(name="John", email="a@b.com", url="https://x.com") - assert m.to_dict() == dict(name="John", email="a@b.com", url="https://x.com") - - m = DataMaintainer(name="John") - assert m.to_dict() == dict(name="John", email=None, url=None) - - -def test_build_direct_federation_config_file_url(): - url = build_direct_federation_config_file_url( - remote_root_url="https://github.com/aboutcode-data", - federation_name="aboutcode-data", - config_filename="aboutcode-federated-config.yml", - ) - assert ( - url - == "https://github.com/aboutcode-data/aboutcode-data/raw/refs/heads/main/aboutcode-federated-config.yml" - ) - - -def test_compute_purl_hash(): - p1 = "pkg:pypi/univers@1.0.0" - h1 = compute_purl_hash(p1) - p2 = "pkg:pypi/univers@2.0.0" - h2 = compute_purl_hash(p2) - assert h1 == h2 - assert h1 == "0145" - - -def test_is_valid_power_of_two(): - assert not is_valid_power_of_two(0) - assert is_valid_power_of_two(1) - assert is_valid_power_of_two(2) - assert not is_valid_power_of_two(3) - assert not is_valid_power_of_two(3, max_value=256) - assert is_valid_power_of_two(4, max_value=4) - assert is_valid_power_of_two(1024) - assert not is_valid_power_of_two(1024, max_value=256) - assert not is_valid_power_of_two(2048) - assert not is_valid_power_of_two(2048, max_value=1024) - assert is_valid_power_of_two(8192, max_value=8192) - - -def test_percent_quote_more(): - assert percent_quote_more("abc/def") == "abc%2Fdef" - assert percent_quote_more("abc%2Fdef") == "abc%2Fdef" - assert percent_quote_more("abc:def") == "abc%3Adef" - assert percent_quote_more("") == "" - - -def test_as_purl(): - p = "pkg:pypi/example@1.0.0?file_name=foo.bar&key=value#sub/path" - purl = as_purl(p) - assert isinstance(purl, PackageURL) - assert purl.to_string() == p - - purl2 = as_purl(purl) - assert isinstance(purl2, PackageURL) - assert purl2 == purl - - with pytest.raises(ValueError): - purl = as_purl(123) - - with pytest.raises(ValueError): - purl = as_purl("foo") - - -def test_get_core_purl(): - p = "pkg:pypi/example@1.0.0?file_name=foo.bar&key=value#sub/path" - core = get_core_purl(p) - assert core.to_string() == "pkg:pypi/example" - - -def test_package_path_elements(): - purl = "pkg:pypi/license_expression@30.3.1" - phash, core, ver, extra = package_path_elements(purl) - assert isinstance(phash, str) - assert "pypi" in core - assert ver == "30.3.1" - assert extra == "" - purl2 = "pkg:pypi/license_expression@30.3.1?foo=bar#sub/path" - phash, core, ver, extra = package_path_elements(purl2) - assert "%3D" in extra - - -PURLS_AND_HASHES = [ - ("pkg:maven/org.apache.commons/io", "0604"), - ("pkg:GOLANG/google.golang.org/genproto@abcdedf#/googleapis/api/annotations/", "0643"), - ("pkg:golang/google.golang.org/genproto", "0643"), - ("pkg:golang/github.com/nats-io/nats-server/v2/server@v1.2.9", "0107"), - ("pkg:bitbucket/birKenfeld/pyGments-main@244fd47e07d1014f0aed9c", "0913"), - ("pkg:github/Package-url/purl-Spec@244fd47e07d1004f0aed9c", "0694"), - ("pkg:github/package-url/purl-spec", "0694"), - ("pkg:deb/debian/curl@7.50.3-1?arch=i386&distro=jessie", "0320"), - ("pkg:docker/customer/dockerimage@sha256:244fd47e07d1004f0aed9c?repository_url=gcr.io", "0387"), - ("pkg:gem/jruby-launcher@1.1.2?Platform=java", "0884"), - ( - "pkg:Maven/org.apache.xmlgraphics/batik-anim@1.9.1?repositorY_url=repo.spring.io/release&classifier=sources", - "0758", - ), - ( - "pkg:Maven/org.apache.xmlgraphics/batik-anim@1.9.1?repositorY_url=repo.spring.io/release&extension=pom", - "0758", - ), - ("pkg:maven/org.apache.xmlgraphics/batik-anim", "0758"), - ("pkg:Maven/net.sf.jacob-project/jacob@1.14.3?type=dll&classifier=x86", "0221"), - ("pkg:maven/net.sf.jacob-project/jacob", "0221"), - ("pkg:npm/%40angular/animation@12.3.1", "1001"), - ("pkg:Nuget/EnterpriseLibrary.Common@6.0.1304", "0820"), - ("pkg:PYPI/Django-package@1.11.1.dev1", "0603"), - ("pkg:pypi/django_package", "0603"), - ("pkg:composer/guzzlehttp/promises@2.0.2", "0925"), - ("pkg:Rpm/fedora/curl@7.50.3-1.fc25?Arch=i386&Distro=fedora-25", "0832"), - ("pkg:rpm/fedora/curl@7.50.3-1.fc25?Arch=i386&Distro=fedora-25", "0832"), - ("pkg:maven/HTTPClient/HTTPClient@0.3-3", "0084"), - ("pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value", "0566"), - ("pkg:npm/@babel/core#/googleapis/api/annotations/", "0985"), - ("pkg:npm/@babel/core@1.0.2#/googleapis/api/annotations/", "0985"), - ("pkg:npm/core@1.0.2#/googleapis/api/annotations/", "0775"), - ("pkg:npm/core#/googleapis/api/annotations/", "0775"), -] - - -@pytest.mark.parametrize("purl, purl_hash", PURLS_AND_HASHES) -def test_purl_hash(purl, purl_hash): - result_hash, *_ = package_path_elements(purl) - assert result_hash == purl_hash - - -def test_federation_with_all_cluster_preset(): - df = DataFederation(name="foo", data_clusters=sorted(cluster_preset().values())) - local_root_dir = TEST_DATA / "all-presets" - if False: - df.local_root_dir = local_root_dir - df.dump() - df2 = DataFederation.load(name="foo", local_root_dir=local_root_dir) - assert df.to_dict() == df2.to_dict() - - -def test_datacluster_get_datafile_repo_and_path(): - df = DataFederation(name="foo", data_clusters=sorted(cluster_preset().values())) - local_root_dir = TEST_DATA / "all-presets" - - df = DataFederation.load(name="foo", local_root_dir=local_root_dir) - purls_dc = df.get_cluster("purls") - result_repo, result_path = purls_dc.get_datafile_repo_and_path("pkg:npm/foo/bar") - - assert result_repo == "purls-npm-0920" - assert result_path == "npm-0927/foo/bar/purls.yml" diff --git a/pyproject-aboutcode.federated.toml b/pyproject-aboutcode.federated.toml deleted file mode 100644 index ed6b42857..000000000 --- a/pyproject-aboutcode.federated.toml +++ /dev/null @@ -1,76 +0,0 @@ -[build-system] -requires = [ "flot>=0.7.0" ] -build-backend = "flot.buildapi" - -[project] -name = "aboutcode.federated" -version = "0.1.0" -description = "A library for AboutCode PURL-based federated identifiers" -readme = "aboutcode/federated/README.rst" -license = { text = "Apache-2.0" } -requires-python = ">=3.9" - -authors = [ - { name = "AboutCode and others", email = "info@aboutcode.org" }, -] - -keywords = [ - "purl", - "Package-URL", - "open source", - "package", - "sca", - "scan", - "hash", -] - -classifiers = [ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Topic :: Software Development", - "Topic :: Utilities", -] - -dependencies = [ - "packageurl_python == 0.15.6", - "saneyaml == 0.6.0", - "requests == 2.25.1", - "uritemplate == 4.2.0", -] - -urls = { Homepage = "https://github.com/aboutcode-org/vulnerablecode" } - - -[tool.bumpversion] -current_version = "0.1.0" -allow_dirty = true - -files = [ - { filename = "pyproject-aboutcode.federated.toml" }, -] - -[tool.flot] -includes = [ - "aboutcode/federated/**/*", -] - -excludes = [ - # Python compiled files - "**/*.py[cod]", - "**/*.egg-info", - # Various junk and temp files - "**/.DS_Store", - "**/*~", - "**/.*.sw[po]", - "**/.ve", - "**/*.bak", - "**/.ipynb_checkpoints", - "aboutcode/hashid/**/*", - "aboutcode/federated/tests/**/*", -] - -metadata_files = ["apache-2.0.LICENSE", "NOTICE"] - -editable_paths = ["aboutcode"] From f22ba338a56d45ca0f64f777e1dd905234b58102 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 20 May 2026 16:23:41 +0530 Subject: [PATCH 2/3] feat: add packaged aboutcode.federated dependency Signed-off-by: Keshav Priyadarshi --- requirements-dev.txt | 4 ++-- requirements.txt | 4 ++-- setup.cfg | 7 +++++-- .../pipelines/exporters/federate_vulnerabilities.py | 2 +- .../pipelines/v2_improvers/reference_collect_commits.py | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8eb058b95..4e6b8bc5f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -111,7 +111,7 @@ readme_renderer==43.0 redis==6.2.0 referencing==0.36.0 regex==2026.2.28 -requests==2.32.4 +requests==2.33.1 requests-toolbelt==1.0.0 restructuredtext_lint==2.0.2 rfc3986==2.0.0 @@ -119,7 +119,7 @@ rich==14.3.3 rpds-py==0.30.0 rq==1.15.1 rq-scheduler==0.13.1 -saneyaml==0.6.0 +saneyaml==0.6.1 SecretStorage==3.5.0 semantic-version==2.9.0 semver==3.0.4 diff --git a/requirements.txt b/requirements.txt index 5940ea835..a138c813e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,11 +62,11 @@ PyYAML==6.0.1 redis==6.2.0 referencing==0.36.0 regex==2026.2.28 -requests==2.32.4 +requests==2.33.1 rpds-py==0.30.0 rq==1.15.1 rq-scheduler==0.13.1 -saneyaml==0.6.0 +saneyaml==0.6.1 semantic-version==2.9.0 semver==3.0.4 six==1.16.0 diff --git a/setup.cfg b/setup.cfg index 372544c0f..2a64ae707 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,7 +77,7 @@ install_requires = # file and data formats binaryornot==0.4.4 - saneyaml==0.6.0 + saneyaml==0.6.1 beautifulsoup4==4.13.0 python-dateutil==2.8.2 toml==0.10.2 @@ -90,7 +90,7 @@ install_requires = # networking GitPython==3.1.41 - requests==2.32.4 + requests==2.33.1 fetchcode==0.8.2 #pipeline @@ -108,6 +108,9 @@ install_requires = #hashid uritemplate==4.2.0 + #federation + aboutcode.federated==1.0.3 + [options.extras_require] dev = diff --git a/vulnerabilities/pipelines/exporters/federate_vulnerabilities.py b/vulnerabilities/pipelines/exporters/federate_vulnerabilities.py index 93eacaa1f..d82a7f3d6 100644 --- a/vulnerabilities/pipelines/exporters/federate_vulnerabilities.py +++ b/vulnerabilities/pipelines/exporters/federate_vulnerabilities.py @@ -14,11 +14,11 @@ from operator import attrgetter from pathlib import Path +from aboutcode.federated import DataFederation from aboutcode.pipeline import LoopProgress from django.conf import settings from django.utils import timezone -from aboutcode.federated import DataFederation from vulnerabilities.pipelines import VulnerableCodePipeline from vulnerabilities.pipes import export from vulnerabilities.pipes import federatedcode diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index e694b9a14..c4fc908de 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -8,12 +8,12 @@ # from collections import defaultdict +from aboutcode.federated import get_core_purl from aboutcode.pipeline import LoopProgress from django.db.models import Prefetch from packageurl.contrib.purl2url import purl2url from packageurl.contrib.url2purl import url2purl -from aboutcode.federated import get_core_purl from vulnerabilities.models import AdvisoryReference from vulnerabilities.models import AdvisoryV2 from vulnerabilities.models import ImpactedPackage From b6281acbb39f105f128782e9d0abba7a3311047f Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 20 May 2026 18:36:49 +0530 Subject: [PATCH 3/3] chore: remove release workflow for aboutcode.federated Signed-off-by: Keshav Priyadarshi --- .../pypi-release-aboutcode-federated.yml | 34 ------------------- 1 file changed, 34 deletions(-) delete mode 100644 .github/workflows/pypi-release-aboutcode-federated.yml diff --git a/.github/workflows/pypi-release-aboutcode-federated.yml b/.github/workflows/pypi-release-aboutcode-federated.yml deleted file mode 100644 index 0acc8a578..000000000 --- a/.github/workflows/pypi-release-aboutcode-federated.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: Build aboutcode.federated Python distributions and publish on PyPI - -on: - workflow_dispatch: - push: - tags: - - "aboutcode.federated/*" - -jobs: - build-and-publish: - name: Build and publish library to PyPI - runs-on: ubuntu-22.04 - permissions: - contents: read - - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: 3.11 - - - name: Install flot - run: python -m pip install flot --user - - - name: Build binary wheel and source tarball - run: python -m flot --pyproject pyproject-aboutcode.federated.toml --sdist --wheel --output-dir dist/ - - - name: Publish to PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 - with: - password: ${{ secrets.PYPI_API_TOKEN_ABOUTCODE_FEDERATED }}