# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Performs a number of sanity checks on Magika's documentation.""" from __future__ import annotations import re import sys from dataclasses import dataclass from pathlib import Path import click import requests REPO_ROOT_DIR = Path(__file__).parent.parent.parent assert REPO_ROOT_DIR.is_dir() and (REPO_ROOT_DIR / ".git").is_dir() IGNORE_PREFIX_PATTERNS = [ ".mypy_cache", ".pytest_cache", ".ruff_cache", "python/.venv", "python/dist", "website/node_modules", "website/dist", "js/node_modules", "js/dist", "website-ng/node_modules", "website-ng/dist", ] @click.command() @click.option("-v", "--verbose", is_flag=True) def main(verbose: bool) -> None: with_errors = False success = check_versions_are_up_to_date() if not success: with_errors = True success = check_markdown_links(verbose) if not success: with_errors = True if with_errors: print("There was at least one error.") sys.exit(1) print("Everything looks good.") def check_versions_are_up_to_date() -> bool: """Checks that the mentioned latest versions and models are up to date. Returns True if everything is good, False otherwise.""" # Actual last versions and models rust_cli_latest_stable_version = get_max_stable_version_for_crate("magika-cli") rust_lib_latest_stable_version = get_max_stable_version_for_crate("magika") rust_default_model_name = get_rust_default_model_name() python_latest_stable_version = get_python_latest_stable_version() python_default_model_name = get_python_default_model_name() javascript_latest_stable_version = get_latest_version_for_npm_package("magika") javascript_default_model_name = get_javascript_default_model_name() demo_model_name = get_demo_model_name() expected_table = [ (rust_cli_latest_stable_version, rust_default_model_name), (python_latest_stable_version, python_default_model_name), (javascript_latest_stable_version, javascript_default_model_name), (rust_lib_latest_stable_version, rust_default_model_name), ("-", demo_model_name), ("-", "-"), ] # Extract documented last versions and models bindings_overview_path = ( REPO_ROOT_DIR / "website-ng" / "src" / "content" / "docs" / "cli-and-bindings" / "overview.md" ) assert bindings_overview_path.is_file() lines = bindings_overview_path.read_text().splitlines() parsed_table = [] for line in lines: # This is a hack to parse the table in the binding's overview, but it is # simple and self-contained enough to not cause problems. And we'll # notice immediately if things break. if line.startswith("| ["): cols = line.split("|") latest_version = cols[3].strip(" `") default_model = cols[4].strip() if default_model != "-": default_model = default_model.split("]")[0].split("[")[1].strip(" `") parsed_table.append((latest_version, default_model)) if expected_table == parsed_table: return True else: print( f"ERROR: Found stale information in binding's overview table:\n{expected_table=}\n{parsed_table=}" ) return False def get_python_latest_stable_version() -> str: res = requests.get("https://pypi.org/pypi/magika/json") assert res.status_code == 200 latest_stable_version = res.json().get("info", {}).get("version", None) assert latest_stable_version is not None return latest_stable_version def get_python_default_model_name() -> str: magika_path = REPO_ROOT_DIR / "python" / "src" / "magika" / "magika.py" return extract_one_match_with_regex_from_file( magika_path, '_DEFAULT_MODEL_NAME = "([a-zA-Z0-9_]+)"' ) def get_javascript_default_model_name() -> str: magika_path = REPO_ROOT_DIR / "js" / "magika.ts" return extract_one_match_with_regex_from_file( magika_path, 'static MODEL_VERSION = "([a-zA-Z0-9_]+)";' ) def get_demo_model_name() -> str: """Get the model name used by the demo.""" demo_path = ( REPO_ROOT_DIR / "website-ng" / "src" / "components" / "MagikaDemo.svelte" ) return extract_one_match_with_regex_from_file( demo_path, 'const MAGIKA_MODEL_VERSION = "([a-zA-Z0-9_]+)";' ) def get_rust_default_model_name() -> str: model_symlink_path = REPO_ROOT_DIR / "rust" / "gen" / "model" assert model_symlink_path.is_symlink() return model_symlink_path.readlink().name def check_markdown_links(verbose: bool) -> bool: """Checks that links in Markdown files are OK. Returns True if everything is good, False otherwise.""" with_errors = False for path in enumerate_markdown_files_in_dir(Path(".")): if verbose: print(f"Analyzing file: {path}") for ui in extract_uris_infos_from_file( path, verbose=verbose, ): if not ui.is_valid: with_errors = True print( f"ERROR: {path.relative_to(REPO_ROOT_DIR)} has non-valid uri: {ui.uri}" ) # For python/README.md (which is used on pypi), we also check that # the URIs are either pointing to an external resource or are pure # anchors. if str(path.relative_to(REPO_ROOT_DIR)) == "python/README.md": if not ui.is_external and not ui.is_pure_anchor: with_errors = True print( f"ERROR: {path.relative_to(REPO_ROOT_DIR)}, in python/, has a non-external uri: {ui.uri}" ) # Same for js/README.md, which ends up on npm. if str(path.relative_to(REPO_ROOT_DIR)) == "js/README.md": if not ui.is_external and not ui.is_pure_anchor: with_errors = True print( f"ERROR: {path.relative_to(REPO_ROOT_DIR)}, in python/, has a non-external uri: {ui.uri}" ) success = with_errors is False return success def enumerate_markdown_files_in_dir(rel_dir: Path) -> list[Path]: if rel_dir.is_absolute(): print(f"{rel_dir} is not relative") sys.exit(1) a_dir = REPO_ROOT_DIR / rel_dir assert a_dir.is_dir() paths: list[Path] = [] for path in sorted(a_dir.rglob("*.md")): should_ignore = False for exclude_prefix_pattern in IGNORE_PREFIX_PATTERNS: if str(path.relative_to(REPO_ROOT_DIR)).startswith(exclude_prefix_pattern): should_ignore = True break if not should_ignore: paths.append(path) return paths def extract_uris_infos_from_file(path: Path, verbose: bool) -> list[UriInfo]: uri_regex = r"\[.*?\]\((.*?)\)" uris: list[str] = re.findall(uri_regex, path.read_text()) uris_infos: list[UriInfo] = [] for uri in uris: if verbose: print(f"Analyzing uri: {uri}") is_external = uri.startswith("http://") or uri.startswith("https://") is_valid = None is_pure_anchor = None is_insecure = None if is_external: # We treat links pointing to our own repo in a special way. For # simplicity, we only deal with links pointing to the main branch. repo_main_prefix_url = "https://github.com/google/magika/blob/main/" if uri.startswith(repo_main_prefix_url): rel_path = uri.removeprefix(repo_main_prefix_url) assert rel_path.find("#") == -1, ( "Local links with anchors not supported yet" ) abs_path = REPO_ROOT_DIR / rel_path is_valid = abs_path.is_file() else: # We mark any other external link as valid, as actually checking # it's too much of a pain. is_valid = True is_pure_anchor = False if uri.startswith("http://"): is_insecure = True print(f"WARNING: {uri} is not using https") else: is_insecure = False else: is_insecure = False if uri.startswith("#"): is_valid = True is_pure_anchor = True else: is_pure_anchor = False if Path(uri).is_absolute(): website_files_dir = ( REPO_ROOT_DIR / "website-ng" / "src" / "content" / "docs" / uri.removeprefix("/magika/") ) md_path = website_files_dir.with_suffix(".md") mdx_path = website_files_dir.with_suffix(".mdx") public_path = ( REPO_ROOT_DIR / "website-ng" / "public" / Path(uri).name ) if ( website_files_dir.is_dir() or md_path.is_file() or mdx_path.is_file() or public_path.is_file() ): is_valid = True else: is_valid = False else: if uri.find("#") >= 0: # This URI is not a pure anchor, but it does have an # anchor. We remove it so that we can check whether the # file exists or not. rel_file_path = uri.split("#")[0] else: rel_file_path = uri abs_path = path.parent / rel_file_path is_valid = abs_path.is_file() or abs_path.is_dir() assert is_valid is not None assert is_pure_anchor is not None assert is_insecure is not None uris_infos.append( UriInfo( uri=uri, is_external=is_external, is_valid=is_valid, is_pure_anchor=is_pure_anchor, is_insecure=is_insecure, ) ) return uris_infos def get_max_stable_version_for_crate(crate_name: str) -> str: url = f"https://crates.io/api/v1/crates/{crate_name}" r = requests.get(url) crate_info = r.json() return crate_info["crate"]["max_stable_version"] def get_latest_version_for_npm_package(package_name: str) -> str: url = f"https://registry.npmjs.org/{package_name}/latest" r = requests.get(url) crate_info = r.json() return crate_info["version"] def extract_one_match_with_regex_from_file(path: Path, regex: str) -> str: """Extract one (and only one!) match with a regex from a file. Raises an error in case of zero or more than one hits. """ assert path.is_file() matching_str = None for line in path.read_text().splitlines(): m = re.fullmatch(regex, line.strip()) if m: # If we already found something, there is a bug somewhere assert matching_str is None matching_str = m.group(1) assert matching_str is not None return matching_str @dataclass(kw_only=True) class UriInfo: uri: str is_external: bool is_valid: bool is_pure_anchor: bool is_insecure: bool if __name__ == "__main__": main()