2024-08-29 14:37:34 +00:00
|
|
|
#!/usr/bin/env python3
|
2025-10-07 17:39:43 +00:00
|
|
|
# Copyright 2025 Google LLC
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
import enum
|
2024-08-29 15:04:41 +00:00
|
|
|
import json
|
2024-08-29 14:37:34 +00:00
|
|
|
import shutil
|
2025-03-05 13:49:13 +00:00
|
|
|
import sys
|
2024-08-29 14:37:34 +00:00
|
|
|
from pathlib import Path
|
2025-03-05 13:49:13 +00:00
|
|
|
from typing import Optional
|
2024-08-29 14:37:34 +00:00
|
|
|
|
|
|
|
|
import click
|
|
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
MODELS_NAMES_TO_INCLUDE_IN_PYTHON_PACKAGE = [
|
2025-04-11 08:26:01 +00:00
|
|
|
"standard_v3_3",
|
2025-03-31 15:14:58 +00:00
|
|
|
]
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
REPO_ROOT_DIR = Path(__file__).parent.parent.parent
|
|
|
|
|
assert REPO_ROOT_DIR.is_dir() and (REPO_ROOT_DIR / ".git").is_dir()
|
|
|
|
|
|
|
|
|
|
ASSETS_DIR = REPO_ROOT_DIR / "assets"
|
|
|
|
|
assert ASSETS_DIR.is_dir()
|
|
|
|
|
|
|
|
|
|
CONTENT_TYPES_KB_PATH = ASSETS_DIR / "content_types_kb.min.json"
|
|
|
|
|
assert CONTENT_TYPES_KB_PATH.is_file()
|
|
|
|
|
|
|
|
|
|
ASSETS_MODELS_DIR = ASSETS_DIR / "models"
|
|
|
|
|
assert ASSETS_MODELS_DIR.is_dir()
|
|
|
|
|
|
|
|
|
|
PYTHON_ROOT_DIR = REPO_ROOT_DIR / "python"
|
|
|
|
|
assert PYTHON_ROOT_DIR.is_dir()
|
2024-09-16 11:30:07 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
PYTHON_CONTENT_TYPES_KB_PATH = (
|
|
|
|
|
PYTHON_ROOT_DIR / "src" / "magika" / "config" / "content_types_kb.min.json"
|
|
|
|
|
)
|
2024-09-16 11:30:07 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
PYTHON_MODELS_DIR = PYTHON_ROOT_DIR / "src" / "magika" / "models"
|
|
|
|
|
assert PYTHON_MODELS_DIR.is_dir()
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
PYTHON_CONTENT_TYPES_LABELS_PY_PATH = (
|
|
|
|
|
PYTHON_ROOT_DIR / "src" / "magika" / "types" / "content_type_label.py"
|
|
|
|
|
)
|
2024-08-29 15:04:41 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
JS_ROOT_DIR = REPO_ROOT_DIR / "js"
|
|
|
|
|
assert PYTHON_ROOT_DIR.is_dir()
|
|
|
|
|
|
|
|
|
|
PYTHON_CONTENT_TYPES_KB_PATH = (
|
|
|
|
|
PYTHON_ROOT_DIR / "src" / "magika" / "config" / "content_types_kb.min.json"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
PYTHON_MODELS_DIR = PYTHON_ROOT_DIR / "src" / "magika" / "models"
|
|
|
|
|
assert PYTHON_MODELS_DIR.is_dir()
|
|
|
|
|
|
|
|
|
|
PYTHON_CONTENT_TYPES_LABELS_PY_PATH = (
|
|
|
|
|
PYTHON_ROOT_DIR / "src" / "magika" / "types" / "content_type_label.py"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Target(enum.StrEnum):
|
|
|
|
|
JS = "js"
|
|
|
|
|
PYTHON = "python"
|
|
|
|
|
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
@click.command()
|
2025-03-31 15:14:58 +00:00
|
|
|
@click.argument("target", type=Target)
|
2025-03-05 13:49:13 +00:00
|
|
|
@click.option(
|
|
|
|
|
"--models-names",
|
|
|
|
|
"models_names_str",
|
|
|
|
|
help="Comma-separated list of models names to import in the package",
|
|
|
|
|
)
|
2025-03-31 15:14:58 +00:00
|
|
|
def main(target: Target, models_names_str: Optional[str]) -> None:
|
|
|
|
|
if target == Target.PYTHON:
|
|
|
|
|
if models_names_str is None:
|
|
|
|
|
models_names = MODELS_NAMES_TO_INCLUDE_IN_PYTHON_PACKAGE
|
|
|
|
|
else:
|
|
|
|
|
models_names = list(map(lambda s: s.strip(), models_names_str.split(",")))
|
|
|
|
|
|
|
|
|
|
print(f"Including these models in the python package: {models_names}")
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
update_python_content_type_kb()
|
|
|
|
|
update_python_content_type_label_py()
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
print(f"Deleting {PYTHON_MODELS_DIR}")
|
|
|
|
|
shutil.rmtree(PYTHON_MODELS_DIR)
|
|
|
|
|
for model_name in models_names:
|
|
|
|
|
add_model_to_python_package(model_name)
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
elif target == Target.JS:
|
|
|
|
|
update_js_content_type_files()
|
2024-08-29 14:37:34 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
# FIXME: the model is currently copied manually
|
|
|
|
|
print("WARNING: copying the model is currently NOT supported by this script")
|
2025-03-05 13:49:13 +00:00
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
|
|
|
|
|
def update_python_content_type_kb() -> None:
|
2025-03-05 13:49:13 +00:00
|
|
|
print(
|
|
|
|
|
f"Syncing python's content types KB: {CONTENT_TYPES_KB_PATH} => {PYTHON_CONTENT_TYPES_KB_PATH}"
|
|
|
|
|
)
|
|
|
|
|
PYTHON_CONTENT_TYPES_KB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
shutil.copy(CONTENT_TYPES_KB_PATH, PYTHON_CONTENT_TYPES_KB_PATH)
|
2024-08-29 14:37:34 +00:00
|
|
|
|
|
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
def add_model_to_python_package(model_name: str) -> None:
|
|
|
|
|
assets_model_dir = ASSETS_MODELS_DIR / model_name
|
|
|
|
|
if not assets_model_dir.is_dir():
|
|
|
|
|
print(f'ERROR: model "{model_name} not found')
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
python_model_dir = PYTHON_MODELS_DIR / model_name
|
|
|
|
|
|
|
|
|
|
print(f"Adding model {assets_model_dir} => {python_model_dir}")
|
|
|
|
|
shutil.copytree(assets_model_dir, python_model_dir)
|
2024-08-29 14:37:34 +00:00
|
|
|
|
|
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
CONTENT_TYPE_LABEL_PY_SOURCE_PREFIX = """
|
2024-08-29 15:04:41 +00:00
|
|
|
# Copyright 2024 Google LLC
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from magika.types.strenum import StrEnum
|
|
|
|
|
|
|
|
|
|
# NOTE: DO NOT EDIT --- This file is automatically generated.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is the list of all possible content types we know about; however, models
|
2025-03-17 09:06:33 +00:00
|
|
|
# support a smaller subset of them. See model's README.md for details.
|
2024-08-29 15:04:41 +00:00
|
|
|
class ContentTypeLabel(StrEnum):
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
2025-03-31 15:14:58 +00:00
|
|
|
def update_python_content_type_label_py() -> None:
|
2025-03-05 13:49:13 +00:00
|
|
|
print(f"Updating {PYTHON_CONTENT_TYPES_LABELS_PY_PATH}")
|
2024-08-29 15:04:41 +00:00
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
kb = json.loads(CONTENT_TYPES_KB_PATH.read_text())
|
2024-08-29 15:04:41 +00:00
|
|
|
|
|
|
|
|
enum_body_lines = []
|
|
|
|
|
for ct_label_str in sorted(kb.keys()):
|
|
|
|
|
if ct_label_str[0].isdigit():
|
|
|
|
|
line = (" " * 4) + f'_{ct_label_str.upper()} = "{ct_label_str}"'
|
|
|
|
|
else:
|
|
|
|
|
line = (" " * 4) + f'{ct_label_str.upper()} = "{ct_label_str}"'
|
|
|
|
|
enum_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
out = (
|
2025-03-31 15:14:58 +00:00
|
|
|
CONTENT_TYPE_LABEL_PY_SOURCE_PREFIX.strip()
|
2024-08-29 15:04:41 +00:00
|
|
|
+ "\n"
|
|
|
|
|
+ "\n".join(enum_body_lines)
|
|
|
|
|
+ "\n"
|
|
|
|
|
)
|
|
|
|
|
out += (
|
|
|
|
|
"\n"
|
|
|
|
|
+ (" " * 4)
|
|
|
|
|
+ (
|
|
|
|
|
"""
|
|
|
|
|
def __repr__(self) -> str:
|
|
|
|
|
return str(self)
|
|
|
|
|
""".strip()
|
|
|
|
|
+ "\n"
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-05 13:49:13 +00:00
|
|
|
PYTHON_CONTENT_TYPES_LABELS_PY_PATH.write_text(out)
|
2024-08-29 15:04:41 +00:00
|
|
|
|
|
|
|
|
|
2025-04-06 09:29:51 +00:00
|
|
|
COPYRIGHT_AND_DONOT_EDIT_PREFIX = """
|
2025-03-31 15:14:58 +00:00
|
|
|
// Copyright 2024 Google LLC
|
|
|
|
|
//
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
//
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
//
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
2025-04-06 09:29:51 +00:00
|
|
|
// NOTE: DO NOT EDIT --- This file is automatically generated by sync.py.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CONTENT_TYPE_LABEL_TS_SOURCE_PREFIX = (
|
|
|
|
|
COPYRIGHT_AND_DONOT_EDIT_PREFIX.strip()
|
|
|
|
|
+ "\n\n"
|
|
|
|
|
+ """
|
2025-03-31 15:14:58 +00:00
|
|
|
// This is the list of all possible content types we know about; however, models
|
|
|
|
|
// support a smaller subset of them. See model's README.md for details.
|
|
|
|
|
export enum ContentTypeLabel {
|
2025-04-06 09:29:51 +00:00
|
|
|
""".strip()
|
|
|
|
|
)
|
2025-03-31 15:14:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_js_content_type_files() -> None:
|
|
|
|
|
# Update content type labels enum
|
|
|
|
|
content_type_label_ts_path = JS_ROOT_DIR / "src" / "content-type-label.ts"
|
|
|
|
|
|
|
|
|
|
kb = json.loads(CONTENT_TYPES_KB_PATH.read_text())
|
|
|
|
|
|
|
|
|
|
enum_body_lines = []
|
|
|
|
|
for ct_label_str in sorted(kb.keys()):
|
|
|
|
|
if ct_label_str[0].isdigit():
|
|
|
|
|
line = (" " * 2) + f'_{ct_label_str.upper()} = "{ct_label_str}",'
|
|
|
|
|
else:
|
|
|
|
|
line = (" " * 2) + f'{ct_label_str.upper()} = "{ct_label_str}",'
|
|
|
|
|
enum_body_lines.append(line)
|
|
|
|
|
|
|
|
|
|
out = (
|
|
|
|
|
CONTENT_TYPE_LABEL_TS_SOURCE_PREFIX.strip()
|
|
|
|
|
+ "\n"
|
|
|
|
|
+ "\n".join(enum_body_lines)
|
|
|
|
|
+ "\n"
|
|
|
|
|
+ "}\n"
|
|
|
|
|
).strip() + "\n"
|
|
|
|
|
|
|
|
|
|
content_type_label_ts_path.write_text(out)
|
|
|
|
|
print(f"Updated {content_type_label_ts_path}")
|
|
|
|
|
|
|
|
|
|
# Update content types info
|
|
|
|
|
content_types_infos_ts_path = JS_ROOT_DIR / "src" / "content-types-infos.ts"
|
2025-04-06 09:29:51 +00:00
|
|
|
content_types_info_content = COPYRIGHT_AND_DONOT_EDIT_PREFIX.strip() + "\n\n"
|
|
|
|
|
|
|
|
|
|
content_types_info_content += (
|
|
|
|
|
"""
|
2025-03-31 15:14:58 +00:00
|
|
|
import { ContentTypeInfo } from "./content-type-info";
|
|
|
|
|
import { ContentTypeLabel } from "./content-type-label";
|
|
|
|
|
|
|
|
|
|
export type ContentTypesInfos = Record<ContentTypeLabel, ContentTypeInfo>;
|
|
|
|
|
|
|
|
|
|
export const ContentTypesInfos = {
|
|
|
|
|
get: (): ContentTypesInfos => ({
|
2025-04-06 09:29:51 +00:00
|
|
|
""".strip()
|
|
|
|
|
+ "\n"
|
|
|
|
|
)
|
2025-03-31 15:14:58 +00:00
|
|
|
for ct_label_str, ct_info in sorted(kb.items()):
|
|
|
|
|
if ct_label_str[0].isdigit():
|
|
|
|
|
ct_label_enum = f"_{ct_label_str.upper()}"
|
|
|
|
|
else:
|
|
|
|
|
ct_label_enum = ct_label_str.upper()
|
|
|
|
|
is_text = ct_info["is_text"]
|
|
|
|
|
content_types_info_content += (
|
|
|
|
|
" "
|
|
|
|
|
+ f"""
|
|
|
|
|
[ContentTypeLabel.{ct_label_enum}]: {{
|
|
|
|
|
label: ContentTypeLabel.{ct_label_enum},
|
|
|
|
|
is_text: {"true" if is_text else "false"},
|
|
|
|
|
}},
|
|
|
|
|
""".strip()
|
|
|
|
|
+ "\n"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
content_types_info_content += " })\n};\n"
|
|
|
|
|
|
|
|
|
|
content_types_infos_ts_path.write_text(content_types_info_content)
|
|
|
|
|
print(f"Updated {content_types_infos_ts_path}")
|
|
|
|
|
|
|
|
|
|
|
2024-08-29 14:37:34 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|