SIGN IN SIGN UP
google / magika UNCLAIMED

Fast and accurate AI powered file content types detection

0 0 2 Python
2024-08-29 14:37:34 +00:00
#!/usr/bin/env python3
2025-10-07 17:39:43 +00:00
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2024-08-29 14:37:34 +00:00
import enum
import json
2024-08-29 14:37:34 +00:00
import shutil
2025-03-05 13:49:13 +00:00
import sys
2024-08-29 14:37:34 +00:00
from pathlib import Path
2025-03-05 13:49:13 +00:00
from typing import Optional
2024-08-29 14:37:34 +00:00
import click
MODELS_NAMES_TO_INCLUDE_IN_PYTHON_PACKAGE = [
2025-04-11 08:26:01 +00:00
"standard_v3_3",
]
2024-08-29 14:37:34 +00:00
2025-03-05 13:49:13 +00:00
REPO_ROOT_DIR = Path(__file__).parent.parent.parent
assert REPO_ROOT_DIR.is_dir() and (REPO_ROOT_DIR / ".git").is_dir()
ASSETS_DIR = REPO_ROOT_DIR / "assets"
assert ASSETS_DIR.is_dir()
CONTENT_TYPES_KB_PATH = ASSETS_DIR / "content_types_kb.min.json"
assert CONTENT_TYPES_KB_PATH.is_file()
ASSETS_MODELS_DIR = ASSETS_DIR / "models"
assert ASSETS_MODELS_DIR.is_dir()
PYTHON_ROOT_DIR = REPO_ROOT_DIR / "python"
assert PYTHON_ROOT_DIR.is_dir()
2025-03-05 13:49:13 +00:00
PYTHON_CONTENT_TYPES_KB_PATH = (
PYTHON_ROOT_DIR / "src" / "magika" / "config" / "content_types_kb.min.json"
)
2025-03-05 13:49:13 +00:00
PYTHON_MODELS_DIR = PYTHON_ROOT_DIR / "src" / "magika" / "models"
assert PYTHON_MODELS_DIR.is_dir()
2024-08-29 14:37:34 +00:00
2025-03-05 13:49:13 +00:00
PYTHON_CONTENT_TYPES_LABELS_PY_PATH = (
PYTHON_ROOT_DIR / "src" / "magika" / "types" / "content_type_label.py"
)
JS_ROOT_DIR = REPO_ROOT_DIR / "js"
assert PYTHON_ROOT_DIR.is_dir()
PYTHON_CONTENT_TYPES_KB_PATH = (
PYTHON_ROOT_DIR / "src" / "magika" / "config" / "content_types_kb.min.json"
)
PYTHON_MODELS_DIR = PYTHON_ROOT_DIR / "src" / "magika" / "models"
assert PYTHON_MODELS_DIR.is_dir()
PYTHON_CONTENT_TYPES_LABELS_PY_PATH = (
PYTHON_ROOT_DIR / "src" / "magika" / "types" / "content_type_label.py"
)
class Target(enum.StrEnum):
JS = "js"
PYTHON = "python"
2024-08-29 14:37:34 +00:00
2025-03-05 13:49:13 +00:00
@click.command()
@click.argument("target", type=Target)
2025-03-05 13:49:13 +00:00
@click.option(
"--models-names",
"models_names_str",
help="Comma-separated list of models names to import in the package",
)
def main(target: Target, models_names_str: Optional[str]) -> None:
if target == Target.PYTHON:
if models_names_str is None:
models_names = MODELS_NAMES_TO_INCLUDE_IN_PYTHON_PACKAGE
else:
models_names = list(map(lambda s: s.strip(), models_names_str.split(",")))
print(f"Including these models in the python package: {models_names}")
2024-08-29 14:37:34 +00:00
update_python_content_type_kb()
update_python_content_type_label_py()
2024-08-29 14:37:34 +00:00
print(f"Deleting {PYTHON_MODELS_DIR}")
shutil.rmtree(PYTHON_MODELS_DIR)
for model_name in models_names:
add_model_to_python_package(model_name)
2024-08-29 14:37:34 +00:00
elif target == Target.JS:
update_js_content_type_files()
2024-08-29 14:37:34 +00:00
# FIXME: the model is currently copied manually
print("WARNING: copying the model is currently NOT supported by this script")
2025-03-05 13:49:13 +00:00
def update_python_content_type_kb() -> None:
2025-03-05 13:49:13 +00:00
print(
f"Syncing python's content types KB: {CONTENT_TYPES_KB_PATH} => {PYTHON_CONTENT_TYPES_KB_PATH}"
)
PYTHON_CONTENT_TYPES_KB_PATH.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(CONTENT_TYPES_KB_PATH, PYTHON_CONTENT_TYPES_KB_PATH)
2024-08-29 14:37:34 +00:00
2025-03-05 13:49:13 +00:00
def add_model_to_python_package(model_name: str) -> None:
assets_model_dir = ASSETS_MODELS_DIR / model_name
if not assets_model_dir.is_dir():
print(f'ERROR: model "{model_name} not found')
sys.exit(1)
python_model_dir = PYTHON_MODELS_DIR / model_name
print(f"Adding model {assets_model_dir} => {python_model_dir}")
shutil.copytree(assets_model_dir, python_model_dir)
2024-08-29 14:37:34 +00:00
CONTENT_TYPE_LABEL_PY_SOURCE_PREFIX = """
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from magika.types.strenum import StrEnum
# NOTE: DO NOT EDIT --- This file is automatically generated.
# This is the list of all possible content types we know about; however, models
# support a smaller subset of them. See model's README.md for details.
class ContentTypeLabel(StrEnum):
"""
def update_python_content_type_label_py() -> None:
2025-03-05 13:49:13 +00:00
print(f"Updating {PYTHON_CONTENT_TYPES_LABELS_PY_PATH}")
2025-03-05 13:49:13 +00:00
kb = json.loads(CONTENT_TYPES_KB_PATH.read_text())
enum_body_lines = []
for ct_label_str in sorted(kb.keys()):
if ct_label_str[0].isdigit():
line = (" " * 4) + f'_{ct_label_str.upper()} = "{ct_label_str}"'
else:
line = (" " * 4) + f'{ct_label_str.upper()} = "{ct_label_str}"'
enum_body_lines.append(line)
out = (
CONTENT_TYPE_LABEL_PY_SOURCE_PREFIX.strip()
+ "\n"
+ "\n".join(enum_body_lines)
+ "\n"
)
out += (
"\n"
+ (" " * 4)
+ (
"""
def __repr__(self) -> str:
return str(self)
""".strip()
+ "\n"
)
)
2025-03-05 13:49:13 +00:00
PYTHON_CONTENT_TYPES_LABELS_PY_PATH.write_text(out)
COPYRIGHT_AND_DONOT_EDIT_PREFIX = """
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// NOTE: DO NOT EDIT --- This file is automatically generated by sync.py.
"""
CONTENT_TYPE_LABEL_TS_SOURCE_PREFIX = (
COPYRIGHT_AND_DONOT_EDIT_PREFIX.strip()
+ "\n\n"
+ """
// This is the list of all possible content types we know about; however, models
// support a smaller subset of them. See model's README.md for details.
export enum ContentTypeLabel {
""".strip()
)
def update_js_content_type_files() -> None:
# Update content type labels enum
content_type_label_ts_path = JS_ROOT_DIR / "src" / "content-type-label.ts"
kb = json.loads(CONTENT_TYPES_KB_PATH.read_text())
enum_body_lines = []
for ct_label_str in sorted(kb.keys()):
if ct_label_str[0].isdigit():
line = (" " * 2) + f'_{ct_label_str.upper()} = "{ct_label_str}",'
else:
line = (" " * 2) + f'{ct_label_str.upper()} = "{ct_label_str}",'
enum_body_lines.append(line)
out = (
CONTENT_TYPE_LABEL_TS_SOURCE_PREFIX.strip()
+ "\n"
+ "\n".join(enum_body_lines)
+ "\n"
+ "}\n"
).strip() + "\n"
content_type_label_ts_path.write_text(out)
print(f"Updated {content_type_label_ts_path}")
# Update content types info
content_types_infos_ts_path = JS_ROOT_DIR / "src" / "content-types-infos.ts"
content_types_info_content = COPYRIGHT_AND_DONOT_EDIT_PREFIX.strip() + "\n\n"
content_types_info_content += (
"""
import { ContentTypeInfo } from "./content-type-info";
import { ContentTypeLabel } from "./content-type-label";
export type ContentTypesInfos = Record<ContentTypeLabel, ContentTypeInfo>;
export const ContentTypesInfos = {
get: (): ContentTypesInfos => ({
""".strip()
+ "\n"
)
for ct_label_str, ct_info in sorted(kb.items()):
if ct_label_str[0].isdigit():
ct_label_enum = f"_{ct_label_str.upper()}"
else:
ct_label_enum = ct_label_str.upper()
is_text = ct_info["is_text"]
content_types_info_content += (
" "
+ f"""
[ContentTypeLabel.{ct_label_enum}]: {{
label: ContentTypeLabel.{ct_label_enum},
is_text: {"true" if is_text else "false"},
}},
""".strip()
+ "\n"
)
content_types_info_content += " })\n};\n"
content_types_infos_ts_path.write_text(content_types_info_content)
print(f"Updated {content_types_infos_ts_path}")
2024-08-29 14:37:34 +00:00
if __name__ == "__main__":
main()