#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# /// script
# requires-python = ">=3.10"
# dependencies = [
#   "rich>=13.6.0",
#   "pyyaml>=6.0.3",
# ]
# ///
from __future__ import annotations

import os
import re
from collections import defaultdict
from pathlib import Path

import rich
import yaml

AIRFLOW_SOURCES_PATH = Path(__file__).parents[1]

# Directories to scan
PYPROJECT_TOML_FILES = AIRFLOW_SOURCES_PATH.rglob("providers/**/pyproject.toml")

# Patterns to identify Airflow metadata DB access
DB_PATTERNS: list[tuple[re.Pattern, re.Pattern | None]] = [
    (re.compile(r"from airflow\.utils\.session"), None),
    (re.compile(r"from airflow\.settings import Session"), None),
    (re.compile(r"@provide_session"), None),
    (re.compile(r"from sqlalchemy\.orm\.session"), None),
    (re.compile(r"session\.query"), None),
]

AFFECTED_PROVIDERS: dict[str, list[Path]] = defaultdict(list)
MATCHES: dict[Path, list[str]] = defaultdict(list)


def line_matches_pattern(line: str, patterns: list[tuple[re.Pattern, re.Pattern | None]]) -> bool:
    """Check if a line matches any metadata DB access pattern."""
    return any(
        pattern.search(line) and not (exclude_pattern and exclude_pattern.search(line))
        for pattern, exclude_pattern in patterns
    )


def any_line_matches_pattern(filepath: Path) -> bool:
    """Scan a single file for metadata DB access patterns."""
    lines = filepath.read_text().splitlines()
    matches = False
    for i, line in enumerate(lines, start=1):
        if line_matches_pattern(line, DB_PATTERNS):
            rich.print(f"[bright_blue]Match found[/] in {filepath} -> #{i}:{line}")
            MATCHES[filepath].append(
                f"[Line:{i}](https://github.com/apache/airflow/blob/main/{filepath}#L{i}): {line} "
            )
            matches = True
    return matches


def scan_directory(directory):
    provider_name = yaml.safe_load((directory / "provider.yaml").read_text())["package-name"]
    for path in (directory / "src").rglob("*.py"):
        rel_path = path.relative_to(AIRFLOW_SOURCES_PATH)
        if any_line_matches_pattern(rel_path):
            rich.print(f"[green]Found metadata DB access in {path}[/]")
            AFFECTED_PROVIDERS[provider_name].append(rel_path)


def main():
    for pyproject_toml in PYPROJECT_TOML_FILES:
        directory = pyproject_toml.parent
        if os.path.exists(directory):
            rich.print(f"Scanning src folder of {directory}...")
            scan_directory(directory)
    print()
    print(f"Found {len(AFFECTED_PROVIDERS)} providers with metadata DB access patterns:")
    print()
    for provider in sorted(AFFECTED_PROVIDERS):
        print(f"## Provider: {provider}\n")
        for file in AFFECTED_PROVIDERS[provider]:
            print(f" - [ ] [{file.name}](https://github.com/apache/airflow/blob/main/{file})")
            for match in MATCHES[file]:
                print(f"    - {match}")
        print()


if __name__ == "__main__":
    main()