#!/usr/bin/env python3
"""Generate a compact onboarding summary for a codebase (stdlib only)."""

from __future__ import annotations

import argparse
import json
import os
from collections import Counter
from pathlib import Path
from typing import Dict, Iterable, List

IGNORED_DIRS = {
    ".git",
    "node_modules",
    ".next",
    "dist",
    "build",
    "coverage",
    "venv",
    ".venv",
    "__pycache__",
}

EXT_TO_LANG = {
    ".py": "Python",
    ".ts": "TypeScript",
    ".tsx": "TypeScript",
    ".js": "JavaScript",
    ".jsx": "JavaScript",
    ".go": "Go",
    ".rs": "Rust",
    ".java": "Java",
    ".kt": "Kotlin",
    ".rb": "Ruby",
    ".php": "PHP",
    ".cs": "C#",
    ".c": "C",
    ".cpp": "C++",
    ".h": "C/C++",
    ".swift": "Swift",
    ".sql": "SQL",
    ".sh": "Shell",
}

KEY_CONFIG_FILES = [
    "package.json",
    "pnpm-workspace.yaml",
    "turbo.json",
    "nx.json",
    "lerna.json",
    "tsconfig.json",
    "next.config.js",
    "next.config.mjs",
    "pyproject.toml",
    "requirements.txt",
    "go.mod",
    "Cargo.toml",
    "docker-compose.yml",
    "Dockerfile",
    ".github/workflows",
]


def iter_files(root: Path) -> Iterable[Path]:
    for dirpath, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
        for name in filenames:
            path = Path(dirpath) / name
            if path.is_file():
                yield path


def detect_languages(paths: Iterable[Path]) -> Dict[str, int]:
    counts: Counter[str] = Counter()
    for path in paths:
        lang = EXT_TO_LANG.get(path.suffix.lower())
        if lang:
            counts[lang] += 1
    return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))


def find_key_configs(root: Path) -> List[str]:
    found: List[str] = []
    for rel in KEY_CONFIG_FILES:
        if (root / rel).exists():
            found.append(rel)
    return found


def top_level_structure(root: Path, max_depth: int) -> List[str]:
    lines: List[str] = []
    for dirpath, dirnames, filenames in os.walk(root):
        rel = Path(dirpath).relative_to(root)
        depth = 0 if str(rel) == "." else len(rel.parts)
        if depth > max_depth:
            dirnames[:] = []
            continue

        if any(part in IGNORED_DIRS for part in rel.parts):
            dirnames[:] = []
            continue

        indent = "  " * depth
        if str(rel) != ".":
            lines.append(f"{indent}{rel.name}/")

        visible_files = [f for f in sorted(filenames) if not f.startswith(".")]
        for filename in visible_files[:10]:
            lines.append(f"{indent}  {filename}")

        dirnames[:] = sorted([d for d in dirnames if d not in IGNORED_DIRS])
    return lines


def build_report(root: Path, max_depth: int) -> Dict[str, object]:
    files = list(iter_files(root))
    languages = detect_languages(files)
    total_files = len(files)
    file_count_by_ext: Counter[str] = Counter(p.suffix.lower() or "<no-ext>" for p in files)

    largest = sorted(
        ((str(p.relative_to(root)), p.stat().st_size) for p in files),
        key=lambda item: item[1],
        reverse=True,
    )[:20]

    return {
        "root": str(root),
        "file_count": total_files,
        "languages": languages,
        "key_config_files": find_key_configs(root),
        "top_extensions": dict(file_count_by_ext.most_common(12)),
        "largest_files": largest,
        "directory_structure": top_level_structure(root, max_depth),
    }


def format_size(num_bytes: int) -> str:
    units = ["B", "KB", "MB", "GB"]
    value = float(num_bytes)
    for unit in units:
        if value < 1024 or unit == units[-1]:
            return f"{value:.1f}{unit}"
        value /= 1024
    return f"{num_bytes}B"


def print_text(report: Dict[str, object]) -> None:
    print("Codebase Onboarding Summary")
    print(f"Root: {report['root']}")
    print(f"Total files: {report['file_count']}")
    print("")

    print("Languages detected")
    if report["languages"]:
        for lang, count in report["languages"].items():
            print(f"- {lang}: {count}")
    else:
        print("- No recognized source file extensions")
    print("")

    print("Key config files")
    configs = report["key_config_files"]
    if configs:
        for cfg in configs:
            print(f"- {cfg}")
    else:
        print("- None found from default checklist")
    print("")

    print("Largest files")
    for rel, size in report["largest_files"][:10]:
        print(f"- {rel}: {format_size(size)}")
    print("")

    print("Directory structure")
    for line in report["directory_structure"][:200]:
        print(line)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Scan a repository and generate onboarding summary facts.")
    parser.add_argument("path", help="Path to project directory")
    parser.add_argument("--max-depth", type=int, default=2, help="Max depth for structure output (default: 2)")
    parser.add_argument("--json", action="store_true", help="Print JSON output")
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    root = Path(args.path).expanduser().resolve()
    if not root.exists() or not root.is_dir():
        raise SystemExit(f"Path is not a directory: {root}")

    report = build_report(root, max_depth=max(1, args.max_depth))
    if args.json:
        print(json.dumps(report, indent=2))
    else:
        print_text(report)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
