MokoCassiopeia/scripts/maintenance/validate_file_headers.py

#!/usr/bin/env python3
"""
Copyright (C) 2026 Moko Consulting <hello@mokoconsulting.tech>

This file is part of a Moko Consulting project.

SPDX-License-Identifier: GPL-3.0-or-later

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.

FILE INFORMATION
DEFGROUP: MokoStandards.Scripts
INGROUP: MokoStandards.Validation
REPO: https://github.com/mokoconsulting-tech/MokoStandards
PATH: /scripts/validate_file_headers.py
VERSION: 05.00.00
BRIEF: Validate copyright headers and file information in repository files
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple, Dict

# File extensions that require headers
HEADER_REQUIRED_EXTENSIONS = {
    '.py': 'python',
    '.php': 'php',
    '.md': 'markdown',
    '.yml': 'yaml',
    '.yaml': 'yaml',
    '.sh': 'shell',
    '.js': 'javascript',
    '.ts': 'typescript',
    '.css': 'css',
}

# Files that are exempt from header requirements
EXEMPT_FILES = {
    'package.json',
    'package-lock.json',
    'composer.json',
    'composer.lock',
    'Gemfile.lock',
    '.gitignore',
    '.gitattributes',
    '.editorconfig',
    'LICENSE',
}

# Patterns indicating generated files
GENERATED_PATTERNS = [
    'DO NOT EDIT',
    'AUTO-GENERATED',
    'AUTOGENERATED',
    'Generated by',
]

# Required patterns in header
REQUIRED_HEADER_PATTERNS = [
    'Copyright (C)',
    'Moko Consulting',
    'GPL-3.0-or-later',
]

# Required file information patterns
REQUIRED_FILE_INFO_PATTERNS = [
    'FILE INFORMATION',
    'DEFGROUP:',
    'REPO:',
    'PATH:',
    'VERSION:',
    'BRIEF:',
]

# Required markdown metadata patterns
REQUIRED_MARKDOWN_METADATA = [
    '## Metadata',
    '## Revision History',
]


def is_exempt_file(filepath: Path) -> bool:
    """Check if file is exempt from header requirements."""
    if filepath.name in EXEMPT_FILES:
        return True

    # Check if in vendor or node_modules
    if 'vendor' in filepath.parts or 'node_modules' in filepath.parts:
        return True

    # Check if in .git directory
    if '.git' in filepath.parts:
        return True

    return False


def is_generated_file(content: str) -> bool:
    """Check if file appears to be auto-generated."""
    first_lines = content[:1000]
    return any(pattern in first_lines for pattern in GENERATED_PATTERNS)


def check_copyright_header(content: str, filepath: Path) -> Tuple[bool, List[str]]:
    """Check if file has proper copyright header."""
    issues = []
    first_section = content[:2000]

    for pattern in REQUIRED_HEADER_PATTERNS:
        if pattern not in first_section:
            issues.append(f"Missing required pattern: {pattern}")

    return len(issues) == 0, issues


def check_file_information(content: str, filepath: Path) -> Tuple[bool, List[str]]:
    """Check if file has proper file information block."""
    issues = []
    first_section = content[:2000]

    for pattern in REQUIRED_FILE_INFO_PATTERNS:
        if pattern not in first_section:
            issues.append(f"Missing required file info: {pattern}")

    return len(issues) == 0, issues


def check_markdown_metadata(content: str, filepath: Path) -> Tuple[bool, List[str]]:
    """Check if markdown file has metadata and revision history."""
    issues = []

    for pattern in REQUIRED_MARKDOWN_METADATA:
        if pattern not in content:
            issues.append(f"Missing required section: {pattern}")

    return len(issues) == 0, issues


def validate_file(filepath: Path) -> Dict[str, any]:
    """Validate a single file."""
    result = {
        'path': str(filepath),
        'valid': True,
        'issues': [],
        'exempt': False,
        'generated': False,
    }

    # Check if exempt
    if is_exempt_file(filepath):
        result['exempt'] = True
        return result

    # Check file extension
    if filepath.suffix not in HEADER_REQUIRED_EXTENSIONS:
        result['exempt'] = True
        return result

    # Read file content
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        result['valid'] = False
        result['issues'].append(f"Error reading file: {e}")
        return result

    # Check if generated
    if is_generated_file(content):
        result['generated'] = True
        return result

    # Check copyright header
    valid, issues = check_copyright_header(content, filepath)
    if not valid:
        result['valid'] = False
        result['issues'].extend(issues)

    # Check file information
    valid, issues = check_file_information(content, filepath)
    if not valid:
        result['valid'] = False
        result['issues'].extend(issues)

    # Additional checks for markdown files
    if filepath.suffix == '.md':
        valid, issues = check_markdown_metadata(content, filepath)
        if not valid:
            result['valid'] = False
            result['issues'].extend(issues)

    return result


def validate_repository(repo_path: Path) -> Dict[str, any]:
    """Validate all files in repository."""
    results = {
        'total': 0,
        'validated': 0,
        'valid': 0,
        'invalid': 0,
        'exempt': 0,
        'generated': 0,
        'files': [],
    }

    # Find all tracked files
    for filepath in repo_path.rglob('*'):
        if not filepath.is_file():
            continue

        results['total'] += 1

        result = validate_file(filepath)
        results['files'].append(result)

        if result['exempt']:
            results['exempt'] += 1
        elif result['generated']:
            results['generated'] += 1
        else:
            results['validated'] += 1
            if result['valid']:
                results['valid'] += 1
            else:
                results['invalid'] += 1

    return results


def print_report(results: Dict[str, any], verbose: bool = False):
    """Print validation report."""
    print("=" * 70)
    print("FILE HEADER VALIDATION REPORT")
    print("=" * 70)
    print()
    print(f"Total files found:     {results['total']}")
    print(f"Files validated:       {results['validated']}")
    print(f"Valid headers:         {results['valid']}")
    print(f"Invalid headers:       {results['invalid']}")
    print(f"Exempt files:          {results['exempt']}")
    print(f"Generated files:       {results['generated']}")
    print()

    if results['invalid'] > 0:
        print("FILES WITH ISSUES:")
        print("-" * 70)
        for file_result in results['files']:
            if not file_result['valid'] and not file_result['exempt'] and not file_result['generated']:
                print(f"\n{file_result['path']}")
                for issue in file_result['issues']:
                    print(f"  ✗ {issue}")
        print()

    if verbose and results['valid'] > 0:
        print("\nVALID FILES:")
        print("-" * 70)
        for file_result in results['files']:
            if file_result['valid']:
                print(f"  ✓ {file_result['path']}")
        print()

    print("=" * 70)

    if results['invalid'] > 0:
        compliance_rate = (results['valid'] / results['validated'] * 100) if results['validated'] > 0 else 0
        print(f"Compliance Rate: {compliance_rate:.1f}%")
        print()
        print("ACTION REQUIRED: Fix files with missing or invalid headers")
        return False
    else:
        print("✓ All validated files have proper headers")
        return True


def main():
    """Main entry point."""
    import argparse

    parser = argparse.ArgumentParser(
        description='Validate copyright headers and file information'
    )
    parser.add_argument(
        '--path',
        default='.',
        help='Path to repository (default: current directory)'
    )
    parser.add_argument(
        '--verbose',
        action='store_true',
        help='Show all files including valid ones'
    )
    parser.add_argument(
        '--fail-on-invalid',
        action='store_true',
        help='Exit with error code if invalid headers found'
    )

    args = parser.parse_args()

    repo_path = Path(args.path).resolve()

    if not repo_path.exists():
        print(f"Error: Path does not exist: {repo_path}", file=sys.stderr)
        sys.exit(1)

    print(f"Validating files in: {repo_path}")
    print()

    results = validate_repository(repo_path)
    success = print_report(results, args.verbose)

    if args.fail_on_invalid and not success:
        sys.exit(1)

    sys.exit(0 if success else 0)


if __name__ == "__main__":
    main()