scan_repo_o.py

import os
import glob
import json
from pathlib import Path
from datetime import datetime
import ast  # For analyzing Python files
import re

def crawl_repository(base_dir):
    project_structure = {}

    # Define the file extensions you're interested in
    extensions = {
        'python': '*.py',
        'markdown': '*.md',
        'html': '*.html',
        'javascript': '*.js',
        'javascript jsx': '*.jsx',
        'javascript ts': '*.ts',
        'javascript tsx': '*.tsx',
        'css': '*.css',
        'json': '*.json',
        'txt': '*.txt',
        'yaml': '*.yaml'
    }

    # Directories to exclude from scanning
    excluded_dirs = {'venv', 'env', '.env', '__pycache__', 'node_modules', '.git', '.idea', '.vscode', 'shared_venv', 'local_gpt2'}
    excluded_files = {os.path.basename(__file__), "project_overview.json"}  # Exclude the current script itself

    def is_excluded(path):
        # Check directories
        for excluded_dir in excluded_dirs:
            if excluded_dir in Path(path).parts:
                return True
        # Check files
        if os.path.basename(path) in excluded_files:
            return True
        return False

    for category, pattern in extensions.items():
        project_structure[category] = []
        for dirpath, dirnames, filenames in os.walk(base_dir):
            dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                if is_excluded(filepath):
                    continue
                if glob.fnmatch.fnmatch(filename, pattern):
                    file_info = {
                        'name': filename,
                        'path': os.path.relpath(filepath, base_dir),
                        'size': os.path.getsize(filepath),
                        'last_modified': datetime.fromtimestamp(os.path.getmtime(filepath)).strftime('%Y-%m-%d %H:%M:%S'),
                        'content': '',
                        'insights': {}
                    }

                    # Extract content based on file type
                    try:
                        if filename.endswith('.json') or filename.endswith('.txt'):
                            with open(filepath, 'r', encoding='utf-8') as f:
                                file_info['content'] = json.load(f) if filename.endswith('.json') else f.read()
                        elif filename.endswith('.md'):
                            with open(filepath, 'r', encoding='utf-8') as f:
                                file_info['content'] = f.read()
                                file_info['insights']['word_count'] = len(file_info['content'].split())
                        elif filename.endswith('.py'):
                            with open(filepath, 'r', encoding='utf-8') as f:
                                file_info['content'] = f.read()
                                # Extract comments, docstrings, and function count
                                tree = ast.parse(file_info['content'])
                                comments = []
                                docstrings = []
                                functions = 0
                                for node in ast.walk(tree):
                                    if isinstance(node, ast.FunctionDef):
                                        functions += 1
                                        if ast.get_docstring(node):
                                            docstrings.append(ast.get_docstring(node))
                                    if isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
                                        docstrings.append(node.value.s)
                                # Regex to capture comments
                                comments += re.findall(r'#.*', file_info['content'])
                                file_info['insights'] = {
                                    'function_count': functions,
                                    'docstrings': docstrings[:3],  # limit for readability
                                    'comments': comments[:3]       # limit for readability
                                }
                    except Exception as e:
                        file_info['content'] = f"Error reading file: {e}"

                    project_structure[category].append(file_info)
    
    # Add directory tree structure (excluding environment directories and excluded files)
    project_structure['directories'] = []
    for dirpath, dirnames, filenames in os.walk(base_dir):
        dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
        # Filter out excluded files from directories
        filenames = [f for f in filenames if not is_excluded(os.path.join(dirpath, f))]
        if is_excluded(dirpath):
            continue
        project_structure['directories'].append({
            'directory': os.path.relpath(dirpath, base_dir),
            'subdirectories': dirnames,
            'files': filenames
        })

    # Summarize and write the report
    report_file = os.path.join(base_dir, 'project_overview.json')
    with open(report_file, 'w', encoding='utf-8') as report:
        json.dump(project_structure, report, indent=4, ensure_ascii=False)
    
    return project_structure, report_file

if __name__ == "__main__":
    repo_path = str(Path(__file__).parent)
    project_overview, report_path = crawl_repository(repo_path)
    
    print(f"Project overview report saved at: {report_path}")
    print("Here’s a summary of the structure:")
    
    for category, files in project_overview.items():
        if category == 'directories':
            continue  # Skip printing directory details here
        print(f"\nCategory: {category}")
        for file_info in files:
            print(f"  - {file_info['name']} ({file_info['size']} bytes) last modified: {file_info['last_modified']} in {file_info['path']}")
            if file_info.get('content'):
                print(f"    Content preview: {str(file_info['content'])[:100]}...")
            if 'insights' in file_info and file_info['insights']:
                print(f"    Insights: {file_info['insights']}")