-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan_repo_o.py
132 lines (119 loc) · 6.07 KB
/
scan_repo_o.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import glob
import json
from pathlib import Path
from datetime import datetime
import ast # For analyzing Python files
import re
def crawl_repository(base_dir):
project_structure = {}
# Define the file extensions you're interested in
extensions = {
'python': '*.py',
'markdown': '*.md',
'html': '*.html',
'javascript': '*.js',
'javascript jsx': '*.jsx',
'javascript ts': '*.ts',
'javascript tsx': '*.tsx',
'css': '*.css',
'json': '*.json',
'txt': '*.txt',
'yaml': '*.yaml'
}
# Directories to exclude from scanning
excluded_dirs = {'venv', 'env', '.env', '__pycache__', 'node_modules', '.git', '.idea', '.vscode', 'shared_venv', 'local_gpt2'}
excluded_files = {os.path.basename(__file__), "project_overview.json"} # Exclude the current script itself
def is_excluded(path):
# Check directories
for excluded_dir in excluded_dirs:
if excluded_dir in Path(path).parts:
return True
# Check files
if os.path.basename(path) in excluded_files:
return True
return False
for category, pattern in extensions.items():
project_structure[category] = []
for dirpath, dirnames, filenames in os.walk(base_dir):
dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
for filename in filenames:
filepath = os.path.join(dirpath, filename)
if is_excluded(filepath):
continue
if glob.fnmatch.fnmatch(filename, pattern):
file_info = {
'name': filename,
'path': os.path.relpath(filepath, base_dir),
'size': os.path.getsize(filepath),
'last_modified': datetime.fromtimestamp(os.path.getmtime(filepath)).strftime('%Y-%m-%d %H:%M:%S'),
'content': '',
'insights': {}
}
# Extract content based on file type
try:
if filename.endswith('.json') or filename.endswith('.txt'):
with open(filepath, 'r', encoding='utf-8') as f:
file_info['content'] = json.load(f) if filename.endswith('.json') else f.read()
elif filename.endswith('.md'):
with open(filepath, 'r', encoding='utf-8') as f:
file_info['content'] = f.read()
file_info['insights']['word_count'] = len(file_info['content'].split())
elif filename.endswith('.py'):
with open(filepath, 'r', encoding='utf-8') as f:
file_info['content'] = f.read()
# Extract comments, docstrings, and function count
tree = ast.parse(file_info['content'])
comments = []
docstrings = []
functions = 0
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
functions += 1
if ast.get_docstring(node):
docstrings.append(ast.get_docstring(node))
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
docstrings.append(node.value.s)
# Regex to capture comments
comments += re.findall(r'#.*', file_info['content'])
file_info['insights'] = {
'function_count': functions,
'docstrings': docstrings[:3], # limit for readability
'comments': comments[:3] # limit for readability
}
except Exception as e:
file_info['content'] = f"Error reading file: {e}"
project_structure[category].append(file_info)
# Add directory tree structure (excluding environment directories and excluded files)
project_structure['directories'] = []
for dirpath, dirnames, filenames in os.walk(base_dir):
dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
# Filter out excluded files from directories
filenames = [f for f in filenames if not is_excluded(os.path.join(dirpath, f))]
if is_excluded(dirpath):
continue
project_structure['directories'].append({
'directory': os.path.relpath(dirpath, base_dir),
'subdirectories': dirnames,
'files': filenames
})
# Summarize and write the report
report_file = os.path.join(base_dir, 'project_overview.json')
with open(report_file, 'w', encoding='utf-8') as report:
json.dump(project_structure, report, indent=4, ensure_ascii=False)
return project_structure, report_file
if __name__ == "__main__":
repo_path = str(Path(__file__).parent)
project_overview, report_path = crawl_repository(repo_path)
print(f"Project overview report saved at: {report_path}")
print("Here’s a summary of the structure:")
for category, files in project_overview.items():
if category == 'directories':
continue # Skip printing directory details here
print(f"\nCategory: {category}")
for file_info in files:
print(f" - {file_info['name']} ({file_info['size']} bytes) last modified: {file_info['last_modified']} in {file_info['path']}")
if file_info.get('content'):
print(f" Content preview: {str(file_info['content'])[:100]}...")
if 'insights' in file_info and file_info['insights']:
print(f" Insights: {file_info['insights']}")