-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
263 lines (181 loc) · 9.25 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import streamlit as st
import requests
import os
from dotenv import load_dotenv
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import CSVLoader,TextLoader,DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
import utils.config as config
from github import Github
from utils.constants import *
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['GITHUB_TOKEN'] = os.getenv('GITHUB_TOKEN')
os.environ['ACTIVELOOP_TOKEN'] = os.getenv('ACTIVELOOP_TOKEN')
st.set_page_config(page_title="GitHub Repositories List" , page_icon=":computer:" , layout="wide" , initial_sidebar_state="expanded")
# Function to fetch GitHub repositories
@st.cache_data # Cache data so that we don't have to fetch it again
def fetch_github_repos(username):
# url = f'https://api.github.com/users/{username}/repos'
# response = requests.get(url)
# if response.status_code == 200:
# return response.json()
# else:
# return None
repos = []
page = 1
while True:
url = f"https://api.github.com/users/{username}/repos?page={page}&per_page=50"
response = requests.get(url)
data = response.json()
if not data:
break
repos.extend([(repo) for repo in data])
page += 1
return repos
# Function to display repositories
def display_repos(repos):
for repo in repos:
repo_name = repo["name"]
repo_url = repo["html_url"]
st.write(f"[{repo_name}]({repo_url})")
# def final_analysis(df):
# df.to_csv("data.csv")
# loader = CSVLoader(file_path="data.csv", encoding ="utf-8")
# csv_data = loader.load()
# csv_embeddings = OpenAIEmbeddings()
# vectors = FAISS.from_documents(csv_data, csv_embeddings)
# # Create a question-answering chain using the index
# chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=vectors.as_retriever(), input_key="question")
# # Pass a query to the chain
# query = """
# You are an inteelligent CSV Agent who can understand CSV files and their contents. You are given a CSV file with the following columns: Repository Name, Repository Link, Analysis. You are asked to find the most technically complex and challenging repository from the given CSV file.
# What is the most technically challenging repository from the given CSV file?
# Return the name of the repository , the link to the repository and the analysis of the repository showing why it is the most technically challenging/Complex repository.
# The output should be in the following format:
# Repository Name: <name of the repository>
# Repository Link: <link to the repository>
# Analysis: <analysis of the repository>
# Provide a clickable link to the repository as well like this:
# [Repository Name](Repository Link)
# """
# response = chain({"question": query})
# st.write(response['result'])
# st.stop()
def get_user_repos(username):
"""Gets the repository information of each of the repositories of a GitHub user.
Args:
username: The username of the GitHub user.
Returns:
A list of dictionaries, where each dictionary contains the information of a repository.
"""
client = Github()
user = client.get_user(username)
repos = user.get_repos()
repo_info = []
for repo in repos:
repo_info.append({
"name": repo.name,
"description": repo.description,
"language": repo.language,
"stars": repo.stargazers_count,
"forks": repo.forks_count,
"labels": repo.get_labels(),
"issues": repo.get_issues(state="all"),
"contents" : repo.get_contents(""),
})
repo_info_df = pd.DataFrame(repo_info)
repo_info_df.to_csv("repo_data.csv")
loader = CSVLoader(file_path="repo_data.csv", encoding ="utf-8")
csv_data = loader.load()
csv_embeddings = OpenAIEmbeddings()
vectors = FAISS.from_documents(csv_data, csv_embeddings)
# Create a question-answering chain using the index
context = """ You are Supersmart Github Repository AI system. You are a superintelligent AI that answers questions about Github Repositories and can understand the technical complexity if the repo.
You are:
- helpful & friendly
- good at answering complex questions in simple language
- an expert in all programming languages
- able to infer the intent of the user's question
Remember You are an inteelligent CSV Agent who can understand CSV files and their contents. You are given a CSV file with the following columns: Repository Name, Repository Link, Analysis. You are asked to find the most technically complex and challenging repository from the given CSV file.
To measure the technical complexity of a GitHub repository using the provided API endpoints, You will analyze various factors such as the number of commits, branches, pull requests, issues,contents , number of forks , stars , and contributors. Additionally, you will consider the programming languages used, the size of the codebase, and the frequency of updates.
You will Analyze the following GitHub repository factors to determine the technical complexity of the codebase and calculate a complexity score for each project:
1.Description
2.languages used in the repository
3.Number of stars
4.Number of forks
5.Labels of the repository
6.Description of the repository
7.Contents of the repository
You can consider other factors as well if you think they are relevant for determining the technical complexity of a GitHub repository.
Calculate the complexity score for each project by assigning weights to each factor and summing up the weighted scores.
The project with the highest complexity score will be considered the most technically complex.
Here is the approach or chain-of-thought process , you can use to reach to the solution :
Step 1: Analyze each row and it's contents in the CSV file , each Row represents a Github Repository
"""
prompt_template = """
Understand the following to answer the question in an efficient way
{context}
Question: {question}
Now answer the question. Let's think step by step:"""
PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}
chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=vectors.as_retriever(), input_key="question" , chain_type_kwargs=chain_type_kwargs)
st.subheader("Most Technically Complex Github Repository is")
query = f"""
Which is the most technically challenging repository from the given CSV file?
Return the name of the repository , the link to the repository and the analysis of the repository showing why it is the most technically challenging/Complex repository.Try to provide a detailed analysis to hold your answer strong
The output should be in the following format:
Repository Name: <name of the repository>
Repository Link: <link to the repository>
Analysis: <analysis of the repository>
Provide a clickable link to the repository as well like this:
To get the repo url , you can use this format :
The username is : "{username}"
"https://github.com/{username}/repository_name"
[Repository Name](Repository Link) --> This is Important.Don't skip it
Let's think step by step about how to answer this question:
"""
result = chain({"question": query})
if result is not None:
st.write(result['result'])
else:
st.info("Please wait..")
st.stop()
# Main app
def main():
config.init()
# Set up the app title and sidebar
st.title("GitHub Automated Analysis Tool")
st.sidebar.title("GitHub Automated Analysis Tool")
# Input field for GitHub username
username = st.sidebar.text_input("Enter GitHub Username")
# Submit and clear buttons
submit_button = st.sidebar.button("Submit")
clear_button = st.sidebar.button("Clear")
st.sidebar.header("About")
st.sidebar.info("This Python-based tool , when given a GitHub user's URL, returns the most technically complex and challenging repository from that user's profile. The tool will use GPT and LangChain to assess each repository individually before determining the most technically challenging one.")
st.divider()
st.sidebar.write("This tool is created by [Abhishek Ranjan](https/github.com/AbhishekRP2002).")
# Display the repositories
if submit_button:
st.subheader(f"Repositories for {username}")
repos = fetch_github_repos(username)
if repos:
display_repos(repos)
st.info("Analysis of the repositories using LangChain and ChatGPT started. Please wait...")
get_user_repos(username)
st.error("Invalid username or unable to fetch repositories")
# Clear the input field
if clear_button:
username = ""
st.experimental_rerun()
if __name__ == "__main__":
main()