diff --git a/backend/Procfile b/backend/Procfile deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/README.md b/backend/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/__init__.py b/backend/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/api/__init__.py b/backend/api/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/api/routes/__init__.py b/backend/api/routes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/core/__init__.py b/backend/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/core/config.py b/backend/core/config.py deleted file mode 100644 index f20cca4e..00000000 --- a/backend/core/config.py +++ /dev/null @@ -1,9 +0,0 @@ -import os -from dotenv import load_dotenv - -load_dotenv() - -FIREBASE_DATABASE_URL = os.getenv('FIREBASE_DATABASE_URL') -FIREBASE_CREDENTIALS_PATH = os.getenv('FIREBASE_CREDENTIALS_PATH') - -MAX_BACKGROUND_WORKERS = 4 \ No newline at end of file diff --git a/backend/db/__init__.py b/backend/db/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/db/firebase.py b/backend/db/firebase.py deleted file mode 100644 index 02a8a6d8..00000000 --- a/backend/db/firebase.py +++ /dev/null @@ -1,19 +0,0 @@ -import firebase_admin -from firebase_admin import credentials, firestore_async -from backend.core.config import FIREBASE_DATABASE_URL, FIREBASE_CREDENTIALS_PATH - -db = None - - -def initialize_firebase(): - global db - cred = credentials.Certificate(FIREBASE_CREDENTIALS_PATH) - firebase_admin.initialize_app(cred, { - 'databaseURL': FIREBASE_DATABASE_URL - }) - db = firestore_async.client() - - -# Connect to Firebase -def connect_to_firebase(): - initialize_firebase() diff --git a/backend/db/models/__init__.py b/backend/db/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/db/models/application/__init__.py b/backend/db/models/application/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/db/models/order/__init__.py b/backend/db/models/order/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/db/models/schools.py b/backend/db/models/schools.py deleted file mode 100644 index c3d59e15..00000000 --- a/backend/db/models/schools.py +++ /dev/null @@ -1,9 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class School: - name: str - description: str - location: str - \ No newline at end of file diff --git a/backend/main.py b/backend/main.py deleted file mode 100644 index 77a6c080..00000000 --- a/backend/main.py +++ /dev/null @@ -1,26 +0,0 @@ -import uvicorn -from apscheduler.triggers.interval import IntervalTrigger -from fastapi import FastAPI - -from backend.api.routes import auth -from backend.db.firebase import connect_to_firebase -from backend.services.api.discover import fetch_utoronto_discover -from backend.utils.scheduler import SchedulerManager - - -scheduler = SchedulerManager() - - -def setup_server(): - global scheduler - connect_to_firebase() - - scheduler.start() - - -app = FastAPI() -setup_server() -app.include_router(auth.router, prefix='/auth') - -if __name__ == "__main__": - uvicorn.run(app, host='0.0.0.0', port=8002) diff --git a/backend/myhack-ieee-firebase-adminsdk-6lmvc-ec183162b9.json b/backend/myhack-ieee-firebase-adminsdk-6lmvc-ec183162b9.json deleted file mode 100644 index 4ab504a1..00000000 --- a/backend/myhack-ieee-firebase-adminsdk-6lmvc-ec183162b9.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "type": "service_account", - "project_id": "myhack-ieee", - "private_key_id": "ec183162b91f9bbadd75a6c41367b327a973b256", - "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC8wu9jMw7BkxSA\n1EvM+kpR7fxPsl4CbTrimleE4HoM18OUoNDyu27Qd5d+hWmf3T5QuQvQ5m0e2QS9\nZPHeJikYMvjxeGXa1YLtovpzRCxsL4IzqGK8FXLPfgyzsdJYwtGRDOCq9c1/IPRk\n/N2UhweXAxJZ+ib050sWLWL4A2zHosvdSPGn45j3OU7cSLsjwyb0VUADsMxOrPvF\nKGfex6vvTthJy49t7OcIK3J27a2q0lfcG2jrZiq6dbM9jbcBEXNNHeqBDQkJ2Jca\nqgK1mihkqlli7TFON3OeMlVMpQUSz51Hh2Lcl4Lq8DykHxwDSg7EjlXKLtjqw7FY\nTx/rLQaLAgMBAAECggEABOBQSNK/1dZT2EQjtSbeFqG8O97SuONTexkbm64k2QIV\n5lgFLdATu8TrH/SYVu3jNzigioP5W6uliLL74I5Bz4M5lRtUHkQXvsXQDHw9XYoY\npfb6nfs73QpXj6jPU0WtHj8j6qaw6VDTn2w82SJn2snUhoiEyxzSfr8raFbGT53k\njmWISc4MamXAQVOlTgJzCFOMVFIt2PMayZ2V3mvjbVrsvbLcErvKhEM5QcsyjAf1\nKbyjjGoupVEcv8E2HeWjOxu0mN+BFwTbiVqBwlv+Pgx8nDrkYngLAzpM2n8diy+z\nLyx8hlqBUcbxXRO2hRUyz4H5MjGxgOrL+qWidRd7uQKBgQD42wn0ECWgTfmQUtM3\nXFWXzqE7zHLm3rpcR3JxX01iESwY/Lcp5/BcXLOrLVJGCmIkcCPNxx96X6URmB8+\nJE0GaWuG56fK4NwdGgpZE/6UGohdLKLlcriP3WcUSbOeBacIelPtjbHmRf9X9S9i\nrOMWUhH/nbogDF8MsXoxJURzdQKBgQDCLjwqTNd+HOZuYijBX7wH/6lVIiWGm85f\nRMySPn5k5ctVNT+taPl6yNeRTXhFuI5DMaE4sgBd3o5kH52rWbRgrgbPLIIhMyAU\nDex17/jvxAy3TYAvhsC+ql4RQnTMd+Bly0j5ywlY2/EsqB7Pv2KZ/B7IKUwXlerv\nIaxkdN5R/wKBgQDM7wTpWorB0rTcZ3jNNFrAY0dgCWPuQClUaPoT5xnA0sdv5F2q\nQvkr9qN3KiGA0Hg17atugLapfi5fqNjBwf80cog8VnVVm4to49L1vIN/z8HQiTu0\nnJ7kyr1idbdXQOxnDOYk0PVZ3vcCpkVZi2qPLRLpYCiwz79OorEv27LdKQKBgGsS\nJXjJoDVY6DLLi199U2gxsARSbNC5juT3QboOHZGzKBhW4ULUVGRA5KSpS/1d04v3\njMd/VzZrZqaMzFrUfuKkcvq/tw1pbHHCb7VkWiaTOtPENz99lUBNHstzkoXAQArB\nOf8K1p+Zv3V5SmwRQuRXof2Kz/tdXgr3zWAjFQevAoGAPx2khV18JoJKeu1mJNnn\nqjaHtFJarHvaRbIe1DiA1bQ1yDXeMzEXOjx9CiK2G8JJnPG4gzrjgCwvSIjVfTU5\nEQF+LONcCfkb0LwRurMLT8/1jlXSGY/wxl3U5gk+fjcJGSZ4bzlEs91Vu7bMM5jv\nAUApq4PvscRnyPMzjw7ISZg=\n-----END PRIVATE KEY-----\n", - "client_email": "firebase-adminsdk-6lmvc@myhack-ieee.iam.gserviceaccount.com", - "client_id": "103308082050954120103", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-6lmvc%40myhack-ieee.iam.gserviceaccount.com", - "universe_domain": "googleapis.com" -} diff --git a/backend/services/__init__.py b/backend/services/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/utils/__init__.py b/backend/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/utils/scheduler.py b/backend/utils/scheduler.py deleted file mode 100644 index 7f31a8d5..00000000 --- a/backend/utils/scheduler.py +++ /dev/null @@ -1,26 +0,0 @@ -import logging - -from apscheduler.executors.pool import ProcessPoolExecutor -from apscheduler.schedulers.asyncio import AsyncIOScheduler - -from backend.core import config - - -class SchedulerManager: - logger = logging.getLogger(__name__) - - def __init__(self): - executors = { - 'default': {'type': 'asyncio'}, - 'processpool': ProcessPoolExecutor(max_workers=config.MAX_BACKGROUND_WORKERS) - } - self.scheduler = AsyncIOScheduler(executors=executors) - - def add_async_job(self, job, interval): - self.scheduler.add_job(job, interval) - - def add_sync_job(self, job, interval): - self.scheduler.add_job(job, interval, executor='processpool') - - def start(self): - self.scheduler.start() diff --git a/data_module/main.py b/data_module/main.py deleted file mode 100644 index e9ee64af..00000000 --- a/data_module/main.py +++ /dev/null @@ -1,26 +0,0 @@ -import os -import pandas -from utils.organize_img import organize_data -from utils.scraper import amazon_image_scrape as az_scrape -from utils.scraper import google_image_scrape as gg_scrape -from utils.parse_list import parse_data -from datetime import datetime -import time - -parent_dir = 'img' -os.makedirs(parent_dir, exist_ok=True) # create the parent directory if it doesn't exist - -master_df = pd.DataFrame([], columns=['Title', 'URL', 'Time', 'Source']) # create master data frame - -parse_data() - -# scrape images for each hardware item -for name in arr_item: - # output_amazon_df = az_scrape(name[0], name[1]) - # master_df = pd.concat([output_amazon_df , master_df], axis=0) - output_google_df = gg_scrape(name[0], name[1]) - master_df = pd.concat([output_google_df, master_df], axis=0) - -master_df.to_excel("image_metadata.xlsx") # export data frame to excel file - -organize_data() diff --git a/data_module/utils/organize_img.py b/data_module/utils/organize_img.py deleted file mode 100644 index c7f6cc25..00000000 --- a/data_module/utils/organize_img.py +++ /dev/null @@ -1,26 +0,0 @@ -import os -import pandas as pd -import shutil - -def organize_data(arr_item): - og = './img' - for item in arr_item: - path = os.path.join(og, item[1]) - os.makedirs(path, exist_ok=True) - - parent_folder = './dataset' - df = pd.read_excel('../IEEE Hardware Inventory 2023-2024.xlsx') - unique_categories = pd.unique(df['categories']) - - for category in unique_categories: - path = os.path.join(parent_folder, category) - os.makedirs(path, exist_ok=True) - - num_rows = df.shape[0] - column_name = 'categories' - - for i in range(1, num_rows, 1): - cell_value = df.at[0, column_name] - target = './dataset/{category}'.format(category=cell_value) - original = './img/{name_item}'.format(name_item=arr_item[0][1]) - shutil.move(original, target) \ No newline at end of file diff --git a/data_module/utils/parse_list.py b/data_module/utils/parse_list.py deleted file mode 100644 index 0f2fdabf..00000000 --- a/data_module/utils/parse_list.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd -import os -import cv2 as cv -import shutil - -def parse_data(): - # read hardware component Excel sheet - hardware_components = pd.read_excel( - '../IEEE Hardware Inventory 2023-2024.xlsx', - sheet_name='2022-2023 inventory', - ) - - size_list = hardware_components['name'].size # total number of hardware items - arr_item = [] # holds all item's full names - - # loop through the rows and get the item's: name, manufacturer, and model - for i in range(size_list): - curr_item = [] - # if a field does not exist, replace with empty string - name = "" if str(hardware_components['name'].iloc[i]) == "nan" else str(hardware_components['name'].iloc[i]) - manufacturer = "" if str(hardware_components['manufacturer'].iloc[i]) == "nan" else str(hardware_components['manufacturer'].iloc[i]) - model = "" if str(hardware_components['model_number'].iloc[i]) == "nan" else str(hardware_components['model_number'].iloc[i]) - curr_item.append(name + " " + manufacturer + " " + model) # append into arr - curr_item.append(name.replace(" ", "")) - arr_item.append(curr_item) \ No newline at end of file diff --git a/data_module/utils/scraper.py b/data_module/utils/scraper.py deleted file mode 100644 index cf8821ba..00000000 --- a/data_module/utils/scraper.py +++ /dev/null @@ -1,138 +0,0 @@ -import requests -import pandas as pd -import re -import os -import time -import urllib -from bs4 import BeautifulSoup -from datetime import datetime -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as ec -from urllib.parse import urlencode, urljoin - -# used to avoid Amazon blocking the scraper -SCRAPEOPS_API_KEY = 'ced5bffb-0941-4ba2-8faa-ee55f55f1f0a' - -def scrapeops_url(url): - payload = {'api_key': SCRAPEOPS_API_KEY, 'url': url, 'country': 'us'} - proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload) - return proxy_url - -def amazon_image_scrape(name_item, topic_name): - search_url = f'https://www.amazon.com/s?k={topic_name}&page=1' - - html = requests.get(scrapeops_url(search_url)) - - print(html) - soup = BeautifulSoup(html.content, features="lxml") - links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'}) - - arr_list = [] # store searched result links - - df = pd.DataFrame([], columns=['Title', 'Source URL', 'Time', 'Source']) # create data frame - - for link in links: - arr_list.append(link.get('href')) - - for count, link in enumerate(arr_list): - webpage_url = "https://www.amazon.com" + link - webpage = requests.get(webpage_url, headers=HEADERS) - new_soup = BeautifulSoup(webpage.content, "lxml") - - title = None - - # get product title - try: - title = new_soup.find("span", attrs={"id": 'productTitle'}).text.strip() - except AttributeError: - print("Cannot retrieve product title") - - current_time = datetime.now() - formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S") - - row = { - "Title": title, - "Source URL": webpage_url, - "Time": formatted_time, - "Source": "Amazon" - } - - df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) - - images = re.findall('"hiRes":"(.+?)"', webpage.text) - - for num, img_url in enumerate(images): - img_dir = "./img/{topic}/".format(topic=name_item) - os.makedirs(img_dir, exist_ok=True) # create the directory if it doesn't exist - img_filename = img_dir + "image{num}_amazon.jpg".format(num=num) - img_filename.format(topic=name_item, i=num) - urllib.request.urlretrieve(img_url, img_filename) # retrieve images - - return df - -def scroll_to_end(webdriver): - webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(10) - -def google_image_scrape(topic_name, name_item): - chrome_options = webdriver.ChromeOptions() # creates instance of ChromeOptions class - chrome_options.add_argument('--headless') # runs Chrome in headless mode - chrome_options.add_argument('--no-sandbox') # disable sandbox mode in Chrome - chrome_options.add_argument('--disable-dev-shm-usage') # disables '/dev/shm/ for shared resources - driver = webdriver.Chrome('chromedriver', options=chrome_options) # defines path to ChromeDriver executable - driver.maximize_window() - - df = pd.DataFrame([], columns=['Title', 'Source URL', 'Time', 'Source']) # create data frame - - search_url = "https://www.google.com/search?q={topic}&tbm=isch&ved=2ahUKEwii57id66j_AhVUGFkFHYNpAuYQ2-cCegQIABAA&oq=bluetooth+wireless+module+bluefruit+le+shield+adafruit&gs_lcp=CgNpbWcQAzIECCMQJ1DnCFjnCGC1EWgAcAB4AIABP4gBdpIBATKYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=Cxt8ZOK1JtSw5NoPg9OJsA4&bih=568&biw=1251&hl=en" - driver.get(search_url.format(topic=topic_name)) # replaces topic in the search_url to the desired topic - - # Wait for images to load - while True: - last_height = driver.execute_script('return document.body.scrollHeight') - - scroll_to_end(driver) - - new_height = driver.execute_script('return document.body.scrollHeight') - - if new_height == last_height: - break - - last_height = new_height - - - img_results = driver.find_elements(By.XPATH, "//img[contains(@class,'Q4LuWd')]") # returns a list of image elements found on webpage - div_elements = driver.find_elements(By.CSS_SELECTOR, 'div.isv-r.PNCib.MSM1fd.BUooTd') - - images = [] - for i in range(len(img_results)): - img_src = img_results[i].get_attribute('src') - if img_src: - images.append(img_src) - current = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - title_element = div_elements[i].find_element(By.CSS_SELECTOR, 'h3.bytUYc') - title = title_element.text - - try: - link_element = div_elements[i].find_element(By.CSS_SELECTOR, 'a.VFACy.kGQAp.sMi44c.d0NI4c.lNHeqe.WGvvNb') - url = link_element.get_attribute('href') - except Exception: - print("Link element not found within the div element") - - row = {"Title": title, "Source URL": url, "Time": current, "Source": "Google"} - df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) - - for num in range(len(images)): - img_dir = "./img/{topic}/".format(topic=name_item) - os.makedirs(img_dir, exist_ok=True) # Create the directory if it doesn't exist - img_filename = img_dir + "image{num}_google.jpg".format(num=num) - urllib.request.urlretrieve(str(images[num]), img_filename) - - driver.quit() - - return df - diff --git a/research/__init__.py b/research/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/models/__init__.py b/research/models/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/models/base.py b/research/models/base.py deleted file mode 100755 index b39f01be..00000000 --- a/research/models/base.py +++ /dev/null @@ -1,9 +0,0 @@ -import torch - -from abc import ABC, abstractmethod - - -class BaseModel(ABC): - @abstractmethod - def preprocess(self, image: torch.Tensor)-> torch.Tensor: - pass diff --git a/research/models/dinov2/__init__.py b/research/models/dinov2/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/models/dinov2/model.py b/research/models/dinov2/model.py deleted file mode 100755 index c1a52e10..00000000 --- a/research/models/dinov2/model.py +++ /dev/null @@ -1,50 +0,0 @@ -import time - -import PIL.Image -import torch - -from research.data.synthetic.utils.datapoints.image import ImageType, ImageData -from research.models.base import BaseModel -from torchvision import transforms -from enum import Enum, auto - -IMAGENET_DEFAULT_MEAN = (0.485, 0.4546, 0.406) -IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) - - -class ModelSize(Enum): - SMALL = 's' - BIG = 'b' - LARGE = 'l' - GIANT = 'g' - - -class DinoV2(BaseModel): - def __init__(self, model_size: ModelSize, resize_size: int = 518, crop_size: int = 518): - self.transform = transforms.Compose([ - transforms.Resize(resize_size, interpolation=transforms.InterpolationMode.BICUBIC, antialias=True), - transforms.CenterCrop(crop_size), - transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) - ]) - - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.model = torch.hub.load('facebookresearch/dinov2', f'dinov2_vit{model_size.value}14').to(self.device) - self.model.eval() - - def preprocess(self, image: torch.Tensor) -> torch.Tensor: - return self.transform(image) - - def inference(self, image: ImageType): - image = ImageData(image).get_image(with_wrapper=False) - image = image.type(torch.FloatTensor).to(self.device) - image = self.preprocess(image).unsqueeze(0) - with torch.no_grad(): - features = self.model(image) - return features.cpu().detach().numpy() - - -if __name__ == "__main__": - model = DinoV2(ModelSize.BIG) - features = model.inference( - 'https://img.freepik.com/premium-vector/young-girl-anime-style-character-vector-illustration-design-manga-anime-girl_147933-100.jpg?w=2000') - print(features.shape) diff --git a/research/models/dinov2/retrieval/__init__.py b/research/models/dinov2/retrieval/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/models/dinov2/retrieval/dinov2.py b/research/models/dinov2/retrieval/dinov2.py deleted file mode 100755 index 610fb288..00000000 --- a/research/models/dinov2/retrieval/dinov2.py +++ /dev/null @@ -1,34 +0,0 @@ -import torch -from torchvision import transforms - -dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14') - -transform1 = transforms.Compose([ - transforms.Resize(520), - transforms.CenterCrop(518), #should be multiple of model patch_size - transforms.ToTensor(), - transforms.Normalize(mean=0.5, std=0.2) - ]) - -patch_size = dinov2_vitl14.patch_size # patchsize=14 - -#520//14 -patch_h = 520//patch_size -patch_w = 520//patch_size - -feat_dim = 1024 # vitl14 - -folder_path = "harryported_giffin_images/" -total_features = [] -with torch.no_grad(): - for img_path in os.listdir(folder_path): - img_path = os.path.join(folder_path, img_path) - img = Image.open(img_path).convert('RGB') - img_t = transform1(img) - - features_dict = dinov2_vitl14.forward_features(img_t.unsqueeze(0)) - features = features_dict['x_norm_patchtokens'] - total_features.append(features) - -total_features = torch.cat(total_features, dim=0) -total_features.shape \ No newline at end of file diff --git a/research/tests/__init__.py b/research/tests/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/utils/__init__.py b/research/utils/__init__.py deleted file mode 100755 index e69de29b..00000000 diff --git a/research/utils/ann/__init__.py b/research/utils/ann/__init__.py deleted file mode 100755 index 00c5f5c0..00000000 --- a/research/utils/ann/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import ANN -from research.utils.ann.factory import ANNFactory -from research.utils.ann.faiss import Faiss -from research.utils.ann.numpy import NumPy -from research.utils.ann.torch import Torch diff --git a/research/utils/ann/base.py b/research/utils/ann/base.py deleted file mode 100755 index 552e604d..00000000 --- a/research/utils/ann/base.py +++ /dev/null @@ -1,58 +0,0 @@ -import datetime -import platform - -from abc import ABC, abstractmethod - - -class ANN(ABC): - - def __init__(self, config): - self.backend = None - self.config = config - - @abstractmethod - def load(self, path): - ... - - @abstractmethod - def index(self, embeddings): - ... - - @abstractmethod - def append(self, embeddings): - ... - - @abstractmethod - def delete(self, ids): - ... - - @abstractmethod - def search(self, queries, limit): - ... - - @abstractmethod - def count(self): - ... - - @abstractmethod - def save(self, path): - ... - - def setting(self, name, default=None): - backend = self.config.get(self.config["backend"]) - - setting = backend.get(name) if backend else None - return setting if setting else default - - def metadata(self, settings=None): - create = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - - if settings: - self.config["build"] = { - "create": create, - "python": platform.python_version(), - "settings": settings, - "system": f"{platform.system()} ({platform.machine()})", - } - - self.config["update"] = create diff --git a/research/utils/ann/factory.py b/research/utils/ann/factory.py deleted file mode 100755 index 823942d7..00000000 --- a/research/utils/ann/factory.py +++ /dev/null @@ -1,33 +0,0 @@ -from ..util import Resolver - -from .faiss import Faiss -from .numpy import NumPy -from .torch import Torch - - -class ANNFactory: - @staticmethod - def create(config): - ann = None - backend = config.get("backend", "faiss") - - match backend: - case "faiss": - backend = Faiss(config) - case "numpy": - ann = NumPy(config) - case"torch": - ann = Torch(config) - case _: - ann = ANNFactory.resolve(backend, config) - - config["backend"] = backend - - return ann - - @staticmethod - def resolve(backend, config): - try: - return Resolver()(backend)(config) - except Exception as e: - raise ImportError(f"Unable to resolve ann backend: '{backend}'") from e diff --git a/research/utils/ann/faiss.py b/research/utils/ann/faiss.py deleted file mode 100755 index 9de3536d..00000000 --- a/research/utils/ann/faiss.py +++ /dev/null @@ -1,130 +0,0 @@ -import math - -import numpy as np - -from faiss import index_factory, IO_FLAG_MMAP, METRIC_INNER_PRODUCT, read_index, write_index - -from .base import ANN - - -class Faiss(ANN): - """ - Builds an ANN index using the Faiss library. - """ - - def load(self, path): - # Load index - self.backend = read_index(path, IO_FLAG_MMAP if self.setting("mmap") is True else 0) - - def index(self, embeddings): - # Compute model training size - train, sample = embeddings, self.setting("sample") - if sample: - indices = sorted(np.random.choice(train.shape[0], int(sample * train.shape[0]), replace=False)) - train = train[indices] - - # Configure embeddings index. Inner product is equal to cosine similarity on normalized vectors. - params = self.configure(embeddings.shape[0], train.shape[0]) - self.backend = index_factory(embeddings.shape[1], params, METRIC_INNER_PRODUCT) - - # Train model - self.backend.train(train) - - # Add embeddings - position in embeddings is used as the id - self.backend.add_with_ids(embeddings, np.arange(embeddings.shape[0], dtype=np.int64)) - - # Add id offset and index build metadata - self.config["offset"] = embeddings.shape[0] - self.metadata({"components": params}) - - def append(self, embeddings): - new = embeddings.shape[0] - - # Append new ids - position in embeddings + existing offset is used as the id - self.backend.add_with_ids(embeddings, np.arange(self.config["offset"], self.config["offset"] + new, dtype=np.int64)) - - # Update id offset and index metadata - self.config["offset"] += new - self.metadata() - - def delete(self, ids): - # Remove specified ids - self.backend.remove_ids(np.array(ids, dtype=np.int64)) - - def search(self, queries, limit): - # Run the query - self.backend.nprobe = self.nprobe() - scores, ids = self.backend.search(queries, limit) - - # Map results to [(id, score)] - results = [] - for x, score in enumerate(scores): - results.append(list(zip(ids[x].tolist(), score.tolist()))) - - return results - - def count(self): - return self.backend.ntotal - - def save(self, path): - # Write index - write_index(self.backend, path) - - def configure(self, count, train): - """ - Configures settings for a new index. - - Args: - count: initial number of embeddings rows - train: number of rows selected for model training - - Returns: - user-specified or generated components setting - """ - - # Lookup components setting - components = self.setting("components") - - if components: - return components - - # Get storage setting - storage = "SQ8" if self.setting("quantize", self.config.get("quantize")) else "Flat" - - # Small index, use storage directly with IDMap - if count <= 5000: - return f"IDMap,{storage}" - - x = self.cells(train) - components = f"IVF{x},{storage}" - - return components - - def cells(self, count): - """ - Calculates the number of IVF cells for an IVF index. - - Args: - count: number of embeddings rows - - Returns: - number of IVF cells - """ - - # Calculate number of IVF cells where x = min(4 * sqrt(embeddings count), embeddings count / 39) - # Faiss requires at least 39 * x data points - return min(round(4 * math.sqrt(count)), int(count / 39)) - - def nprobe(self): - """ - Gets or derives the nprobe search parameter. - - Returns: - nprobe setting - """ - - # Get size of embeddings index - count = self.count() - - default = 6 if count <= 5000 else round(self.cells(count) / 16) - return self.setting("nprobe", default) diff --git a/research/utils/ann/numpy.py b/research/utils/ann/numpy.py deleted file mode 100755 index 2a977a95..00000000 --- a/research/utils/ann/numpy.py +++ /dev/null @@ -1,52 +0,0 @@ -import numpy as np -import pickle - -from research.utils.ann.base import ANN - - -class NumPy(ANN): - - def __init__(self, config): - super().__init__(config) - - self.all, self.cat, self.dot, self.zeros = np.all, np.concatenate, np.dot, np.zeros - - def load(self, path): - with open(path, "rb") as handle: - self.backend = self.tensor(pickle.load(handle)) - - def index(self, embeddings): - self.backend = self.tensor(embeddings) - - self.config["offset"] = embeddings.shape[0] - self.metadata(self.settings()) - - def append(self, embeddings): - new = embeddings.shape[0] - - self.backend = self.cat((self.backend, self.tensor(embeddings)), axis=0) - - self.config["offset"] += new - self.metadata() - - def delete(self, ids): - ids = [x for x in ids if x < self.backend.shape[0]] - - self.backend[ids] = self.tensor(self.zeros((len(ids), self.backend.shape[1]))) - - def search(self, queries, limit): - scores = self.dot(self.tensor(queries), self.backend.T).tolist() - return [sorted(enumerate(score), key=lambda x: x[1], reverse=True)[:limit] for score in scores] - - def count(self): - return self.backend[~self.all(self.backend == 0, axis=1)].shape[0] - - def save(self, path): - with open(path, "wb") as handle: - pickle.dump(self.backend, handle) - - def tensor(self, array): - return array - - def settings(self): - return {"numpy": np.__version__} diff --git a/research/utils/ann/torch.py b/research/utils/ann/torch.py deleted file mode 100755 index 20181353..00000000 --- a/research/utils/ann/torch.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np -import torch -from research.utils.ann.numpy import NumPy - - -class Torch(NumPy): - def __init__(self, config): - super().__init__(config) - - self.all, self.cat, self.dot, self.zeros = torch.all, torch.cat, torch.mm, torch.zeros - - def tensor(self, array): - if isinstance(array, np.ndarray): - array = torch.from_numpy(array) - - if torch.cuda.is_available(): - return array.cuda() - return array - - def settings(self): - return {"torch": torch.__version__}