QuivrHQ · StanGirard · Oct 14, 2024 · Oct 8, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/backend/api/quivr_api/modules/sync/utils/normalize.py b/backend/api/quivr_api/modules/sync/utils/normalize.py
@@ -1,3 +1,4 @@
+import os
 import re
 import unicodedata
 
@@ -15,3 +16,35 @@ def remove_special_characters(input):
     except Exception as e:
         logger.error(f"Error removing special characters: {e}")
         return input
+
+
+def sanitize_filename(filename: str) -> str:
+    """
+    Sanitize the filename to make it usable.
+
+    Args:
+        filename (str): The original filename.
+
+    Returns:
+        str: The sanitized filename.
+
+    This function:
+    1. Removes or replaces invalid characters
+    2. Handles double extensions
+    3. Ensures the filename is not empty
+    4. Truncates long filenames
+    """
+    valid_chars = re.sub(r"[^\w\-_\. ]", "", filename)
+
+    name, ext = os.path.splitext(valid_chars)
+
+    name = name.replace(".", "_")
+
+    if not name:
+        name = "unnamed"
+    max_length = 255 - len(ext)
+    if len(name) > max_length:
+        name = name[:max_length]
+    sanitized_filename = f"{name}{ext}"
+
+    return sanitized_filename
diff --git a/backend/api/quivr_api/modules/sync/utils/syncutils.py b/backend/api/quivr_api/modules/sync/utils/syncutils.py
@@ -29,6 +29,7 @@
     ISyncService,
     ISyncUserService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.sync.utils.sync import BaseSync
 from quivr_api.modules.upload.service.upload_file import (
     check_file_exists,
@@ -168,6 +169,8 @@ async def process_sync_file(
         ]:
             raise ValueError(f"Incompatible file extension for {downloaded_file}")
 
+        storage_path = sanitize_filename(storage_path)
+
         response = await upload_file_storage(
             downloaded_file.file_data,
             storage_path,

diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py
@@ -30,6 +30,7 @@
 from quivr_api.modules.notification.service.notification_service import (
     NotificationService,
 )
+from quivr_api.modules.sync.utils.normalize import sanitize_filename
 from quivr_api.modules.upload.service.upload_file import (
     upload_file_storage,
 )
@@ -85,12 +86,14 @@ async def upload_file(
             brain_id=str(brain_id),
         )
     )
+    file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
 
     background_tasks.add_task(
-        maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
+        maybe_send_telemetry, "upload_file", {"file_name": file_name}
     )
 
-    filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
+    filename_with_brain_id = str(brain_id) + "/" + file_name
+    filename_with_brain_id = sanitize_filename(filename_with_brain_id)
 
     buff_reader = io.BufferedReader(uploadFile.file)  # type: ignore
     try:
@@ -110,9 +113,9 @@ async def upload_file(
 
     knowledge_to_add = CreateKnowledgeProperties(
         brain_id=brain_id,
-        file_name=uploadFile.filename,
+        file_name=file_name,
         extension=os.path.splitext(
-            uploadFile.filename  # pyright: ignore reportPrivateUsage=none
+            file_name  # pyright: ignore reportPrivateUsage=none
         )[-1].lower(),
         source=integration if integration else "local",
         source_link=integration_link,  # FIXME: Should return the s3 link @chloedia
@@ -127,7 +130,7 @@ async def upload_file(
         "process_file_task",
         kwargs={
             "file_name": filename_with_brain_id,
-            "file_original_name": uploadFile.filename,
+            "file_original_name": file_name,
             "brain_id": brain_id,
             "notification_id": upload_notification.id,
             "knowledge_id": knowledge.id,

diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py
@@ -320,22 +320,28 @@ async def convert(
         else:
             raise ValueError(f"Method {self.method} not supported")
 
-        if not gpt4o_cleaner:
-            return LangChainDocument(
-                page_content=parsed_md,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
-        else:
+        if gpt4o_cleaner:
             md_processor = MarkdownProcessor(
                 parsed_md,
                 strict=True,
                 remove_pagination=True,
             )
             md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
-            return LangChainDocument(
-                page_content=md_cleaned,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
+            parsed_md = md_cleaned
+
+        if (
+            len(parsed_md) < 5
+            and file_path.stat().st_size > 100
+            and self.strategy == "fast"
+        ):
+            logger.info(f"Switching to auto strategy for {file_path.name}")
+            self.strategy = "auto"
+            return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
+
+        return LangChainDocument(
+            page_content=parsed_md,
+            metadata={"filename": file_path.name, "type": "pdf"},
+        )
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
         with open(file_path, "w") as f:

diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py
@@ -59,7 +59,6 @@ def processor_metadata(self):
     async def process_file_inner(self, file: QuivrFile) -> list[Document]:
         mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config)  # type: ignore
         document: Document = await mega_parse.aload()
-        print("\n\n document: ", document.page_content)
         if len(document.page_content) > self.splitter_config.chunk_size:
             docs = self.text_splitter.split_documents([document])
             for doc in docs:

diff --git a/backend/worker/quivr_worker/celery_monitor.py b/backend/worker/quivr_worker/celery_monitor.py
@@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
         running currently.
     """
     active_tasks = celery.control.inspect().active()
+    if not active_tasks:
+        return False
+
     for worker, running_tasks in active_tasks.items():
         for task in running_tasks:
             if task["name"] == task_name:  # type: ignore