Skip to content

Commit

Permalink
Fix issue where only the first page of PDFs was indexed (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
amarszowski authored Aug 28, 2024
1 parent 1324dd1 commit 4d8e4ba
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions flask_react/index_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,22 @@ def query_index(query_text):
def insert_into_index(doc_file_path, doc_id=None):
"""Insert new document into global index."""
global index, stored_docs
document = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()[0]
if doc_id is not None:
document.id_ = doc_id
documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()

with lock:
# Keep track of stored docs
stored_docs[document.id_] = document.text[0:200] # only take the first 200 chars
for document in documents:
if doc_id is not None:
document.id_ = doc_id
index.insert(document)

stored_docs[document.id_] = document.text[0:200] # only take the first 200 chars

index.insert(document)
index.storage_context.persist(persist_dir=index_name)


first_document = documents[0]
# Keep track of stored docs -- llama_index doesn't make this easy
stored_docs[first_document.doc_id] = first_document.text[0:200] # only take the first 200 chars

with open(pkl_name, "wb") as f:
pickle.dump(stored_docs, f)

Expand Down

0 comments on commit 4d8e4ba

Please sign in to comment.