Update vector_store_retriever.py
Browse files- vector_store_retriever.py +0 -20
vector_store_retriever.py
CHANGED
|
@@ -96,26 +96,6 @@ load_model("meta-llama/Llama-2-70b-chat-hf")
|
|
| 96 |
#####
|
| 97 |
#########
|
| 98 |
|
| 99 |
-
from langchain.document_loaders import PyPDFDirectoryLoader
|
| 100 |
-
from langchain.document_loaders.utils import RecursiveCharacterTextSplitter
|
| 101 |
-
from langchain.vectorstores import Chroma
|
| 102 |
-
|
| 103 |
-
def load_and_process_pdfs(directory_path: str, chunk_size: int = 500, chunk_overlap: int = 200, collection_name: str = "my-collection"):
|
| 104 |
-
# Load PDF files from the specified directory
|
| 105 |
-
loader = PyPDFDirectoryLoader(directory_path)
|
| 106 |
-
documents = loader.load()
|
| 107 |
-
|
| 108 |
-
# Split the text into chunks
|
| 109 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 110 |
-
texts = text_splitter.split_documents(documents)
|
| 111 |
-
|
| 112 |
-
# Create a Chroma vector store from the processed texts
|
| 113 |
-
db = Chroma.from_documents(texts, hf, collection_name=collection_name)
|
| 114 |
-
|
| 115 |
-
return db # You can return the Chroma vector store if needed
|
| 116 |
-
|
| 117 |
-
# Call the function with the desired directory path and parameters
|
| 118 |
-
load_and_process_pdfs("new_papers/")
|
| 119 |
|
| 120 |
###
|
| 121 |
###
|
|
|
|
| 96 |
#####
|
| 97 |
#########
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
###
|
| 101 |
###
|