Spaces:
Sleeping
Sleeping
chore: cleanup
Browse files- app/app.py +2 -38
app/app.py
CHANGED
|
@@ -34,26 +34,14 @@ def process_file(*, file: AskFileResponse) -> list:
|
|
| 34 |
with NamedTemporaryFile() as tempfile:
|
| 35 |
tempfile.write(file.content)
|
| 36 |
|
| 37 |
-
######################################################################
|
| 38 |
-
#
|
| 39 |
-
# 1. Load the PDF
|
| 40 |
-
#
|
| 41 |
-
######################################################################
|
| 42 |
loader = PDFPlumberLoader(tempfile.name)
|
| 43 |
|
| 44 |
-
######################################################################
|
| 45 |
documents = loader.load()
|
| 46 |
|
| 47 |
-
######################################################################
|
| 48 |
-
#
|
| 49 |
-
# 2. Split the text
|
| 50 |
-
#
|
| 51 |
-
######################################################################
|
| 52 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 53 |
chunk_size=3000,
|
| 54 |
chunk_overlap=100
|
| 55 |
)
|
| 56 |
-
######################################################################
|
| 57 |
|
| 58 |
docs = text_splitter.split_documents(documents)
|
| 59 |
|
|
@@ -72,16 +60,10 @@ def create_search_engine(*, file: AskFileResponse) -> VectorStore:
|
|
| 72 |
docs = process_file(file=file)
|
| 73 |
cl.user_session.set("docs", docs)
|
| 74 |
|
| 75 |
-
##########################################################################
|
| 76 |
-
#
|
| 77 |
-
# 3. Set the Encoder model for creating embeddings
|
| 78 |
-
#
|
| 79 |
-
##########################################################################
|
| 80 |
encoder = OpenAIEmbeddings(
|
| 81 |
model="text-embedding-ada-002"
|
| 82 |
)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
# Initialize Chromadb client and settings, reset to ensure we get a clean
|
| 86 |
# search engine
|
| 87 |
client = chromadb.EphemeralClient()
|
|
@@ -95,20 +77,12 @@ def create_search_engine(*, file: AskFileResponse) -> VectorStore:
|
|
| 95 |
)
|
| 96 |
search_engine._client.reset()
|
| 97 |
|
| 98 |
-
##########################################################################
|
| 99 |
-
#
|
| 100 |
-
# 4. Create the document search engine. Remember to add
|
| 101 |
-
# client_settings using the above settings.
|
| 102 |
-
#
|
| 103 |
-
##########################################################################
|
| 104 |
-
|
| 105 |
search_engine = Chroma.from_documents(
|
| 106 |
client=client,
|
| 107 |
documents=docs,
|
| 108 |
embedding=encoder,
|
| 109 |
client_settings=client_settings
|
| 110 |
)
|
| 111 |
-
##########################################################################
|
| 112 |
|
| 113 |
return search_engine
|
| 114 |
|
|
@@ -140,27 +114,17 @@ async def start():
|
|
| 140 |
streaming=True
|
| 141 |
)
|
| 142 |
|
| 143 |
-
##########################################################################
|
| 144 |
-
#
|
| 145 |
-
# 5. Create the chain / tool for RetrievalQAWithSourcesChain.
|
| 146 |
-
#
|
| 147 |
-
##########################################################################
|
| 148 |
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
| 149 |
llm=llm,
|
| 150 |
chain_type="stuff",
|
| 151 |
retriever=search_engine.as_retriever(max_tokens_limit=4097),
|
| 152 |
-
|
| 153 |
-
# 6. Customize prompts to improve summarization and question
|
| 154 |
-
# answering performance. Perhaps create your own prompt in prompts.py?
|
| 155 |
-
######################################################################
|
| 156 |
chain_type_kwargs={
|
| 157 |
"prompt": PROMPT,
|
| 158 |
"document_prompt": EXAMPLE_PROMPT
|
| 159 |
},
|
| 160 |
)
|
| 161 |
-
##########################################################################
|
| 162 |
|
| 163 |
-
# await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
|
| 164 |
msg.content = f"`{file.name}` processed. You can now ask questions!"
|
| 165 |
await msg.update()
|
| 166 |
|
|
|
|
| 34 |
with NamedTemporaryFile() as tempfile:
|
| 35 |
tempfile.write(file.content)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
loader = PDFPlumberLoader(tempfile.name)
|
| 38 |
|
|
|
|
| 39 |
documents = loader.load()
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 42 |
chunk_size=3000,
|
| 43 |
chunk_overlap=100
|
| 44 |
)
|
|
|
|
| 45 |
|
| 46 |
docs = text_splitter.split_documents(documents)
|
| 47 |
|
|
|
|
| 60 |
docs = process_file(file=file)
|
| 61 |
cl.user_session.set("docs", docs)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
encoder = OpenAIEmbeddings(
|
| 64 |
model="text-embedding-ada-002"
|
| 65 |
)
|
| 66 |
+
|
|
|
|
| 67 |
# Initialize Chromadb client and settings, reset to ensure we get a clean
|
| 68 |
# search engine
|
| 69 |
client = chromadb.EphemeralClient()
|
|
|
|
| 77 |
)
|
| 78 |
search_engine._client.reset()
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
search_engine = Chroma.from_documents(
|
| 81 |
client=client,
|
| 82 |
documents=docs,
|
| 83 |
embedding=encoder,
|
| 84 |
client_settings=client_settings
|
| 85 |
)
|
|
|
|
| 86 |
|
| 87 |
return search_engine
|
| 88 |
|
|
|
|
| 114 |
streaming=True
|
| 115 |
)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
| 118 |
llm=llm,
|
| 119 |
chain_type="stuff",
|
| 120 |
retriever=search_engine.as_retriever(max_tokens_limit=4097),
|
| 121 |
+
|
|
|
|
|
|
|
|
|
|
| 122 |
chain_type_kwargs={
|
| 123 |
"prompt": PROMPT,
|
| 124 |
"document_prompt": EXAMPLE_PROMPT
|
| 125 |
},
|
| 126 |
)
|
|
|
|
| 127 |
|
|
|
|
| 128 |
msg.content = f"`{file.name}` processed. You can now ask questions!"
|
| 129 |
await msg.update()
|
| 130 |
|