Spaces:
Paused
Paused
dev/add-metadata-url
#1
by
terapyon
- opened
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from langchain.llms import OpenAI
|
|
| 5 |
from langchain.vectorstores import Qdrant
|
| 6 |
from openai.error import InvalidRequestError
|
| 7 |
from qdrant_client import QdrantClient
|
| 8 |
-
from config import
|
| 9 |
|
| 10 |
|
| 11 |
PERSIST_DIR_NAME = "nvdajp-book"
|
|
@@ -13,7 +13,7 @@ PERSIST_DIR_NAME = "nvdajp-book"
|
|
| 13 |
|
| 14 |
def get_retrieval_qa() -> RetrievalQA:
|
| 15 |
embeddings = OpenAIEmbeddings()
|
| 16 |
-
db_url, db_api_key, db_collection_name =
|
| 17 |
client = QdrantClient(url=db_url, api_key=db_api_key)
|
| 18 |
db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
|
| 19 |
retriever = db.as_retriever()
|
|
@@ -22,21 +22,17 @@ def get_retrieval_qa() -> RetrievalQA:
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
| 25 |
-
def _remove_prefix_path(p: str):
|
| 26 |
-
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
|
| 27 |
-
return p.removeprefix(prefix)
|
| 28 |
-
|
| 29 |
-
|
| 30 |
def get_related_url(metadata):
|
| 31 |
-
|
| 32 |
-
url = "https://nvdajp-book.readthedocs.io/"
|
| 33 |
for m in metadata:
|
| 34 |
-
p = m['source']
|
| 35 |
-
|
| 36 |
-
if
|
| 37 |
continue
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
def main(query: str):
|
|
|
|
| 5 |
from langchain.vectorstores import Qdrant
|
| 6 |
from openai.error import InvalidRequestError
|
| 7 |
from qdrant_client import QdrantClient
|
| 8 |
+
from config import DB_CONFIG
|
| 9 |
|
| 10 |
|
| 11 |
PERSIST_DIR_NAME = "nvdajp-book"
|
|
|
|
| 13 |
|
| 14 |
def get_retrieval_qa() -> RetrievalQA:
|
| 15 |
embeddings = OpenAIEmbeddings()
|
| 16 |
+
db_url, db_api_key, db_collection_name = DB_CONFIG
|
| 17 |
client = QdrantClient(url=db_url, api_key=db_api_key)
|
| 18 |
db = Qdrant(client=client, collection_name=db_collection_name, embeddings=embeddings)
|
| 19 |
retriever = db.as_retriever()
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_related_url(metadata):
|
| 26 |
+
urls = set()
|
|
|
|
| 27 |
for m in metadata:
|
| 28 |
+
# p = m['source']
|
| 29 |
+
url = m["url"]
|
| 30 |
+
if url in urls:
|
| 31 |
continue
|
| 32 |
+
urls.add(url)
|
| 33 |
+
category = m["category"]
|
| 34 |
+
# print(m)
|
| 35 |
+
yield f'<p>URL: <a href="{url}">{url}</a> (category: {category})</p>'
|
| 36 |
|
| 37 |
|
| 38 |
def main(query: str):
|
config.py
CHANGED
|
@@ -1,8 +1,21 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
def get_db_config():
|
| 5 |
url = os.environ["QDRANT_URL"]
|
| 6 |
api_key = os.environ["QDRANT_API_KEY"]
|
| 7 |
collection_name = "nvdajp-book"
|
| 8 |
return url, api_key, collection_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
|
| 4 |
+
SAAS = True
|
| 5 |
+
|
| 6 |
+
|
| 7 |
def get_db_config():
|
| 8 |
url = os.environ["QDRANT_URL"]
|
| 9 |
api_key = os.environ["QDRANT_API_KEY"]
|
| 10 |
collection_name = "nvdajp-book"
|
| 11 |
return url, api_key, collection_name
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_local_db_congin():
|
| 15 |
+
url = "localhost"
|
| 16 |
+
# api_key = os.environ["QDRANT_API_KEY"]
|
| 17 |
+
collection_name = "nvdajp-book"
|
| 18 |
+
return url, None, collection_name
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()
|
store.py
CHANGED
|
@@ -3,16 +3,29 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 3 |
from langchain.embeddings import OpenAIEmbeddings
|
| 4 |
from langchain.vectorstores import Qdrant
|
| 5 |
# from qdrant_client import QdrantClient
|
| 6 |
-
from config import
|
| 7 |
|
| 8 |
|
| 9 |
CHUNK_SIZE = 500
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def get_documents(path: str):
|
| 13 |
loader = ReadTheDocsLoader(path, encoding="utf-8")
|
| 14 |
docs = loader.load()
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def get_text_chunk(docs):
|
|
@@ -23,7 +36,7 @@ def get_text_chunk(docs):
|
|
| 23 |
|
| 24 |
def store(texts):
|
| 25 |
embeddings = OpenAIEmbeddings()
|
| 26 |
-
db_url, db_api_key, db_collection_name =
|
| 27 |
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
|
| 28 |
_ = Qdrant.from_documents(
|
| 29 |
texts,
|
|
@@ -48,6 +61,9 @@ if __name__ == "__main__":
|
|
| 48 |
args = sys.argv
|
| 49 |
if len(args) != 2:
|
| 50 |
print("No args, you need two args for html_path")
|
|
|
|
|
|
|
|
|
|
| 51 |
else:
|
| 52 |
path = args[1]
|
| 53 |
# dir_name = args[2]
|
|
|
|
| 3 |
from langchain.embeddings import OpenAIEmbeddings
|
| 4 |
from langchain.vectorstores import Qdrant
|
| 5 |
# from qdrant_client import QdrantClient
|
| 6 |
+
from config import DB_CONFIG
|
| 7 |
|
| 8 |
|
| 9 |
CHUNK_SIZE = 500
|
| 10 |
|
| 11 |
|
| 12 |
+
def _remove_prefix_path(p: str):
|
| 13 |
+
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
|
| 14 |
+
return p.removeprefix(prefix)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
def get_documents(path: str):
|
| 18 |
loader = ReadTheDocsLoader(path, encoding="utf-8")
|
| 19 |
docs = loader.load()
|
| 20 |
+
base_url = "https://nvdajp-book.readthedocs.io/"
|
| 21 |
+
add_meta = {"category": "ja-book"}
|
| 22 |
+
for doc in docs:
|
| 23 |
+
org_metadata = doc.metadata
|
| 24 |
+
source = _remove_prefix_path(org_metadata["source"])
|
| 25 |
+
add_meta = {"category": "ja-book", "source": source, "url": f"{base_url}{source}"}
|
| 26 |
+
doc.metadata = org_metadata | add_meta
|
| 27 |
+
yield doc
|
| 28 |
+
# return docs
|
| 29 |
|
| 30 |
|
| 31 |
def get_text_chunk(docs):
|
|
|
|
| 36 |
|
| 37 |
def store(texts):
|
| 38 |
embeddings = OpenAIEmbeddings()
|
| 39 |
+
db_url, db_api_key, db_collection_name = DB_CONFIG
|
| 40 |
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
|
| 41 |
_ = Qdrant.from_documents(
|
| 42 |
texts,
|
|
|
|
| 61 |
args = sys.argv
|
| 62 |
if len(args) != 2:
|
| 63 |
print("No args, you need two args for html_path")
|
| 64 |
+
docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest")
|
| 65 |
+
print(type(docs))
|
| 66 |
+
breakpoint()
|
| 67 |
else:
|
| 68 |
path = args[1]
|
| 69 |
# dir_name = args[2]
|