Spaces:

syntaxhacker
/

developer-portfolio-rag

Sleeping

App Files Files Community

rohit commited on Jul 12

Commit

761b11c

1 Parent(s): 9474afa

Deploy RAG Pipeline with FastAPI and ML capabilities

Browse files

Files changed (10) hide show

app/__init__.py +1 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/config.cpython-311.pyc +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
app/__pycache__/pipeline.cpython-311.pyc +0 -0
app/config.py +158 -0
app/main.py +128 -0
app/pipeline.py +133 -0
requirements.txt +12 -0
start.sh +25 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (176 Bytes). View file

app/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (5.34 kB). View file

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

app/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (7.24 kB). View file

app/config.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from typing import Dict, Optional, List
+from dataclasses import dataclass
+from haystack.dataclasses import ChatMessage
+@dataclass
+class DatasetConfig:
+    name: str
+    split: str = "train"
+    content_field: str = "content"
+    fields: Dict[str, str] = None  # Dictionary of field mappings
+    prompt_template: Optional[str] = None
+# Default configurations for different datasets
+DATASET_CONFIGS = {
+    "awesome-chatgpt-prompts": DatasetConfig(
+        name="fka/awesome-chatgpt-prompts",
+        content_field="prompt",
+        fields={
+            "role": "act",
+            "prompt": "prompt"
+        },
+        prompt_template="""
+        Given the following context where each document represents a prompt for a specific role,
+        please answer the question while considering both the role and the prompt content.
+        Available Contexts:
+        {% for document in documents %}
+            {% if document.meta.role %}Role: {{ document.meta.role }}{% endif %}
+            Content: {{ document.content }}
+            ---
+        {% endfor %}
+        Question: {{question}}
+        Answer:
+        """
+    ),
+    "settings-dataset": DatasetConfig(
+        name="syntaxhacker/rag_pipeline",
+        content_field="context",
+        fields={
+            "question": "question",
+            "answer": "answer",
+            "context": "context"
+        },
+        prompt_template="""
+        Given the following context about software settings and configurations,
+        please answer the question accurately based on the provided information.
+        For each setting, provide a clear, step-by-step navigation path and include:
+        1. The exact location (Origin Type > Tab > Section > Setting name)
+        2. What the setting does
+        3. Available options/values
+        4. How to access and modify the setting
+        5. Reference screenshots (if available)
+        Format your answer as:
+        "To [accomplish task], follow these steps:
+        Location: [Origin Type] > [Tab] > [Section] > [Setting name]
+        Purpose: [describe what the setting does]
+        Options: [list available values/options]
+        How to set: [describe interaction method: toggle/select/input]
+        Visual Guide:
+        [Include reference image links if available]
+        For more details, you can refer to the screenshots above showing the exact location and interface."
+        Available Contexts:
+        {% for document in documents %}
+            Setting Info: {{ document.content }}
+            Reference Answer: {{ document.meta.answer }}
+            ---
+        {% endfor %}
+        Question: {{question}}
+        Answer:
+        """
+    ),
+    "seven-wonders": DatasetConfig(
+        name="bilgeyucel/seven-wonders",
+        content_field="content",
+        fields={},  # No additional fields needed
+        prompt_template="""
+        Given the following information about the Seven Wonders, please answer the question.
+        Context:
+        {% for document in documents %}
+            {{ document.content }}
+        {% endfor %}
+        Question: {{question}}
+        Answer:
+        """
+    ),
+    "psychology-dataset": DatasetConfig(
+        name="jkhedri/psychology-dataset",
+        split="train",
+        content_field="question",  # Assuming we want to use the question as the content
+        fields={
+            "response_j": "response_j",  # Response from one model
+            "response_k": "response_k"   # Response from another model
+        },
+        prompt_template="""
+        Given the following context where each document represents a psychological inquiry,
+        please answer the question based on the provided responses.
+        Available Contexts:
+        {% for document in documents %}
+            Question: {{ document.content }}
+            Response J: {{ document.meta.response_j }}
+            Response K: {{ document.meta.response_k }}
+            ---
+        {% endfor %}
+        Question: {{question}}
+        Answer:
+        """
+    ),
+    "developer-portfolio": DatasetConfig(
+        name="syntaxhacker/developer-portfolio-rag",
+        split="train",
+        content_field="answer",
+        fields={
+            "question": "question",
+            "answer": "answer",
+            "context": "context"
+        },
+        prompt_template="""
+        Given the following context about a software developer's skills, experience, and background,
+        please answer the question accurately based on the provided information.
+        For each query, provide detailed information about:
+        1. Technical skills and programming languages
+        2. Machine learning and AI experience
+        3. Projects and professional experience
+        4. Tools and frameworks used
+        5. Personal interests and learning approach
+        Available Contexts:
+        {% for document in documents %}
+            Question: {{ document.meta.question }}
+            Answer: {{ document.content }}
+            Context: {{ document.meta.context }}
+            ---
+        {% endfor %}
+        Question: {{question}}
+        Answer:
+        """
+    ),
+}
+# Default configuration for embedding and LLM models
+MODEL_CONFIG = {
+    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+    "llm_model": "gemini-2.0-flash-exp",
+}

app/main.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import os
+import logging
+import sys
+from .config import DATASET_CONFIGS
+# Lazy imports to avoid blocking startup
+# from .pipeline import RAGPipeline  # Will import when needed
+# import umap  # Will import when needed for visualization
+# import plotly.express as px  # Will import when needed for visualization
+# import plotly.graph_objects as go  # Will import when needed for visualization
+# from plotly.subplots import make_subplots  # Will import when needed for visualization
+# import numpy as np  # Will import when needed for visualization
+# from sklearn.preprocessing import normalize  # Will import when needed for visualization
+# import pandas as pd  # Will import when needed for visualization
+import json
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="RAG Pipeline API", description="Multi-dataset RAG API", version="1.0.0")
+# Initialize pipelines for all datasets
+pipelines = {}
+google_api_key = os.getenv("GOOGLE_API_KEY")
+logger.info(f"Starting RAG Pipeline API")
+logger.info(f"Port from env: {os.getenv('PORT', 'Not set - will use 8000')}")
+logger.info(f"Google API Key present: {'Yes' if google_api_key else 'No'}")
+logger.info(f"Available datasets: {list(DATASET_CONFIGS.keys())}")
+# Don't load datasets during startup - do it asynchronously after server starts
+logger.info("RAG Pipeline API is ready to serve requests - datasets will load in background")
+# Visualization function disabled to speed up startup
+# def create_3d_visualization(pipeline):
+#     ... (commented out for faster startup)
+class Question(BaseModel):
+    text: str
+    dataset: str = "developer-portfolio"  # Default dataset
+@app.post("/answer")
+async def get_answer(question: Question):
+    try:
+        # Check if any pipelines are loaded
+        if not pipelines:
+            return {
+                "answer": "RAG Pipeline is running but datasets are still loading in the background. Please try again in a moment, or check /health for loading status.",
+                "dataset": question.dataset,
+                "status": "datasets_loading"
+            }
+        # Select the appropriate pipeline based on dataset
+        if question.dataset not in pipelines:
+            raise HTTPException(status_code=400, detail=f"Dataset '{question.dataset}' not available. Available datasets: {list(pipelines.keys())}")
+        selected_pipeline = pipelines[question.dataset]
+        answer = selected_pipeline.answer_question(question.text)
+        return {"answer": answer, "dataset": question.dataset}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/datasets")
+async def list_datasets():
+    """List all available datasets"""
+    return {"datasets": list(pipelines.keys())}
+async def load_datasets_background():
+    """Load datasets in background after server starts"""
+    global pipelines
+    if google_api_key:
+        # Import RAGPipeline only when needed
+        from .pipeline import RAGPipeline
+        # Only load developer-portfolio to save memory
+        dataset_name = "developer-portfolio"
+        try:
+            logger.info(f"Loading dataset: {dataset_name}")
+            pipeline = RAGPipeline.from_preset(
+                google_api_key=google_api_key,
+                preset_name=dataset_name
+            )
+            pipelines[dataset_name] = pipeline
+            logger.info(f"Successfully loaded {dataset_name}")
+        except Exception as e:
+            logger.error(f"Failed to load {dataset_name}: {e}")
+        logger.info(f"Background loading complete - {len(pipelines)} datasets loaded")
+    else:
+        logger.warning("No Google API key provided - running in demo mode without datasets")
+@app.on_event("startup")
+async def startup_event():
+    logger.info("FastAPI application startup complete")
+    logger.info(f"Server should be running on port: {os.getenv('PORT', '8000')}")
+    # Start loading datasets in background (non-blocking)
+    import asyncio
+    asyncio.create_task(load_datasets_background())
+@app.on_event("shutdown")
+async def shutdown_event():
+    logger.info("FastAPI application shutting down")
+@app.get("/")
+async def root():
+    """Root endpoint"""
+    return {"status": "ok", "message": "RAG Pipeline API", "version": "1.0.0", "datasets": list(pipelines.keys())}
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    logger.info("Health check called")
+    loading_status = "complete" if "developer-portfolio" in pipelines else "loading"
+    return {
+        "status": "healthy",
+        "datasets_loaded": len(pipelines),
+        "total_datasets": 1,  # Only loading developer-portfolio
+        "loading_status": loading_status,
+        "port": os.getenv('PORT', '8000')
+    }

app/pipeline.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from haystack import Document, Pipeline
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
+from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
+from haystack.components.builders import ChatPromptBuilder
+from haystack_integrations.components.generators.google_ai import GoogleAIGeminiChatGenerator
+from datasets import load_dataset
+from haystack.dataclasses import ChatMessage
+from typing import Optional, List, Union, Dict
+from .config import DatasetConfig, DATASET_CONFIGS, MODEL_CONFIG
+class RAGPipeline:
+    def __init__(
+        self,
+        google_api_key: str,
+        dataset_config: Union[str, DatasetConfig],
+        documents: Optional[List[Union[str, Document]]] = None,
+        embedding_model: Optional[str] = None,
+        llm_model: Optional[str] = None
+    ):
+        """
+        Initialize the RAG Pipeline.
+        Args:
+            google_api_key: API key for Google AI services
+            dataset_config: Either a string key from DATASET_CONFIGS or a DatasetConfig object
+            documents: Optional list of documents to use instead of loading from a dataset
+            embedding_model: Optional override for embedding model
+            llm_model: Optional override for LLM model
+        """
+        # Load configuration
+        if isinstance(dataset_config, str):
+            if dataset_config not in DATASET_CONFIGS:
+                raise ValueError(f"Dataset config '{dataset_config}' not found. Available configs: {list(DATASET_CONFIGS.keys())}")
+            self.config = DATASET_CONFIGS[dataset_config]
+        else:
+            self.config = dataset_config
+        # Load documents either from provided list or dataset
+        if documents is not None:
+            self.documents = documents
+        else:
+            dataset = load_dataset(self.config.name, split=self.config.split)
+            # Create documents with metadata based on configuration
+            self.documents = []
+            for doc in dataset:
+                # Create metadata dictionary from configured fields
+                meta = {}
+                if self.config.fields:
+                    for meta_key, dataset_field in self.config.fields.items():
+                        if dataset_field in doc:
+                            meta[meta_key] = doc[dataset_field]
+                # Create document with content and metadata
+                document = Document(
+                    content=doc[self.config.content_field],
+                    meta=meta
+                )
+                self.documents.append(document)
+        # print 10 documents
+        for doc in self.documents[:10]:
+            print(f"Content: {doc.content}")
+            print(f"Metadata: {doc.meta}")
+            print("-"*100)
+        # Initialize components
+        self.document_store = InMemoryDocumentStore()
+        self.doc_embedder = SentenceTransformersDocumentEmbedder(
+            model=embedding_model or MODEL_CONFIG["embedding_model"]
+        )
+        self.text_embedder = SentenceTransformersTextEmbedder(
+            model=embedding_model or MODEL_CONFIG["embedding_model"]
+        )
+        self.retriever = InMemoryEmbeddingRetriever(self.document_store)
+        # Warm up the document embedder
+        self.doc_embedder.warm_up()
+        # Initialize prompt template
+        template = [
+            ChatMessage.from_user(self.config.prompt_template)
+        ]
+        self.prompt_builder = ChatPromptBuilder(template=template)
+        # Initialize the generator
+        self.generator = GoogleAIGeminiChatGenerator(
+            model=llm_model or MODEL_CONFIG["llm_model"]
+        )
+        # Index documents
+        self._index_documents(self.documents)
+        # Build pipeline
+        self.pipeline = self._build_pipeline()
+    @classmethod
+    def from_preset(cls, google_api_key: str, preset_name: str):
+        """
+        Create a pipeline from a preset configuration.
+        Args:
+            google_api_key: API key for Google AI services
+            preset_name: Name of the preset configuration to use
+        """
+        return cls(google_api_key=google_api_key, dataset_config=preset_name)
+    def _index_documents(self, documents):
+        # Embed and index documents
+        docs_with_embeddings = self.doc_embedder.run(documents)
+        self.document_store.write_documents(docs_with_embeddings["documents"])
+    def _build_pipeline(self):
+        pipeline = Pipeline()
+        pipeline.add_component("text_embedder", self.text_embedder)
+        pipeline.add_component("retriever", self.retriever)
+        pipeline.add_component("prompt_builder", self.prompt_builder)
+        pipeline.add_component("llm", self.generator)
+        # Connect components
+        pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+        pipeline.connect("retriever", "prompt_builder")
+        pipeline.connect("prompt_builder.prompt", "llm.messages")
+        return pipeline
+    def answer_question(self, question: str) -> str:
+        """Run the RAG pipeline to answer a question"""
+        result = self.pipeline.run({
+            "text_embedder": {"text": question},
+            "prompt_builder": {"question": question}
+        })
+        return result["llm"]["replies"][0].text

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+haystack-ai==2.10.3
+datasets==3.3.2
+sentence-transformers==3.4.1
+google-ai-haystack==5.1.0
+fastapi==0.115.4
+uvicorn==0.31.0
+beautifulsoup4==4.12.0  # Stable HTML parsing
+umap-learn==0.5.4
+plotly==5.22.0
+scikit-learn==1.4.1.post1
+numpy>=1.20.0
+pandas>=1.3.0

start.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+set -e  # Exit on any error
+echo "=== RAG Pipeline Startup ==="
+echo "PORT environment variable: ${PORT:-'not set'}"
+echo "Using port: ${PORT:-8000}"
+if [ -n "$GOOGLE_API_KEY" ]; then
+    echo "Google API Key present: Yes"
+else
+    echo "Google API Key present: No"
+fi
+echo "Starting uvicorn server..."
+echo "=== End Startup Info ==="
+# Debug the uvicorn command
+echo "Current directory: $(pwd)"
+echo "Python path: $PYTHONPATH"
+echo "Contents of current directory:"
+ls -la
+echo "Contents of app directory:"
+ls -la app/
+echo "Testing Python import:"
+python -c "import app.main; print('Import successful')" || echo "Import failed"
+echo "Starting uvicorn..."
+uvicorn app.main:app --host 0.0.0.0 --port ${PORT:-8000} --log-level info