Spaces:
Running
Running
| """ | |
| utils.py | |
| """ | |
| # Standard imports | |
| import os | |
| from typing import List | |
| # Third party imports | |
| import numpy as np | |
| from openai import OpenAI | |
| client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
| # Maximum tokens for text-embedding-3-small | |
| MAX_TOKENS = 8191 # We don't have access to the tokenizer for text-embedding-3-small, and just assume 1 character = 1 token here | |
| def get_embeddings( | |
| texts: List[str], model: str = "text-embedding-3-large" | |
| ) -> List[List[float]]: | |
| """ | |
| Generate embeddings for a list of texts using OpenAI API synchronously. | |
| Args: | |
| texts: List of strings to embed. | |
| model: OpenAI embedding model to use (default: text-embedding-3-small). | |
| Returns: | |
| A list of embeddings (each embedding is a list of floats). | |
| Raises: | |
| Exception: If the OpenAI API call fails. | |
| """ | |
| # Truncate texts to max token limit | |
| truncated_texts = [text[:MAX_TOKENS] for text in texts] | |
| # Make the API call | |
| response = client.embeddings.create(input=truncated_texts, model=model) | |
| # Extract embeddings from response | |
| embeddings = np.array([data.embedding for data in response.data]) | |
| return embeddings | |