File size: 3,126 Bytes
9127d81
 
693f9ea
9127d81
6eef698
 
 
 
db31eac
6ebfd44
9127d81
 
 
 
 
 
 
6eef698
 
 
 
 
 
 
 
 
 
 
 
 
196bcc7
6eef698
 
 
 
 
e2acb01
9127d81
9c16eb5
8705449
e91666b
2402682
 
 
 
196bcc7
2402682
196bcc7
 
2402682
196bcc7
529e1f1
8705449
529e1f1
 
 
646a12e
71bfc0a
ffbee98
646a12e
 
 
 
 
ffbee98
e000191
 
646a12e
e000191
 
6eef698
bb64b75
 
 
06c1b5d
ffbee98
 
8705449
ffbee98
 
 
 
 
 
 
 
06c1b5d
ffbee98
 
 
06c1b5d
ffbee98
 
 
 
 
9127d81
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import torch
import onnxruntime as ort
import numpy as np
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download

MODEL_ID = "yuhueng/qwen3-4b-singlish-base-v3"  # replace with your model

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
)

REPO_ID = "govtech/lionguard-v1"
EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5" 
FILENAME = "models/lionguard-binary.onnx"

embedder = SentenceTransformer(EMBEDDING_MODEL)

model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
session = ort.InferenceSession(model_path)

def check_safety(text):
    embedding = embedder.encode([text], normalize_embeddings=True)

    input_name = session.get_inputs()[0].name

    pred = session.run(None, {input_name: embedding.astype(np.float32)})[0]
    
    return "Unsafe" if pred[0] == 1 else "Safe"


@spaces.GPU(duration=60)
def inference(prompt: str, max_tokens: int = 256) -> str:
    model.to("cuda")  # Move to GPU inside decorated function

    SYSTEM_PROMPT = """
    You are having a casual conversation with a user in Singapore.
    Keep responses helpful and friendly. Avoid sensitive topics like politics, religion, or race.
    If asked about harmful activities, politely decline.
    """
    
    messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = True, # Must add for generation
    )
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_p=0.8,
        top_k=20,
    )

    response = tokenizer.decode(
        outputs[0][inputs["input_ids"].shape[1]:],
        skip_special_tokens=True
    )
    safety = check_safety(response)

    json = {"response": response, "safety": safety}
    return json
    
    # # Use TextIteratorStreamer instead of TextStreamer
    # streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # generation_kwargs = dict(
    #     **inputs,
    #     max_new_tokens=max_tokens,
    #     temperature=0.7,
    #     top_p=0.8,
    #     top_k=20,
    #     streamer=streamer,
    # )
    
    # # Run generation in separate thread
    # thread = Thread(target=model.generate, kwargs=generation_kwargs)
    # thread.start()
    
    # # Yield tokens as they come
    # generated_text = ""
    # for new_text in streamer:
    #     generated_text += new_text
    #     yield generated_text  # yield cumulative text for Gradio

demo = gr.Interface(
    fn=inference,
    inputs=[
        gr.Textbox(label="prompt"),
        gr.Number(value=256, label="max_tokens")
    ],
    outputs=gr.Textbox(label="response"),
    api_name="inference"  # explicit endpoint name: /inference
)

demo.launch()