sirjosev commited on
Commit
8b988cb
·
1 Parent(s): 2553602

repair to v1.2

Browse files
Files changed (2) hide show
  1. app.py +108 -72
  2. requirements.txt +3 -2
app.py CHANGED
@@ -2,119 +2,155 @@ import gradio as gr
2
  import cv2
3
  import whisper
4
  import torch
5
- from fer import FER
 
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
 
8
- # --- 1. LOAD MODELS ---
9
- print("Sedang memuat model... Mohon tunggu.")
10
 
11
- # A. Model Otak: SmolLM (Gunakan versi Instruct agar bisa diajak chat/analisis)
12
- # Kita gunakan versi 135M atau 360M agar ringan di CPU Hugging Face Space gratis
13
- model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
14
  tokenizer = AutoTokenizer.from_pretrained(model_id)
15
  smol_lm = AutoModelForCausalLM.from_pretrained(model_id)
16
 
17
- # B. Model Telinga: Whisper untuk Transkripsi
18
- whisper_model = whisper.load_model("tiny") # Gunakan 'tiny' atau 'base' agar cepat
19
 
20
- # C. Model Mata: FER untuk Emosi Wajah
21
- face_detector = FER(mtcnn=True) # MTCNN lebih akurat
 
22
 
23
- # --- 2. FUNGSI PEMROSESAN ---
 
 
24
 
25
- def analyze_emotion(video_path):
26
- """
27
- Fungsi utama yang memproses video user.
28
- """
29
- if not video_path:
30
- return "Mohon upload video terlebih dahulu."
31
 
32
- # --- LANGKAH 1: Analisis Audio (Transkripsi) ---
33
- # Whisper otomatis ekstrak audio dari file video
34
- audio_result = whisper_model.transcribe(video_path)
35
- transcribed_text = audio_result["text"]
36
-
37
- # --- LANGKAH 2: Analisis Visual (Mimik Muka) ---
38
- # Kita ambil beberapa frame dari video untuk dicek emosinya
39
  cap = cv2.VideoCapture(video_path)
40
  emotions_list = []
41
-
42
  frame_count = 0
 
 
43
  while cap.isOpened():
44
  ret, frame = cap.read()
45
  if not ret:
46
  break
47
 
48
- # Cek setiap 30 frame (agar tidak terlalu berat)
49
  if frame_count % 30 == 0:
50
- # FER mendeteksi emosi dominan di frame ini
51
- top_emotion, score = face_detector.top_emotion(frame)
52
- if top_emotion:
53
- emotions_list.append(top_emotion)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  frame_count += 1
55
  cap.release()
 
 
 
 
 
 
56
 
57
- # Hitung emosi yang paling sering muncul (Modus)
58
- if emotions_list:
59
- dominant_facial_emotion = max(set(emotions_list), key=emotions_list.count)
60
- else:
61
- dominant_facial_emotion = "Netral/Tidak Terdeteksi"
62
 
63
- # --- LANGKAH 3: Analisis Agen (SmolLM3) ---
64
- # Kita buat prompt agar SmolLM bertindak sebagai psikolog/analis
 
 
 
 
 
 
 
 
 
65
 
66
- system_prompt = "You are an expert AI emotional analyst. Analyze the user's state based on their facial expression and spoken words."
 
67
 
68
  user_input = f"""
69
- DATA INPUT:
70
- 1. Transcribed Text: "{transcribed_text}"
71
- 2. Facial Expression Detected: {dominant_facial_emotion}
72
 
73
- TUGAS:
74
- Jelaskan emosi apa yang dirasakan orang ini? Apakah kata-katanya (teks) cocok dengan ekspresi wajahnya? Berikan kesimpulan singkat dalam Bahasa Indonesia.
 
 
75
  """
76
 
77
- # Format prompt sesuai template chat SmolLM
78
  messages = [
79
  {"role": "system", "content": system_prompt},
80
  {"role": "user", "content": user_input},
81
  ]
82
 
83
- input_text = tokenizer.apply_chat_template(messages, tokenize=False)
84
- inputs = tokenizer.encode(input_text, return_tensors="pt")
85
 
86
- # Generate jawaban
87
- outputs = smol_lm.generate(inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
88
- analysis_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
89
 
90
- # Bersihkan output (hapus prompt asli dari hasil)
91
- final_response = analysis_result.split("assistant")[-1].strip()
 
 
 
 
92
 
93
- return final_response, transcribed_text, dominant_facial_emotion
94
 
95
- # --- 3. MEMBUAT UI DENGAN GRADIO ---
96
 
97
- with gr.Blocks() as demo:
98
- gr.Markdown("# 🧠 SmolLM3 Emotion Agent")
99
- gr.Markdown("Upload video pendek (berbicara ke kamera), AI akan mendeteksi: **Mimik Wajah + Teks Ucapan**.")
100
-
101
- with gr.Row():
102
- video_input = gr.Video(label="Upload Video atau Rekam via Webcam")
 
 
103
 
104
- submit_btn = gr.Button("Analisis Emosi")
105
-
106
- with gr.Row():
107
- output_analysis = gr.Textbox(label="Analisis SmolLM3 (Agent)", lines=5)
108
-
109
- with gr.Row():
110
- output_text = gr.Textbox(label="Teks Terdeteksi")
111
- output_face = gr.Textbox(label="Emosi Wajah Dominan")
 
112
 
113
  submit_btn.click(
114
- fn=analyze_emotion,
115
- inputs=video_input,
116
- outputs=[output_analysis, output_text, output_face]
117
  )
118
 
119
- # Jalankan aplikasi
120
- demo.launch()
 
2
  import cv2
3
  import whisper
4
  import torch
5
+ import numpy as np
6
+ from PIL import Image
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
 
9
+ # --- 1. SETUP & LOAD MODELS ---
10
+ print("Sedang memuat model... Mohon tunggu sebentar.")
11
 
12
+ # A. Model Otak: SmolLM (Agent)
13
+ # Menggunakan versi Instruct agar bisa diajak diskusi
14
+ model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
15
  tokenizer = AutoTokenizer.from_pretrained(model_id)
16
  smol_lm = AutoModelForCausalLM.from_pretrained(model_id)
17
 
18
+ # B. Model Telinga: Whisper (Audio to Text)
19
+ whisper_model = whisper.load_model("tiny")
20
 
21
+ # C. Model Mata: Vision Transformer untuk Emosi
22
+ # Kita ganti FER dengan model native Hugging Face agar tidak error
23
+ emotion_classifier = pipeline("image-classification", model="dima806/facial_emotions_image_detection")
24
 
25
+ # D. Setup Deteksi Wajah (OpenCV Basic)
26
+ # Menggunakan Haar Cascade bawaan cv2 untuk menemukan lokasi wajah
27
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
28
 
29
+ # --- 2. FUNGSI LOGIKA ---
 
 
 
 
 
30
 
31
+ def get_dominant_emotion(video_path):
 
 
 
 
 
 
32
  cap = cv2.VideoCapture(video_path)
33
  emotions_list = []
 
34
  frame_count = 0
35
+
36
+ # Ambil sampel setiap 30 frame (sekitar 1 detik sekali)
37
  while cap.isOpened():
38
  ret, frame = cap.read()
39
  if not ret:
40
  break
41
 
 
42
  if frame_count % 30 == 0:
43
+ # 1. Convert ke Grayscale untuk deteksi wajah
44
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
45
+ faces = face_cascade.detectMultiScale(gray, 1.1, 4)
46
+
47
+ for (x, y, w, h) in faces:
48
+ # 2. Crop bagian wajah saja
49
+ face_roi = frame[y:y+h, x:x+w]
50
+
51
+ # 3. Convert ke format PIL Image untuk Hugging Face Pipeline
52
+ rgb_face = cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB)
53
+ pil_image = Image.fromarray(rgb_face)
54
+
55
+ # 4. Prediksi Emosi
56
+ try:
57
+ results = emotion_classifier(pil_image)
58
+ # results format: [{'label': 'happy', 'score': 0.9}, ...]
59
+ top_emotion = results[0]['label']
60
+ emotions_list.append(top_emotion)
61
+ except Exception as e:
62
+ print(f"Error detecting frame: {e}")
63
+ continue
64
+
65
+ # Kita hanya ambil 1 wajah pertama yang ketemu per frame
66
+ break
67
+
68
  frame_count += 1
69
  cap.release()
70
+
71
+ if not emotions_list:
72
+ return "Tidak ada wajah terdeteksi"
73
+
74
+ # Cari modus (emosi yang paling sering muncul)
75
+ return max(set(emotions_list), key=emotions_list.count)
76
 
77
+ def analyze_agent(video_path):
78
+ if not video_path:
79
+ return "Error", "Mohon upload video.", "N/A"
 
 
80
 
81
+ print(f"Processing video: {video_path}")
82
+
83
+ # 1. Transkripsi Audio (Telinga)
84
+ try:
85
+ audio_result = whisper_model.transcribe(video_path)
86
+ transcribed_text = audio_result["text"]
87
+ except Exception as e:
88
+ transcribed_text = f"Gagal transkripsi audio: {str(e)}"
89
+
90
+ # 2. Deteksi Emosi Visual (Mata)
91
+ detected_emotion = get_dominant_emotion(video_path)
92
 
93
+ # 3. Analisis SmolLM (Otak)
94
+ system_prompt = "You are an expert AI psychological analyst. Analyze the user's emotion based on facial expression and text."
95
 
96
  user_input = f"""
97
+ DATA DARI USER:
98
+ - Teks Ucapan: "{transcribed_text}"
99
+ - Ekspresi Wajah Dominan: {detected_emotion}
100
 
101
+ INSTRUKSI:
102
+ Analisis apakah ada kesesuaian antara ucapan dan ekspresi wajahnya.
103
+ Jika wajah 'sad' tapi teks semangat, mungkin dia menyembunyikan sesuatu.
104
+ Berikan kesimpulan singkat dalam Bahasa Indonesia.
105
  """
106
 
 
107
  messages = [
108
  {"role": "system", "content": system_prompt},
109
  {"role": "user", "content": user_input},
110
  ]
111
 
112
+ # Format chat template
113
+ input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", tokenize=True)
114
 
115
+ # Generate response
116
+ outputs = smol_lm.generate(input_ids, max_new_tokens=250, temperature=0.7)
117
+ decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
118
 
119
+ # Parsing output agar rapi (mengambil bagian assistant saja)
120
+ if "assistant" in decoded:
121
+ final_response = decoded.split("assistant")[-1].strip()
122
+ else:
123
+ # Fallback jika format berbeda
124
+ final_response = decoded
125
 
126
+ return final_response, transcribed_text, detected_emotion
127
 
128
+ # --- 3. USER INTERFACE ---
129
 
130
+ css = """
131
+ #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
132
+ """
133
+
134
+ with gr.Blocks(css=css) as demo:
135
+ with gr.Column(elem_id="col-container"):
136
+ gr.Markdown("## 🤖 SmolLM3 Multimodal Agent (Video Emotion)")
137
+ gr.Markdown("Upload video Anda berbicara. AI akan melihat ekspresi wajah dan mendengar ucapan Anda.")
138
 
139
+ video_input = gr.Video(sources=["upload", "webcam"])
140
+ submit_btn = gr.Button("Analisis Emosi", variant="primary")
141
+
142
+ gr.Markdown("### Hasil Analisis Agent")
143
+ output_agent = gr.Textbox(label="Pendapat SmolLM3", lines=4)
144
+
145
+ with gr.Row():
146
+ output_text = gr.Textbox(label="Transkrip Suara")
147
+ output_face = gr.Textbox(label="Deteksi Wajah")
148
 
149
  submit_btn.click(
150
+ fn=analyze_agent,
151
+ inputs=[video_input],
152
+ outputs=[output_agent, output_text, output_face]
153
  )
154
 
155
+ if __name__ == "__main__":
156
+ demo.launch()
requirements.txt CHANGED
@@ -3,8 +3,9 @@ torch
3
  torchaudio
4
  gradio
5
  opencv-python-headless
6
- fer
7
  openai-whisper
8
  numpy
9
  scipy
10
- accelerate
 
 
 
3
  torchaudio
4
  gradio
5
  opencv-python-headless
 
6
  openai-whisper
7
  numpy
8
  scipy
9
+ accelerate
10
+ pillow
11
+ timm