Spaces:

UMCU
/

PerplexityViewerPlus

Running

App Files Files Community

UMCU commited on 25 days ago

Commit

772b2e0

verified ·

1 Parent(s): c2e82c6

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -12

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ except ImportError:
     }
     MODEL_SETTINGS = {"max_length": 512}
     VIZ_SETTINGS = {
-        "max_perplexity_display": 1000.0,
         "color_scheme": {
             "low_perplexity": {"r": 46, "g": 204, "b": 113},
             "medium_perplexity": {"r": 241, "g": 196, "b": 15},
@@ -97,6 +97,45 @@ cached_models = {}
 cached_tokenizers = {}
 def load_model_and_tokenizer(model_name, model_type):
     """Load and cache model and tokenizer"""
     cache_key = f"{model_name}_{model_type}"
@@ -184,17 +223,23 @@ def calculate_decoder_perplexity(text, model, tokenizer):
         # Get tokens (excluding the first one since we predict next tokens)
         tokens = tokenizer.convert_ids_to_tokens(input_ids[0][1:])
-        # Clean up tokens for display
         cleaned_tokens = []
-        for token in tokens:
             if token.startswith("Ġ"):
                 cleaned_tokens.append(token[1:])  # Remove Ġ prefix
             elif token.startswith("##"):
                 cleaned_tokens.append(token[2:])  # Remove ## prefix
             else:
                 cleaned_tokens.append(token)
-    return perplexity, cleaned_tokens, token_perplexities
 def calculate_encoder_perplexity(
@@ -303,15 +348,23 @@ def calculate_encoder_perplexity(
             # Fallback if no samples collected (shouldn't happen with proper min_samples)
             token_perplexities.append(2.0)
-    # Clean up tokens for display
     cleaned_tokens = []
-    for token in tokens:
         if token.startswith("##"):
             cleaned_tokens.append(token[2:])
         else:
             cleaned_tokens.append(token)
-    return overall_perplexity, cleaned_tokens, np.array(token_perplexities)
 def perplexity_to_color(perplexity, min_perp=1, max_perp=1000):
@@ -365,7 +418,7 @@ def create_visualization(tokens, perplexities):
         return "<p>No tokens to visualize.</p>"
     # Cap perplexities for better visualization
-    max_perplexity = min(np.max(perplexities), VIZ_SETTINGS["max_perplexity_display"])
     # Normalize perplexities to 0-1 range for color mapping
     normalized_perplexities = np.clip(perplexities / max_perplexity, 0, 1)
@@ -389,20 +442,29 @@ def create_visualization(tokens, perplexities):
         if not token.strip():
             continue
         # Clean token for display
         clean_token = (
-            token.replace("</w>", "").replace("##", "").replace("Ġ", "").strip()
         )
         if not clean_token:
             continue
         # Add space before token if needed
-        if i > 0 and not clean_token[0] in ".,!?;:":
             html_parts.append(" ")
         # Get color thresholds from configuration
-        low_thresh = VIZ_SETTINGS.get("thresholds", {}).get("low_threshold", 0.3)
-        high_thresh = VIZ_SETTINGS.get("thresholds", {}).get("high_threshold", 0.7)
         # Get colors from configuration
         # low_color = VIZ_SETTINGS["color_scheme"]["low_perplexity"]

     }
     MODEL_SETTINGS = {"max_length": 512}
     VIZ_SETTINGS = {
+        "max_perplexity_display": 5000.0,
         "color_scheme": {
             "low_perplexity": {"r": 46, "g": 204, "b": 113},
             "medium_perplexity": {"r": 241, "g": 196, "b": 15},
 cached_tokenizers = {}
+def is_special_character(token):
+    """
+    Check if a token is only special characters/punctuation.
+    Args:
+        token: The token string to check
+    Returns:
+        True if token contains only special characters, False otherwise
+    Examples:
+        >>> is_special_character(".")
+        True
+        >>> is_special_character(",")
+        True
+        >>> is_special_character("hello")
+        False
+        >>> is_special_character("Ġ,")
+        True
+        >>> is_special_character("##!")
+        True
+    """
+    # Clean up common tokenizer artifacts
+    clean_token = (
+        token.replace("</w>", "")
+        .replace("##", "")
+        .replace("Ġ", "")
+        .replace("Ċ", "")
+        .strip()
+    )
+    # Check if empty after cleaning
+    if not clean_token:
+        return True
+    # Check if token contains only punctuation and special characters
+    return all(not c.isalnum() for c in clean_token)
 def load_model_and_tokenizer(model_name, model_type):
     """Load and cache model and tokenizer"""
     cache_key = f"{model_name}_{model_type}"
         # Get tokens (excluding the first one since we predict next tokens)
         tokens = tokenizer.convert_ids_to_tokens(input_ids[0][1:])
+        # Clean up tokens for display and filter special characters
         cleaned_tokens = []
+        filtered_perplexities = []
+        for token, token_perp in zip(tokens, token_perplexities):
+            # Skip special characters
+            if is_special_character(token):
+                continue
             if token.startswith("Ġ"):
                 cleaned_tokens.append(token[1:])  # Remove Ġ prefix
             elif token.startswith("##"):
                 cleaned_tokens.append(token[2:])  # Remove ## prefix
             else:
                 cleaned_tokens.append(token)
+            filtered_perplexities.append(token_perp)
+    return perplexity, cleaned_tokens, np.array(filtered_perplexities)
 def calculate_encoder_perplexity(
             # Fallback if no samples collected (shouldn't happen with proper min_samples)
             token_perplexities.append(2.0)
+    # Clean up tokens for display and filter special characters
     cleaned_tokens = []
+    filtered_perplexities = []
+    for idx, (token, token_perp) in enumerate(zip(tokens, token_perplexities)):
+        # Skip special characters and tokenizer special tokens
+        if input_ids[0, idx].item() in special_token_ids:
+            continue
+        if is_special_character(token):
+            continue
         if token.startswith("##"):
             cleaned_tokens.append(token[2:])
         else:
             cleaned_tokens.append(token)
+        filtered_perplexities.append(token_perp)
+    return overall_perplexity, cleaned_tokens, np.array(filtered_perplexities)
 def perplexity_to_color(perplexity, min_perp=1, max_perp=1000):
         return "<p>No tokens to visualize.</p>"
     # Cap perplexities for better visualization
+    max_perplexity = np.max(perplexities)
     # Normalize perplexities to 0-1 range for color mapping
     normalized_perplexities = np.clip(perplexities / max_perplexity, 0, 1)
         if not token.strip():
             continue
+        # Skip special characters (already filtered in calculation functions)
+        if is_special_character(token):
+            continue
         # Clean token for display
+        # </w>, ##, Ġ, Ċ
         clean_token = (
+            token.replace("</w>", "")
+            .replace("##", "")
+            .replace("Ġ", "")
+            .replace("Ċ", "")
+            .strip()
         )
         if not clean_token:
             continue
         # Add space before token if needed
+        if i > 0 and clean_token[0] not in ".,!?;:":
             html_parts.append(" ")
         # Get color thresholds from configuration
+        # low_thresh = VIZ_SETTINGS.get("thresholds", {}).get("low_threshold", 0.3)
+        # high_thresh = VIZ_SETTINGS.get("thresholds", {}).get("high_threshold", 0.7)
         # Get colors from configuration
         # low_color = VIZ_SETTINGS["color_scheme"]["low_perplexity"]