Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -43,18 +43,6 @@ class Inference:
|
|
| 43 |
self.sae = sae
|
| 44 |
self.cfg_dict = cfg_dict
|
| 45 |
|
| 46 |
-
def get_feature_info(self):
|
| 47 |
-
projection_onto_unembed = self.sae.W_dec @ self.model.W_U
|
| 48 |
-
# get the top ten words associated with the given feature
|
| 49 |
-
WORD_COUNT = 10
|
| 50 |
-
_, inds = torch.topk(projection_onto_unembed, WORD_COUNT, dim=1)
|
| 51 |
-
|
| 52 |
-
_, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
| 53 |
-
features = self._get_features(sv_feature_acts)
|
| 54 |
-
breakpoint();
|
| 55 |
-
associated_words = [self.model.to_str_tokens(inds[f]) for f in features]
|
| 56 |
-
return associated_words
|
| 57 |
-
|
| 58 |
def _get_sae_out_and_feature_activations(self):
|
| 59 |
# given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
|
| 60 |
sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
|
|
@@ -79,7 +67,9 @@ class Inference:
|
|
| 79 |
# return torch.topk(sv_feature_acts, 1).indices.tolist()
|
| 80 |
features = torch.topk(sv_feature_activations, 1).indices
|
| 81 |
print(f'features that align with the text prompt: {features}')
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def _get_steering_hook(self, feature, sae_out):
|
| 85 |
coeff = self.coeff
|
|
@@ -101,7 +91,7 @@ class Inference:
|
|
| 101 |
# and not use the seperate function _get_steering_hook()
|
| 102 |
sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
| 103 |
features = self._get_features(sv_feature_acts)
|
| 104 |
-
steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features]
|
| 105 |
|
| 106 |
return steering_hooks
|
| 107 |
|
|
@@ -159,15 +149,6 @@ def slow_echo_steering(message, history):
|
|
| 159 |
time.sleep(0.01)
|
| 160 |
yield result[: i + 1]
|
| 161 |
|
| 162 |
-
def populate_related_features():
|
| 163 |
-
features = chatbot_model.get_feature_info()
|
| 164 |
-
print(features)
|
| 165 |
-
return features[0]
|
| 166 |
-
# for feature in features:
|
| 167 |
-
# for i in range(len(feature)):
|
| 168 |
-
# time.sleep(0.01)
|
| 169 |
-
# yield feature[: i + 1]
|
| 170 |
-
|
| 171 |
with gr.Blocks() as demo:
|
| 172 |
with gr.Row():
|
| 173 |
gr.Markdown("*STANDARD HEXTER BOT*")
|
|
@@ -197,9 +178,6 @@ with gr.Blocks() as demo:
|
|
| 197 |
)
|
| 198 |
with gr.Row():
|
| 199 |
steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
|
| 200 |
-
found_features = gr.Textbox(label="Found Features")
|
| 201 |
-
find_features = gr.Button("Find Related Features")
|
| 202 |
-
find_features.click(fn=populate_related_features,inputs=None, outputs=found_features)
|
| 203 |
with gr.Row():
|
| 204 |
coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
|
| 205 |
with gr.Row():
|
|
|
|
| 43 |
self.sae = sae
|
| 44 |
self.cfg_dict = cfg_dict
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def _get_sae_out_and_feature_activations(self):
|
| 47 |
# given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
|
| 48 |
sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
|
|
|
|
| 67 |
# return torch.topk(sv_feature_acts, 1).indices.tolist()
|
| 68 |
features = torch.topk(sv_feature_activations, 1).indices
|
| 69 |
print(f'features that align with the text prompt: {features}')
|
| 70 |
+
print("pump the features into the tool that gives you the words associated with each feature")
|
| 71 |
+
return features
|
| 72 |
+
|
| 73 |
|
| 74 |
def _get_steering_hook(self, feature, sae_out):
|
| 75 |
coeff = self.coeff
|
|
|
|
| 91 |
# and not use the seperate function _get_steering_hook()
|
| 92 |
sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
|
| 93 |
features = self._get_features(sv_feature_acts)
|
| 94 |
+
steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features[0]]
|
| 95 |
|
| 96 |
return steering_hooks
|
| 97 |
|
|
|
|
| 149 |
time.sleep(0.01)
|
| 150 |
yield result[: i + 1]
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
with gr.Blocks() as demo:
|
| 153 |
with gr.Row():
|
| 154 |
gr.Markdown("*STANDARD HEXTER BOT*")
|
|
|
|
| 178 |
)
|
| 179 |
with gr.Row():
|
| 180 |
steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
|
|
|
|
|
|
|
|
|
|
| 181 |
with gr.Row():
|
| 182 |
coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
|
| 183 |
with gr.Row():
|