Az-r-ow commited on
Commit
ee9d38f
·
1 Parent(s): d914a99

chore(deepl): padding sentences and labels

Browse files
Files changed (1) hide show
  1. deepl_ner.ipynb +230 -14
deepl_ner.ipynb CHANGED
@@ -6,26 +6,38 @@
6
  "source": [
7
  "# Deep learning NER\n",
8
  "\n",
9
- "In this notebook, we will discover two deep learning techniques for Named Entity Recognition (or NER). \n",
10
  "\n",
11
  "- LSTM (Long Short Term Memory)\n",
12
- "- Transformers"
13
  ]
14
  },
15
  {
16
  "cell_type": "code",
17
- "execution_count": 5,
18
  "metadata": {},
19
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
20
  "source": [
21
  "from app.travel_resolver.libs.nlp import data_processing as dp\n",
22
  "\n",
23
- "sentences, labels, vocab, unique_labels = dp.from_bio_file_to_examples('./data/bio/fr.bio/fr.sentences.bio')"
 
 
24
  ]
25
  },
26
  {
27
  "cell_type": "code",
28
- "execution_count": 6,
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
@@ -33,18 +45,62 @@
33
  "processed_labels = []\n",
34
  "\n",
35
  "for sentence, label in zip(sentences, labels):\n",
36
- " sentence, label = dp.process_sentence(sentence, stemming=True, return_tokens=True, labels_to_adapt=label)\n",
 
 
37
  " processed_sentences.append(sentence)\n",
38
- " processed_labels.append(label)\n"
39
  ]
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
- "def encode_sentence(sentence: str, vocab: list[str]):\n",
 
 
 
48
  " \"\"\"\n",
49
  " Encode a sentence into a list of integers\n",
50
  "\n",
@@ -55,10 +111,170 @@
55
  " Returns:\n",
56
  " list: The list of integers\n",
57
  " \"\"\"\n",
58
- " return [\n",
59
  " vocab.index(word) if word in vocab else vocab.index(\"<UNK>\")\n",
60
- " for word in sentence.split(\" \")\n",
61
- " ]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  ]
63
  }
64
  ],
@@ -78,7 +294,7 @@
78
  "name": "python",
79
  "nbconvert_exporter": "python",
80
  "pygments_lexer": "ipython3",
81
- "version": "3.10.1"
82
  }
83
  },
84
  "nbformat": 4,
 
6
  "source": [
7
  "# Deep learning NER\n",
8
  "\n",
9
+ "In this notebook, we will discover two deep learning techniques for Named Entity Recognition (or NER).\n",
10
  "\n",
11
  "- LSTM (Long Short Term Memory)\n",
12
+ "- Transformers\n"
13
  ]
14
  },
15
  {
16
  "cell_type": "code",
17
+ "execution_count": null,
18
  "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "name": "stderr",
22
+ "output_type": "stream",
23
+ "text": [
24
+ "[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
25
+ "[nltk_data] ow/nltk_data...\n",
26
+ "[nltk_data] Package punkt_tab is already up-to-date!\n"
27
+ ]
28
+ }
29
+ ],
30
  "source": [
31
  "from app.travel_resolver.libs.nlp import data_processing as dp\n",
32
  "\n",
33
+ "sentences, labels, vocab, unique_labels = dp.from_bio_file_to_examples(\n",
34
+ " \"./data/bio/fr.bio/10k_samples.bio\"\n",
35
+ ")"
36
  ]
37
  },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": null,
41
  "metadata": {},
42
  "outputs": [],
43
  "source": [
 
45
  "processed_labels = []\n",
46
  "\n",
47
  "for sentence, label in zip(sentences, labels):\n",
48
+ " sentence, label = dp.process_sentence(\n",
49
+ " sentence, stemming=True, return_tokens=True, labels_to_adapt=label\n",
50
+ " )\n",
51
  " processed_sentences.append(sentence)\n",
52
+ " processed_labels.append(label)"
53
  ]
54
  },
55
  {
56
  "cell_type": "code",
57
+ "execution_count": 31,
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "data": {
62
+ "image/png": "",
63
+ "text/plain": [
64
+ "<Figure size 640x480 with 1 Axes>"
65
+ ]
66
+ },
67
+ "metadata": {},
68
+ "output_type": "display_data"
69
+ }
70
+ ],
71
+ "source": [
72
+ "import matplotlib.pyplot as plt\n",
73
+ "\n",
74
+ "plt.hist([len(sentence) for sentence in processed_sentences], bins=50)\n",
75
+ "plt.title(\"Histogram of sentence lengths\")\n",
76
+ "\n",
77
+ "plt.show()"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 48,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "\"\"\"\n",
87
+ " This variable will control the maximum length of the sentence \n",
88
+ " as well as the embedding size\n",
89
+ "\"\"\"\n",
90
+ "\n",
91
+ "MAX_LEN = 30"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 49,
97
  "metadata": {},
98
  "outputs": [],
99
  "source": [
100
+ "import tensorflow as tf\n",
101
+ "\n",
102
+ "\n",
103
+ "def encode_and_pad_sentence(sentence: str, vocab: list[str], max_length: int = MAX_LEN):\n",
104
  " \"\"\"\n",
105
  " Encode a sentence into a list of integers\n",
106
  "\n",
 
111
  " Returns:\n",
112
  " list: The list of integers\n",
113
  " \"\"\"\n",
114
+ " encoded_sentence = [\n",
115
  " vocab.index(word) if word in vocab else vocab.index(\"<UNK>\")\n",
116
+ " for word in sentence\n",
117
+ " ]\n",
118
+ "\n",
119
+ " return tf.keras.utils.pad_sequences(\n",
120
+ " [encoded_sentence], maxlen=max_length, padding=\"post\", value=0\n",
121
+ " )[0]"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 25,
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "get_vocab_from_corpus = lambda corpus: list(\n",
131
+ " set([word for sentence in corpus for word in sentence])\n",
132
+ ") + [\"<UNK>\"]"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 26,
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "vocab = get_vocab_from_corpus(processed_sentences)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 29,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "encoded_sentences = [\n",
151
+ " encode_and_pad_sentence(sentence, vocab) for sentence in processed_sentences\n",
152
+ "]"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 50,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "padded_labels = tf.keras.preprocessing.sequence.pad_sequences(\n",
162
+ " processed_labels, maxlen=MAX_LEN, padding=\"post\", value=-1\n",
163
+ ")"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 47,
169
+ "metadata": {},
170
+ "outputs": [
171
+ {
172
+ "name": "stderr",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "2024-11-09 16:56:24.038756: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence\n"
176
+ ]
177
+ }
178
+ ],
179
+ "source": [
180
+ "dataset = tf.data.Dataset.from_tensor_slices((encoded_sentences, padded_labels))\n",
181
+ "\n",
182
+ "# Split the dataset into a training and testing dataset\n",
183
+ "train_dataset, test_dataset = tf.keras.utils.split_dataset(dataset, left_size=0.8)"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 51,
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "lstm = tf.keras.models.Sequential(\n",
193
+ " layers=[\n",
194
+ " tf.keras.layers.Embedding(len(vocab) + 1, MAX_LEN, mask_zero=True),\n",
195
+ " tf.keras.layers.LSTM(MAX_LEN, return_sequences=True),\n",
196
+ " tf.keras.layers.Dense(len(unique_labels), activation=tf.nn.log_softmax),\n",
197
+ " ]\n",
198
+ ")"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "markdown",
203
+ "metadata": {},
204
+ "source": [
205
+ "## Masked loss and metrics\n",
206
+ "\n",
207
+ "Before training the model, we need to create your own function to compute the accuracy. Tensorflow has built-in accuracy metrics but we cannot pass values to be ignored. This will impact the calculations, since we must remove the padded values.\n",
208
+ "\n",
209
+ "Usually, the metric that inputs true labels and predicted labels and outputs how many times the predicted and true labels match is called accuracy. In some cases, however, there is one more step before getting the predicted labels. This may happen if, instead of passing the predicted labels, a vector of probabilities is passed. In such case, there is a need to perform an `argmax` for each prediction to find the appropriate predicted label. Such situations happen very often, therefore Tensorflow has a set of functions, with prefix `Sparse`, that performs this operation in the backend. Unfortunately, it does not provide values to ignore in the accuracy case. This is what you will work on now.\n",
210
+ "\n",
211
+ "Note that the model's prediction has 3 axes:\n",
212
+ "\n",
213
+ "- the number of examples (batch size)\n",
214
+ "- the number of words in each example (padded to be as long as the longest sentence in the batch)\n",
215
+ "- the number of possible targets (the 17 named entity tags).\n",
216
+ "\n",
217
+ "Another important function is the loss function. In this case, we will use the Cross Entropy loss, but we need a multiclass implementation of it, also we may look for its Sparse version. Tensorflow has a SparseCategoricalCrossentropy loss function, which it is already imported by the name SparseCategoricalCrossEntropy.\n",
218
+ "\n",
219
+ "SparseCategoricalCrossentropy: The Sparse Categorical Crossentropy Loss Function.\n",
220
+ "\n",
221
+ "The arguments you will need:\n",
222
+ "\n",
223
+ "1. `from_logits`: This indicates if the values are raw values or normalized values (probabilities). Since the last layer of the model finishes with a LogSoftMax call, the results are not normalized - they do not lie between 0 and 1.\n",
224
+ "2. `ignore_class`: This indicates which class should be ignored when computing the crossentropy. Remember that the class related to padding value is set to be 0.\n"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "class CustomSparseCategoricalCrossentropy(tf.keras.losses.Loss):\n",
234
+ " def __init__(self, from_logits=False, ignore_class=-1):\n",
235
+ " super().__init__()\n",
236
+ " self.from_logits = from_logits\n",
237
+ " self.ignore_class = ignore_class\n",
238
+ "\n",
239
+ " def call(self, y_true, y_pred):\n",
240
+ " # Ensure inputs are tensors\n",
241
+ " y_true = tf.convert_to_tensor(y_true)\n",
242
+ " y_pred = tf.convert_to_tensor(y_pred)\n",
243
+ "\n",
244
+ " # Generate a mask that is False where y_true equals ignore_class and True elsewhere\n",
245
+ " mask = tf.not_equal(y_true, self.ignore_class)\n",
246
+ "\n",
247
+ " # Use this mask to filter out ignored values from y_true and y_pred\n",
248
+ " y_true_filtered = tf.boolean_mask(y_true, mask)\n",
249
+ " y_pred_filtered = tf.boolean_mask(y_pred, mask)\n",
250
+ "\n",
251
+ " # Compute the sparse categorical crossentropy on filtered targets and predictions\n",
252
+ " loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
253
+ " y_true_filtered, y_pred_filtered, from_logits=self.from_logits\n",
254
+ " )\n",
255
+ "\n",
256
+ " # Return the mean loss value\n",
257
+ " return tf.reduce_mean(loss)\n",
258
+ "\n",
259
+ "\n",
260
+ "def masked_loss(y_true, y_pred):\n",
261
+ " \"\"\"\n",
262
+ " Calculate the masked sparse categorical cross-entropy loss.\n",
263
+ "\n",
264
+ " Parameters:\n",
265
+ " y_true (tensor): True labels.\n",
266
+ " y_pred (tensor): Predicted logits.\n",
267
+ "\n",
268
+ " Returns:\n",
269
+ " loss (tensor): Calculated loss.\n",
270
+ " \"\"\"\n",
271
+ "\n",
272
+ " # Calculate the loss for each item in the batch. Remember to pass the right arguments, as discussed above!\n",
273
+ " loss_fn = CustomSparseCategoricalCrossentropy(from_logits=True, ignore_class=-1)\n",
274
+ " # Use the previous defined function to compute the loss\n",
275
+ " loss = loss_fn(y_true, y_pred)\n",
276
+ "\n",
277
+ " return loss"
278
  ]
279
  }
280
  ],
 
294
  "name": "python",
295
  "nbconvert_exporter": "python",
296
  "pygments_lexer": "ipython3",
297
+ "version": "3.12.4"
298
  }
299
  },
300
  "nbformat": 4,