Az-r-ow commited on
Commit
48a770b
·
1 Parent(s): c644122

fix(camembert): camembert fine-tuning

Browse files
assets/bilstm_vs_blstm_pos.png ADDED

Git LFS Details

  • SHA256: fbc29ff9a7dbf0bef0f29b4bb6c67e852167dba5e420bf6fc819850ec57c187f
  • Pointer size: 130 Bytes
  • Size of remote file: 30.1 kB
assets/lstm.png ADDED

Git LFS Details

  • SHA256: 446b747b2ae6433775f6394d42cb043fd45f7e3a16fd271887840574bccb00f0
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB
assets/lstm_vs_lstm_with_pos.png ADDED

Git LFS Details

  • SHA256: 825bf06526fec22f499d802568d1eea3020f5d7b2a892b1126ac883daded6f70
  • Pointer size: 130 Bytes
  • Size of remote file: 30.3 kB
camemBERT_finetuning.ipynb CHANGED
@@ -11,30 +11,39 @@
11
  },
12
  {
13
  "cell_type": "code",
14
- "execution_count": 20,
15
  "metadata": {},
16
  "outputs": [
 
 
 
 
 
 
 
 
 
 
17
  {
18
  "name": "stdout",
19
  "output_type": "stream",
20
  "text": [
21
- "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.46.3)\n",
22
  "Requirement already satisfied: tf-keras in ./venv/lib/python3.12/site-packages (2.18.0)\n",
23
- "Collecting focal-loss\n",
24
- " Downloading focal_loss-0.0.7-py3-none-any.whl.metadata (5.1 kB)\n",
25
  "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from transformers) (3.16.1)\n",
26
- "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in ./venv/lib/python3.12/site-packages (from transformers) (0.26.3)\n",
27
  "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from transformers) (1.26.4)\n",
28
- "Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.12/site-packages (from transformers) (24.1)\n",
29
  "Requirement already satisfied: pyyaml>=5.1 in ./venv/lib/python3.12/site-packages (from transformers) (6.0.2)\n",
30
- "Requirement already satisfied: regex!=2019.12.17 in ./venv/lib/python3.12/site-packages (from transformers) (2024.9.11)\n",
31
  "Requirement already satisfied: requests in ./venv/lib/python3.12/site-packages (from transformers) (2.32.3)\n",
32
- "Requirement already satisfied: tokenizers<0.21,>=0.20 in ./venv/lib/python3.12/site-packages (from transformers) (0.20.3)\n",
33
  "Requirement already satisfied: safetensors>=0.4.1 in ./venv/lib/python3.12/site-packages (from transformers) (0.4.5)\n",
34
  "Requirement already satisfied: tqdm>=4.27 in ./venv/lib/python3.12/site-packages (from transformers) (4.66.5)\n",
35
  "Requirement already satisfied: tensorflow<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tf-keras) (2.18.0)\n",
36
- "Requirement already satisfied: fsspec>=2023.5.0 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.10.0)\n",
37
- "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
38
  "Requirement already satisfied: absl-py>=1.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.1.0)\n",
39
  "Requirement already satisfied: astunparse>=1.6.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.6.3)\n",
40
  "Requirement already satisfied: flatbuffers>=24.3.25 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (24.3.25)\n",
@@ -43,11 +52,11 @@
43
  "Requirement already satisfied: libclang>=13.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (18.1.1)\n",
44
  "Requirement already satisfied: opt-einsum>=2.3.2 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.4.0)\n",
45
  "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (4.25.5)\n",
46
- "Requirement already satisfied: setuptools in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (75.2.0)\n",
47
- "Requirement already satisfied: six>=1.12.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.16.0)\n",
48
  "Requirement already satisfied: termcolor>=1.1.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.5.0)\n",
49
- "Requirement already satisfied: wrapt>=1.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.16.0)\n",
50
- "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.67.0)\n",
51
  "Requirement already satisfied: tensorboard<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
52
  "Requirement already satisfied: keras>=3.5.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.7.0)\n",
53
  "Requirement already satisfied: h5py>=3.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.12.1)\n",
@@ -56,23 +65,17 @@
56
  "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.10)\n",
57
  "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2.2.3)\n",
58
  "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2024.8.30)\n",
59
- "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./venv/lib/python3.12/site-packages (from astunparse>=1.6.0->tensorflow<2.19,>=2.18->tf-keras) (0.44.0)\n",
60
- "Requirement already satisfied: rich in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (13.9.2)\n",
61
  "Requirement already satisfied: namex in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.0.8)\n",
62
- "Requirement already satisfied: optree in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.13.0)\n",
63
  "Requirement already satisfied: markdown>=2.6.8 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.7)\n",
64
  "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (0.7.2)\n",
65
- "Requirement already satisfied: werkzeug>=1.0.1 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.0.4)\n",
66
  "Requirement already satisfied: MarkupSafe>=2.1.1 in ./venv/lib/python3.12/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.0.2)\n",
67
  "Requirement already satisfied: markdown-it-py>=2.2.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (3.0.0)\n",
68
  "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
69
- "Requirement already satisfied: mdurl~=0.1 in ./venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.1.2)\n",
70
- "Downloading focal_loss-0.0.7-py3-none-any.whl (19 kB)\n",
71
- "Installing collected packages: focal-loss\n",
72
- "Successfully installed focal-loss-0.0.7\n",
73
- "\n",
74
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
75
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
76
  ]
77
  }
78
  ],
@@ -82,7 +85,7 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 21,
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
@@ -93,7 +96,7 @@
93
  },
94
  {
95
  "cell_type": "code",
96
- "execution_count": 22,
97
  "metadata": {},
98
  "outputs": [],
99
  "source": [
@@ -102,7 +105,7 @@
102
  },
103
  {
104
  "cell_type": "code",
105
- "execution_count": 23,
106
  "metadata": {},
107
  "outputs": [],
108
  "source": [
@@ -113,46 +116,41 @@
113
  ")\n",
114
  "\n",
115
  "# To avoid overfitting the model on sentences that don't have any labels\n",
116
- "# lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(\n",
117
- "# \"./data/bio/fr.bio/1k_train_unlabeled_samples.bio\"\n",
118
- "# )\n",
119
- "\n",
120
- "large_sentences, large_labels, _, __ = dp.from_bio_file_to_examples(\n",
121
- " \"./data/bio/fr.bio/1k_train_large_samples.bio\"\n",
122
  ")\n",
123
  "\n",
124
- "sentences = sentences + large_sentences\n",
125
- "labels = labels + large_labels"
 
126
  ]
127
  },
128
  {
129
  "cell_type": "code",
130
- "execution_count": 24,
131
  "metadata": {},
132
  "outputs": [],
133
  "source": [
134
- "import app.travel_resolver.libs.nlp.data_processing as dp\n",
135
- "\n",
136
- "processed_sentences, processed_labels = dp.process_sentences_and_labels(\n",
137
- " sentences, labels, return_tokens=True, stemming=False\n",
138
- ")"
139
  ]
140
  },
141
  {
142
  "cell_type": "code",
143
- "execution_count": 25,
144
  "metadata": {},
145
  "outputs": [],
146
  "source": [
147
- "for i in range(len(processed_sentences)):\n",
148
- " for j in range(len(processed_sentences[i])):\n",
149
- " if processed_labels[i][j] > 0:\n",
150
- " processed_sentences[i][j] = processed_sentences[i][j].title()"
 
151
  ]
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": 7,
156
  "metadata": {},
157
  "outputs": [],
158
  "source": [
@@ -161,12 +159,12 @@
161
  " as well as the embedding size\n",
162
  "\"\"\"\n",
163
  "\n",
164
- "MAX_LEN = 100"
165
  ]
166
  },
167
  {
168
  "cell_type": "code",
169
- "execution_count": 26,
170
  "metadata": {},
171
  "outputs": [],
172
  "source": [
@@ -177,25 +175,26 @@
177
  },
178
  {
179
  "cell_type": "code",
180
- "execution_count": 27,
181
  "metadata": {},
182
  "outputs": [],
183
  "source": [
184
- "from transformers import TFAutoModelForTokenClassification, CamembertTokenizer\n",
185
  "import numpy as np\n",
186
  "\n",
187
- "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")"
188
  ]
189
  },
190
  {
191
  "cell_type": "code",
192
- "execution_count": 28,
193
  "metadata": {},
194
  "outputs": [],
195
  "source": [
196
  "tokenized_sentences = tokenizer(\n",
197
  " processed_sentences,\n",
198
  " is_split_into_words=True,\n",
 
199
  " truncation=True,\n",
200
  " padding=\"max_length\",\n",
201
  " max_length=MAX_LEN,\n",
@@ -204,7 +203,62 @@
204
  },
205
  {
206
  "cell_type": "code",
207
- "execution_count": 33,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  "metadata": {},
209
  "outputs": [],
210
  "source": [
@@ -220,14 +274,14 @@
220
  ") = train_test_split(\n",
221
  " tokenized_sentences[\"input_ids\"],\n",
222
  " tokenized_sentences[\"attention_mask\"],\n",
223
- " padded_labels,\n",
224
  " test_size=0.2,\n",
225
  ")"
226
  ]
227
  },
228
  {
229
  "cell_type": "code",
230
- "execution_count": 39,
231
  "metadata": {},
232
  "outputs": [],
233
  "source": [
@@ -254,7 +308,7 @@
254
  },
255
  {
256
  "cell_type": "code",
257
- "execution_count": 40,
258
  "metadata": {},
259
  "outputs": [],
260
  "source": [
@@ -290,37 +344,16 @@
290
  },
291
  {
292
  "cell_type": "code",
293
- "execution_count": 14,
294
- "metadata": {},
295
- "outputs": [],
296
- "source": [
297
- "class_weights = {0: 0.1, 1: 20.0, 2: 20.0}\n",
298
- "\n",
299
- "\n",
300
- "def weighted_loss(y_true, y_pred):\n",
301
- " weights = tf.constant(\n",
302
- " [class_weights[i] for i in range(len(class_weights))], dtype=tf.float32\n",
303
- " )\n",
304
- " weights = tf.gather(\n",
305
- " weights, tf.cast(y_true, tf.int32)\n",
306
- " ) # Get weights for true labels\n",
307
- " loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
308
- " y_true, y_pred, from_logits=True\n",
309
- " )\n",
310
- " return loss * weights"
311
- ]
312
- },
313
- {
314
- "cell_type": "code",
315
- "execution_count": 61,
316
  "metadata": {},
317
  "outputs": [
318
  {
319
  "name": "stderr",
320
  "output_type": "stream",
321
  "text": [
322
- "All PyTorch model weights were used when initializing TFCamembertForTokenClassification.\n",
323
- "\n",
 
324
  "Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
325
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
326
  ]
@@ -330,73 +363,77 @@
330
  "from focal_loss import SparseCategoricalFocalLoss\n",
331
  "\n",
332
  "camembert = TFAutoModelForTokenClassification.from_pretrained(\n",
333
- " \"camembert-base\", num_labels=len(unique_labels)\n",
334
  ")\n",
335
  "\n",
336
  "loss_func = SparseCategoricalFocalLoss(\n",
337
- " gamma=2, class_weight=[0.1, 2, 2], from_logits=True\n",
338
  ")\n",
339
  "\n",
340
  "camembert.compile(\n",
341
- " optimizer=tf.keras.optimizers.legacy.Adam(5e-5),\n",
342
- " loss=loss_func,\n",
343
- " metrics=[\"accuracy\", entity_accuracy],\n",
344
  ")"
345
  ]
346
  },
347
  {
348
  "cell_type": "code",
349
- "execution_count": 46,
350
  "metadata": {},
351
  "outputs": [],
352
  "source": [
353
- "train_dataset = train_dataset.batch(32)\n",
354
- "test_dataset = test_dataset.batch(32)"
355
  ]
356
  },
357
  {
358
  "cell_type": "code",
359
- "execution_count": 62,
360
  "metadata": {},
361
  "outputs": [
362
  {
363
  "name": "stdout",
364
  "output_type": "stream",
365
  "text": [
366
- "Epoch 1/4\n",
367
- "272/272 [==============================] - 1596s 6s/step - loss: 0.0124 - accuracy: 0.9677 - entity_accuracy: 0.8099 - val_loss: 0.0038 - val_accuracy: 0.9799 - val_entity_accuracy: 0.9682\n",
368
- "Epoch 2/4\n",
369
- "272/272 [==============================] - 1560s 6s/step - loss: 0.0031 - accuracy: 0.9852 - entity_accuracy: 0.9684 - val_loss: 0.0019 - val_accuracy: 0.9885 - val_entity_accuracy: 0.9820\n",
370
- "Epoch 3/4\n",
371
- "272/272 [==============================] - 1560s 6s/step - loss: 0.0020 - accuracy: 0.9907 - entity_accuracy: 0.9767 - val_loss: 0.0016 - val_accuracy: 0.9941 - val_entity_accuracy: 0.9775\n",
372
- "Epoch 4/4\n",
373
- "272/272 [==============================] - 1605s 6s/step - loss: 0.0016 - accuracy: 0.9923 - entity_accuracy: 0.9789 - val_loss: 0.0017 - val_accuracy: 0.9920 - val_entity_accuracy: 0.9831\n"
374
  ]
375
  },
376
  {
377
- "data": {
378
- "text/plain": [
379
- "<tf_keras.src.callbacks.History at 0x2dab031a0>"
380
- ]
381
- },
382
- "execution_count": 62,
383
- "metadata": {},
384
- "output_type": "execute_result"
 
 
 
 
 
 
385
  }
386
  ],
387
  "source": [
388
- "callback = tf.keras.callbacks.EarlyStopping(\n",
389
- " monitor=\"val_loss\", patience=0, restore_best_weights=True\n",
390
  ")\n",
391
  "\n",
 
 
392
  "camembert.fit(\n",
393
- " train_dataset, validation_data=test_dataset, epochs=4, callbacks=[callback]\n",
 
 
 
394
  ")"
395
  ]
396
  },
397
  {
398
  "cell_type": "code",
399
- "execution_count": null,
400
  "metadata": {},
401
  "outputs": [
402
  {
@@ -405,7 +442,7 @@
405
  "<tf.Tensor: shape=(), dtype=float32, numpy=0.1186538115143776>"
406
  ]
407
  },
408
- "execution_count": 54,
409
  "metadata": {},
410
  "output_type": "execute_result"
411
  }
@@ -421,26 +458,18 @@
421
  },
422
  {
423
  "cell_type": "code",
424
- "execution_count": 63,
425
  "metadata": {},
426
  "outputs": [],
427
  "source": [
428
- "camembert.save_pretrained(\"./models/camembert\")"
429
  ]
430
  },
431
  {
432
  "cell_type": "code",
433
- "execution_count": null,
434
  "metadata": {},
435
- "outputs": [
436
- {
437
- "name": "stderr",
438
- "output_type": "stream",
439
- "text": [
440
- "tf_model.h5: 100%|██████████| 440M/440M [00:20<00:00, 21.8MB/s] \n"
441
- ]
442
- }
443
- ],
444
  "source": [
445
  "# camembert.push_to_hub(\"CamemBERT-NER-Travel\")"
446
  ]
@@ -462,7 +491,7 @@
462
  "name": "python",
463
  "nbconvert_exporter": "python",
464
  "pygments_lexer": "ipython3",
465
- "version": "3.12.4"
466
  }
467
  },
468
  "nbformat": 4,
 
11
  },
12
  {
13
  "cell_type": "code",
14
+ "execution_count": 86,
15
  "metadata": {},
16
  "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
22
+ "To disable this warning, you can either:\n",
23
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
24
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
25
+ ]
26
+ },
27
  {
28
  "name": "stdout",
29
  "output_type": "stream",
30
  "text": [
31
+ "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.47.1)\n",
32
  "Requirement already satisfied: tf-keras in ./venv/lib/python3.12/site-packages (2.18.0)\n",
33
+ "Requirement already satisfied: focal-loss in ./venv/lib/python3.12/site-packages (0.0.7)\n",
 
34
  "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from transformers) (3.16.1)\n",
35
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in ./venv/lib/python3.12/site-packages (from transformers) (0.26.5)\n",
36
  "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from transformers) (1.26.4)\n",
37
+ "Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.12/site-packages (from transformers) (24.2)\n",
38
  "Requirement already satisfied: pyyaml>=5.1 in ./venv/lib/python3.12/site-packages (from transformers) (6.0.2)\n",
39
+ "Requirement already satisfied: regex!=2019.12.17 in ./venv/lib/python3.12/site-packages (from transformers) (2024.11.6)\n",
40
  "Requirement already satisfied: requests in ./venv/lib/python3.12/site-packages (from transformers) (2.32.3)\n",
41
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in ./venv/lib/python3.12/site-packages (from transformers) (0.21.0)\n",
42
  "Requirement already satisfied: safetensors>=0.4.1 in ./venv/lib/python3.12/site-packages (from transformers) (0.4.5)\n",
43
  "Requirement already satisfied: tqdm>=4.27 in ./venv/lib/python3.12/site-packages (from transformers) (4.66.5)\n",
44
  "Requirement already satisfied: tensorflow<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tf-keras) (2.18.0)\n",
45
+ "Requirement already satisfied: fsspec>=2023.5.0 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (2024.10.0)\n",
46
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (4.12.2)\n",
47
  "Requirement already satisfied: absl-py>=1.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.1.0)\n",
48
  "Requirement already satisfied: astunparse>=1.6.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.6.3)\n",
49
  "Requirement already satisfied: flatbuffers>=24.3.25 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (24.3.25)\n",
 
52
  "Requirement already satisfied: libclang>=13.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (18.1.1)\n",
53
  "Requirement already satisfied: opt-einsum>=2.3.2 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.4.0)\n",
54
  "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (4.25.5)\n",
55
+ "Requirement already satisfied: setuptools in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (75.6.0)\n",
56
+ "Requirement already satisfied: six>=1.12.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.17.0)\n",
57
  "Requirement already satisfied: termcolor>=1.1.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.5.0)\n",
58
+ "Requirement already satisfied: wrapt>=1.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.17.0)\n",
59
+ "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.68.1)\n",
60
  "Requirement already satisfied: tensorboard<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
61
  "Requirement already satisfied: keras>=3.5.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.7.0)\n",
62
  "Requirement already satisfied: h5py>=3.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.12.1)\n",
 
65
  "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.10)\n",
66
  "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2.2.3)\n",
67
  "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2024.8.30)\n",
68
+ "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./venv/lib/python3.12/site-packages (from astunparse>=1.6.0->tensorflow<2.19,>=2.18->tf-keras) (0.45.1)\n",
69
+ "Requirement already satisfied: rich in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (13.9.4)\n",
70
  "Requirement already satisfied: namex in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.0.8)\n",
71
+ "Requirement already satisfied: optree in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.13.1)\n",
72
  "Requirement already satisfied: markdown>=2.6.8 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.7)\n",
73
  "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (0.7.2)\n",
74
+ "Requirement already satisfied: werkzeug>=1.0.1 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.1.3)\n",
75
  "Requirement already satisfied: MarkupSafe>=2.1.1 in ./venv/lib/python3.12/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.0.2)\n",
76
  "Requirement already satisfied: markdown-it-py>=2.2.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (3.0.0)\n",
77
  "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
78
+ "Requirement already satisfied: mdurl~=0.1 in ./venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.1.2)\n"
 
 
 
 
 
 
79
  ]
80
  }
81
  ],
 
85
  },
86
  {
87
  "cell_type": "code",
88
+ "execution_count": 87,
89
  "metadata": {},
90
  "outputs": [],
91
  "source": [
 
96
  },
97
  {
98
  "cell_type": "code",
99
+ "execution_count": 88,
100
  "metadata": {},
101
  "outputs": [],
102
  "source": [
 
105
  },
106
  {
107
  "cell_type": "code",
108
+ "execution_count": 89,
109
  "metadata": {},
110
  "outputs": [],
111
  "source": [
 
116
  ")\n",
117
  "\n",
118
  "# To avoid overfitting the model on sentences that don't have any labels\n",
119
+ "lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(\n",
120
+ " \"./data/bio/fr.bio/1k_train_unlabeled_samples.bio\"\n",
 
 
 
 
121
  ")\n",
122
  "\n",
123
+ "long_sentences, long_labels, _, __ = dp.from_bio_file_to_examples(\n",
124
+ " \"./data/bio/fr.bio/1k_train_large_samples.bio\"\n",
125
+ ")"
126
  ]
127
  },
128
  {
129
  "cell_type": "code",
130
+ "execution_count": 90,
131
  "metadata": {},
132
  "outputs": [],
133
  "source": [
134
+ "sentences = sentences + lambda_sentences + long_sentences\n",
135
+ "labels = labels + lambda_labels + long_labels"
 
 
 
136
  ]
137
  },
138
  {
139
  "cell_type": "code",
140
+ "execution_count": 91,
141
  "metadata": {},
142
  "outputs": [],
143
  "source": [
144
+ "import app.travel_resolver.libs.nlp.data_processing as dp\n",
145
+ "\n",
146
+ "processed_sentences, processed_labels = dp.process_sentences_and_labels(\n",
147
+ " sentences, labels, return_tokens=True, stemming=False\n",
148
+ ")"
149
  ]
150
  },
151
  {
152
  "cell_type": "code",
153
+ "execution_count": 92,
154
  "metadata": {},
155
  "outputs": [],
156
  "source": [
 
159
  " as well as the embedding size\n",
160
  "\"\"\"\n",
161
  "\n",
162
+ "MAX_LEN = 150"
163
  ]
164
  },
165
  {
166
  "cell_type": "code",
167
+ "execution_count": 93,
168
  "metadata": {},
169
  "outputs": [],
170
  "source": [
 
175
  },
176
  {
177
  "cell_type": "code",
178
+ "execution_count": 94,
179
  "metadata": {},
180
  "outputs": [],
181
  "source": [
182
+ "from transformers import TFAutoModelForTokenClassification, CamembertTokenizerFast\n",
183
  "import numpy as np\n",
184
  "\n",
185
+ "tokenizer = CamembertTokenizerFast.from_pretrained(\"cmarkea/distilcamembert-base\")"
186
  ]
187
  },
188
  {
189
  "cell_type": "code",
190
+ "execution_count": 95,
191
  "metadata": {},
192
  "outputs": [],
193
  "source": [
194
  "tokenized_sentences = tokenizer(\n",
195
  " processed_sentences,\n",
196
  " is_split_into_words=True,\n",
197
+ " return_offsets_mapping=True,\n",
198
  " truncation=True,\n",
199
  " padding=\"max_length\",\n",
200
  " max_length=MAX_LEN,\n",
 
203
  },
204
  {
205
  "cell_type": "code",
206
+ "execution_count": 96,
207
+ "metadata": {},
208
+ "outputs": [],
209
+ "source": [
210
+ "def align_labels_with_tokens(encodings, labels):\n",
211
+ " \"\"\"\n",
212
+ " Aligns the labels to match the tokenized outputs.\n",
213
+ "\n",
214
+ " Args:\n",
215
+ " encodings (BatchEncoding): Tokenized outputs from the Hugging Face tokenizer (must use a fast tokenizer).\n",
216
+ " labels (List[List[int]]): Original labels for each sentence before tokenization. Each inner list corresponds to one sentence.\n",
217
+ "\n",
218
+ " Returns:\n",
219
+ " List[List[int]]: Aligned labels, where each inner list corresponds to the aligned labels for the tokenized sentence.\n",
220
+ " Special tokens and padding are assigned a value of -100.\n",
221
+ " \"\"\"\n",
222
+ " adapted_labels = []\n",
223
+ "\n",
224
+ " for i, label in enumerate(labels):\n",
225
+ " word_ids = encodings.word_ids(\n",
226
+ " batch_index=i\n",
227
+ " ) # Get word IDs for the i-th sentence\n",
228
+ " aligned_labels = []\n",
229
+ " previous_word_id = None\n",
230
+ "\n",
231
+ " for word_id in word_ids:\n",
232
+ " if word_id is None:\n",
233
+ " # Special tokens (e.g., [CLS], [SEP], or padding)\n",
234
+ " aligned_labels.append(-100)\n",
235
+ " elif word_id != previous_word_id:\n",
236
+ " # New word\n",
237
+ " aligned_labels.append(label[word_id])\n",
238
+ " else:\n",
239
+ " # Subword token (same word)\n",
240
+ " aligned_labels.append(\n",
241
+ " label[word_id]\n",
242
+ " ) # Or append -100 to ignore subwords\n",
243
+ " previous_word_id = word_id\n",
244
+ "\n",
245
+ " adapted_labels.append(aligned_labels)\n",
246
+ "\n",
247
+ " return adapted_labels"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 97,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "readapted_labels = align_labels_with_tokens(tokenized_sentences, padded_labels)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 98,
262
  "metadata": {},
263
  "outputs": [],
264
  "source": [
 
274
  ") = train_test_split(\n",
275
  " tokenized_sentences[\"input_ids\"],\n",
276
  " tokenized_sentences[\"attention_mask\"],\n",
277
+ " readapted_labels,\n",
278
  " test_size=0.2,\n",
279
  ")"
280
  ]
281
  },
282
  {
283
  "cell_type": "code",
284
+ "execution_count": 99,
285
  "metadata": {},
286
  "outputs": [],
287
  "source": [
 
308
  },
309
  {
310
  "cell_type": "code",
311
+ "execution_count": 100,
312
  "metadata": {},
313
  "outputs": [],
314
  "source": [
 
344
  },
345
  {
346
  "cell_type": "code",
347
+ "execution_count": 101,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  "metadata": {},
349
  "outputs": [
350
  {
351
  "name": "stderr",
352
  "output_type": "stream",
353
  "text": [
354
+ "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertForTokenClassification: ['roberta.embeddings.position_ids']\n",
355
+ "- This IS expected if you are initializing TFCamembertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
356
+ "- This IS NOT expected if you are initializing TFCamembertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
357
  "Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
358
  "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
359
  ]
 
363
  "from focal_loss import SparseCategoricalFocalLoss\n",
364
  "\n",
365
  "camembert = TFAutoModelForTokenClassification.from_pretrained(\n",
366
+ " \"cmarkea/distilcamembert-base\", num_labels=len(unique_labels)\n",
367
  ")\n",
368
  "\n",
369
  "loss_func = SparseCategoricalFocalLoss(\n",
370
+ " gamma=2, class_weight=[1, 10, 10], from_logits=True\n",
371
  ")\n",
372
  "\n",
373
  "camembert.compile(\n",
374
+ " optimizer=tf.keras.optimizers.legacy.Adam(8e-4),\n",
375
+ " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
376
+ " metrics=[entity_accuracy],\n",
377
  ")"
378
  ]
379
  },
380
  {
381
  "cell_type": "code",
382
+ "execution_count": 102,
383
  "metadata": {},
384
  "outputs": [],
385
  "source": [
386
+ "train_dataset = train_dataset.batch(64)\n",
387
+ "test_dataset = test_dataset.batch(64)"
388
  ]
389
  },
390
  {
391
  "cell_type": "code",
392
+ "execution_count": 103,
393
  "metadata": {},
394
  "outputs": [
395
  {
396
  "name": "stdout",
397
  "output_type": "stream",
398
  "text": [
399
+ "Epoch 1/10\n"
 
 
 
 
 
 
 
400
  ]
401
  },
402
  {
403
+ "ename": "TypeError",
404
+ "evalue": "in user code:\n\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1398, in train_function *\n return step_function(self, iterator)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1381, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1370, in run_step **\n outputs = model.train_step(data)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py\", line 1672, in train_step\n y_pred = self(x, training=True)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/utils/traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"/var/folders/3h/5n6s9rcj3sx0gpncsxbq_99m0000gn/T/__autograph_generated_filepc984rni.py\", line 40, in tf__run_call_with_unpacked_inputs\n raise\n\n TypeError: Exception encountered when calling layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification).\n \n in user code:\n \n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py\", line 1393, in run_call_with_unpacked_inputs *\n return func(self, **unpacked_inputs)\n \n TypeError: outer_factory.<locals>.inner_factory.<locals>.tf__call() got an unexpected keyword argument 'offset_mapping'\n \n \n Call arguments received by layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification):\n • input_ids={'input_ids': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'offset_mapping': 'tf.Tensor(shape=(None, 150, 2), dtype=int32)'}\n • attention_mask=None\n • token_type_ids=None\n • position_ids=None\n • head_mask=None\n • inputs_embeds=None\n • output_attentions=None\n • output_hidden_states=None\n • return_dict=None\n • labels=None\n • training=True\n",
405
+ "output_type": "error",
406
+ "traceback": [
407
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
408
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
409
+ "Cell \u001b[0;32mIn[103], line 7\u001b[0m\n\u001b[1;32m 1\u001b[0m early_stopping \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mkeras\u001b[38;5;241m.\u001b[39mcallbacks\u001b[38;5;241m.\u001b[39mEarlyStopping(\n\u001b[1;32m 2\u001b[0m monitor\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mval_loss\u001b[39m\u001b[38;5;124m\"\u001b[39m, min_delta\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.001\u001b[39m, patience\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, restore_best_weights\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 3\u001b[0m )\n\u001b[1;32m 5\u001b[0m csv_logger \u001b[38;5;241m=\u001b[39m tf\u001b[38;5;241m.\u001b[39mkeras\u001b[38;5;241m.\u001b[39mcallbacks\u001b[38;5;241m.\u001b[39mCSVLogger(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtraining.log\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 7\u001b[0m \u001b[43mcamembert\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrain_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidation_data\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest_dataset\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mearly_stopping\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcsv_logger\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m)\u001b[49m\n",
410
+ "File \u001b[0;32m~/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py:1229\u001b[0m, in \u001b[0;36mTFPreTrainedModel.fit\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1226\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(keras\u001b[38;5;241m.\u001b[39mModel\u001b[38;5;241m.\u001b[39mfit)\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfit\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 1228\u001b[0m args, kwargs \u001b[38;5;241m=\u001b[39m convert_batch_encoding(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m-> 1229\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
411
+ "File \u001b[0;32m~/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/utils/traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 67\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[1;32m 68\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[1;32m 69\u001b[0m \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[0;32m---> 70\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
412
+ "File \u001b[0;32m/var/folders/3h/5n6s9rcj3sx0gpncsxbq_99m0000gn/T/__autograph_generated_filelw03dryu.py:15\u001b[0m, in \u001b[0;36mouter_factory.<locals>.inner_factory.<locals>.tf__train_function\u001b[0;34m(iterator)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 14\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m retval_ \u001b[38;5;241m=\u001b[39m ag__\u001b[38;5;241m.\u001b[39mconverted_call(ag__\u001b[38;5;241m.\u001b[39mld(step_function), (ag__\u001b[38;5;241m.\u001b[39mld(\u001b[38;5;28mself\u001b[39m), ag__\u001b[38;5;241m.\u001b[39mld(iterator)), \u001b[38;5;28;01mNone\u001b[39;00m, fscope)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 17\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
413
+ "File \u001b[0;32m~/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py:1672\u001b[0m, in \u001b[0;36mTFPreTrainedModel.train_step\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 1670\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m(x, training\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_loss\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1672\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraining\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 1673\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_using_dummy_loss:\n\u001b[1;32m 1674\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompiled_loss(y_pred\u001b[38;5;241m.\u001b[39mloss, y_pred\u001b[38;5;241m.\u001b[39mloss, sample_weight, regularization_losses\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlosses)\n",
414
+ "File \u001b[0;32m/var/folders/3h/5n6s9rcj3sx0gpncsxbq_99m0000gn/T/__autograph_generated_filepc984rni.py:37\u001b[0m, in \u001b[0;36mouter_factory.<locals>.inner_factory.<locals>.tf__run_call_with_unpacked_inputs\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 36\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m retval_ \u001b[38;5;241m=\u001b[39m ag__\u001b[38;5;241m.\u001b[39mconverted_call(ag__\u001b[38;5;241m.\u001b[39mld(func), (ag__\u001b[38;5;241m.\u001b[39mld(\u001b[38;5;28mself\u001b[39m),), \u001b[38;5;28mdict\u001b[39m(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mag__\u001b[38;5;241m.\u001b[39mld(unpacked_inputs)), fscope)\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 39\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
415
+ "\u001b[0;31mTypeError\u001b[0m: in user code:\n\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1398, in train_function *\n return step_function(self, iterator)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1381, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py\", line 1370, in run_step **\n outputs = model.train_step(data)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py\", line 1672, in train_step\n y_pred = self(x, training=True)\n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/utils/traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"/var/folders/3h/5n6s9rcj3sx0gpncsxbq_99m0000gn/T/__autograph_generated_filepc984rni.py\", line 40, in tf__run_call_with_unpacked_inputs\n raise\n\n TypeError: Exception encountered when calling layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification).\n \n in user code:\n \n File \"/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py\", line 1393, in run_call_with_unpacked_inputs *\n return func(self, **unpacked_inputs)\n \n TypeError: outer_factory.<locals>.inner_factory.<locals>.tf__call() got an unexpected keyword argument 'offset_mapping'\n \n \n Call arguments received by layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification):\n • input_ids={'input_ids': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'offset_mapping': 'tf.Tensor(shape=(None, 150, 2), dtype=int32)'}\n • attention_mask=None\n • token_type_ids=None\n • position_ids=None\n • head_mask=None\n • inputs_embeds=None\n • output_attentions=None\n • output_hidden_states=None\n • return_dict=None\n • labels=None\n • training=True\n"
416
+ ]
417
  }
418
  ],
419
  "source": [
420
+ "early_stopping = tf.keras.callbacks.EarlyStopping(\n",
421
+ " monitor=\"val_loss\", min_delta=0.001, patience=0, restore_best_weights=True\n",
422
  ")\n",
423
  "\n",
424
+ "csv_logger = tf.keras.callbacks.CSVLogger(\"training.log\")\n",
425
+ "\n",
426
  "camembert.fit(\n",
427
+ " train_dataset,\n",
428
+ " validation_data=test_dataset,\n",
429
+ " epochs=10,\n",
430
+ " callbacks=[early_stopping, csv_logger],\n",
431
  ")"
432
  ]
433
  },
434
  {
435
  "cell_type": "code",
436
+ "execution_count": 76,
437
  "metadata": {},
438
  "outputs": [
439
  {
 
442
  "<tf.Tensor: shape=(), dtype=float32, numpy=0.1186538115143776>"
443
  ]
444
  },
445
+ "execution_count": 76,
446
  "metadata": {},
447
  "output_type": "execute_result"
448
  }
 
458
  },
459
  {
460
  "cell_type": "code",
461
+ "execution_count": 79,
462
  "metadata": {},
463
  "outputs": [],
464
  "source": [
465
+ "camembert.save_pretrained(\"./models/distilcamembert-base-ner-cross-entropy-11\")"
466
  ]
467
  },
468
  {
469
  "cell_type": "code",
470
+ "execution_count": 78,
471
  "metadata": {},
472
+ "outputs": [],
 
 
 
 
 
 
 
 
473
  "source": [
474
  "# camembert.push_to_hub(\"CamemBERT-NER-Travel\")"
475
  ]
 
491
  "name": "python",
492
  "nbconvert_exporter": "python",
493
  "pygments_lexer": "ipython3",
494
+ "version": "3.12.8"
495
  }
496
  },
497
  "nbformat": 4,
deepl_ner.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -13,4 +13,6 @@ hmmlearn # for hidden markov models
13
  ipykernel
14
  tabulate
15
  transformers
16
- sentencepiece
 
 
 
13
  ipykernel
14
  tabulate
15
  transformers
16
+ sentencepiece
17
+ stanza
18
+ pydot