code for creating dataset
Browse files- dataset_creator.py +50 -0
dataset_creator.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the JSON file
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
def unicode_to_ascii(text):
|
| 7 |
+
# Normalize to decomposed form (separate characters and combining marks)
|
| 8 |
+
normalized = unicodedata.normalize('NFKD', text)
|
| 9 |
+
|
| 10 |
+
# Remove non-ASCII chars (keeps only ASCII)
|
| 11 |
+
ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
|
| 12 |
+
|
| 13 |
+
return ascii_text
|
| 14 |
+
|
| 15 |
+
def clean_html_tags(html_string):
|
| 16 |
+
"""
|
| 17 |
+
Remove all HTML tags from the input string.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
html_string (str): String containing HTML tags
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
str: String with all HTML tags removed
|
| 24 |
+
"""
|
| 25 |
+
# This pattern matches HTML tags: < followed by anything except >, then >
|
| 26 |
+
pattern = re.compile(r'<[^>]+>')
|
| 27 |
+
|
| 28 |
+
# Replace all occurrences of HTML tags with empty string
|
| 29 |
+
clean_text = re.sub(pattern, '', html_string)
|
| 30 |
+
super_clean_text = unicode_to_ascii(clean_text)
|
| 31 |
+
return super_clean_text
|
| 32 |
+
|
| 33 |
+
with open("Megillah_map_to_english.json", "r", encoding="utf-8") as file:
|
| 34 |
+
megillah_data = file.readlines()
|
| 35 |
+
|
| 36 |
+
with open("megillah_sugyot.json", "w", encoding="utf-8") as output_file:
|
| 37 |
+
# Loop through each line in the file
|
| 38 |
+
for line in megillah_data:
|
| 39 |
+
full_talmud = json.loads(line)
|
| 40 |
+
|
| 41 |
+
for sugya, texts in full_talmud.items():
|
| 42 |
+
metadata = {"sugya": sugya, "sections": []}
|
| 43 |
+
content = ""
|
| 44 |
+
for text in texts:
|
| 45 |
+
cleaned_text = clean_html_tags(text['english'])
|
| 46 |
+
content += f"{cleaned_text} "
|
| 47 |
+
metadata["sections"].append(text['sefaria_id'])
|
| 48 |
+
output = {"id": sugya, "metadata": metadata, "content": content}
|
| 49 |
+
output_file.write(f"{json.dumps(output)}\n")
|
| 50 |
+
|