Spaces:

TURKCELL
/

offensive-lang-detection-tr

Runtime error

App Files Files Community

offensive-lang-detection-tr / app.py

zeynepgulhan

app file created

79bbdf9 verified almost 2 years ago

raw

history blame contribute delete

3.1 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import numpy as np
	import re

	from turkish.deasciifier import Deasciifier

	# Model ve tokenizer initialization
	tokenizer = AutoTokenizer.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
	model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)


	def deasciifier(text):
	deasciifier = Deasciifier(text)
	return deasciifier.convert_to_turkish()


	def remove_circumflex(text):
	circumflex_map = {
	'â': 'a',
	'î': 'i',
	'û': 'u',
	'ô': 'o',
	'Â': 'A',
	'Î': 'I',
	'Û': 'U',
	'Ô': 'O'
	}

	return ''.join(circumflex_map.get(c, c) for c in text)


	def turkish_lower(text):
	turkish_map = {
	'I': 'ı',
	'İ': 'i',
	'Ç': 'ç',
	'Ş': 'ş',
	'Ğ': 'ğ',
	'Ü': 'ü',
	'Ö': 'ö'
	}
	return ''.join(turkish_map.get(c, c).lower() for c in text)


	def clean_text(text):
	# Metindeki şapkalı harfleri kaldırma
	text = remove_circumflex(text)
	# Metni küçük harfe dönüştürme
	text = turkish_lower(text)
	# deasciifier
	text = deasciifier(text)
	# Kullanıcı adlarını kaldırma
	text = re.sub(r"@\S*", " ", text)
	# Hashtag'leri kaldırma
	text = re.sub(r'#\S+', ' ', text)
	# URL'leri kaldırma
	text = re.sub(r"http\S+\|www\S+\|https\S+", ' ', text, flags=re.MULTILINE)
	# Noktalama işaretlerini ve metin tabanlı emojileri kaldırma
	text = re.sub(r'[^\w\s]\|(:\)\|:\(\|:D\|:P\|:o\|:O\|;\))', ' ', text)
	# Emojileri kaldırma
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	text = emoji_pattern.sub(r' ', text)

	# Birden fazla boşluğu tek boşlukla değiştirme
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def is_offensive(sentence):
	normalize_text = clean_text(sentence)

	test_sample = tokenizer(normalize_text, padding=True, truncation=True, max_length=256, return_tensors='pt')
	test_sample = {k: v.to(device) for k, v in test_sample.items()}

	output = model(**test_sample)
	y_pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)

	d = {0: 'non-offensive', 1: 'offensive'}
	return d[y_pred[0]]


	iface = gr.Interface(
	fn=is_offensive,
	inputs=gr.Textbox(lines=2, placeholder="Enter sentence here..."),
	outputs="text",
	title="Offensive Language Detection",
	description="Offensive language detection for Turkish"
	)

	iface.launch()