Skip to content
Snippets Groups Projects
Commit bffa5b43 authored by Samuel Innes's avatar Samuel Innes
Browse files

Fix file path in detect_language.py in model

parent 2a90e913
No related branches found
No related tags found
No related merge requests found
File deleted
This diff is collapsed.
# this module provides the main feature of the model, namely predicting a given language
# run this module and follow the prompts to find out the predicted language
# there is also the option to view the probabilites for other languages
import numpy as np
import re
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
CWD = os.getcwd()
LANG_LIST = ["ru", "pl", "uk", "sl", "sk", "sr", "bg", "be", "mk", "cs", "hr", "szl", "dsb", "hsb", "cu", "rue"]
with open(CWD + "/project/model/data.txt", "r") as f:
data = f.read().strip().split("\t")
# extract data from data.txt
X = []
y = []
for dp in data:
with_bracket = re.search("\([^a-zA-Z]+\)", dp).group()[1:]
no_bracket = with_bracket.strip("(").strip(")").split(", ")
X.append(np.array(no_bracket, dtype=np.int32))
y.append(re.search("[a-zA-Z]+", dp).group())
#print("length of data set: " + str(len(X)))
rf = RandomForestClassifier()
rf.fit(X, y)
nn = MLPClassifier(activation="tanh")
nn.fit(X, y)
# hardcode list of features
feature_list = ['в', 'и', 'на', 'с', 'года', 'по', 'из', 'что', 'а', 'году', 'w', 'i', 'z', 'na', 'do', 'się', \
'roku', 'a', 'od', 'po', 'в', 'у', 'на', 'з', 'і', 'та', 'до', 'року', 'за', 'що', 'je', 'in', 'v', 'na', \
'so', 'se', 'za', 'ki', 'leta', 'z', 'v', 'a', 'na', 'je', 'sa', 'z', 's', 'roku', 'm', 'ako', 'је', 'у', 'и', \
'на', 'се', 'су', 'од', 'за', 'да', 'године', 'на', 'и', 'в', 'е', 'от', 'се', 'за', 'с', 'г', 'да', 'і', 'у', \
'ў', 'з', 'на', 'года', 'да', 'па', 'а', 'г', 'на', 'во', 'и', 'од', 'се', 'за', 'е', 'со', 'да', 'година', 'v', \
'a', 'na', 'se', 'je', 's', 'z', 'do', 've', 'byl', 'je', 'i', 'u', 'na', 'se', 'su', 'od', 'za', 'a', 's', 'we', \
'je', 'i', 'do', 'co', 'przipisy', 'nŏleży', 'zorty', 'go', 'ôpisoł', 'a', 'jo', 'w', 'na', 'se', 'z', 'wót', \
'su', 'do', '', 'a', 'w', 'je', 'na', 'wot', 'z', 'so', 'do', 'k', 'za', 'и', 'ѥстъ', 'лѣта', 'бѣ', 'градъ', \
'жє', 'въ', 'ꙁьри', 'такождє', 'людии', 'в', 'и', 'на', 'ся', 'і', 'з', 'є', 'до', 'року', 'а', 'а', 'б', 'в', \
'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', \
'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'a', 'ą', 'b', 'c', 'ć', 'd', 'e', 'ę', 'f', 'g', 'h', 'i', 'j', 'k', \
'l', 'ł', 'm', 'n', 'ń', 'o', 'ó', 'p', 'q', 'r', 's', 'ś', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ź', 'ż', 'ґ', \
'є', 'і', 'ї', 'č', 'š', 'ž', 'á', 'ä', 'ď', 'dz', '', 'é', 'ch', 'í', 'ĺ', 'ľ', 'ň', 'ô', 'ŕ', 'ť', 'ú', 'ý', \
'ђ', 'ј', 'љ', 'њ', 'ћ', 'џ', 'ў', 'ѓ', 'ѕ', 'ќ', 'ě', 'ř', 'ů', 'ã', 'ŏ', 'ō', 'õ', '', '', '', 'оу', '', \
'ѡ', 'ѿ', '', 'ъи', 'ѣ', '', 'ѥ', 'ѧ', 'ѩ', 'ѫ', 'ѭ', 'ѯ', 'ѱ', 'ѳ', 'ѵ', 'đ']
def predict_lang():
'''
This function asks the user to enter a text into the terminal and prints the most likely language.
There is also the option to view the likelihoods for all languages and to enter another text.
Parameters:
None
Returns:
None
'''
test_texts = []
test_texts.append(str(input("\nEnter text to be classified: ")))
# extract feature vector from text
test_data = []
for text in test_texts:
text_low = text.lower()
datapoint = []
for feature in feature_list:
datapoint.append(text_low.count(feature))
test_data.append(datapoint)
# create key between names of languages and their abbreviations on wikipedia for better readability
lang_key_list = ["Belarusian", "Bulgarian", "Czech", "Old Church Slavonic", "Lower Sorbian", "Serbo-Croatian: Latin", "Upper Sorbian", \
"Macedonian", "Polish", "Russian", "Rusyn", "Slovak", "Slovene", "Serbo-Croat: Cyrillic", "Silesian", "Ukrainian"]
lang_key_zip = zip(sorted(LANG_LIST, key=str.lower), lang_key_list)
lang_key = dict(lang_key_zip)
print("\nNeural Network prediction:")
prediction_abrev = str(nn.predict(test_data)[0])
prediction_full = lang_key.get(prediction_abrev)
print(prediction_full)
print("\nRandom Forest prediction:")
prediction_abrev = str(rf.predict(test_data)[0])
prediction_full = lang_key.get(prediction_abrev)
print(prediction_full)
if input("\nWould you like find out more about this language? (y/n) ") == "y":
if prediction_abrev != "hr" or "sr":
prediction_full = prediction_full.replace(" ", "_")
wikipediapage = "https://en.wikipedia.org/wiki/" + prediction_full + "_language"
elif prediction_abrev == "hr" or "sr":
wikipediapage = "https://en.wikipedia.org/wiki/Dialects_of_Serbo-Croatian"
print("Here is the wikipedia page for the language: " + wikipediapage)
if input("\nWould you like the see the probabilites for other languages? (y/n) ") == "y":
for lang, prob in zip(lang_key_list, nn.predict_proba(test_data)[0]):
print(lang + ": " + str(prob))
if input("\nWould you like to enter another text? (y/n) ") == "y":
predict_lang()
if __name__ == "__main__":
predict_lang()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment