Fix file path in detect_language.py in model

bffa5b43 · Samuel Innes · 2a90e913 · 2a90e913 · bffa5b43 · bffa5b43
Commit bffa5b43 authored 3 years ago by Samuel Innes
--- a/project/model.zip
+++ b/project/model.zip
--- a/project/model/data.txt
+++ b/project/model/data.txt
--- a/project/model/delect_language.py
+++ b/project/model/delect_language.py
+# this module provides the main feature of the model, namely predicting a given language
+# run this module and follow the prompts to find out the predicted language
+# there is also the option to view the probabilites for other languages
+
+import numpy as np
+import re
+import os
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neural_network import MLPClassifier
+
+CWD = os.getcwd()
+LANG_LIST = ["ru", "pl", "uk", "sl", "sk", "sr", "bg", "be", "mk", "cs", "hr", "szl", "dsb", "hsb", "cu", "rue"]
+
+with open(CWD + "/project/model/data.txt", "r") as f:
+    data = f.read().strip().split("\t")
+
+# extract data from data.txt
+X = []
+y = []
+for dp in data:
+    with_bracket = re.search("\([^a-zA-Z]+\)", dp).group()[1:]
+    no_bracket = with_bracket.strip("(").strip(")").split(", ")
+    X.append(np.array(no_bracket, dtype=np.int32))
+    y.append(re.search("[a-zA-Z]+", dp).group())
+
+#print("length of data set: " + str(len(X)))
+
+rf = RandomForestClassifier()
+rf.fit(X, y)
+
+nn = MLPClassifier(activation="tanh")
+nn.fit(X, y)
+
+
+# hardcode list of features
+feature_list = ['в', 'и', 'на', 'с', 'года', 'по', 'из', 'что', 'а', 'году', 'w', 'i', 'z', 'na', 'do', 'się', \
+    'roku', 'a', 'od', 'po', 'в', 'у', 'на', 'з', 'і', 'та', 'до', 'року', 'за', 'що', 'je', 'in', 'v', 'na', \
+    'so', 'se', 'za', 'ki', 'leta', 'z', 'v', 'a', 'na', 'je', 'sa', 'z', 's', 'roku', 'm', 'ako', 'је', 'у', 'и', \
+    'на', 'се', 'су', 'од', 'за', 'да', 'године', 'на', 'и', 'в', 'е', 'от', 'се', 'за', 'с', 'г', 'да', 'і', 'у', \
+    'ў', 'з', 'на', 'года', 'да', 'па', 'а', 'г', 'на', 'во', 'и', 'од', 'се', 'за', 'е', 'со', 'да', 'година', 'v', \
+    'a', 'na', 'se', 'je', 's', 'z', 'do', 've', 'byl', 'je', 'i', 'u', 'na', 'se', 'su', 'od', 'za', 'a', 's', 'we', \
+    'je', 'i', 'do', 'co', 'przipisy', 'nŏleży', 'zorty', 'go', 'ôpisoł', 'a', 'jo', 'w', 'na', 'se', 'z', 'wót', \
+    'su', 'do', 'až', 'a', 'w', 'je', 'na', 'wot', 'z', 'so', 'do', 'k', 'za', 'и', 'ѥстъ', 'лѣта', 'бѣ', 'градъ', \
+    'жє', 'въ', 'ꙁьри', 'такождє', 'людии', 'в', 'и', 'на', 'ся', 'і', 'з', 'є', 'до', 'року', 'а', 'а', 'б', 'в', \
+    'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', \
+    'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'a', 'ą', 'b', 'c', 'ć', 'd', 'e', 'ę', 'f', 'g', 'h', 'i', 'j', 'k', \
+    'l', 'ł', 'm', 'n', 'ń', 'o', 'ó', 'p', 'q', 'r', 's', 'ś', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ź', 'ż', 'ґ', \
+    'є', 'і', 'ї', 'č', 'š', 'ž', 'á', 'ä', 'ď', 'dz', 'dž', 'é', 'ch', 'í', 'ĺ', 'ľ', 'ň', 'ô', 'ŕ', 'ť', 'ú', 'ý', \
+    'ђ', 'ј', 'љ', 'њ', 'ћ', 'џ', 'ў', 'ѓ', 'ѕ', 'ќ', 'ě', 'ř', 'ů', 'ã', 'ŏ', 'ō', 'õ', 'ꙃ', 'ꙁ', 'ꙉ', 'оу', 'ꙋ', \
+    'ѡ', 'ѿ', 'ꙑ', 'ъи', 'ѣ', 'ꙗ', 'ѥ', 'ѧ', 'ѩ', 'ѫ', 'ѭ', 'ѯ', 'ѱ', 'ѳ', 'ѵ', 'đ']
+
+def predict_lang():
+    '''
+    This function asks the user to enter a text into the terminal and prints the most likely language.
+    There is also the option to view the likelihoods for all languages and to enter another text.
+
+            Parameters:
+                    None
+            
+            Returns:
+                    None
+    '''
+    test_texts = []
+
+    test_texts.append(str(input("\nEnter text to be classified: ")))
+
+    # extract feature vector from text
+    test_data = [] 
+    for text in test_texts:
+        text_low = text.lower()
+
+        datapoint = []
+        for feature in feature_list:
+            datapoint.append(text_low.count(feature))
+        test_data.append(datapoint)
+
+
+    # create key between names of languages and their abbreviations on wikipedia for better readability
+    lang_key_list = ["Belarusian", "Bulgarian", "Czech", "Old Church Slavonic", "Lower Sorbian", "Serbo-Croatian: Latin",  "Upper Sorbian", \
+        "Macedonian", "Polish", "Russian", "Rusyn", "Slovak", "Slovene", "Serbo-Croat: Cyrillic", "Silesian", "Ukrainian"]
+    lang_key_zip = zip(sorted(LANG_LIST, key=str.lower), lang_key_list)
+    lang_key = dict(lang_key_zip)
+
+    print("\nNeural Network prediction:")
+    prediction_abrev = str(nn.predict(test_data)[0])
+    prediction_full = lang_key.get(prediction_abrev)
+    print(prediction_full)
+
+    print("\nRandom Forest prediction:")
+    prediction_abrev = str(rf.predict(test_data)[0])
+    prediction_full = lang_key.get(prediction_abrev)
+    print(prediction_full)
+
+
+    if input("\nWould you like find out more about this language? (y/n) ") == "y":
+        if prediction_abrev != "hr" or "sr":
+            prediction_full = prediction_full.replace(" ", "_")
+            wikipediapage = "https://en.wikipedia.org/wiki/" + prediction_full + "_language"
+
+        elif prediction_abrev == "hr" or "sr":
+            wikipediapage = "https://en.wikipedia.org/wiki/Dialects_of_Serbo-Croatian"
+        print("Here is the wikipedia page for the language: " + wikipediapage)
+
+    if input("\nWould you like the see the probabilites for other languages? (y/n) ") == "y":
+        for lang, prob in zip(lang_key_list, nn.predict_proba(test_data)[0]):
+            print(lang + ": " + str(prob))
+
+    if input("\nWould you like to enter another text? (y/n) ") == "y":
+        predict_lang()
+
+if __name__ == "__main__":
+    predict_lang()
\ No newline at end of file