Skip to content

Commit

Permalink
added name map extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
WissamAntoun committed Feb 11, 2021
1 parent 3dfa88c commit 2e5d280
Show file tree
Hide file tree
Showing 9 changed files with 369 additions and 75 deletions.
3 changes: 1 addition & 2 deletions AnalyzeDB.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import numpy as np

if __name__ == '__main__':
if __name__ == "__main__":
df = pd.read_csv("Data/all_engineers.csv", encoding="utf-8")
# df.Links = "https://www.oea.org.lb/Arabic/"+df.Links
# df.to_csv("Data/all_engineers.csv", index=False)
Expand All @@ -20,4 +20,3 @@

# results=df.groupby("Field").agg({'Engineer_ID': ['min','median', 'max']})
# results.sort_values([('Engineer_ID', 'median')], ascending=False)

51 changes: 35 additions & 16 deletions GetTheFieldsAndSubfields.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
import sys
import codecs

sys.stdout.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding="utf-8")

if __name__ == '__main__':
subfields_df = pd.read_csv('subfields.csv', encoding="utf-8", index_col=0)
fields_df = pd.read_csv('fields.csv', encoding="utf-8", index_col=0)
if __name__ == "__main__":
subfields_df = pd.read_csv("subfields.csv", encoding="utf-8", index_col=0)
fields_df = pd.read_csv("fields.csv", encoding="utf-8", index_col=0)

fields = np.delete(fields_df.index.to_numpy(),
np.where(fields_df.index.to_numpy() == -1))
subfields = np.delete(subfields_df.index.to_numpy(),
np.where(subfields_df.index.to_numpy() == -1))
fields = np.delete(
fields_df.index.to_numpy(), np.where(fields_df.index.to_numpy() == -1)
)
subfields = np.delete(
subfields_df.index.to_numpy(), np.where(subfields_df.index.to_numpy() == -1)
)
print("Hello")

# "fstname":,
Expand All @@ -33,20 +35,37 @@
"spec": field,
"spec1": subfield,
"searchoption": "And",
"rand": 0.055286690143709905
"rand": 0.055286690143709905,
}
r = requests.get(
"https://www.oea.org.lb/Arabic/GetMembers.aspx", params=parameters)
"https://www.oea.org.lb/Arabic/GetMembers.aspx", params=parameters
)

response = r.text

if("لا يوجد أي نتيجة" in response):
if "لا يوجد أي نتيجة" in response:
print("wrong issue")
else:
print(field, fields_df.loc[field].Field, subfield,
subfields_df.loc[subfield].Subfield, sep=', ')
print(
field,
fields_df.loc[field].Field,
subfield,
subfields_df.loc[subfield].Subfield,
sep=", ",
)

# Writing data to a file
file1.write(", ".join(map(str, [field, fields_df.loc[field].Field, subfield,
subfields_df.loc[subfield].Subfield])))
file1.write('\n')
file1.write(
", ".join(
map(
str,
[
field,
fields_df.loc[field].Field,
subfield,
subfields_df.loc[subfield].Subfield,
],
)
)
)
file1.write("\n")
210 changes: 210 additions & 0 deletions get_name_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# %%
import json
import re

import editdistance
import pandas as pd
from tqdm import tqdm

# %%
engineers_df = pd.read_csv("Data/all_engineers.csv", encoding="utf-8")
half_names = [
x.split(",")
for x in open("half_names.csv", "r", encoding="utf-8").read().split("\n")
if x != ""
]
# %%
# %%
full_name_map = []
for row in engineers_df[["Arabic_Names", "Latin_Names"]].iterrows():
full_name_map.append((row[0], row[1]["Arabic_Names"], row[1]["Latin_Names"]))
# %%

rejected_chars_regex = r"[^\u0621-\u063A\u0641-\u064Aa-z ]"


def clean(name: str):
name = name.lower()
name = name.replace("آ", "ا")
name = name.replace("أ", "ا")
name = name.replace("إ", "ا")
name = re.sub(rejected_chars_regex, "", name)
return name


def join_name_list(arabic_full_name: str, latin_full_name: str):
"""
Splits the string by whitespace, then tries to find
composite names using a predefined list of prefixes and suffixes
"""
arabic_name_full_list_tmp = [clean(x) for x in arabic_full_name.split()]

arabic_name_full_list = []
for name in arabic_name_full_list_tmp:
if name != "متاهلة":
arabic_name_full_list.append(name)
else:
break

latin_full_name_list = [clean(x) for x in latin_full_name.split()]
flag = False
for ar_hf_name, la_hf_name, is_start in half_names:
new_ar_name_list = []
if ar_hf_name in arabic_name_full_list and la_hf_name in " ".join(
latin_full_name_list
):
flag = True
index = arabic_name_full_list.index(ar_hf_name)
if is_start == "start":
new_ar_name_list.extend(arabic_name_full_list[:index])
new_ar_name_list.append(
ar_hf_name + "-" + arabic_name_full_list[index + 1]
)
new_ar_name_list.extend(arabic_name_full_list[index + 2 :])
arabic_name_full_list = new_ar_name_list
else:
new_ar_name_list.extend(arabic_name_full_list[: index - 1])
new_ar_name_list.append(
arabic_name_full_list[index - 1] + "-" + ar_hf_name
)
new_ar_name_list.extend(arabic_name_full_list[index + 1 :])
arabic_name_full_list = new_ar_name_list
new_la_name_list = []
if la_hf_name in latin_full_name_list and ar_hf_name in " ".join(
arabic_name_full_list
):
flag = True
index = latin_full_name_list.index(la_hf_name)
if is_start == "start":
new_la_name_list.extend(latin_full_name_list[:index])
new_la_name_list.append(
la_hf_name + "-" + latin_full_name_list[index + 1]
)
new_la_name_list.extend(latin_full_name_list[index + 2 :])
latin_full_name_list = new_la_name_list
else:
new_la_name_list.extend(latin_full_name_list[: index - 1])
new_la_name_list.append(
latin_full_name_list[index - 1] + "-" + la_hf_name
)
new_la_name_list.extend(latin_full_name_list[index + 1 :])
latin_full_name_list = new_la_name_list
if flag:
return join_name_list(
" ".join(arabic_name_full_list), " ".join(latin_full_name_list)
)
else:
return arabic_name_full_list, latin_full_name_list


def add_name_to_dict(source_name: str, target_name: str, name_dict: dict):
if source_name in name_dict:
if target_name in name_dict[source_name]:
name_dict[source_name][target_name] += 1
else:
name_dict[source_name][target_name] = 1
else:
name_dict[source_name] = {target_name: 1}
return name_dict


latin_to_arabic_name_map = {}
arabic_to_latin_name_map = {}
weird_names = []
fix_erros = []
for name_tuple in tqdm(full_name_map):
id = name_tuple[0]
arabic_full_name = name_tuple[1]
latin_full_name = name_tuple[2]
try:
arabic_name_full_list, latin_full_name_list = join_name_list(
arabic_full_name, latin_full_name
)
except:
fix_erros.append(name_tuple)
continue

if len(arabic_name_full_list) == len(latin_full_name_list):
for arabic_name, latin_name in zip(arabic_name_full_list, latin_full_name_list):
latin_to_arabic_name_map = add_name_to_dict(
latin_name, arabic_name, latin_to_arabic_name_map
)
arabic_to_latin_name_map = add_name_to_dict(
arabic_name, latin_name, arabic_to_latin_name_map
)
else:
weird_names.append((id, arabic_name_full_list, latin_full_name_list))


# %%
cleaned_latin_to_arabic_name_map = {}
wrong_count = 0
wrong_names = [("source_name", "target_name", "top_target_name")]
for source_name, value in latin_to_arabic_name_map.items():
value = dict(sorted(value.items(), key=lambda item: item[1], reverse=True))
cleaned_value = {}
for idx, (target_name, count) in enumerate(value.items()):
if idx == 0:
cleaned_value[target_name] = count
continue
top_target_name = list(
dict(
sorted(
arabic_to_latin_name_map[target_name].items(),
key=lambda item: item[1],
reverse=True,
)
).keys()
)[0]
if editdistance.eval(top_target_name, source_name) > 2:
wrong_names.append((source_name, target_name, top_target_name))
wrong_count += 1
continue
else:
cleaned_value[target_name] = count
cleaned_latin_to_arabic_name_map[source_name] = cleaned_value

print("Cleaned Words: ", wrong_count)
# %%
cleaned_arabic_to_latin_name_map = {}
wrong_count = 0
wrong_names = [("source_name", "target_name", "top_target_name")]
for source_name, value in arabic_to_latin_name_map.items():
value = dict(sorted(value.items(), key=lambda item: item[1], reverse=True))
cleaned_value = {}
for idx, (target_name, count) in enumerate(value.items()):
if idx == 0:
cleaned_value[target_name] = count
continue
top_target_name = list(
dict(
sorted(
latin_to_arabic_name_map[target_name].items(),
key=lambda item: item[1],
reverse=True,
)
).keys()
)[0]
if editdistance.eval(top_target_name, source_name) > 2:
wrong_names.append((source_name, target_name, top_target_name))
wrong_count += 1
continue
else:
cleaned_value[target_name] = count
cleaned_arabic_to_latin_name_map[source_name] = cleaned_value

print("Cleaned Words: ", wrong_count)

# %%
json.dump(
cleaned_latin_to_arabic_name_map,
open("name_maps/latin_to_arabic_map.json", "w", encoding="utf-8"),
)
json.dump(
cleaned_arabic_to_latin_name_map,
open("name_maps/arabic_to_latin_map.json", "w", encoding="utf-8"),
)
# %%
print("Total Arabic Names: ", len(cleaned_arabic_to_latin_name_map))
print("Total Latin Names: ", len(cleaned_latin_to_arabic_name_map))
# %%
37 changes: 37 additions & 0 deletions half_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
ابي,abi,start
ابي,abil,start
الله,allah,end
الله,alla,end
ذو,zoul,start
عبد,abdul,start
عبد,abdel,start
عبد,abd,start
ابو,abou,start
ابو,abu,start
ابو,abo,start
بو,abou,start
ابو,aboul,start
ابو,abouel,start
ال,el,start
ال,al,start
الدين,eddine,end
الدين,ddine,end
الدين,ddin,end
الدين,din,end
الدين,dine,end
الدين,deen,end
الدين,edeen,end
الدين,eddeen,end
الدين,eddin,end
الدين,eddine,end
الدين,eldin,end
الدين,eldeen,end
الدين,eddeen,end
الدين,eldine,end
الدين,eldeen,end
الدين,aldine,end
الدين,alddine,end
الدين,aldeen,end
الدين,addeen,end
ديل,del,start

2 changes: 1 addition & 1 deletion mergeAllFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd


if __name__ == '__main__':
if __name__ == "__main__":
files = glob.glob("Data/*[0-9].csv")
print(files)
print(len(files))
Expand Down
1 change: 1 addition & 0 deletions name_maps/arabic_to_latin_map.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions name_maps/latin_to_arabic_map.json

Large diffs are not rendered by default.

Loading

0 comments on commit 2e5d280

Please sign in to comment.