-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_generator_v1.py
81 lines (77 loc) · 3.7 KB
/
index_generator_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
class Index_generator():
def __init__(self):
self.index_name = "nutrient_ver_01"
self.settings ={
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"korean": {
"type": "custom",
"tokenizer": "nori_tokenizer",
"decompound_mode": "mixed"
}
}
}
},
"mappings": {
"properties": {
"code": {"type": "keyword"}, # 식품코드
"name": {"type": "text", "analyzer": "korean"}, # 식품명
"manufacturer": {"type": "keyword"}, # 제조사명
"distributor": {"type": "keyword"}, # 유통업체명
"data_classification_code": {"type": "keyword"}, # 데이터구분코드
"data_classification_name": {"type": "keyword"}, # 데이터구분명
"food_classification_code": {"type": "keyword"}, # 식품대분류코드
"food_classification_name": {"type": "keyword"}, # 식품대분류명
"food_representative_code": {"type": "keyword"}, # 대표식품코드
"food_representative_name": {"type": "keyword"}, # 대표식품명
"food_category_code": {"type": "keyword"}, # 식품중분류코드
"food_category_name": {"type": "keyword"}, # 식품중분류명
"food_subcategory_code": {"type": "keyword"}, # 식품소분류코드
"food_subcategory_name": {"type": "keyword"}, # 식품소분류명
"weight_original": {"type": "keyword"}, # 식품중량
"nutrient_reference_weight_original": {"type": "keyword"}, # 영양성분함량기준량
"energy": {"type": "double"}, # 에너지(kcal)
"protein": {"type": "double"}, # 단백질(g)
"fat": {"type": "double"}, # 지방(g)
"carbohydrate": {"type": "double"}, # 탄수화물(g)
"sugar": {"type": "double"} # 당류(g)
}
}
}
def remove_last_g(self, value):
if value[-1] == 'g': return float(value[:-1])
raise Exception("Cannot remove last g from {}".format(value))
def parse_data2doc(self, data):
try:
return {
"code": data[0],
"name": data[1],
"manufacturer": data[2],
"distributor": data[3],
"data_classification_code": data[4],
"data_classification_name": data[5],
"food_classification_code": data[6],
"food_classification_name": data[7],
"food_representative_code": data[8],
"food_representative_name": data[9],
"food_category_code": data[10],
"food_category_name": data[11],
"food_subcategory_code": data[12],
"food_subcategory_name": data[13],
"weight": data[14],
"nutrient_reference_weight": data[15],
"energy": data[16],
"protein": data[17],
"fat": data[18],
"carbohydrate": data[19],
"sugar": data[20]
}
except Exception as e:
print(e)
return None
def get_parsed_docs(self):
dataset = pd.read_csv('datasets/nutrient_dataset_ver_230924.csv', header=0).values.tolist()
return list(map(self.parse_data2doc, dataset))