index_generator_v1.py

import pandas as pd

class Index_generator():
    def __init__(self):
        self.index_name = "nutrient_ver_01"
        self.settings ={
            "settings": {
                "number_of_shards": 1,
                "analysis": {
                    "analyzer": {
                        "korean": {
                            "type": "custom",
                            "tokenizer": "nori_tokenizer",
                            "decompound_mode": "mixed"
                        }
                    }
                }
            },
            "mappings": {
                "properties": {
                    "code": {"type": "keyword"}, # 식품코드
                    "name": {"type": "text", "analyzer": "korean"}, # 식품명
                    "manufacturer": {"type": "keyword"}, # 제조사명
                    "distributor": {"type": "keyword"}, # 유통업체명
                    "data_classification_code": {"type": "keyword"}, # 데이터구분코드
                    "data_classification_name": {"type": "keyword"}, # 데이터구분명
                    "food_classification_code": {"type": "keyword"}, # 식품대분류코드
                    "food_classification_name": {"type": "keyword"}, # 식품대분류명
                    "food_representative_code": {"type": "keyword"}, # 대표식품코드
                    "food_representative_name": {"type": "keyword"}, # 대표식품명
                    "food_category_code": {"type": "keyword"}, # 식품중분류코드
                    "food_category_name": {"type": "keyword"}, # 식품중분류명
                    "food_subcategory_code": {"type": "keyword"}, # 식품소분류코드
                    "food_subcategory_name": {"type": "keyword"}, # 식품소분류명
                    "weight_original": {"type": "keyword"}, # 식품중량
                    "nutrient_reference_weight_original": {"type": "keyword"}, # 영양성분함량기준량
                    "energy": {"type": "double"}, # 에너지(kcal)
                    "protein": {"type": "double"}, # 단백질(g)
                    "fat": {"type": "double"}, # 지방(g)
                    "carbohydrate": {"type": "double"}, # 탄수화물(g)
                    "sugar": {"type": "double"} # 당류(g)
                }
            }
        }
    
    def remove_last_g(self, value):
        if value[-1] == 'g': return float(value[:-1])
        raise Exception("Cannot remove last g from {}".format(value))

    def parse_data2doc(self, data):
        try:
            return {
                "code": data[0],
                "name": data[1],
                "manufacturer": data[2],
                "distributor": data[3],
                "data_classification_code": data[4],
                "data_classification_name": data[5],
                "food_classification_code": data[6],
                "food_classification_name": data[7],
                "food_representative_code": data[8],
                "food_representative_name": data[9],
                "food_category_code": data[10],
                "food_category_name": data[11],
                "food_subcategory_code": data[12],
                "food_subcategory_name": data[13],
                "weight": data[14],
                "nutrient_reference_weight": data[15],
                "energy": data[16],
                "protein": data[17],
                "fat": data[18],
                "carbohydrate": data[19],
                "sugar": data[20]
            }
        except Exception as e:
            print(e)
            return None
    
    def get_parsed_docs(self):
        dataset = pd.read_csv('datasets/nutrient_dataset_ver_230924.csv', header=0).values.tolist()
        return list(map(self.parse_data2doc, dataset))