Skip to content

Commit

Permalink
Parameters curation fix (#77)
Browse files Browse the repository at this point in the history
* tcr8 parameter curation

* some change

---------

Co-authored-by: hujiatao <[email protected]>
Co-authored-by: Shipeng Qi <[email protected]>
  • Loading branch information
3 people authored Jan 25, 2024
1 parent 0b0a9b5 commit 3a59172
Show file tree
Hide file tree
Showing 8 changed files with 635 additions and 0 deletions.
15 changes: 15 additions & 0 deletions scripts/paramgen/factor_table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

rm -rf ../../out/factor_table

python3 generate_account.py &

python3 time_split.py &

python3 split_amount.py &

python3 loan.py &

wait

echo "All factors have been generated."
43 changes: 43 additions & 0 deletions scripts/paramgen/generate_account.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import pandas as pd


def process_csv(file_path):
df = pd.read_csv(file_path, delimiter='|')
return df


account_folder_path = '../../out/raw/account'
transfer_folder_path = '../../out/raw/transfer'
output_folder = '../../out/factor_table'
withdraw_folder_path = '../../out/raw/withdraw'

account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')]
transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')]
withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')]

account_df = pd.concat([process_csv(file) for file in account_files])
transfer_df = pd.concat([process_csv(file) for file in transfer_files])
withdraw_df = pd.concat([process_csv(file) for file in withdraw_files])

merged_df = pd.merge(account_df, transfer_df, left_on='id', right_on='toId', how='left')

result_amount_df = merged_df.groupby('id')['amount'].sum().reset_index().fillna(0)

account_items = []

for account_id in account_df['id']:
transfer_data = transfer_df[transfer_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index()
withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index()

max_amounts = pd.concat([transfer_data, withdraw_data], ignore_index=True).groupby('toId')['amount'].max().reset_index()

items = [[to_id, max_amount] for to_id, max_amount in zip(max_amounts['toId'], max_amounts['amount'])]

account_items.append([account_id, items])

os.makedirs(output_folder, exist_ok=True)

result_amount_df.to_csv(os.path.join(output_folder, 'amount.csv'), sep='|', index=False, header=['account_id', 'amount'])
result_df = pd.DataFrame(account_items, columns=['account_id', 'items'])
result_df.to_csv(os.path.join(output_folder, 'account_items.csv'), sep='|', index=False)
27 changes: 27 additions & 0 deletions scripts/paramgen/loan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import pandas as pd

def process_csv(file_path):
df = pd.read_csv(file_path, delimiter='|')
return df

loan_folder_path = '../../out/raw/loan'
deposit_folder_path = '../../out/raw/deposit'
output_folder = '../../out/factor_table'

loan_files = [os.path.join(loan_folder_path, file) for file in os.listdir(loan_folder_path) if file.endswith('.csv')]
deposit_files = [os.path.join(deposit_folder_path, file) for file in os.listdir(deposit_folder_path) if file.endswith('.csv')]

loan_df = pd.concat([process_csv(file) for file in loan_files])
deposit_df = pd.concat([process_csv(file) for file in deposit_files])

result_list = []

for loan_id in loan_df['id'].unique():
account_list = deposit_df[deposit_df['loanId'] == loan_id]['accountId'].unique().tolist()
result_list.append([loan_id, account_list])

result_df = pd.DataFrame(result_list, columns=['loan_id', 'account_list'])

os.makedirs(output_folder, exist_ok=True)
result_df.to_csv('../../out/factor_table/loan_account_list.csv', sep='|', index=False)
182 changes: 182 additions & 0 deletions scripts/paramgen/parameter_curation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/usr/bin/env python3

from ast import literal_eval
from calendar import timegm
import pandas as pd
import numpy as np
import search_params
import time_select
import os
import codecs
from datetime import date

THRESH_HOLD = 0
TRUNCATION_LIMIT = 10000

def process_csv(file_path):
df = pd.read_csv(file_path, delimiter='|')
return df


class CSVSerializer:
def __init__(self):
self.handlers = []
self.inputs = []

def setOutputFile(self, outputFile):
self.outputFile=outputFile

def registerHandler(self, handler, inputParams, header):
handler.header = header
self.handlers.append(handler)
self.inputs.append(inputParams)

def writeCSV(self):
dir_path = os.path.dirname(self.outputFile)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
output = codecs.open( self.outputFile, "w",encoding="utf-8")

if len(self.inputs) == 0:
return

headers = [self.handlers[j].header for j in range(len(self.handlers))]
output.write("|".join(headers))
output.write("\n")

for i in range(len(self.inputs[0])):
# compile a single CSV line from multiple handlers
csvLine = []
for j in range(len(self.handlers)):
handler = self.handlers[j]
data = self.inputs[j][i]
csvLine.append(handler(data))
output.write('|'.join([s for s in csvLine]))
output.write("\n")
output.close()


def find_neighbors(account_list, account_account_df, account_amount_df, amount_bucket_df, num_list):
temp = []
result = set()

# edge amount > upstream * threshold
for item in account_list:
rows_account_list = account_account_df.loc[item]
ata_list = rows_account_list['items']
rows_amount_bucket = amount_bucket_df.loc[item]
transfer_in_amount = account_amount_df.loc[item]['amount']
for ata in ata_list:
if ata[1] > transfer_in_amount * THRESH_HOLD:
temp.append(ata)
# truncate at truncationLimit
sum_num = 0
header_at_limit = -1
for col in reversed(num_list):
sum_num += rows_amount_bucket[str(col)]
if sum_num >= TRUNCATION_LIMIT:
header_at_limit = col
break
for t in temp:
if header_at_limit != -1 and t[1] < header_at_limit:
continue
result.add(t[0])

return list(result)


def get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df, amount_bucket_df):
next_neighbors_df = neighbors_df
num_list = [int(x) for x in amount_bucket_df.iloc[0].index.tolist()[1:]]
next_neighbors_df['account_list'] = next_neighbors_df['account_list'].apply(lambda x: find_neighbors(x, account_account_df, account_amount_df, amount_bucket_df, num_list))
return next_neighbors_df


def get_next_sum_table(neighbors_df, basic_sum_df):
result_data = []
for index, row in neighbors_df.iterrows():
loan_id = row['loan_id']
account_list = row['account_list']
add_frame = basic_sum_df.loc[basic_sum_df.index.isin(account_list)]
add_frame = add_frame.rename_axis('loan_id')
sum_result = add_frame.sum(axis=0).astype(int)
sum_result['loan_id'] = loan_id
result_data.append(sum_result.to_dict())
return pd.DataFrame(result_data)


def handleLoanParam(loan):
return str(loan)


def handleTimeDurationParam(timeParam):
start = timegm(date(year=int(timeParam.year), month=int(timeParam.month), day=int(timeParam.day)).timetuple())*1000
end = start + timeParam.duration * 3600 * 24 * 1000
res = str(start) + "|" + str(end)
return res


def main():

loan_account_path = '../../out/factor_table/loan_account_list.csv'
account_account_path = '../../out/factor_table/account_items.csv'
account_amount_path = '../../out/factor_table/amount.csv'
amount_bucket_path = '../../out/factor_table/amount_bucket.csv'
time_bucket_path = '../../out/factor_table/month.csv'
output_path = '../../out/substitute_parameters/'


loan_account_df = process_csv(loan_account_path)
account_account_df = process_csv(account_account_path)
account_amount_df = process_csv(account_amount_path)
amount_bucket_df = process_csv(amount_bucket_path)
time_bucket_df = process_csv(time_bucket_path)
account_account_df['items'] = account_account_df['items'].apply(literal_eval)
loan_account_df['account_list'] = loan_account_df['account_list'].apply(literal_eval)

account_account_df.set_index('account_id', inplace=True)
amount_bucket_df.set_index('account_id', inplace=True)
time_bucket_df.set_index('account_id', inplace=True)
account_amount_df.set_index('account_id', inplace=True)

steps = 3
current_step = 0
neighbors_df = loan_account_df
final_array = neighbors_df['loan_id'].to_numpy()
next_time_bucket = None

while current_step < steps:

next_amount_bucket = get_next_sum_table(neighbors_df, amount_bucket_df)
next_amount_bucket.set_index('loan_id', inplace=True)
result_array = next_amount_bucket.to_numpy().sum(axis=1)
final_array = np.column_stack((final_array, result_array))

if current_step == steps - 1:
next_time_bucket = get_next_sum_table(neighbors_df, time_bucket_df)
next_time_bucket.set_index('loan_id', inplace=True)

# print(neighbors_df)
# print(next_amount_bucket)
# print(next_time_bucket)
# print(final_array)

else:
neighbors_df = get_next_neighbor_list(neighbors_df, account_account_df, account_amount_df, amount_bucket_df)

current_step += 1


result = search_params.generate(final_array, 0.01)
time_list = time_select.findTimeParams(result, next_time_bucket)

csvWriter = CSVSerializer()
csvWriter.setOutputFile(output_path + "tcr8.txt")
csvWriter.registerHandler(handleLoanParam, result, "loanId")
csvWriter.registerHandler(handleTimeDurationParam, time_list, "startDate|endDate")

csvWriter.writeCSV()


if __name__ == "__main__":
main()
Loading

0 comments on commit 3a59172

Please sign in to comment.