-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path301_df_rm_candidate.py
executable file
·58 lines (50 loc) · 2.51 KB
/
301_df_rm_candidate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import sys
import argparse
import pandas as pd
from tqdm import tqdm
def process_file(file_path, output_path):
"""
Process a single CSV file, removing rows where the first column starts with 'PDC-' or 'HEC-', or equals 'DMS_other'.
Save the modified DataFrame to the specified output file.
"""
df = pd.read_csv(file_path)
original_size = len(df)
# Remove rows where the first column starts with 'PDC-' or 'HEC-', or equals 'DMS_other'
df = df[~(df.iloc[:, 0].str.startswith('PDC-') |
df.iloc[:, 0].str.startswith('HEC-') |
(df.iloc[:, 0] == 'DMS_other'))]
# Save the modified DataFrame to the output file
df.to_csv(output_path, index=False)
return original_size - len(df), output_path
def process_directory(directory, output_directory):
"""
Process all CSV files in a directory, removing rows where the first column starts with 'PDC-' or 'HEC-', or equals 'DMS_other'.
Save the modified DataFrames to the specified output directory.
"""
total_rows_removed = 0
for file in tqdm(os.listdir(directory)):
if file.endswith(".csv"):
file_path = os.path.join(directory, file)
output_path = os.path.join(output_directory, file)
rows_removed, _ = process_file(file_path, output_path)
total_rows_removed += rows_removed
print(f"Removed {rows_removed} rows from {file_path} and saved to {output_path}")
return total_rows_removed
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Remove rows from CSV files where the first column starts with "PDC-" or "HEC-", or equals "DMS_other".')
parser.add_argument('-i', '--input', required=True, help='Input file or directory path')
parser.add_argument('-o', '--output', required=True, help='Output file or directory path')
args = parser.parse_args()
input_path = args.input
output_path = args.output
if os.path.isfile(input_path):
rows_removed, file_path = process_file(input_path, output_path)
print(f"Removed {rows_removed} rows from {input_path} and saved to {output_path}")
elif os.path.isdir(input_path):
os.makedirs(output_path, exist_ok=True)
total_rows_removed = process_directory(input_path, output_path)
print(f"Removed a total of {total_rows_removed} rows from all CSV files in {input_path} and saved to {output_path}")
else:
print("Invalid input path. Please provide a valid file or directory path.")
sys.exit(1)