Skip to content

Commit 27443da

Browse files
Change the string append logic to use StringIO
Change the string append logic to use StringIO Change the string append logic to use StringIO
1 parent a797c91 commit 27443da

File tree

4 files changed

+47
-35
lines changed

4 files changed

+47
-35
lines changed

src/SimpleReplay/audit_logs_parsing.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
44
This module parses various auditlogs
55
"""
6-
6+
from io import StringIO
77
logger = None
88

99

@@ -16,7 +16,19 @@ def __init__(self):
1616
self.database_name = ""
1717
self.pid = ""
1818
self.xid = ""
19-
self.text = ""
19+
self.text = StringIO()
20+
21+
def clear_and_set_text(self, new_value):
22+
# Better to create a new instance, rather than truncate and seek - because it’s faster
23+
self.text.close()
24+
self.text = StringIO()
25+
self.text.write(new_value)
26+
27+
def append_text(self, value):
28+
self.text.write(value)
29+
30+
def get_text_value(self):
31+
return self.text.getvalue()
2032

2133
def get_filename(self):
2234
base_name = (
@@ -44,7 +56,7 @@ def __str__(self):
4456
self.database_name,
4557
self.pid,
4658
self.xid,
47-
self.text,
59+
self.get_text_value(),
4860
)
4961
)
5062

@@ -58,11 +70,11 @@ def __eq__(self, other):
5870
and self.database_name == other.database_name
5971
and self.pid == other.pid
6072
and self.xid == other.xid
61-
and self.text == other.text
73+
and self.get_text_value() == other.get_text_value()
6274
)
6375

6476
def __hash__(self):
65-
return hash((str(self.pid), str(self.xid), self.text.strip("\n")))
77+
return hash((str(self.pid), str(self.xid), self.get_text_value().strip("\n")))
6678

6779

6880
class ConnectionLog:

src/SimpleReplay/extract/extractor/extract_parser.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
6464
if filename in logs:
6565
# Check if duplicate. This happens with JDBC connections.
6666
prev_query = logs[filename][-1]
67-
if not is_duplicate(prev_query.text, user_activity_log.text):
67+
if not is_duplicate(prev_query.get_text_value(), user_activity_log.get_text_value()):
6868
if fetch_pattern.search(
69-
prev_query.text
70-
) and fetch_pattern.search(user_activity_log.text):
71-
user_activity_log.text = f"--{user_activity_log.text}"
69+
prev_query.get_text_value()
70+
) and fetch_pattern.search(user_activity_log.get_text_value()):
71+
user_activity_log.clear_and_set_text(f"--{user_activity_log.get_text_value()}")
7272
logs[filename].append(user_activity_log)
7373
else:
7474
logs[filename].append(user_activity_log)
@@ -87,9 +87,9 @@ def _parse_user_activity_log(file, logs, databases, start_time, end_time):
8787
user_activity_log.database_name = query_information[3][3:]
8888
user_activity_log.pid = query_information[5][4:]
8989
user_activity_log.xid = query_information[7][4:]
90-
user_activity_log.text = line_split[1]
90+
user_activity_log.clear_and_set_text(line_split[1])
9191
else:
92-
user_activity_log.text += line
92+
user_activity_log.append_text(line)
9393

9494

9595
def _parse_start_node_log(file, logs, databases, start_time, end_time):
@@ -107,7 +107,7 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
107107
if filename in logs:
108108
# Check if duplicate. This happens with JDBC connections.
109109
prev_query = logs[filename][-1]
110-
if not is_duplicate(prev_query.text, start_node_log.text):
110+
if not is_duplicate(prev_query.get_text_value(), start_node_log.get_text_value()):
111111
logs[filename].append(start_node_log)
112112
else:
113113
logs[filename] = [start_node_log]
@@ -132,14 +132,14 @@ def _parse_start_node_log(file, logs, databases, start_time, end_time):
132132
start_node_log.username = query_information[4][3:].split(":")[0]
133133
start_node_log.pid = query_information[5][4:]
134134
start_node_log.xid = query_information[7][4:]
135-
start_node_log.text = line_split[1].strip()
135+
start_node_log.clear_and_set_text(line_split[1].strip())
136136
else:
137-
start_node_log.text += line
137+
start_node_log.append_text(line)
138138

139139

140140
def _parse_connection_log(file, connections, last_connections, start_time, end_time):
141141
for line in file.readlines():
142-
142+
143143
line = line.decode("utf-8")
144144

145145
connection_information = line.split("|")

src/SimpleReplay/extract/extractor/extractor.py

+15-15
Original file line numberDiff line numberDiff line change
@@ -200,33 +200,33 @@ def get_sql_connections_replacements(self, last_connections, log_items):
200200
)
201201
continue
202202

203-
query.text = remove_line_comments(query.text).strip()
203+
query.clear_and_set_text(remove_line_comments(query.get_text_value()).strip())
204204

205-
if "copy " in query.text.lower() and "from 's3:" in query.text.lower():
205+
if "copy " in query.get_text_value().lower() and "from 's3:" in query.get_text_value().lower():
206206
bucket = re.search(
207-
r"from 's3:\/\/[^']*", query.text, re.IGNORECASE
207+
r"from 's3:\/\/[^']*", query.get_text_value(), re.IGNORECASE
208208
).group()[6:]
209209
replacements.add(bucket)
210-
query.text = re.sub(
210+
query.clear_and_set_text(re.sub(
211211
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
212212
f" IAM_ROLE ''",
213-
query.text,
213+
query.get_text_value(),
214214
flags=re.IGNORECASE,
215-
)
216-
if "unload" in query.text.lower() and "to 's3:" in query.text.lower():
217-
query.text = re.sub(
215+
))
216+
if "unload" in query.get_text_value().lower() and "to 's3:" in query.get_text_value().lower():
217+
query.clear_and_set_text(re.sub(
218218
r"IAM_ROLE 'arn:aws:iam::\d+:role/\S+'",
219219
f" IAM_ROLE ''",
220-
query.text,
220+
query.get_text_value(),
221221
flags=re.IGNORECASE,
222-
)
222+
))
223223

224-
query.text = f"{query.text.strip()}"
225-
if not len(query.text) == 0:
226-
if not query.text.endswith(";"):
227-
query.text += ";"
224+
query.clear_and_set_text(f"{query.get_text_value().strip()}")
225+
if not len(query.get_text_value()) == 0:
226+
if not query.get_text_value().endswith(";"):
227+
query.append_text(";")
228228

229-
query_info["text"] = query.text
229+
query_info["text"] = query.get_text_value()
230230
sql_json["transactions"][query.xid]["queries"].append(query_info)
231231

232232
if not hash((query.database_name, query.username, query.pid)) in last_connections:

src/SimpleReplay/log_validation.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,18 @@ def is_valid_log(log, start_time, end_time):
4444
if end_time and log.record_time > end_time:
4545
return False
4646

47-
if any(word in log.text for word in problem_keywords):
47+
if any(word in log.get_text_value() for word in problem_keywords):
4848
return False
4949

50-
if any(word in log.text for word in potential_problem_keywords) and not any(word in log.text for word in not_problem_keywords):
50+
if any(word in log.get_text_value() for word in potential_problem_keywords) and not any(word in log.get_text_value() for word in not_problem_keywords):
5151
return False
5252

5353
# filter internal statement rewrites with parameter markers
54-
if re.search('\$\d',log.text):
54+
if re.search('\$\d',log.get_text_value()):
5555
# remove \$\d in string literals ( select '$1' ) or comment blocks ( */ $1 */ )
56-
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.text,flags=re.DOTALL)
56+
text_without_valid_parameter_markers = re.sub("""'.*\\$\\d.*'|\\/\\*.*\\$\\d.*\\*\\/""",'',log.get_text_value(),flags=re.DOTALL)
5757
# remove \$\d in single line quotes ( -- $1 )
58-
if '--' in log.text:
58+
if '--' in log.get_text_value():
5959
text_without_valid_parameter_markers = re.sub('^\s*--.*\$\d','',text_without_valid_parameter_markers)
6060
# if there are still parameter markers remaining after removing them from valid cases, the query text is invalid
6161
if re.search('\$\d',text_without_valid_parameter_markers):

0 commit comments

Comments
 (0)