22
33from __future__ import annotations
44
5- import itertools
65import logging
76import re
87from urllib .parse import urlparse
@@ -198,17 +197,28 @@ def get_leaders(self):
198197
199198 leaders = []
200199 for line in content .split ("\n " ):
201- leaders .extend (
202- [
203- name
204- for name in itertools .chain (
205- * re .findall (
206- r"[-*]\s*\[\s*([^(]+?)\s*(?:\([^)]*\))?\]|\*\s*([\w\s]+)" , line .strip ()
207- )
208- )
209- if name .strip ()
210- ]
200+ stripped_line = line .strip ()
201+ names = []
202+
203+ # Match [name] or [name (optional_info)] patterns
204+ # Using bounded quantifiers {0,3} to prevent catastrophic backtracking
205+ bracketed_pattern = (
206+ r"[-*]\s{0,3}\[\s{0,3}([^\]\(]{1,200})"
207+ r"(?:\s{0,3}\([^)]{0,100}\))?\s{0,3}\]"
211208 )
209+ names .extend (re .findall (bracketed_pattern , stripped_line ))
210+
211+ # Match * name patterns
212+ names .extend (re .findall (r"\*\s{0,3}([\w\s]{1,200})" , stripped_line ))
213+
214+ # Clean names by removing parenthetical info (avoid PLW2901)
215+ cleaned_names = []
216+ for raw_name in names :
217+ if raw_name .strip ():
218+ cleaned = re .sub (r"\s*\([^)]{0,100}\)\s*$" , "" , raw_name ).strip ()
219+ cleaned_names .append (cleaned )
220+
221+ leaders .extend (cleaned_names )
212222
213223 return leaders
214224
@@ -220,23 +230,31 @@ def get_leaders_emails(self):
220230
221231 leaders = {}
222232 for line in content .split ("\n " ):
223- matches = re .findall (
224- r"^[-*]\s*\[([^\]]+)\]\((?:mailto:)?([^)]+)(\)|([^[<\n]))" , line .strip ()
233+ stripped_line = line .strip ()
234+ # Use bounded quantifiers to prevent backtracking on malformed input
235+ # Match [name](email) or [name](mailto:email) patterns
236+ email_pattern = (
237+ r"^[-*]\s{0,3}\[([^\]]{1,200})\]"
238+ r"\((?:mailto:)?([^)]{1,300})\)"
225239 )
240+ matches = re .findall (email_pattern , stripped_line )
226241
227- for match in matches :
228- if match [0 ] and match [1 ]: # Name with email
229- leaders [match [0 ].strip ()] = match [1 ].strip ()
230- elif match [2 ]: # Name without email
231- leaders [match [2 ].strip ()] = None
242+ for raw_name , raw_email in matches :
243+ name = raw_name .strip ()
244+ email = raw_email .strip ()
245+ if name and email :
246+ leaders [name ] = email
247+ elif name :
248+ leaders [name ] = None
232249
233250 return leaders
234251
235252 def get_metadata (self ):
236253 """Get entity metadata."""
237254 try :
255+ metadata_pattern = r"^---\s{0,3}((?:.|\n){0,10000}?)\s{0,3}---"
238256 yaml_content = re .search (
239- r"^---\s*(.*?)\s*---" ,
257+ metadata_pattern ,
240258 get_repository_file_content (self .index_md_url ),
241259 re .DOTALL ,
242260 )
@@ -282,7 +300,8 @@ def get_urls(self, domain=None):
282300
283301 urls = set ()
284302
285- markdown_links = re .findall (r"\[([^\]]*)\]\((https?://[^\s\)]+)\)" , content )
303+ markdown_pattern = r"\[([^\]]*)\]\((https?://[^\s\)]+)\)"
304+ markdown_links = re .findall (markdown_pattern , content )
286305 for _text , url in markdown_links :
287306 cleaned_url = clean_url (url )
288307 if cleaned_url and validate_url (cleaned_url ):
@@ -311,11 +330,10 @@ def parse_tags(self, tags) -> list[str]:
311330 if not tags :
312331 return []
313332
314- return (
315- [tag .strip (", " ) for tag in tags .split ("," if "," in tags else " " )]
316- if isinstance (tags , str )
317- else tags
318- )
333+ if isinstance (tags , str ):
334+ separator = "," if "," in tags else " "
335+ return [tag .strip (", " ) for tag in tags .split (separator )]
336+ return tags
319337
320338 def sync_leaders (self , leaders_emails ):
321339 """Sync Leaders data.
0 commit comments