Skip to content

Commit

Permalink
⏪ Remove change in warc reader
Browse files Browse the repository at this point in the history
  • Loading branch information
alexchapeaux committed Jul 11, 2023
1 parent e8279a0 commit 7d18f66
Showing 1 changed file with 2 additions and 6 deletions.
8 changes: 2 additions & 6 deletions src/datatrove/pipeline/readers/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,8 @@ def __init__(self, *args, gzip_mode: bool = False, **kwargs):

def read_file(self, datafile: InputDataFile):
with datafile.open(gzip=self.gzip_mode, binary=True) as f:
for i, record in enumerate(ArchiveIterator(f)):
with self.stats.time_manager:
document = process_record(record)

if i > 1000:
return
for record in ArchiveIterator(f):
document = process_record(record)
if document:
document.metadata["file_path"] = datafile.path
yield document
Expand Down

0 comments on commit 7d18f66

Please sign in to comment.