Skip to content

Commit

Permalink
Normalize line endings before reading csv
Browse files Browse the repository at this point in the history
  • Loading branch information
TiraelSedai committed Jun 23, 2024
1 parent e9db480 commit 119df5f
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion ClubDoorman/SpamHamClassifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ private async Task Train()
var stopWords = (await File.ReadAllTextAsync("data/exclude-tokens.txt")).Split(',').Select(x => x.Trim()).ToArray();

List<MessageData> dataset;
using (var reader = new StreamReader(SpamHamDataset))
// Right now dataset is around 160KB, but if/when it will be huge we would need to normalize line endings without loading everything into memory
var csvContent = await File.ReadAllTextAsync(SpamHamDataset);
var normalizedCsvContent = NormalizeLineEndings(csvContent);
using (var reader = new StringReader(normalizedCsvContent))
using (var csv = new CsvReader(reader, CultureInfo.InvariantCulture))
{
dataset = csv.GetRecords<MessageData>().ToList();
Expand Down Expand Up @@ -129,4 +132,6 @@ private async Task Train()
_logger.LogError(e, "Exception during training");
}
}

private static string NormalizeLineEndings(string input) => input.Replace("\r\n", "\n").Replace("\r", "\n");
}

0 comments on commit 119df5f

Please sign in to comment.