-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrat2iob.py
161 lines (126 loc) · 5.47 KB
/
brat2iob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
from collections import defaultdict
import os
import csv
import string
# File paths for the provided data
#data_folder = 'RareDis-v1/dev'
#data_folder = 'RareDis-v1/test'
data_folder = 'RareDis-v1/train'
ann_files = [f for f in os.listdir(data_folder) if f.endswith('.ann')]
txt_files = [f for f in os.listdir(data_folder) if f.endswith('.txt')]
# Verify if the number of annotation files and text files match
if len(ann_files) != len(txt_files):
raise Exception("The number of annotation files and text files do not match.")
# Sort files to ensure matching pairs
ann_files.sort()
txt_files.sort()
def tokenize_text_with_sentences(text, sentence_num_global):
"""
Tokenize the text into words, treating punctuation as separate tokens, and track sentence boundaries.
"""
tokens = []
current_pos = 0
sentence_num = 1 # Starting with sentence number 1
text_length = len(text)
for word in text.split():
start = text.find(word, current_pos)
# Handle punctuation at the start of the word
while word and word[0] in string.punctuation:
tokens.append((word[0], start, start + 1, sentence_num))
word = word[1:]
start += 1
end = start + len(word)
# Temporarily store word tokens if they have punctuation at the end
word_tokens = []
while word and word[-1] in string.punctuation:
word_tokens.insert(0, (word[-1], end - 1, end, sentence_num))
word = word[:-1]
end -= 1
# Add the main word token
if word:
tokens.append((word, start, end, sentence_num))
# Add the stored punctuation tokens at the end
tokens.extend(word_tokens)
# Update sentence number if punctuation indicates end of sentence
if tokens and tokens[-1][0] in {'.', '?', '!'}:
next_char_index = tokens[-1][2]
if next_char_index >= text_length or text[next_char_index].isspace():
sentence_num += 1
sentence_num_global += 1 # Increment global sentence counter
current_pos = end + 1
return tokens, sentence_num_global
def annotate_tokens_with_sentences(tokens, entities, sentence_num_global):
"""
Annotate tokens with IOB tags based on the entity annotations and include sentence information.
"""
annotated_tokens = []
for token, start, end, sentence_num in tokens:
tag = 'O' # Default tag is 'Outside'
for entity_type, spans in entities.items():
for span_start, span_end in spans:
if start == span_start:
tag = f'B-{entity_type}' # Beginning of an entity
break
elif start > span_start and end <= span_end:
tag = f'I-{entity_type}' # Inside an entity
break
annotated_tokens.append((token, tag, sentence_num, sentence_num_global))
return annotated_tokens
def parse_ann_file(ann_file_path):
"""
Parse the .ann file to extract entity annotations.
Returns a dictionary where keys are entity types and values are lists of (start, end) character positions.
"""
entities = defaultdict(list)
with open(ann_file_path, 'r', encoding='utf-8') as file:
for line in file:
parts = line.strip().split('\t')
if len(parts) == 3:
entity_id, entity_info, _ = parts
entity_type, start, end = re.split(r'[\s;]+', entity_info)[:3]
entities[entity_type].append((int(start), int(end)))
return entities
def tokenize_text(text):
"""
Tokenize the text into words and return a list of tuples (word, start, end).
"""
# Using simple whitespace tokenization, can be replaced with more sophisticated tokenizers if needed
tokens = []
current_pos = 0
for word in text.split():
start = text.find(word, current_pos)
end = start + len(word)
tokens.append((word, start, end))
current_pos = end
return tokens
# Initialize the global sentence counter at the beginning of the script
sentence_num_all = 0
# Process each file pair and generate IOB formatted data
iob_data = {}
for ann_file, txt_file in zip(ann_files, txt_files):
ann_file_path = os.path.join(data_folder, ann_file)
txt_file_path = os.path.join(data_folder, txt_file)
# Parse the annotation file
entities = parse_ann_file(ann_file_path)
# Read and tokenize the text file
with open(txt_file_path, 'r', encoding='utf-8') as file:
text = file.read()
tokens, sentence_num_all = tokenize_text_with_sentences(text, sentence_num_all)
# Annotate the tokens
annotated_tokens = annotate_tokens_with_sentences(tokens, entities, sentence_num_all)
# Store the IOB data
iob_data[txt_file] = annotated_tokens
# Define the CSV file path
csv_file_path = os.path.join(data_folder, "combined_IOB_data.csv")
prev_sentence_global = 0
# Create and write data to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Writing header
writer.writerow(['Token', 'Tag', 'Sentence_Num', 'Sentence_Num_Global', 'File'])
# Writing data
for file_name, annotated_tokens in iob_data.items():
for token, tag, sentence_num, sentence_num_all in annotated_tokens:
writer.writerow([token, tag, sentence_num, prev_sentence_global+sentence_num, file_name.replace('_IOB.txt', '')])
prev_sentence_global = sentence_num_all