-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathStripeSkippingParser.py
186 lines (146 loc) · 5.6 KB
/
StripeSkippingParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python
"""
still pretty drafty, but it works on
test/stripe-skipping/[1-4].xml
just runs standalone now, and just writes to
one particular sink. needs reworking on that
front.
@@@ switch to Span objects with plain literal values, for all
character content.
Leaf elements have a .text which is a str lit
Tree elements have first/rest which are trees or leaves.
Nah, use literals directly...
DO use <author><sandro/></author>
"""
__version__ = "$Revision$"
# $Id$
import sys
import inspect
import re
import xml.sax
from NTriplesSink import FormulaSink
class Error(RuntimeError):
pass
################################################################
## counter = 1
## def Class(uri=None):
## global counter
## result = "_:g" + str(counter)
## counter += 1
## if uri:
## print "("+result+" rdf:type <"+uri+">)"
## return result
## def Property(uri=None):
## return "[has "+str(uri)+"]"
## class Sink:
## def add(self, *arg):
## print arg
#
#
class docHandler(xml.sax.ContentHandler):
"""
Parse an XML file into triples of URIs and bNodes. No literals.
Plain string literals are turned into rdf:List objects containing
the characters, where each character is identified by a URI. This
allows markup to be mixed into the strings.
An XML element whose name (after ns prefix) starts with an uppercase letter is
taken to stand for an individual instance of the class named by the element.
An XML element whose name (after ns prefix) starts with a lowercase letter is:
* if no child elements of or text content: <unknown>
* if exactly one child or one text char: a property linking to that
element
* if more than one: a property linking to an rdf:List of those elements
"""
def __init__(self):
self.sink = FormulaSink()
self.document = self.sink.termFor(uri="")
self.parents = [ "root", self.document ]
self.parentsLenStack = []
self.expectingIndividual = 1
self.rdftype = self.sink.termFor(uri="http://...#type")
self.rdfli = self.sink.termFor(uri="http://...#li")
self.awaitingFirstValue = 0
self.firstValueBuffer = None
def instanceOf(self, uri=None):
result = self.sink.termFor();
if uri is not None:
self.sink.insert((result, self.rdftype, self.sink.termFor(uri=uri)))
return result
def startElementNS(self, name, qname, attrs):
self.parentsLenStack.insert(0, len(self.parents))
if name[0].endswith("/") or name[0].endswith("#"):
uri = name[0] + name[1]
else:
uri = name[0] + "/" + name[1]
char1 = name[1][0:1]
if char1.isupper():
self.prepareForIndividual()
me = self.instanceOf(uri)
self.gotIndividual(me)
self.parents.insert(0, me)
elif char1.islower():
self.prepareForProperty()
me = self.sink.termFor(uri=uri)
self.parents.insert(0, me)
self.awaitingFirstValue = 1
else:
raise RuntimeError, "not upper or lower?"
def prepareForIndividual(self):
if len(self.parents) % 2 == 1:
print "# need to infer a property stripe"
self.parents.insert(0, self.rdfli)
def prepareForProperty(self):
if len(self.parents) % 2 == 0:
print "# need to infer an individual stripe"
i = self.instanceOf()
self.parents.insert(0, i)
self.sink.insert((self.parents[2], self.parents[1], self.parents[0]))
def endElementNS(self, name, qname):
if self.awaitingFirstValue:
raise Error, "property with no value given" # use this syntax for named things?
if self.firstValueBuffer is not None:
self.sink.insert((self.parents[1], self.parents[0], self.firstValueBuffer))
self.firstValueBuffer = None
finalLen = self.parentsLenStack[0]
del self.parentsLenStack[0]
while len(self.parents) > finalLen:
del self.parents[0]
def characters(self, content):
self.prepareForIndividual()
#for char in content:
# self.gotIndividual(self.sink.termFor(char))
self.gotIndividual(self.sink.termFor(content))
# append it to the string buffer, which gets
# converted at the end or next individual.
def gotIndividual(self, term):
if self.awaitingFirstValue:
self.awaitingFirstValue = 0
self.firstValueBuffer = term
return
if self.firstValueBuffer is not None:
# we have multiple values; we need a list!
self.prepareForProperty() # say it's a List?
self.prepareForIndividual()
self.sink.insert((self.parents[1], self.parents[0], self.firstValueBuffer))
self.firstValueBuffer = None
self.sink.insert((self.parents[1], self.parents[0], term))
if __name__ == '__main__':
parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 1)
dh = docHandler()
parser.setContentHandler(dh)
parser.parse(sys.stdin)
#if __name__ == "__main__":
# import doctest, sys
# doctest.testmod(sys.modules[__name__])
# $Log$
# Revision 1.3 2003-08-01 15:35:03 sandro
# added some comments
#
# Revision 1.2 2003/04/04 12:52:10 sandro
# added quoting of newlines
# made characters be RDF literals, grouped together
#
# Revision 1.1 2003/04/03 21:55:06 sandro
# First cut prototype of a parser which reads plain XML as if it were RDF.
#