-
Notifications
You must be signed in to change notification settings - Fork 1
/
ext-fulldumpxml-parserevision.rb
executable file
·225 lines (206 loc) · 6.89 KB
/
ext-fulldumpxml-parserevision.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
# Usage:
# % gzip -cd 20110918-savemlak-full.xml.gz | ./ext-fulldumpxml.rb
require "time"
require "rubygems"
require "libxml"
SECTION_MAPPING = {
/被害状況|被災状況|被害情報|被災情報/o => :damage,
/運営状況|運営情報/o => :operation,
/救援状況|救援情報/o => :rescue,
/避難受入状況|避難受入情報/o => :refuge,
/その他|自由記述/o => :other,
/情報源|記入者|元情報/o => :source,
}
FACILITY_REGEXP = /\{\{(施設|図書館|博物館)(.*?)\}\}/mo
def parse_facility( text )
if FACILITY_REGEXP =~ text
$2
else
nil
end
end
def parse_sections( text )
section = :first
result = {}
text = text.gsub( FACILITY_REGEXP ) do |m|
result[ :basic ] = m
""
end
text.split( /\r?\n/ ).each do |line|
if /\A==([^=].*?)==\s*\Z/o =~ line
section_str = $1.strip
key = SECTION_MAPPING.keys.find{|k| k =~ section_str }
if key
section = SECTION_MAPPING[ key ]
else
puts "WARNING: unknown section: '#{ section_str }'. ignored."
end
end
next if line.empty?
if not parse_category( line ).empty?
result[ :category ] ||= []
result[ :category ] << line
else
result[ section ] ||= []
result[ section ] << line
end
end
result
end
def parse_category( text )
m = text.scan( /\[\[(?:Category|カテゴリ)\:(.+?)\]\]/io )
m.flatten
end
if $0 == __FILE__
opt_mode = :page
opt_til_201207 = false
while( ARGV[0] and ARGV[0] =~ /\A-/ ) do
if ARGV[0] and ARGV[0] =~ /\A--?(section|revision)/
opt_mode = :section
ARGV.shift
elsif ARGV[0] and ARGV[0] =~ /\A--?standard/
opt_til_201207 = true
ARGV.shift
end
end
time = Time.now
parser = LibXML::XML::Parser.io( ARGF )
doc = parser.parse
#puts Time.now - time
doc.root.namespaces.default_prefix = 'mw'
pages = doc.find( "//mw:page" )
#puts pages.size
count_page = {
:total => 0,
:type => {
:museum => 0,
:library => 0,
:archives => 0,
:kominkan => 0,
},
:pref => {},
}
count_revision = {
:total => 0,
:type => {
:museum => 0,
:library => 0,
:archives => 0,
:kominkan => 0,
},
:pref => {},
}
pages.each do |page|
next if not page.find( "./mw:ns" )[0].content == "0"
revisions = page.find( "./mw:revision" ).to_a
title = page.find( "./mw:title" )[0].content
#puts [title, revisions.size].join("\t")
revisions = revisions.find_all do |r|
text = r.find( "./mw:text" )[0].content
text =~ /\{\{(施設|図書館)\s*/o and not text =~ /\A#(REDIRECT|転送)/o
end
if opt_til_201207
revisions = revisions.find_all do |r| # 2012-07-01以降のものは無視する
timestamp = r.find( "./mw:timestamp" )[0].content
timestamp < "2012-07-01"
end
end
next if revisions.empty?
count_page[ :total ] += 1
count_revision[ :total ] += revisions.size
latest_text = revisions[-1].find( "./mw:text" )[0].content
categories = parse_category( latest_text )
categories.each do |cat|
case cat
# 館種カウント:
when "図書館"
count_page[ :type ][ :library ] += 1
count_revision[ :type ][ :library ] += revisions.size
when "博物館", "美術館"
count_page[ :type ][ :museum ] += 1
count_revision[ :type ][ :museum ] += revisions.size
when "公民館"
count_page[ :type ][ :kominkan ] += 1
count_revision[ :type ][ :kominkan ] += revisions.size
when "文書館"
count_page[ :type ][ :archives ] += 1
count_revision[ :type ][ :archives ] += revisions.size
# 都道府県カウント
when /(都|道|府|県)\Z/
count_page[ :pref ][ cat ] ||= 0
count_page[ :pref ][ cat ] += 1
count_revision[ :pref ][ cat ] ||= 0
count_revision[ :pref ][ cat ] += revisions.size
else
# puts "# unknown cat: #{ cat.inspect }"
end
end
if opt_mode == :section
text, prev_text = nil
revisions.each_with_index do |revision, i|
edit_type = []
timestamp = revision.find( "./mw:timestamp" )[0].content
date = Time.parse( timestamp ).localtime.iso8601[ 0, 10 ]
contributor = revision.find( "./mw:contributor" )[0]
user = contributor.content
if contributor.find( "./mw:username" ) and contributor.find( "./mw:username" )[0]
user = contributor.find( "./mw:username" )[0].content
if revision.find( "./mw:comment" ) and revision.find( "./mw:comment" )[0] and revision.find( "./mw:comment" )[0].content == "ロボットによる編集: check Yomi field"
user = contributor.find( "./mw:username" )[0].content + ":yomi"
end
elsif contributor.find( "./mw:ip" ) and contributor.find( "./mw:ip" )[0]
user = contributor.find( "./mw:ip" )[0].content
end
text = revision.find( "./mw:text" )[0].content
if i == 0
edit_type << :new
else
sections = parse_sections( text )
prev_sections = parse_sections( prev_text )
sections.each do |sec, content|
if content != prev_sections[ sec ]
#if sec == :basic
# p [ content, prev_sections[ sec ]]
#end
edit_type << sec
end
end
end
if opt_mode == :section
puts [ date, title,
edit_type.join(","),
user,
categories.join(",") ].join( "\t" )
end
prev_text = text
end
end
if opt_mode == :page
puts [ title, revisions.size, categories.join(",") ].join( "\t" )
end
end
if opt_mode == :page
STDERR.puts "Page statistics:"
STDERR.puts "- Total: #{ count_page[ :total ] }"
STDERR.puts "- Subtotal by Type:"
count_page[ :type ].each do |k,v|
STDERR.puts [ k, v ].join( "\t" )
end
STDERR.puts "Subtotal by Prefecture:"
count_page[ :pref ].each do |k,v|
STDERR.puts [ k, v ].join( "\t" )
end
STDERR.puts "Revision statistics:"
STDERR.puts "- Total: #{ count_revision[ :total ] }"
STDERR.puts "- Subtotal by Type:"
count_revision[ :type ].each do |k,v|
STDERR.puts [ k, v ].join( "\t" )
end
STDERR.puts "Subtotal by Prefecture:"
count_revision[ :pref ].each do |k,v|
STDERR.puts [ k, v ].join( "\t" )
end
end
end