-
Notifications
You must be signed in to change notification settings - Fork 1
/
BEIC-ia2commons.rb
executable file
·85 lines (75 loc) · 3.73 KB
/
BEIC-ia2commons.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env ruby
# encoding: utf-8
##################################################################################
# Script to read a list of BEIC.it PIDs and upload archive.org books to Commons #
# via the https://tools.wmflabs.org/ia-upload tool, adding {{BEIC|pid= X }}. #
# Scrapes some data from digitool's MetadataManager (webclient), which is in #
# MARC21 (http://www.loc.gov/MARC21/slim). #
# #
# How to use: #
# * write the list of BEIC PIDs in a file BEIC-pids.txt in same directory; #
# * "login" to ia-upload with OAuth and get the PHPSESSID cookie; #
# * pass the cookie value as only argument to this script. #
# #
# MIT license, (C) Federico Leva e Fondazione BEIC, 2014 #
# #
##################################################################################
require 'rubygems'
require 'mechanize'
require 'uri'
# You have to manually set the ia-upload PHPSESSID cookie as commandline argument for now
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Linux Firefox'
}
cookie = Mechanize::Cookie.new :domain => ".tools.wmflabs.org", :name => "PHPSESSID", :value => ARGV[0], :path => "/"
a.cookie_jar << cookie
pids = Array.new
File.foreach("BEIC-pids.txt", "\n") do |pid|
metadata = a.get("http://131.175.183.1/webclient/MetadataManager?descriptive_only=true&pid=" + pid)
if metadata.search("//td[contains(text(),'CaSfIA')]").empty?
puts "Nothing to do with " + pid
next
end
puts "There is hope for " + pid
pids << pid
# http://www.w3.org/TR/xpath/#path-abbrev
ia = metadata.parser.xpath('//td[text()="Other ID"]/../td[5]').text
title = metadata.parser.xpath('//td[text()="Main Title"]/../td[5]').text
# The pseudo-field "a" is not always in the same position (row)
# For following-sibling etc. see http://www.w3.org/TR/xpath/#section-Location-Steps
if metadata.parser.xpath('//td[text()="Personal Name"]/../td[4]').text == 'a'
name = metadata.parser.xpath('//td[text()="Personal Name"]/../td[5]').text
else
name = metadata.parser.xpath('//td[text()="Personal Name"]/../following::td[text()="a"][1]/../td[last()]').text
end
if name.nil? or name == ''
name = "AAVV"
else
name = name.split(/(,| :)/)[0]
end
year = metadata.parser.xpath('//td[text()="Imprint"]/../following-sibling::tr[position()=2]/td[3]').text
# We can finally produce a filename it.wikisource likes!
commons = name + " - " + title + ", " + year
# Ensure the title isn't invalid
commons = commons.gsub(/[<>\[\]|{}]/, "")
tool = URI.escape("https://tools.wmflabs.org/ia-upload/commons/init?iaId=" + ia + "&commonsName=" + commons)
puts "Will try: " + tool
begin
upload = a.get(tool).form
confirm = a.submit(upload, upload.buttons.first)
form = confirm.forms.first
description = form['description'].sub! "{{IA|", "{{BEIC-IA|pid = " + pid.strip! + " |ia="
author = description.match(/\| *Author *= *(.+)$/)[1]
author = author.gsub(/, [0-9-]+/, '')
# Shuffle names with disambiguation and remove numbering. Reused from Primo.js
author = author.gsub(/ *([^,;]+), +([^,;0-9]+) *[0-9]*, +([^,;]+) */, '\2 \1 (\3)')
# Same, other names
author = author.gsub(/ *([^,;]+), +([^,;0-9]+) *[0-9]* */, '\2 \1')
description = description.sub(/\| *Author *= *(.+)$/, '| Author = ' + author )
description = description + "\n" + '[[Category:' + author + ']]'
form['description'] = description
form.submit
rescue
puts "Something went wrong for PID: " + pid
end
end