Skip to content

Commit

Permalink
Add more fine-grained merge rules for SAB classification (#1322)
Browse files Browse the repository at this point in the history
* Add more fine-grained merge rules for SAB classification

* Add rules file

* Fix incomplete code

* Hua --> Hua*

* Add test

* Clarify

* Correct comment

* Remove dpendendency added by mistake
  • Loading branch information
kwahlin authored Nov 9, 2023
1 parent 9c32aa1 commit 5a364c4
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 18 deletions.
8 changes: 8 additions & 0 deletions librisworks/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ sourceSets {
scripts {
groovy { srcDir 'scripts' }
}
test {
groovy { srcDir 'src/test/groovy/' }
}
}

repositories {
Expand All @@ -17,6 +20,11 @@ dependencies {
compileOnly project(':whelk-core')
scriptsCompileOnly sourceSets.main.output
scriptsCompileOnly project(':whelk-core')
testImplementation "org.spockframework:spock-core:${spockVersion}"
}

test {
useJUnitPlatform()
}

jar {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class Doc {
}

boolean isSabFiction() {
classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|uH|ufH|ugH)/ }
classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|h|uH|ufH|ugH)/ }
}

boolean isNotFiction() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
package se.kb.libris.mergeworks.compare

class Classification extends StuffSet {
// Terms that will be merged (values precede keys)
private static def norm = [
'uHc' : ['Hc,u'],
'uHce' : ['Hce,u'],
'Hc' : ['Hc.01', 'Hc.02', 'Hc.03'],
'Hc,u' : ['Hcf', 'Hcg']
]
private static def sabPrecedenceRules = loadSabPrecedenceRules()

@Override
Object merge(Object a, Object b) {
Expand All @@ -17,17 +11,17 @@ class Classification extends StuffSet {
if (!code1 || !code2) {
return
}
code1 = code1.replaceAll(/\s+/, "")
code2 = code2.replaceAll(/\s+/, "")

if (isSab(c1) && isSab(c2)) {
def code = code1 == code2 || n(code2, code1)
? code1
: (n(code1, code2) ? code2 : null)
if (code) {
code1 = normalizeSabCode(code1)
code2 = normalizeSabCode(code2)

def mergedCode = tryMergeSabCodes(code1, code2)

if (mergedCode) {
def result = [
'@type' : 'Classification',
'code' : code1,
'code' : mergedCode,
inScheme: [
'@type': 'ConceptScheme',
'code' : 'kssb'
Expand Down Expand Up @@ -56,7 +50,7 @@ class Classification extends StuffSet {
}

boolean isSab(Map c) {
c['inScheme'] && c['inScheme']['code'] == 'kssb'
c['inScheme'] && c['inScheme']['code'] =~ 'kssb'
}

String maxSabVersion(c1, c2) {
Expand All @@ -79,7 +73,114 @@ class Classification extends StuffSet {
Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", ""))
}

boolean n(a, b) {
norm[a]?.any { it == b || n(it, b) }
static String normalizeSabCode(String sab) {
sab.replaceFirst(~/^h/, 'H').with {
it =~ /bf:|z/ ? it : it.replaceAll(~/\s+/, '')
}
}

static String tryMergeSabCodes(String a, String b) {
if (a == b) {
return a
}
if (sabPrecedes(a, b)) {
return a
}
if (sabPrecedes(b, a)) {
return b
}
return null
}

static sabPrecedes(String a, String b) {
def (equal, startsWith) = sabPrecedenceRules
// Codes starting with Hcb or Hdab should never overwrite another code
def overwriteExceptions = ~/^Hcb|^Hdab/
def preferred = equal[b] ?: startsWith.find { b.startsWith(it.key) }?.value
if (preferred && !(a =~ overwriteExceptions)) {
if (preferred['equals'] && a in preferred['equals']) {
return true
}
if (preferred['startsWith'] && preferred['startsWith'].any { a.startsWith(it) }) {
return true
}
}
return false
}

/**
* Loads rules for how to merge SAB codes from file.
* The code in the first column is preferred over the other codes in the same row.
* The codes can contain wildcard characters '?' (anywhere in the string) or '*' (at the end)
* The asterisk represents any sequence of characters (zero or more)
* The question mark represents zero or one of the characters '6', '7' and '8'.
* Examples:
* Hcd* | Hcbd*
* --> Any code starting with Hcd is picked over any code starting with Hcbd
* Hda.01?=c | Hda.01? | Hda=c
* --> Hda.01=c, Hda.016=c, Hda.017=c, Hda.018=c and Hda=c are all picked over over Hda.01, Hda.016, Hda.017, Hda.018 and Hda=c
* Hcee.03 | Hce.03 | Hcee
* --> Hcee.03 is picked over Hce.03 and Hcee
*
* The rules are loaded into two different maps, 'equal' and 'startsWith'.
* The top-level keys of these maps are the codes that can possibly be overwritten.
*
* In the 'equal' map we can directly look up a code (key) to see if there are preferred codes that should overwrite it,
* while in the 'startsWith' map we check if the code starts with any of the keys. For example if the code is 'Hce'
* and we have startsWith = ['He: [:], 'Hm': [:] 'Hc': [:]] we iterate over the entries until 'Hc' is found.
*
* The value is in turn also a Map containing the codes that are preferred over the code matching the key.
* The map at this second level can have two keys, 'equals' and 'startsWith', and the values are sets of preferred codes.
*
* Example:
* [
* 'Hc.01': ['equals': ['Hc.01', 'Hc.016', 'Hc.017', 'Hc.018', 'Hcd.01', 'Hcd.016', 'Hcd.017', 'Hcd.018']],
* 'Hce': ['startsWith': ['Hce']]
* ]
*
* This means that any code starting with 'Hce' is preferred over just 'Hce' and any of 'Hc.01', 'Hc.016', 'Hc.017'...
* is preferred over just 'Hc.01'.
*/
static Tuple2<Map<String, Map>, Map<String, Map>> loadSabPrecedenceRules() {
Map equal = [:]
Map startsWith = [:]

def questionMarkSubstitutes = ['6', '7', '8', '']

Classification.class.getClassLoader()
.getResourceAsStream('merge-works/sab-precedence-rules.tsv')
.splitEachLine('\t') {
def preferred = it.first()
def preferredStartsWith = preferred.endsWith('*') ? preferred[0..<-1] : null
def preferredEquals = preferred.contains('?')
? questionMarkSubstitutes.collect { preferred.replace('?', it) }
: (preferredStartsWith ? null : [preferred])

def addPreferred = { Map pref ->
if (preferredStartsWith) {
pref.computeIfAbsent('startsWith', f -> [] as Set).add(preferredStartsWith)
}
if (preferredEquals) {
pref.computeIfAbsent('equals', f -> [] as Set).addAll(preferredEquals)
}
}

def overwrite = it.drop(1)
overwrite.each { s ->
if (s.endsWith('*')) {
def leading = s[0..<-1]
startsWith.computeIfAbsent(leading, f -> [:]).with(addPreferred)
} else if (s.contains('?')) {
questionMarkSubstitutes.each {
def substituted = s.replace('?', it)
equal.computeIfAbsent(substituted, f -> [:]).with(addPreferred)
}
} else {
equal.computeIfAbsent(s, f -> [:]).with(addPreferred)
}
}
}

return new Tuple2(equal, startsWith)
}
}
107 changes: 107 additions & 0 deletions librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
H* H
Hc* Hc
Hcd* Hcbd*
Hcq* Hcbq*
Hc,u H,u uHc uH
Hce* Hce
Hce,u H,u Hc,u uHce
Hcf* Hc,u uHc Hcf
Hcg* Hc,u uHc Hcg
Hci,u Hci
Hd* Hd
Hda* Hda
Hdb* Hdb
He* He
Hf* Hf
Hg* Hg
Hi* Hi
Hj* Hj
Hk* Hk
Hl* Hl
Hm* Hm
Hma* Hma
Hmb* Hmb
Hmc* Hmc
Hmd* Hmd
Hsg* Hsg
Hub* Hua* Hub
Hva* Hva
Hxj* Hxj
Hc.01? Hc.01
Hcd.01? Hcd.01 Hc.01 Hcd
Hcd.03 Hc.03 Hcd
Hce.01? Hce.01
Hceda.01? Hceda.01 Hce.01 Hced Hceda
Hceda.03 Hce.03 Hced Hceda
Hcedb.01? Hcedb.01 Hce.01? Hced Hcedb
Hcedb.03 Hce.03 Hced Hcedb
Hcee.01? Hcee.01 Hce.01? Hcee
Hcee.03 Hce.03 Hcee
Hceeq.01? Hceeq.01 Hcee.01? Hce.01? Hcee Hceeq
Hceeq.03 Hcee.03 Hce.03 Hcee Hceeq
Hcef.01? Hcef.01 Hce.01? Hcef
Hcef.03 Hce.03 Hcef
Hceg.01? Hceg.01 Hce.01? Hceg
Hceg.03 Hce.03 Hceg
Hcei.01? Hcei.01 Hce.01? Hcei
Hcei.03 Hce.03 Hcei
Hcej.01? Hcej.01 Hce.01? Hcej
Hcej.03 Hce.03 Hcej
Hcek.01? Hcek.01 Hce.01? Hcek
Hcek.03 Hce.03 Hcek
Hcekq.01? Hcekq.01 Hcek.01? Hce.01? Hcek Hcekq
Hcekq.03 Hcek.03 Hce.03 Hcek Hcekq
Hcel.01? Hcel.01 Hce.01? Hcel
Hcel.03 Hce.03 Hcel
Hcema.01? Hcema.01 Hce.01? Hcem Hcema
Hcema.03 Hce.03 Hcem Hcema
Hcemb.01? Hcemb.01 Hce.01? Hcem Hcema
Hcemb.03 Hce.03 Hcem Hcemb
Hcemc.01? Hcemc.01 Hce.01? Hcem Hcemc
Hcemc.03 Hce.03 Hcem Hcmec
Hcemd.01? Hcemd.01 Hce.01? Hcem Hcemd
Hcemd.03 Hce.03 Hcem Hcemd
Hcesg.01? Hcesg.01 Hce.01? Hcesg
Hcesg.03 Hce.03 Hces Hcesg
Hceub.01? Hceub.01 Hce.01? Hceub
Hceub.03 Hce.03 Hceu Hceub
Hceva.01? Hceva.01 Hce.01? Hceva
Hceva.03 Hce.03 Hcev Hceva
Hcexj.01? Hcexj.01 Hce.01? Hcexj
Hcexj.03 Hce.03 Hcex Hcexj
Hda.01?=c Hda.01? Hda=c
Hda.03=c Hda.03 Hda=c
Hdb.01?=c Hdb.01? Hdb=c
Hdb.03=c Hdb.03 Hdb=c
He.01?=c He.01? He=c
He.03=c He.03 He=c
Heq.01?=c Heq.01=c Heq.01? Heq=c He.01? He=c He.01?=c
Heq.03=c Heq.03 Heq=c He.03* He=c
Hf.01?=c Hf.01? Hf=c
Hf.03=c Hf.03 Hf=c
Hi.01?=c Hi.01? Hi=c
Hi.03=c Hi.03 Hi=c
Hj.01?=c Hj.01? Hj=c
Hj.03=c Hj.03 Hj=c
Hk.01?=c Hk.01? Hk=c
Hk.03=c Hk.03 Hk=c
Hkq.01?=c Hkq.01=c Hkq.01? Hkq=c Hk.01? Hk=c Hk.01?=c
Hkq.03=c Hkq.03 Hkq=c Hk.03* Hk=c
Hl.01?=c Hl.01? Hl=c
Hl.03=c Hl.03 Hl=c
Hma.01?=c Hma.01? Hma=c
Hma.03=c Hma.03 Hma=c
Hmb.01?=c Hmb.01? Hmb=c
Hmb.03=c Hmd.03 Hmb=c
Hmc.01?=c Hmc.01? Hmc=c
Hmc.03=c Hmc.03 Hmc=c
Hmd.01?=c Hmd.01? Hmd=c
Hmd.03=c Hmd.03 Hmd=c
Hsg.01?=c Hsg.01? Hsg=c
Hsg.03=c Hsg.03 Hsg=c
Hub.01?=c Hub.01? Hub=c
Hub.03=c Hub.03 Hub=c
Hva.01?=c Hva.01? Hva=c
Hva.03=c Hva.03 Hva=c
Hxj.01?=c Hxj.01? Hxj=c
Hxj.03=c Hxj.03 Hxj=c
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package se.kb.libris.mergeworks.compare

import spock.lang.Specification

class ClassificationSpec extends Specification {
def "merge SAB codes"() {
expect:
Classification.tryMergeSabCodes(a, b) == result

where:
a || b || result
'H' || 'H' || 'H'
'Haaa' || 'H' || 'Haaa'
'Hcqaa' || 'Hcbqbbb' || 'Hcqaa'
'Hcb' || 'Hc' || null
'Hci' || 'Hci,u' || 'Hci,u'
'Hcd.016' || 'Hcd.01' || 'Hcd.016'
'Hc.01' || 'Hcd.01' || 'Hcd.01'
'Hda.017=c' || 'Hda.018' || 'Hda.017=c'
'He' || 'Hc' || null
}
}

0 comments on commit 5a364c4

Please sign in to comment.