From 2a063f493f147beace7dde8025a083ff73a76f08 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 6 Sep 2023 16:35:27 +0200 Subject: [PATCH 1/4] Exclude works with marc:PreAdolescent in intendedAudience from selection --- librisworks/scripts/swedish-fiction.groovy | 1 + .../src/main/groovy/se/kb/libris/mergeworks/Doc.groovy | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/librisworks/scripts/swedish-fiction.groovy b/librisworks/scripts/swedish-fiction.groovy index dacab96ea7..4e6ec15086 100644 --- a/librisworks/scripts/swedish-fiction.groovy +++ b/librisworks/scripts/swedish-fiction.groovy @@ -17,6 +17,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) {cluster -> && !doc.isDrama() && !doc.isThesis() && !doc.isInSb17Bibliography() + && !doc.intendedForMarcPreAdolescent() } def swedish = { Doc doc -> diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 11bb47af05..88f9e65618 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -144,6 +144,10 @@ class Doc { asList(workData['genreForm']) } + List intendedAudience() { + asList(workData['intendedAudience']) + } + List publication() { asList(instanceData?.publication) } @@ -211,6 +215,10 @@ class Doc { || hasRelationshipWithContribution() } + boolean intendedForMarcPreAdolescent() { + intendedAudience().contains(['@id': 'https://id.kb.se/marc/PreAdolescent']) + } + boolean hasPart() { workData['hasPart'] || instanceData['hasTitle'].findAll { it['@type'] == 'Title' }.any { it.hasPart?.size() > 1 || it.hasPart?.any { p -> asList(p.partName).size() > 1 || asList(p.partNumber).size() > 1 } From 4bd0e723ebc3da758bbf6dd0fffcd8af7ffc47d7 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 7 Sep 2023 14:20:44 +0200 Subject: [PATCH 2/4] Allow Text, StillImage or empty when matching contentType --- .../se/kb/libris/mergeworks/WorkComparator.groovy | 2 +- .../kb/libris/mergeworks/compare/ContentType.groovy | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy index 4d6ca48c00..96c2864c1d 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy @@ -11,7 +11,7 @@ class WorkComparator { Map comparators = [ 'classification' : new Classification(), - 'contentType' : new SameOrEmpty('https://id.kb.se/term/rda/Text'), + 'contentType' : new ContentType(), 'genreForm' : new GenreForm(), 'hasTitle' : new WorkTitle(), 'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'), diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy new file mode 100644 index 0000000000..4d0d3e498b --- /dev/null +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/ContentType.groovy @@ -0,0 +1,12 @@ +package se.kb.libris.mergeworks.compare + +import static se.kb.libris.mergeworks.Util.asList + +class ContentType extends StuffSet { + private static def allowedValues = ['https://id.kb.se/term/rda/StillImage', 'https://id.kb.se/term/rda/Text'] + + @Override + boolean isCompatible(Object a, Object b) { + asList(a).every { it['@id'] in allowedValues } && asList(b).every { it['@id'] in allowedValues } + } +} From cd1d9264db7d52263b9ef2457f7ad17c3949fbb0 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 7 Sep 2023 14:47:49 +0200 Subject: [PATCH 3/4] Refine intendedAudience matching - marc:Adult can only be matched with empty or marc:General - Match empty/marc:General with marc:Adult primarily --- librisworks/scripts/display-clusters.groovy | 3 +++ librisworks/scripts/display-works.groovy | 3 +++ librisworks/scripts/merge-works.groovy | 3 ++- .../se/kb/libris/mergeworks/Util.groovy | 11 ++++++++++ .../libris/mergeworks/WorkComparator.groovy | 2 +- .../compare/IntendedAudience.groovy | 15 +++++++++++++ .../mergeworks/compare/SameOrEmpty.groovy | 21 ------------------- 7 files changed, 35 insertions(+), 23 deletions(-) create mode 100644 librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy delete mode 100644 librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/SameOrEmpty.groovy diff --git a/librisworks/scripts/display-clusters.groovy b/librisworks/scripts/display-clusters.groovy index 353f763101..204dc7d7a3 100644 --- a/librisworks/scripts/display-clusters.groovy +++ b/librisworks/scripts/display-clusters.groovy @@ -1,6 +1,8 @@ import se.kb.libris.mergeworks.Doc import se.kb.libris.mergeworks.Html +import static se.kb.libris.mergeworks.Util.sortByIntendedAudience + htmlReport = getReportWriter('clusters.html') htmlReport.println(Html.START) @@ -16,6 +18,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> } docs.each { it.addComparisonProps() } + .with { sortByIntendedAudience(it) } htmlReport.println(Html.clusterTable(docs) + Html.HORIZONTAL_RULE) } diff --git a/librisworks/scripts/display-works.groovy b/librisworks/scripts/display-works.groovy index 281ab8ca7a..29031b6e5c 100644 --- a/librisworks/scripts/display-works.groovy +++ b/librisworks/scripts/display-works.groovy @@ -3,6 +3,7 @@ import se.kb.libris.mergeworks.Html import se.kb.libris.mergeworks.WorkComparator import static se.kb.libris.mergeworks.Util.partition +import static se.kb.libris.mergeworks.Util.sortByIntendedAudience htmlReport = getReportWriter('works.html') @@ -18,6 +19,8 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> ?.with { docs.add(new Doc(getWhelk(), it)) } } + sortByIntendedAudience(docs) + WorkComparator c = new WorkComparator(WorkComparator.allFields(docs)) def workClusters = workClusters(docs, c).findAll { it.size() > 1 } diff --git a/librisworks/scripts/merge-works.groovy b/librisworks/scripts/merge-works.groovy index 2afa00ccbd..99d5e7a290 100644 --- a/librisworks/scripts/merge-works.groovy +++ b/librisworks/scripts/merge-works.groovy @@ -3,6 +3,7 @@ import se.kb.libris.mergeworks.WorkComparator import se.kb.libris.mergeworks.Doc import static se.kb.libris.mergeworks.Util.partition +import static se.kb.libris.mergeworks.Util.sortByIntendedAudience maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv") multiWorkReport = getReportWriter("multi-work-clusters.html") @@ -107,7 +108,7 @@ Collection> workClusters(Collection docs, WorkComparator c) if (it.instanceData) { it.addComparisonProps() } - } + }.with { sortByIntendedAudience(it) } def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) }) .each { work -> work.each { doc -> doc.removeComparisonProps() } } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index f28a486b59..2f02340b6a 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -5,6 +5,9 @@ import whelk.Whelk import whelk.util.DocumentUtil import whelk.util.Unicode +import static se.kb.libris.mergeworks.compare.IntendedAudience.GENERAL +import static se.kb.libris.mergeworks.compare.IntendedAudience.ADULT + class Util { static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] @@ -308,4 +311,12 @@ class Util { ? normalize("${agent.givenName} ${agent.familyName}") : agent.name ? normalize("${agent.name}") : null } + + static void sortByIntendedAudience(Collection docs) { + docs.sort { Doc d -> + d.intendedAudience().with { + it.isEmpty() || it == [GENERAL] || it == [ADULT] + } + }.reverse(true) + } } \ No newline at end of file diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy index 96c2864c1d..0e6e064fed 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy @@ -14,7 +14,7 @@ class WorkComparator { 'contentType' : new ContentType(), 'genreForm' : new GenreForm(), 'hasTitle' : new WorkTitle(), - 'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'), + 'intendedAudience': new IntendedAudience(), '_numPages' : new Extent(), 'subject' : new Subject(), 'summary' : new StuffSet(), diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy new file mode 100644 index 0000000000..de095405ae --- /dev/null +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy @@ -0,0 +1,15 @@ +package se.kb.libris.mergeworks.compare + + +import static se.kb.libris.mergeworks.Util.asList + +class IntendedAudience extends StuffSet { + static def GENERAL = ['@id': 'https://id.kb.se/marc/General'] + static def ADULT = ['@id': 'https://id.kb.se/marc/Adult'] + + @Override + boolean isCompatible(Object a, Object b) { + !a || !b || asList(a) == [GENERAL] || asList(b) == [GENERAL] + || !(asList(a) + asList(b)).findResults { it == ADULT }.containsAll([true, false]) + } +} diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/SameOrEmpty.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/SameOrEmpty.groovy deleted file mode 100644 index 10826079b3..0000000000 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/SameOrEmpty.groovy +++ /dev/null @@ -1,21 +0,0 @@ -package se.kb.libris.mergeworks.compare - -import static se.kb.libris.mergeworks.Util.asList - -class SameOrEmpty implements FieldHandler { - Object link - - SameOrEmpty(String iri) { - this.link = [['@id': iri]] - } - - @Override - boolean isCompatible(Object a, Object b) { - (!a && asList(b) == link) || (!b && asList(a) == link) - } - - @Override - Object merge(Object a, Object b) { - return a ?: b - } -} \ No newline at end of file From 12639b756d57e133fe2572d6fd59284e790b42f1 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 8 Sep 2023 09:07:14 +0200 Subject: [PATCH 4/4] Refactor --- librisworks/scripts/display-clusters.groovy | 3 --- librisworks/scripts/display-works.groovy | 14 +------------- librisworks/scripts/merge-works.groovy | 18 ++---------------- .../groovy/se/kb/libris/mergeworks/Util.groovy | 18 +++++++++++------- .../mergeworks/compare/IntendedAudience.groovy | 13 +++++++++++-- 5 files changed, 25 insertions(+), 41 deletions(-) diff --git a/librisworks/scripts/display-clusters.groovy b/librisworks/scripts/display-clusters.groovy index 204dc7d7a3..353f763101 100644 --- a/librisworks/scripts/display-clusters.groovy +++ b/librisworks/scripts/display-clusters.groovy @@ -1,8 +1,6 @@ import se.kb.libris.mergeworks.Doc import se.kb.libris.mergeworks.Html -import static se.kb.libris.mergeworks.Util.sortByIntendedAudience - htmlReport = getReportWriter('clusters.html') htmlReport.println(Html.START) @@ -18,7 +16,6 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> } docs.each { it.addComparisonProps() } - .with { sortByIntendedAudience(it) } htmlReport.println(Html.clusterTable(docs) + Html.HORIZONTAL_RULE) } diff --git a/librisworks/scripts/display-works.groovy b/librisworks/scripts/display-works.groovy index 29031b6e5c..c26eb0ddf9 100644 --- a/librisworks/scripts/display-works.groovy +++ b/librisworks/scripts/display-works.groovy @@ -2,8 +2,7 @@ import se.kb.libris.mergeworks.Doc import se.kb.libris.mergeworks.Html import se.kb.libris.mergeworks.WorkComparator -import static se.kb.libris.mergeworks.Util.partition -import static se.kb.libris.mergeworks.Util.sortByIntendedAudience +import static se.kb.libris.mergeworks.Util.workClusters htmlReport = getReportWriter('works.html') @@ -19,8 +18,6 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> ?.with { docs.add(new Doc(getWhelk(), it)) } } - sortByIntendedAudience(docs) - WorkComparator c = new WorkComparator(WorkComparator.allFields(docs)) def workClusters = workClusters(docs, c).findAll { it.size() > 1 } @@ -31,15 +28,6 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> htmlReport.println(Html.END) -Collection> workClusters(Collection docs, WorkComparator c) { - docs.each { it.addComparisonProps() } - - def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) }) - .each { work -> work.each { doc -> doc.removeComparisonProps() } } - - return workClusters -} - Doc createNewWork(Map workData) { workData['@id'] = "TEMPID#it" Map data = [ diff --git a/librisworks/scripts/merge-works.groovy b/librisworks/scripts/merge-works.groovy index 99d5e7a290..6f9fb5bf99 100644 --- a/librisworks/scripts/merge-works.groovy +++ b/librisworks/scripts/merge-works.groovy @@ -2,8 +2,7 @@ import se.kb.libris.mergeworks.Html import se.kb.libris.mergeworks.WorkComparator import se.kb.libris.mergeworks.Doc -import static se.kb.libris.mergeworks.Util.partition -import static se.kb.libris.mergeworks.Util.sortByIntendedAudience +import static se.kb.libris.mergeworks.Util.workClusters maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv") multiWorkReport = getReportWriter("multi-work-clusters.html") @@ -47,7 +46,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> uniqueWorksAndTheirInstances.add(new Tuple2(linkedWorks.find(), localWorks)) } else { maybeDuplicates.println(linkedWorks.collect { it.shortId() }.join('\t')) - System.err.println("Local works ${localWorks.collect { it.shortId() }} match multiple linked works: ${linkedWorks.collect { it.shortId() }}. Duplicate linked works?") + System.err.println("Local works ${localWorks.collect { it.shortId() }} match multiple linked works: ${linkedWorks.collect { it.shortId() }}. Duplicated linked works?") } } @@ -103,19 +102,6 @@ void saveAndLink(Doc workDoc, Collection instanceDocs = [], boolean existsI } } -Collection> workClusters(Collection docs, WorkComparator c) { - docs.each { - if (it.instanceData) { - it.addComparisonProps() - } - }.with { sortByIntendedAudience(it) } - - def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) }) - .each { work -> work.each { doc -> doc.removeComparisonProps() } } - - return workClusters -} - Doc createNewWork(Map workData) { workData['@id'] = "TEMPID#it" Map data = [ diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 2f02340b6a..dfc94ed985 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -5,8 +5,7 @@ import whelk.Whelk import whelk.util.DocumentUtil import whelk.util.Unicode -import static se.kb.libris.mergeworks.compare.IntendedAudience.GENERAL -import static se.kb.libris.mergeworks.compare.IntendedAudience.ADULT +import static se.kb.libris.mergeworks.compare.IntendedAudience.preferredComparisonOrder class Util { static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] @@ -312,11 +311,16 @@ class Util { : agent.name ? normalize("${agent.name}") : null } - static void sortByIntendedAudience(Collection docs) { - docs.sort { Doc d -> - d.intendedAudience().with { - it.isEmpty() || it == [GENERAL] || it == [ADULT] + static Collection> workClusters(Collection docs, WorkComparator c) { + docs.each { + if (it.instanceData) { + it.addComparisonProps() } - }.reverse(true) + }.with { preferredComparisonOrder(it) } + + def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) }) + .each { work -> work.each { doc -> doc.removeComparisonProps() } } + + return workClusters } } \ No newline at end of file diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy index de095405ae..ce2569f866 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/IntendedAudience.groovy @@ -1,15 +1,24 @@ package se.kb.libris.mergeworks.compare +import se.kb.libris.mergeworks.Doc import static se.kb.libris.mergeworks.Util.asList class IntendedAudience extends StuffSet { - static def GENERAL = ['@id': 'https://id.kb.se/marc/General'] - static def ADULT = ['@id': 'https://id.kb.se/marc/Adult'] + private static def GENERAL = ['@id': 'https://id.kb.se/marc/General'] + private static def ADULT = ['@id': 'https://id.kb.se/marc/Adult'] @Override boolean isCompatible(Object a, Object b) { !a || !b || asList(a) == [GENERAL] || asList(b) == [GENERAL] || !(asList(a) + asList(b)).findResults { it == ADULT }.containsAll([true, false]) } + + static void preferredComparisonOrder(Collection docs) { + docs.sort { Doc d -> + d.intendedAudience().with { + it.isEmpty() || it == [GENERAL] || it == [ADULT] + } + }.reverse(true) + } }