Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/softer work matching #1302

Merged
merged 4 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions librisworks/scripts/display-works.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import se.kb.libris.mergeworks.Doc
import se.kb.libris.mergeworks.Html
import se.kb.libris.mergeworks.WorkComparator

import static se.kb.libris.mergeworks.Util.partition
import static se.kb.libris.mergeworks.Util.workClusters

htmlReport = getReportWriter('works.html')

Expand All @@ -28,15 +28,6 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->

htmlReport.println(Html.END)

Collection<Collection<Doc>> workClusters(Collection<Doc> docs, WorkComparator c) {
docs.each { it.addComparisonProps() }

def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) })
.each { work -> work.each { doc -> doc.removeComparisonProps() } }

return workClusters
}

Doc createNewWork(Map workData) {
workData['@id'] = "TEMPID#it"
Map data = [
Expand Down
17 changes: 2 additions & 15 deletions librisworks/scripts/merge-works.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import se.kb.libris.mergeworks.Html
import se.kb.libris.mergeworks.WorkComparator
import se.kb.libris.mergeworks.Doc

import static se.kb.libris.mergeworks.Util.partition
import static se.kb.libris.mergeworks.Util.workClusters

maybeDuplicates = getReportWriter("maybe-duplicate-linked-works.tsv")
multiWorkReport = getReportWriter("multi-work-clusters.html")
Expand Down Expand Up @@ -46,7 +46,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster ->
uniqueWorksAndTheirInstances.add(new Tuple2(linkedWorks.find(), localWorks))
} else {
maybeDuplicates.println(linkedWorks.collect { it.shortId() }.join('\t'))
System.err.println("Local works ${localWorks.collect { it.shortId() }} match multiple linked works: ${linkedWorks.collect { it.shortId() }}. Duplicate linked works?")
System.err.println("Local works ${localWorks.collect { it.shortId() }} match multiple linked works: ${linkedWorks.collect { it.shortId() }}. Duplicated linked works?")
}
}

Expand Down Expand Up @@ -102,19 +102,6 @@ void saveAndLink(Doc workDoc, Collection<Doc> instanceDocs = [], boolean existsI
}
}

Collection<Collection<Doc>> workClusters(Collection<Doc> docs, WorkComparator c) {
docs.each {
if (it.instanceData) {
it.addComparisonProps()
}
}

def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) })
.each { work -> work.each { doc -> doc.removeComparisonProps() } }

return workClusters
}

Doc createNewWork(Map workData) {
workData['@id'] = "TEMPID#it"
Map data = [
Expand Down
1 change: 1 addition & 0 deletions librisworks/scripts/swedish-fiction.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) {cluster ->
&& !doc.isDrama()
&& !doc.isThesis()
&& !doc.isInSb17Bibliography()
&& !doc.intendedForMarcPreAdolescent()
}

def swedish = { Doc doc ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ class Doc {
asList(workData['genreForm'])
}

List<Map> intendedAudience() {
asList(workData['intendedAudience'])
}

List<Map> publication() {
asList(instanceData?.publication)
}
Expand Down Expand Up @@ -211,6 +215,10 @@ class Doc {
|| hasRelationshipWithContribution()
}

boolean intendedForMarcPreAdolescent() {
intendedAudience().contains(['@id': 'https://id.kb.se/marc/PreAdolescent'])
}

boolean hasPart() {
workData['hasPart'] || instanceData['hasTitle'].findAll { it['@type'] == 'Title' }.any {
it.hasPart?.size() > 1 || it.hasPart?.any { p -> asList(p.partName).size() > 1 || asList(p.partNumber).size() > 1 }
Expand Down
15 changes: 15 additions & 0 deletions librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import whelk.Whelk
import whelk.util.DocumentUtil
import whelk.util.Unicode

import static se.kb.libris.mergeworks.compare.IntendedAudience.preferredComparisonOrder

class Util {
static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle']

Expand Down Expand Up @@ -308,4 +310,17 @@ class Util {
? normalize("${agent.givenName} ${agent.familyName}")
: agent.name ? normalize("${agent.name}") : null
}

static Collection<Collection<Doc>> workClusters(Collection<Doc> docs, WorkComparator c) {
docs.each {
if (it.instanceData) {
it.addComparisonProps()
}
}.with { preferredComparisonOrder(it) }

def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) })
.each { work -> work.each { doc -> doc.removeComparisonProps() } }

return workClusters
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ class WorkComparator {

Map<String, FieldHandler> comparators = [
'classification' : new Classification(),
'contentType' : new SameOrEmpty('https://id.kb.se/term/rda/Text'),
'contentType' : new ContentType(),
'genreForm' : new GenreForm(),
'hasTitle' : new WorkTitle(),
'intendedAudience': new SameOrEmpty('https://id.kb.se/marc/Juvenile'),
'intendedAudience': new IntendedAudience(),
'_numPages' : new Extent(),
'subject' : new Subject(),
'summary' : new StuffSet(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package se.kb.libris.mergeworks.compare

import static se.kb.libris.mergeworks.Util.asList

class ContentType extends StuffSet {
private static def allowedValues = ['https://id.kb.se/term/rda/StillImage', 'https://id.kb.se/term/rda/Text']

@Override
boolean isCompatible(Object a, Object b) {
asList(a).every { it['@id'] in allowedValues } && asList(b).every { it['@id'] in allowedValues }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package se.kb.libris.mergeworks.compare

import se.kb.libris.mergeworks.Doc

import static se.kb.libris.mergeworks.Util.asList

class IntendedAudience extends StuffSet {
private static def GENERAL = ['@id': 'https://id.kb.se/marc/General']
private static def ADULT = ['@id': 'https://id.kb.se/marc/Adult']

@Override
boolean isCompatible(Object a, Object b) {
!a || !b || asList(a) == [GENERAL] || asList(b) == [GENERAL]
|| !(asList(a) + asList(b)).findResults { it == ADULT }.containsAll([true, false])
}

static void preferredComparisonOrder(Collection<Doc> docs) {
docs.sort { Doc d ->
d.intendedAudience().with {
it.isEmpty() || it == [GENERAL] || it == [ADULT]
}
}.reverse(true)
}
}

This file was deleted.