Skip to content

Commit

Permalink
Merge pull request #1281 from libris/feature/lxl-4150-deduplicate-con…
Browse files Browse the repository at this point in the history
…tribution

Feature/lxl 4150 deduplicate contribution
  • Loading branch information
kwahlin authored Aug 31, 2023
2 parents 9964501 + 7ef9f27 commit 465925b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 5 deletions.
17 changes: 12 additions & 5 deletions librisworks/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations.tsv

LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer
CONTRIBUTION=$NORMALIZATIONS_DIR/3-contribution
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance
DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions
ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance

# Clustering step 1 TODO: run only on recently updated records after first run
echo "Finding new clusters..."
Expand Down Expand Up @@ -96,10 +97,16 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"

echo
echo "Normalizing contribution..."
echo "Merging contribution objects with same agent..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar $JAR_FILE \
$ARGS --report $CONTRIBUTION $SCRIPTS_DIR/normalize-contribution.groovy 2>/dev/null
echo "$(count_lines $CONTRIBUTION/MODIFIED.txt) records affected, report in $CONTRIBUTION"
$ARGS --report $DEDUPLICATE_CONTRIBUTIONS $SCRIPTS_DIR/lxl-4150-deduplicate-contribution.groovy 2>/dev/null
echo "$(count_lines $DEDUPLICATE_CONTRIBUTIONS/MODIFIED.txt) records affected, report in $DEDUPLICATE_CONTRIBUTIONS"

echo
echo "Adding missing contribution data..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar $JAR_FILE \
$ARGS --report $ADD_MISSING_CONTRIBUTION_DATA $SCRIPTS_DIR/add-missing-contribution-data.groovy 2>/dev/null
echo "$(count_lines $ADD_MISSING_CONTRIBUTION_DATA/MODIFIED.txt) records affected, report in $ADD_MISSING_CONTRIBUTION_DATA"

echo
echo "Moving roles to instance..."
Expand Down
35 changes: 35 additions & 0 deletions librisworks/scripts/lxl-4150-deduplicate-contribution.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
def ids = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }.flatten()

selectByIds(ids) { bib ->
def work = bib.graph[1].instanceOf
def contribution = work?.contribution

if (!contribution) return

def duplicates = contribution.countBy { asList(it.agent) }.findResults { it.value > 1 ? it.key : null }

duplicates.each { d ->
def primaryContributionIdx = contribution.findIndexOf { asList(it.agent) == d && it['@type'] == 'PrimaryContribution' }
def mergeIntoIdx = primaryContributionIdx > -1
? primaryContributionIdx
: contribution.findIndexOf { asList(it.agent) == d }
def mergeInto = contribution[mergeIntoIdx]
def roles = []

contribution.removeAll {
if (asList(it.agent) == d) {
roles += asList(it.role)
return true
}
return false
}

if (roles) mergeInto['role'] = roles.unique()

contribution.add(mergeIntoIdx, mergeInto)
}

if (duplicates) {
bib.scheduleSave()
}
}

0 comments on commit 465925b

Please sign in to comment.