Skip to content

Commit

Permalink
Merge pull request #17 from cardin/nested-fields
Browse files Browse the repository at this point in the history
Allow nested fields
  • Loading branch information
deric authored Oct 16, 2023
2 parents 99403ee + d99321c commit 9548354
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
8 changes: 5 additions & 3 deletions esdedupe/esdedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import requests
import sys

from benedict import benedict
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import parallel_bulk
from elasticsearch.helpers import streaming_bulk
Expand All @@ -28,15 +29,16 @@ def __init__(self):
# Process documents returned by the current search/scroll
def build_index(self, docs_hash, unique_fields, hit):
hashval = None
_id = hit["_id"]
hit_benedict = benedict(hit)
_id = hit_benedict["_id"]
# there's no need to hash, if we have just single unique key
if len(unique_fields) > 1:
combined_key = ""
for field in unique_fields:
combined_key += str(hit['_source'][field])
combined_key += str(hit_benedict['_source'][field])
hashval = hashlib.md5(combined_key.encode('utf-8')).digest()
else:
hashval = str(hit['_source'][unique_fields[0]])
hashval = str(hit_benedict['_source'][unique_fields[0]])

docs_hash.setdefault(hashval, []).append(_id)

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ tqdm
psutil
elasticsearch>=8.0.0
requests
urllib3>=1.26.2,<2
urllib3>=1.26.2,<2
python-benedict

0 comments on commit 9548354

Please sign in to comment.