Skip to content

Commit b6b835b

Browse files
committed
script to import equivalance dbk kg
1 parent 877518a commit b6b835b

File tree

1 file changed

+121
-0
lines changed

1 file changed

+121
-0
lines changed
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#lang racket/base
2+
3+
(require json
4+
racket/string
5+
racket/pretty
6+
"../../dbKanren/test/equivalence-database.rkt"
7+
"../../dbKanren/dbk/database.rkt"
8+
"../../dbKanren/dbk/enumerator.rkt"
9+
"../transform-2tsv-to-4tsv-kgs/transform-utils.rkt"
10+
racket/runtime-path
11+
racket/file
12+
racket/system)
13+
14+
(define BASE "../../neo-data/raw_downloads_from_kge_archive/")
15+
(define NODE-NORM-DIRECTORY (string-append BASE "NodeNorm-24oct/"))
16+
(define RTX-KG2-EDGE (string-append BASE "rtx-kg2-v2.10.0/" "data_01_RAW_KGs_rtx_kg2_v2.10.0_validated_rtx-kg2_2.10.0_edges.tsv"))
17+
18+
#|
19+
***
20+
extract the "same_as" edges from RTX-KG2
21+
***
22+
|#
23+
(define rtx-kg2-edges-in (open-input-file RTX-KG2-EDGE))
24+
#;(define same-as-export-out (open-output-file (string-append NODE-NORM-DIRECTORY "rtx-kg2-same-as.jsonl")))
25+
26+
#;(let* ((header (read-line rtx-kg2-edges-in 'any))
27+
(header (string-split header "\t" #:trim? #f)))
28+
(let loop ((id 0)
29+
(line-str (read-line rtx-kg2-edges-in 'any)))
30+
(when (zero? (modulo id 1000000))
31+
(printf "processing edges line ~s\n" id))
32+
(cond
33+
((eof-object? line-str)
34+
(close-input-port rtx-kg2-edges-in)
35+
(printf "finished extracting same_as edges from RTX-KG2\n"))
36+
(else
37+
(let* ((line (efficient-no-trim-tab-string-split line-str))
38+
(predicate (list-ref line (find-index header "predicate"))))
39+
(when (equal? predicate "biolink:same_as")
40+
(let* ((subject (list-ref line (find-index header "subject")))
41+
(object (list-ref line (find-index header "object")))
42+
(h (hash 'subject subject
43+
'object object))
44+
(js (jsexpr->string h)))
45+
(fprintf same-as-export-out "~a\n" js)))
46+
(loop (add1 id) (read-line rtx-kg2-edges-in 'any)))))))
47+
48+
#|
49+
***
50+
1. remove the empty lines from each .jsonl in the Node Norm KG directory
51+
2. merge the non-empty cleaned .jsonl files into one
52+
***
53+
|#
54+
55+
;; Find all .jsonl files in the Node Norm KG directory
56+
(define input-files
57+
(filter (lambda (p)
58+
(string-suffix? p ".jsonl" ))
59+
(map path->string (directory-list NODE-NORM-DIRECTORY))))
60+
61+
;; Create a directory to store cleaned files.
62+
(define cleaned-dir (string-append NODE-NORM-DIRECTORY "cleaned"))
63+
(unless (directory-exists? cleaned-dir)
64+
(make-directory cleaned-dir))
65+
66+
;; Clean each .jsonl file using jq if the file is not empty.
67+
#;(for-each
68+
(lambda (in-file-str)
69+
(let ((in-file-full-str (string-append NODE-NORM-DIRECTORY in-file-str)))
70+
(if (> (file-size (string->path in-file-full-str)) 0)
71+
(let* ((out-file-str (string-append cleaned-dir "/" in-file-str ".cleaned"))
72+
(jq-cmd (format "jq -R -c 'select(length > 0) | fromjson' ~a > ~a" in-file-full-str out-file-str)))
73+
(system* "/bin/sh" "-c" jq-cmd)
74+
(printf "Cleaned ~a -> ~a\n" in-file-full-str out-file-str))
75+
(printf "Skipping empty file ~a\n" in-file-full-str))))
76+
input-files)
77+
78+
79+
;; Merge all cleaned files into one final file.
80+
(define merged-file (string-append cleaned-dir "/" "NodeNorm-RTXKG2-Merged.jsonl"))
81+
;; Collect only those cleaned files that were created (exist)
82+
(define cleaned-files
83+
(map
84+
(lambda (f) (string-append cleaned-dir "/" f))
85+
(filter (lambda (p)
86+
(string-suffix? p ".cleaned"))
87+
(map path->string (directory-list cleaned-dir)))))
88+
(printf "There are ~a cleaned files. Starting to merge these files.\n" (length cleaned-files))
89+
(define cat-cmd
90+
(format "cat ~a > ~a"
91+
(string-join cleaned-files " ")
92+
merged-file))
93+
#;(system* "/bin/sh" "-c" cat-cmd)
94+
95+
(printf "Merged cleaned files into ~a\n" merged-file)
96+
97+
98+
#|
99+
***
100+
import the equivalence dbk from the merged .jsonl generated from above
101+
***
102+
|#
103+
(define-runtime-path path.here "../../neo-data/")
104+
105+
(define ((json-port-enumerator in) yield)
106+
(let loop ()
107+
(let ((line (read-bytes-line in)))
108+
(cond
109+
[(eof-object? line) (close-input-port in)]
110+
#;[(bytes=? #"" line) (loop)]
111+
[else
112+
(let* ((js (bytes->jsexpr line))
113+
(subject (string->bytes/utf-8 (hash-ref js 'subject)))
114+
(object (string->bytes/utf-8 (hash-ref js 'object))))
115+
(yield (list subject object))
116+
(loop))]))))
117+
118+
(define db.equiv
119+
(build-equivalence-database
120+
(build-path path.here "test-equivalence.db")
121+
(json-port-enumerator (open-input-file merged-file))))

0 commit comments

Comments
 (0)