1
+ #lang racket/base
2
+
3
+ (require json
4
+ racket/string
5
+ racket/pretty
6
+ "../../dbKanren/test/equivalence-database.rkt "
7
+ "../../dbKanren/dbk/database.rkt "
8
+ "../../dbKanren/dbk/enumerator.rkt "
9
+ "../transform-2tsv-to-4tsv-kgs/transform-utils.rkt "
10
+ racket/runtime-path
11
+ racket/file
12
+ racket/system)
13
+
14
+ (define BASE "../../neo-data/raw_downloads_from_kge_archive/ " )
15
+ (define NODE-NORM-DIRECTORY (string-append BASE "NodeNorm-24oct/ " ))
16
+ (define RTX-KG2-EDGE (string-append BASE "rtx-kg2-v2.10.0/ " "data_01_RAW_KGs_rtx_kg2_v2.10.0_validated_rtx-kg2_2.10.0_edges.tsv " ))
17
+
18
+ #|
19
+ ***
20
+ extract the "same_as" edges from RTX-KG2
21
+ ***
22
+ |#
23
+ (define rtx-kg2-edges-in (open-input-file RTX-KG2-EDGE))
24
+ #; (define same-as-export-out (open-output-file (string-append NODE-NORM-DIRECTORY "rtx-kg2-same-as.jsonl " )))
25
+
26
+ #; (let* ((header (read-line rtx-kg2-edges-in 'any ))
27
+ (header (string-split header "\t " #:trim? #f )))
28
+ (let loop ((id 0 )
29
+ (line-str (read-line rtx-kg2-edges-in 'any )))
30
+ (when (zero? (modulo id 1000000 ))
31
+ (printf "processing edges line ~s\n " id))
32
+ (cond
33
+ ((eof-object? line-str)
34
+ (close-input-port rtx-kg2-edges-in)
35
+ (printf "finished extracting same_as edges from RTX-KG2\n " ))
36
+ (else
37
+ (let* ((line (efficient-no-trim-tab-string-split line-str))
38
+ (predicate (list-ref line (find-index header "predicate " ))))
39
+ (when (equal? predicate "biolink:same_as " )
40
+ (let* ((subject (list-ref line (find-index header "subject " )))
41
+ (object (list-ref line (find-index header "object " )))
42
+ (h (hash 'subject subject
43
+ 'object object))
44
+ (js (jsexpr->string h)))
45
+ (fprintf same-as-export-out "~a\n " js)))
46
+ (loop (add1 id) (read-line rtx-kg2-edges-in 'any )))))))
47
+
48
+ #|
49
+ ***
50
+ 1. remove the empty lines from each .jsonl in the Node Norm KG directory
51
+ 2. merge the non-empty cleaned .jsonl files into one
52
+ ***
53
+ |#
54
+
55
+ ;; Find all .jsonl files in the Node Norm KG directory
56
+ (define input-files
57
+ (filter (lambda (p)
58
+ (string-suffix? p ".jsonl " ))
59
+ (map path->string (directory-list NODE-NORM-DIRECTORY))))
60
+
61
+ ;; Create a directory to store cleaned files.
62
+ (define cleaned-dir (string-append NODE-NORM-DIRECTORY "cleaned " ))
63
+ (unless (directory-exists? cleaned-dir)
64
+ (make-directory cleaned-dir))
65
+
66
+ ;; Clean each .jsonl file using jq if the file is not empty.
67
+ #; (for-each
68
+ (lambda (in-file-str)
69
+ (let ((in-file-full-str (string-append NODE-NORM-DIRECTORY in-file-str)))
70
+ (if (> (file-size (string->path in-file-full-str)) 0 )
71
+ (let* ((out-file-str (string-append cleaned-dir "/ " in-file-str ".cleaned " ))
72
+ (jq-cmd (format "jq -R -c 'select(length > 0) | fromjson' ~a > ~a " in-file-full-str out-file-str)))
73
+ (system* "/bin/sh " "-c " jq-cmd)
74
+ (printf "Cleaned ~a -> ~a\n " in-file-full-str out-file-str))
75
+ (printf "Skipping empty file ~a\n " in-file-full-str))))
76
+ input-files)
77
+
78
+
79
+ ;; Merge all cleaned files into one final file.
80
+ (define merged-file (string-append cleaned-dir "/ " "NodeNorm-RTXKG2-Merged.jsonl " ))
81
+ ;; Collect only those cleaned files that were created (exist)
82
+ (define cleaned-files
83
+ (map
84
+ (lambda (f) (string-append cleaned-dir "/ " f))
85
+ (filter (lambda (p)
86
+ (string-suffix? p ".cleaned " ))
87
+ (map path->string (directory-list cleaned-dir)))))
88
+ (printf "There are ~a cleaned files. Starting to merge these files.\n " (length cleaned-files))
89
+ (define cat-cmd
90
+ (format "cat ~a > ~a "
91
+ (string-join cleaned-files " " )
92
+ merged-file))
93
+ #; (system* "/bin/sh " "-c " cat-cmd)
94
+
95
+ (printf "Merged cleaned files into ~a\n " merged-file)
96
+
97
+
98
+ #|
99
+ ***
100
+ import the equivalence dbk from the merged .jsonl generated from above
101
+ ***
102
+ |#
103
+ (define-runtime-path path.here "../../neo-data/ " )
104
+
105
+ (define ((json-port-enumerator in) yield)
106
+ (let loop ()
107
+ (let ((line (read-bytes-line in)))
108
+ (cond
109
+ [(eof-object? line) (close-input-port in)]
110
+ #; [(bytes=? #"" line) (loop)]
111
+ [else
112
+ (let* ((js (bytes->jsexpr line))
113
+ (subject (string->bytes/utf-8 (hash-ref js 'subject )))
114
+ (object (string->bytes/utf-8 (hash-ref js 'object ))))
115
+ (yield (list subject object))
116
+ (loop))]))))
117
+
118
+ (define db.equiv
119
+ (build-equivalence-database
120
+ (build-path path.here "test-equivalence.db " )
121
+ (json-port-enumerator (open-input-file merged-file))))
0 commit comments