DDMAL · Yueqiao12Zhang · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/json_2rdf/convert.py b/json_2rdf/convert.py
@@ -0,0 +1,174 @@
+"""
+This module reads a JSON file and converts it into an RDF graph in Turtle (.ttl) format
+using the rdflib library. It processes complex JSON structures with nested dictionaries 
+and lists, creating corresponding RDF triples.
+
+The JSON structure should include:
+    - A unique ID field, which serves as the main subject URI.
+    - Key-value pairs where keys are predicates and values are objects.
+    - Nested dictionaries or lists as values, which are recursively processed into 
+      RDF blank nodes or additional subjects.
+
+Usage:
+    - Place a JSON file in the same directory and name it "input.json".
+    - Run this script to generate an "output.ttl" file with RDF triples.
+
+Dependencies:
+    - rdflib: Required to create and manipulate the RDF graph.
+
+Example JSON input:
+{
+    "id": "http://example/id.org",
+    "pred1": "example literal obj 1",
+    "pred2": {
+        "pred2_1": "example literal obj 2_1",
+        "pred2_2": "http://example/obj2.org"
+    },
+    "pred3": [
+        {"nested_pred1": "nested_value1"},
+        {"nested_pred2": "http://example/nested_value2.org"}
+    ]
+}
+"""
+
+import json
+from datetime import datetime
+from rdflib import Graph, URIRef, Literal, Namespace, BNode
+from rdflib.namespace import RDF, XSD, GEO
+
+
+# Initialize the RDF graph
+g = Graph()
+
+# Define a namespace
+with open("namespace_mapping.json", "r", encoding="utf-8") as ns_mp:
+    NS = json.load(ns_mp)
+
+with open("./musicbrainz/pred_mapping.json", "r", encoding="utf-8") as pd_mp:
+    PD = json.load(pd_mp)
+
+NUM_COLUMN = []
+LOC_COLUMN = []
+DATE_COLUMN = []
+IGNORE_COLUMN = [
+    "annotation",
+    "ended",
+    "video",
+    "isrcs",
+    "aliases",
+    # "tags",
+    "rating",
+]
+
+
+# Function to add triples to the graph
+def add_triples(subject, predicates):
+    """
+    Recursively adds RDF triples to the graph from a dictionary of predicates
+    and objects. Handles nested dictionaries as blank nodes and lists as sequences
+    of blank nodes, allowing for complex JSON structures to be represented in RDF.
+
+    Parameters:
+    subject (URIRef or BNode): The RDF subject node to which triples are added.
+    predicates (dict): A dictionary where each key is a predicate and each value
+                       can be a literal, URI, dictionary, or list:
+                       - Literal values are added directly as RDF literals.
+                       - URI strings (starting with 'http') are converted to URIRefs.
+                       - Nested dictionaries are treated as blank nodes and recursively processed.
+                       - Lists create a blank node for each item; nested dictionaries in lists are
+                         recursively processed as additional RDF triples.
+
+    Returns:
+    None. Modifies the global RDF graph by adding triples based on the predicates dictionary.
+    """
+
+    for predicate, obj in predicates.items():
+
+        if predicate in IGNORE_COLUMN:
+            continue
+
+        if obj == "" or obj is None:
+            continue
+
+        # Define the predicate URI
+        namespace_string = NS[list(PD[predicate].keys())[0]]
+        namespace = Namespace(namespace_string)
+        g.bind(list(PD[predicate].keys())[0], namespace=namespace)
+        value = list(PD[predicate].values())[0]
+        pred_uri = URIRef(namespace[value])
+
+        if isinstance(obj, str) and obj.startswith("http"):
+            # If the object is a URI (starts with http), treat it as URIRef
+            obj = URIRef(obj)
+        elif isinstance(obj, dict):
+            # For nested dictionaries, create a new blank node or subject
+            nested_subject = BNode()  # Use a blank node for anonymous nodes
+            g.add((subject, pred_uri, nested_subject))
+            add_triples(nested_subject, obj)  # Recursively add nested triples
+            continue
+        elif isinstance(obj, list):
+            # Handle lists by creating a blank node for each item in the list
+            for item in obj:
+                list_node = BNode()
+                g.add((subject, pred_uri, list_node))
+                if isinstance(item, dict):
+                    add_triples(list_node, item)
+                else:
+                    # If the item is not a dictionary, treat it as a literal or URI
+                    g.add(
+                        (
+                            list_node,
+                            pred_uri,
+                            (
+                                Literal(item)
+                                if not str(item).startswith("http")
+                                else URIRef(item)
+                            ),
+                        )
+                    )
+            continue
+        else:
+            # # Otherwise, treat it as a literal
+            # if obj == "True" or obj == "False":
+            #     obj = Literal(obj, datatype=XSD.boolean)
+            # elif pred_uri in NUM_COLUMN:
+            #     obj = Literal(obj, datatype=XSD.integer)
+            # elif pred_uri in LOC_COLUMN:
+            #     obj = Literal(obj.upper(), datatype=GEO.wktLiteral)
+            # elif pred_uri in DATE_COLUMN:
+            #     datetime_obj = datetime.strptime(obj, "%Y-%m-%d %H:%M:%S")
+
+            #     day_of_week = datetime_obj.strftime("%A")
+            #     day_of_week_obj = Literal(day_of_week)
+            #     g.add(
+            #         (
+            #             subject,
+            #             URIRef("http://www.wikidata.org/prop/direct/P2894"),
+            #             day_of_week_obj,
+            #         )
+            #     )
+
+            #     day_str = datetime_obj.strftime("%Y-%m-%dT%H:%M:%S")
+            #     obj = Literal(day_str, datatype=XSD.dateTime)
+            # else:
+            obj = Literal(obj)
+
+        # Add the triple
+        g.add((subject, pred_uri, obj))
+
+
+with open("./musicbrainz/recording_test_2", "r", encoding="utf-8") as f:
+    for line in f:
+        data = json.loads(line)  # Parse each JSON object in the file
+
+        # Define the main subject based on the "id" field
+        main_subject = URIRef(f"https://musicbrainz.org/recording/{data["id"]}")
+
+        # Add triples for each JSON object, excluding the "id" field
+        add_triples(main_subject, {k: v for k, v in data.items() if k != "id"})
+
+
+# Serialize the graph to a Turtle (.ttl) file
+g.serialize("output.ttl", format="turtle")
+
+print("RDF data successfully saved in Turtle format.")
diff --git a/json_2rdf/extract_pred.py b/json_2rdf/extract_pred.py
@@ -0,0 +1,123 @@
+"""
+This module reads a JSON Lines (JSONL) file, extracts unique predicates from the data, 
+and appends them to an existing `mapping.json` file without modifying any pre-existing mappings. 
+The `mapping.json` file is structured as a dictionary where each unique predicate serves as a key 
+with an empty string as its value, unless a value already exists.
+
+This script is designed to incrementally build and maintain a mapping of predicates 
+encountered across multiple JSON objects in a JSON Lines file, allowing for dynamic 
+data schema updates without overwriting previous mappings.
+
+Usage:
+    - Place a JSON Lines file named "input.jsonl" in the same directory.
+    - Run the script. If `mapping.json` exists, new predicates will be appended to it;
+      if not, a new `mapping.json` file will be created with all unique predicates from
+      `input.jsonl`.
+
+Dependencies:
+    - json: Required for reading, parsing, and writing JSON data.
+    - os: Used to check for the existence of `mapping.json`.
+
+Functions:
+    - extract_predicates(data, predicates): Recursively extracts unique predicates 
+      from a dictionary. If a predicate is nested within a dictionary or list, it 
+      is added to the set of predicates.
+
+File I/O:
+    - `input.jsonl`: The JSON Lines input file, where each line is a JSON object.
+    - `mapping.json`: The output mapping file, which stores unique predicates as keys 
+      with their associated values. New predicates are added with an empty string as 
+      their value unless already present.
+
+Example:
+    If `mapping.json` contains:
+    {
+        "pred1": "existing value"
+    }
+    and `input.jsonl` has a new predicate "pred2", then after running the script, 
+    `mapping.json` will contain:
+    {
+        "pred1": "existing value",
+        "pred2": ""
+    }
+"""
+
+import json
+import os
+
+
+# Function to recursively extract predicates
+def extract_predicates(d, p):
+    """
+    Recursively extracts unique predicate names from a JSON object and adds them to a set.
+
+    This function traverses the JSON structure to find all keys (predicates) in 
+    the provided `data` dictionary. It handles nested dictionaries and lists by 
+    recursively searching through each level, ensuring that all unique predicates 
+    are added to the provided set. The "id" field is ignored, as it is assumed 
+    to represent the main subject rather than a predicate.
+
+    Parameters:
+    data (dict): A JSON object represented as a dictionary, which may contain 
+                 nested dictionaries or lists.
+    predicates (set): A set to store unique predicate names (keys) found in the 
+                      JSON object.
+
+    Returns:
+    None. The function modifies the `predicates` set in place, adding any new 
+          unique predicate names it encounters.
+
+    Example:
+    If `data` is:
+    {
+        "id": "http://example.org/id",
+        "pred1": "value1",
+        "pred2": {
+            "pred2_1": "value2"
+        },
+        "pred3": [
+            {"pred3_1": "value3"},
+            "simple_value"
+        ]
+    }
+
+    Then after calling `extract_predicates(data, predicates)`, the `predicates` set 
+    will contain:
+    {"pred1", "pred2", "pred2_1", "pred3", "pred3_1"}
+    """
+    for key, value in d.items():
+        if key != "id":  # Ignore the "id" field, as it represents the main subject
+            p.add(key)
+            if isinstance(value, dict):
+                extract_predicates(value, p)  # Recurse for nested dictionaries
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict):
+                        # Recurse for dictionaries within lists
+                        extract_predicates(item, p)
+
+
+# Load the JSON Lines (JSONL) file and extract unique predicates
+predicates = set()
+with open("recording", "r", encoding="utf-8") as f:
+    for line in f:
+        data = json.loads(line)
+        extract_predicates(data, predicates)
+
+# Load existing mapping.json if it exists
+if os.path.exists("pred_mapping.json"):
+    with open("pred_mapping.json", "r", encoding="utf-8") as infile:
+        predicate_mapping = json.load(infile)
+else:
+    predicate_mapping = {}
+
+# Add new predicates to the mapping without modifying old entries
+for predicate in predicates:
+    if predicate not in predicate_mapping:
+        predicate_mapping[predicate] = {"": ""}
+
+# Write the updated mapping back to mapping.json
+with open("pred_mapping.json", "w", encoding="utf-8") as outfile:
+    json.dump(predicate_mapping, outfile, indent=4)
+
+print("New predicates appended to mapping.json without modifying old mappings.")
diff --git a/json_2rdf/log.md b/json_2rdf/log.md
@@ -0,0 +1,75 @@
+# 11-8-2024
+
+**RISM**:
+- No progress. Andrew responded under the issue, indicating that my data is incorrect. I have paused experimentation with this data, so progress remains the same as last week.
+
+**MusicBrainz & Other Potential Databases**:
+- Applied a new approach using JSON logic
+- Merged with old CSV2RDF logic for parsing JSON file
+
+#### Advantages of Using JSON Logic:
+1. **Data Structure Preservation**: RDF closely aligns with JSON’s structure, perfectly conserving complex data layouts without losing fidelity—unlike CSV, which struggles with nested or hierarchical data.
+2. **Simplified Reconciliation**: CSV files introduced excessive, nested columns due to the JSON structure, complicating reconciliation efforts. With RDF, we avoid this, making reconciliation more straightforward.
+3. **Data Integrity**: Unlike CSV, where data might be truncated or result in numerous blank cells, RDF maintains full data integrity.
+4. **Direct RDF Import for Reconciliation**: RDF files can be directly imported into OpenRefine for reconciliation, allowing us to skip the additional CSV conversion step.
+5. **Old Functions Preserved**: We can apply the exact same functions in the old CSV2RDF, like marking language, detecting datatype, etc.
+
+#### Disadvantage:
+1. **Query Complexity**: RDF is implemented using blank nodes, which can make querying the data more challenging.
+
+# 11-15-2024
+
+**RISM**:
+- No progress was made this week, as the challenges from the previous week remain unresolved.
+
+**RDF Reconciliation**:
+- Adjustments to the process included:
+  - Removing type recognition.
+  - Adding a "tags" column for testing purposes.
+
+#### Challenges Encountered:
+- **RDF Reconciliation**:
+  - **Issue**: OpenRefine allows RDF input but does not support RDF output, making it challenging to preserve the reconciliation data.
+  - **Potential Solution**: Exploring alternative methods to merge OpenRefine data with the original RDF could address this issue.
+  - **Possible Advantage**: Retaining both the old reconciliation data and the original RDF file may streamline database updates in the long term.
+
+# 11-22-2024
+
+**RISM**:
+- No progress was made this week due to unresolved challenges from the previous week.
+
+**RDF Reconciliation**:
+- **Issue**:
+  - Encountered a problem during the process of merging reconciled CSV data into RDF. Some blank nodes were not correctly linked, resulting in inaccurate RDF output.
+  - Efforts are ongoing to debug and resolve the issue.
+  - If an object is a list of blank nodes, only the first item refers to the correct parent.
+
+# 01-10-2025
+
+**RDF CSV Merging New Features**:
+**Steps**:
+1. Retrieve all raw JSON from MusicBrainz.
+2. Convert the JSONs into RDFs. This conserves all schema and structures. Also, extract the predicates to manually reconcile them.
+   > Note: Junjun said that due to the structure of JSON, there might be many redundant blank nodes in the converted RDF.
+3. Upload these RDFs onto OpenRefine and reconcile following the old processes.
+4. Run merge.py to merge the output CSV from OpenRefine into the raw RDF. This step will modify the raw RDF, replacing all reconcilable literal objects as URIs.
+
+**Difficulties Encountered**:
+- There might be infinitely many predicates, making reconciliation extremely difficult. We can categorize them by using a single URI for a group of similar predicates.
+- Merging blank nodes is difficult since the internal code for each blank node when reading the RDF is different every time. Tracing the reconciled CSV is necessary during the iteration of the raw RDF.
+- Using a stack data structure to iterate the RDF structure to effectively trace the blank nodes.
+
+# 01-13-2025
+
+- Worked and finished the short example of JSON to RDF.
+- Discussed with Junjun, concluded that it might not perform better than our old method, decided to give up on it.
+- Initial understanding of the data structure of RISM.
+
+**Reconciliation Discussion For MusicBrainz**:
+- **Countries and Citizenship**: Should countries and citizenships in tags and genres for artists or recordings be considered as the language, culture, or citizenship of the artist?
+- **Genres like "Death" and "Hate"**: Should these be interpreted in their original context or as special literature genres?
+- **Entity Selection**:
+  - For "Person": Use Q5 or Q215627?
+  - For "Artist" as a musician: Use Q639669?
+  - For "Work": Use Q386724 or Q268378?
+