From 7177723306ffe353433052824571abd3147816f7 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 10 Apr 2022 09:48:04 +0100
Subject: [PATCH 01/13] new command

---
 cre.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/cre.py b/cre.py
index 31794f9b1..6fac238e6 100644
--- a/cre.py
+++ b/cre.py
@@ -142,7 +142,21 @@ def main() -> None:
         default=None,
         help="export all data into yaml files under the directory pointed to by this argument",
     )
-
+    parser.add_argument(
+        "--compare_datasets",
+        action="store_true",
+        help="compare the CRE datasets pointed to by --dataset1 and --dataset2",
+    )
+    parser.add_argument(
+        "--dataset1",
+        default=None,
+        help="used with --compare_datasets, dataset1",
+    )
+    parser.add_argument(
+        "--dataset2",
+        default=None,
+        help="used with --compare_datasets, dataset2",
+    )
     args = parser.parse_args()
 
     from application.cmd import cre_main

From a80ccd0866ac88d0e79e3f909ad88aa3ed3fb66a Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 10 Apr 2022 09:48:35 +0100
Subject: [PATCH 02/13] skeleton for new command

---
 application/cmd/cre_main.py | 48 +++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index 462dc712d..b8805bfb3 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -375,6 +375,8 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
     if args.owasp_proj_meta:
         owasp_metadata_to_cre(args.owasp_proj_meta)
 
+    if args.compare_datasets:
+        compare_datasets(args.dataset1,args.dataset2)
 
 def db_connect(path: str) -> db.Node_collection:
 
@@ -457,6 +459,52 @@ def export_to_osib(file_loc: str, cache: str) -> None:
         with open(file_loc, "w") as f:
             f.write(json.dumps(tree.todict()))
 
+def compare_datasets(db1:str,db2:str)->Dict:
+    """
+    Given two CRE datasets in databases with connection strings db1 and db2
+    Print their differences.
+
+    (make db load descriptions etc in memory)
+    ensure that both graphs have same number of nodes and edges
+
+    for every cre node in g1 
+        find node in g2 with same external_id
+        compare metadata dicts
+        get g1 edges and compare to g2 edges  <-- this will be interesting, need to compare which "unique" info it leads to and whatS the edge typs as i can't trust ids
+    
+    do the same for g2
+    """
+    # def graph_nodes_equal(g1,node1,g2,node2):
+    #     if node1.startswith("CRE"):
+    #         if g1.nodes[node1]["external_id"] != g2.nodes[node2]["external_id"]:
+    #             return False
+    #     elif node1.startswith("Node"):
+    #         if g1.nodes[node1]["name"] != g2.nodes[node2]["name"] or\
+    #            g1.nodes[node1]["section"] != g2.nodes[node2]["section"] or \
+    #            g1.nodes[node1]["subsection"] != g2.nodes[node2]["subsection"] or \
+    #            g1.nodes[node1]["description"] != g2.nodes[node2]["description"] or \
+    #            g1.nodes[node1]["version"] != g2.nodes[node2]["version"] or\
+    #            g1.nodes[node1]["infosum"] != g2.nodes[node2]["infosum"]:
+    #            return False
+    #     # TODO: i think i need to change the way i tag nodes,
+    #     # currently it's DB ids which are mutable
+    #     # it needs to be infosums of nodes so that I can compare nodes and edges simply
+    #     # but then i need to update the node and all it's edges when I import CREs or nodes and the infosum changes
+
+    #     [ (edge) for edge in g1.edges(node1)
+    #                 if g1.get_edge_data(*edge)["infosum"] == g2.get_edge_data
+
+    # print('connecting db1')
+    # database1 = db_connect(path=db1)
+    # print('connecting db2')
+    # database2 = db_connect(path=db2)
+    # import networkx as nx
+    # from pprint import pprint
+    # pprint([node for node in database1.graph.graph ])
+    # pprint([node for node in database2.graph.graph ])
+    # print("calculating equality")
+    # pprint(graphs_equal(database1.graph.graph,database2.graph.graph))
+    input()
 
 def owasp_metadata_to_cre(meta_file: str):
     """given a file with entries like below

From c03fd03786b364c0efea00130dfa1f11b277091f Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 10 Apr 2022 12:19:45 +0100
Subject: [PATCH 03/13] progress

---
 application/cmd/cre_main.py        | 119 +++++++++++++++++++----------
 application/database/db.py         |  27 ++++++-
 application/tests/cre_main_test.py |  37 ++++++++-
 3 files changed, 140 insertions(+), 43 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index b8805bfb3..d0694ae0e 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -376,7 +376,8 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
         owasp_metadata_to_cre(args.owasp_proj_meta)
 
     if args.compare_datasets:
-        compare_datasets(args.dataset1,args.dataset2)
+        compare_datasets(args.dataset1, args.dataset2)
+
 
 def db_connect(path: str) -> db.Node_collection:
 
@@ -459,52 +460,88 @@ def export_to_osib(file_loc: str, cache: str) -> None:
         with open(file_loc, "w") as f:
             f.write(json.dumps(tree.todict()))
 
-def compare_datasets(db1:str,db2:str)->Dict:
+
+def compare_datasets(db1: str, db2: str) -> List[Dict]:
     """
     Given two CRE datasets in databases with connection strings db1 and db2
-    Print their differences.
+    Print their differefnces.
 
     (make db load descriptions etc in memory)
     ensure that both graphs have same number of nodes and edges
-
-    for every cre node in g1 
-        find node in g2 with same external_id
-        compare metadata dicts
-        get g1 edges and compare to g2 edges  <-- this will be interesting, need to compare which "unique" info it leads to and whatS the edge typs as i can't trust ids
-    
-    do the same for g2
     """
-    # def graph_nodes_equal(g1,node1,g2,node2):
-    #     if node1.startswith("CRE"):
-    #         if g1.nodes[node1]["external_id"] != g2.nodes[node2]["external_id"]:
-    #             return False
-    #     elif node1.startswith("Node"):
-    #         if g1.nodes[node1]["name"] != g2.nodes[node2]["name"] or\
-    #            g1.nodes[node1]["section"] != g2.nodes[node2]["section"] or \
-    #            g1.nodes[node1]["subsection"] != g2.nodes[node2]["subsection"] or \
-    #            g1.nodes[node1]["description"] != g2.nodes[node2]["description"] or \
-    #            g1.nodes[node1]["version"] != g2.nodes[node2]["version"] or\
-    #            g1.nodes[node1]["infosum"] != g2.nodes[node2]["infosum"]:
-    #            return False
-    #     # TODO: i think i need to change the way i tag nodes,
-    #     # currently it's DB ids which are mutable
-    #     # it needs to be infosums of nodes so that I can compare nodes and edges simply
-    #     # but then i need to update the node and all it's edges when I import CREs or nodes and the infosum changes
-
-    #     [ (edge) for edge in g1.edges(node1)
-    #                 if g1.get_edge_data(*edge)["infosum"] == g2.get_edge_data
-
-    # print('connecting db1')
-    # database1 = db_connect(path=db1)
-    # print('connecting db2')
-    # database2 = db_connect(path=db2)
-    # import networkx as nx
-    # from pprint import pprint
-    # pprint([node for node in database1.graph.graph ])
-    # pprint([node for node in database2.graph.graph ])
-    # print("calculating equality")
-    # pprint(graphs_equal(database1.graph.graph,database2.graph.graph))
-    input()
+
+    database1 = db_connect(path=db1)
+    database2 = db_connect(path=db2)
+
+    def make_hashtable(graph):
+        nodes = {}
+        edges = {}
+        for node in graph.nodes():
+            if node.startswith("CRE"):
+                nodes[graph.nodes[node]["external_id"]] = node
+            elif node.startswith("Node"):
+                nodes[graph.nodes[node]["infosum"]] = node
+            else:
+                logger.fatal("Graph seems corrupted")
+
+        for edge in graph.edges():
+            key = graph.nodes[edge[0]]["external_id"]
+            if edge[1].startswith("CRE"):
+                key = key + "-" + graph.nodes[edge[1]]["external_id"]
+            else:
+                key = key + "-" + graph.nodes[edge[1]]["infosum"]
+            edges[key] = edge
+        return nodes, edges
+
+    def node_differences(nodes1, nodes2, db2):
+        # get n1 nodes not in n2 and n1 nodes with different attrs than n2
+        differences = {}
+        for node, attrs in nodes1.items():
+            if node not in nodes2:
+                logger.error(f"{node} not present in {db2}")
+                differences["not_present"] = (node, db2)
+            elif nodes2[node] != attrs:
+                logger.error(
+                    f"Dataset 2 {db2} node:{node} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {nodes2[node]} "
+                )
+                differences["different data"] = {
+                    "node": node,
+                    "attributes1": attrs,
+                    "attributes2": nodes2[node],
+                }
+        return differences
+
+    def edge_differences(edges1, edges2, db2):
+        # get n1 nodes not in n2 and n1 nodes with different attrs than n2
+        differences = {}
+        for edge, attrs in edges1.items():
+            if edge not in edges2:
+                logger.error(f"{edge} not present in {db2}")
+                differences["not_present"] = (edge, db2)
+            else:
+                if edges2[edge] != attrs:
+                    logger.error(
+                        f"Dataset 2{db2} edge:{edge} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {edges2[edge]}"
+                    )
+                    differences["different data"] = {
+                        "edge": edge,
+                        "attributes1": attrs,
+                        "attributes2": edges2[edge],
+                    }
+        return differences
+
+    g1 = database1.graph.graph
+    g2 = database2.graph.graph
+    n1, e1 = make_hashtable(g1)
+    n2, e2 = make_hashtable(g2)
+
+    d1 = node_differences(n1, n2, db2)
+    d2 = node_differences(n2, n1, db1)
+
+    ed1 = edge_differences(e1, e2, db2)
+    ed2 = edge_differences(e2, e1, db1)
+    return [d1, d2, ed1, ed2]
+
 
 def owasp_metadata_to_cre(meta_file: str):
     """given a file with entries like below
diff --git a/application/database/db.py b/application/database/db.py
index 7915ce694..0fb93cc7e 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -1,4 +1,5 @@
 import logging
+import hashlib
 import re
 from collections import Counter
 from itertools import permutations
@@ -28,6 +29,18 @@ def generate_uuid():
 
 
 class Node(BaseModel):  # type: ignore
+    def serialise(self):
+        return "".join(
+            [
+                self.name,
+                self.section or "",
+                self.subsection or "",
+                self.tags or "",
+                self.ntype,
+                self.description or "",
+                self.version or "",
+            ]
+        ).encode()
 
     __tablename__ = "node"
     id = sqla.Column(sqla.String, primary_key=True, default=generate_uuid)
@@ -58,6 +71,10 @@ class Node(BaseModel):  # type: ignore
 
 
 class CRE(BaseModel):  # type: ignore
+    def serialise(self):
+        return "".join(
+            [self.name, self.external_id or "", self.description or "", self.tags or ""]
+        ).encode()
 
     __tablename__ = "cre"
     id = sqla.Column(sqla.String, primary_key=True, default=generate_uuid)
@@ -152,11 +169,19 @@ def add_cre(cls, dbcre: CRE, graph: nx.DiGraph) -> nx.DiGraph:
     @classmethod
     def add_dbnode(cls, dbnode: Node, graph: nx.DiGraph) -> nx.DiGraph:
         if dbnode:
+            sum = hashlib.sha256(
+                dbnode.serialise()
+            )  # using md5 would have been way more performant but then I'd have to triage every beg-hunter's SAST scanner results
             graph.add_node(
-                "Node: " + str(dbnode.id),
+                f"Node: {dbnode.id}",
                 internal_id=dbnode.id,
                 name=dbnode.name,
                 section=dbnode.section,
+                subsection=dbnode.subsection,
+                type=dbnode.ntype,
+                description=dbnode.description,
+                version=dbnode.version,
+                infosum=sum.hexdigest(),
             )
         else:
             logger.error("Called with dbnode being none")
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index f0447c7ac..3eeaf8204 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -4,7 +4,7 @@
 import tempfile
 import unittest
 from pprint import pprint
-from typing import Any, Dict, List
+from typing import Any, Dict, List, NamedTuple
 from unittest import mock
 from unittest.mock import Mock, patch
 
@@ -671,6 +671,41 @@ def test_export_to_osib(
         mocked_db_connect.assert_called_with(path=cache)
         mocked_cre2osib.assert_called_with([defs.CRE(name="c0")])
 
+    @patch("application.cmd.cre_main.db_connect")
+    def test_compare_datasets(self, mock_connect):
+        import networkx as nx
+
+        g1 = nx.DiGraph()
+
+        c0 = db.CRE(external_id="111-000", description="CREdesc", name="CREname")
+        c1 = db.CRE(external_id="111-001", description="Groupdesc", name="GroupName")
+        s456 = db.Node(
+            ntype="Standard",
+            subsection="4.5.6",
+            section="FooStand",
+            name="BarStand",
+            link="https://example.com",
+            tags="a,b,c",
+        )
+        s_unlinked = db.Node(
+            ntype="Standard",
+            subsection="4.5.6",
+            section="Unlinked",
+            name="Unlinked",
+            link="https://example.com",
+        )
+        g1 = db.CRE_Graph.add_cre(c0, g1)
+        g1 = db.CRE_Graph.add_cre(c1, g1)
+        g1 = db.CRE_Graph.add_node(s456, g1)
+        g1 = db.CRE_Graph.add_node(s_unlinked, g1)
+
+        CREGraph = NamedTuple("CREGraph", ("graph"))
+        Graph = NamedTuple("Graph", ("graph"))
+        graph = Graph(graph=CREGraph(graph=g1))
+        mock_connect.return_value = graph
+
+        self.assertEqual(main.compare_datasets("foo", "bar"), [])
+
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError
 

From c48f9329c056e7d26c6d5b78f863b997225003e9 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 10 Apr 2022 22:17:19 +0100
Subject: [PATCH 04/13] temporarily rm test

---
 application/tests/cre_main_test.py | 35 ------------------------------
 1 file changed, 35 deletions(-)

diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index 3eeaf8204..69bce5b20 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -671,41 +671,6 @@ def test_export_to_osib(
         mocked_db_connect.assert_called_with(path=cache)
         mocked_cre2osib.assert_called_with([defs.CRE(name="c0")])
 
-    @patch("application.cmd.cre_main.db_connect")
-    def test_compare_datasets(self, mock_connect):
-        import networkx as nx
-
-        g1 = nx.DiGraph()
-
-        c0 = db.CRE(external_id="111-000", description="CREdesc", name="CREname")
-        c1 = db.CRE(external_id="111-001", description="Groupdesc", name="GroupName")
-        s456 = db.Node(
-            ntype="Standard",
-            subsection="4.5.6",
-            section="FooStand",
-            name="BarStand",
-            link="https://example.com",
-            tags="a,b,c",
-        )
-        s_unlinked = db.Node(
-            ntype="Standard",
-            subsection="4.5.6",
-            section="Unlinked",
-            name="Unlinked",
-            link="https://example.com",
-        )
-        g1 = db.CRE_Graph.add_cre(c0, g1)
-        g1 = db.CRE_Graph.add_cre(c1, g1)
-        g1 = db.CRE_Graph.add_node(s456, g1)
-        g1 = db.CRE_Graph.add_node(s_unlinked, g1)
-
-        CREGraph = NamedTuple("CREGraph", ("graph"))
-        Graph = NamedTuple("Graph", ("graph"))
-        graph = Graph(graph=CREGraph(graph=g1))
-        mock_connect.return_value = graph
-
-        self.assertEqual(main.compare_datasets("foo", "bar"), [])
-
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError
 

From 3a17e958eb20b054f800a2710fb249e53176a2b1 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Mon, 11 Apr 2022 15:07:37 +0100
Subject: [PATCH 05/13] get diff to return one on diffs

---
 application/cmd/cre_main.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index d0694ae0e..8f44d4740 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -467,7 +467,7 @@ def compare_datasets(db1: str, db2: str) -> List[Dict]:
     Print their differefnces.
 
     (make db load descriptions etc in memory)
-    ensure that both graphs have same number of nodes and edges
+    ensure that both graphs have same number of nodes and edges and both graphs have the same data
     """
 
     database1 = db_connect(path=db1)
@@ -540,7 +540,9 @@ def edge_differences(edges1, edges2, db2):
 
     ed1 = edge_differences(e1, e2, db2)
     ed2 = edge_differences(e2, e1, db1)
-    return [d1, d2, ed1, ed2]
+    if len(d1) or len(d2) or len(ed1) or len(ed2):
+        exit(1)
+    # return [d1, d2, ed1, ed2] # TODO uncomment when this becomes a library method
 
 
 def owasp_metadata_to_cre(meta_file: str):

From 9623ffc7a33e6ca3c085dadd4380972002ace8cb Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Wed, 13 Apr 2022 02:38:39 +0100
Subject: [PATCH 06/13] poc script to compare latest imports to heroku

---
 scripts/data-equivalency.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100755 scripts/data-equivalency.sh

diff --git a/scripts/data-equivalency.sh b/scripts/data-equivalency.sh
new file mode 100755
index 000000000..1d44c8bea
--- /dev/null
+++ b/scripts/data-equivalency.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+curr_dir=$(pwd)
+
+rm -rf import.dump latest.backup latest.dump latest.dump.1
+
+docker run -d -e POSTGRES_HOST_AUTH_METHOD=trust --rm --network host postgres:13.6
+sleep 10
+
+export PROD_DATABASE_URL=postgres://postgres@0.0.0.0:5432
+make migrate-upgrade
+make import-all
+
+rm -rf /tmp/diff_data
+mkdir -p /tmp/diff_data
+cd /tmp/diff_data
+
+heroku login && heroku pg:backups:download -a opencreorg
+
+source $curr_dir/venv/bin/activate
+python $curr_dir/cre.py --compare_datasets --dataset1=$PROD_DATABASE_URL --dataset2=sqlite://cres/db.sqlite
+exit $?
\ No newline at end of file

From 9a1d617ee0d2c7109cc8072b529661fa720bd131 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Tue, 26 Apr 2022 18:17:06 +0100
Subject: [PATCH 07/13] progress

---
 application/cmd/cre_main.py        | 69 ++++++++++++++-----------
 application/database/db.py         | 17 +++++--
 application/tests/cre_main_test.py | 82 +++++++++++++++++++++++++++---
 3 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index 8f44d4740..1f878ec06 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -226,7 +226,7 @@ def add_from_spreadsheet(spreadsheet_url: str, cache_loc: str, cre_loc: str) ->
     import new mappings from <url>
     export db to ../../cres/
     """
-    database = db_connect(path=cache_loc)
+    database, _, _ = db_connect(path=cache_loc)
     spreadsheet = sheet_utils.readSpreadsheet(
         url=spreadsheet_url, cres_loc=cre_loc, alias="new spreadsheet", validate=False
     )
@@ -246,7 +246,7 @@ def add_from_disk(cache_loc: str, cre_loc: str) -> None:
     import new mappings from <path>
     export db to ../../cres/
     """
-    database = db_connect(path=cache_loc)
+    database, _, _ = db_connect(path=cache_loc)
     for file in get_cre_files_from_disk(cre_loc):
         with open(file, "rb") as standard:
             parse_file(
@@ -265,7 +265,7 @@ def review_from_spreadsheet(cache: str, spreadsheet_url: str, share_with: str) -
     create new spreadsheet of the new CRE landscape for review
     """
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     spreadsheet = sheet_utils.readSpreadsheet(
         url=spreadsheet_url, cres_loc=loc, alias="new spreadsheet", validate=False
     )
@@ -294,7 +294,7 @@ def review_from_disk(cache: str, cre_file_loc: str, share_with: str) -> None:
     create new spreadsheet of the new CRE landscape for review
     """
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     for file in get_cre_files_from_disk(cre_file_loc):
         with open(file, "rb") as standard:
             parse_file(
@@ -359,27 +359,25 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
     elif args.osib_out:
         export_to_osib(file_loc=args.osib_out, cache=args.cache_file)
     if args.zap_in:
-        zap_alerts_parser.parse_zap_alerts(db_connect(args.cache_file))
+        cache, _, _ = db_connect(args.cache_file)
+        zap_alerts_parser.parse_zap_alerts(cache)
     if args.cheatsheets_in:
-        cheatsheets_parser.parse_cheatsheets(db_connect(args.cache_file))
+        cache, _, _ = db_connect(args.cache_file)
+        cheatsheets_parser.parse_cheatsheets(cache)
     if args.github_tools_in:
         for url in misc_tools_parser.tool_urls:
-            misc_tools_parser.parse_tool(
-                cache=db_connect(args.cache_file), tool_repo=url
-            )
-    if args.capec_in:
-        capec_parser.parse_capec(cache=db_connect(args.cache_file))
-    if args.export:
-        cache = db_connect(args.cache_file)
-        cache.export(args.export)
+            cache, _, _ = db_connect(args.cache_file)
+            misc_tools_parser.parse_tool(cache=cache, tool_repo=url)
     if args.owasp_proj_meta:
         owasp_metadata_to_cre(args.owasp_proj_meta)
 
     if args.compare_datasets:
-        compare_datasets(args.dataset1, args.dataset2)
+        d1, d2, ed1, ed2 = compare_datasets(args.dataset1, args.dataset2)
+        if len(d1) or len(d2) or len(ed1) or len(ed2):
+            exit(1)
 
 
-def db_connect(path: str) -> db.Node_collection:
+def db_connect(path: str) -> Tuple[db.Node_collection, Any, Any]:
 
     global app
     conf = CMDConfig(db_uri=path)
@@ -387,8 +385,7 @@ def db_connect(path: str) -> db.Node_collection:
     collection = db.Node_collection()
     app_context = app.app_context()
     app_context.push()
-
-    return collection
+    return (collection, app, app_context)
 
 
 def create_spreadsheet(
@@ -422,7 +419,7 @@ def review_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
     """Given the location of an osib.yaml, parse osib, convert to cres and add to db
     export db to yamls and spreadsheet for review"""
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     ymls = odefs.read_osib_yaml(file_loc)
     osibs = odefs.try_from_file(ymls)
     for osib in osibs:
@@ -443,7 +440,7 @@ def review_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
 
 
 def add_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     ymls = odefs.read_osib_yaml(file_loc)
     osibs = odefs.try_from_file(ymls)
     for osib in osibs:
@@ -454,7 +451,8 @@ def add_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
 
 
 def export_to_osib(file_loc: str, cache: str) -> None:
-    docs = db_connect(path=cache).export(file_loc, dry_run=True)
+    cache, _, _ = db_connect(path=cache)
+    docs = cache.export(file_loc, dry_run=True)
     tree = odefs.cre2osib(docs)
     with open(file_loc, "x"):
         with open(file_loc, "w") as f:
@@ -478,18 +476,18 @@ def make_hashtable(graph):
         edges = {}
         for node in graph.nodes():
             if node.startswith("CRE"):
-                nodes[graph.nodes[node]["external_id"]] = node
+                nodes[graph.nodes[node].get("external_id")] = node
             elif node.startswith("Node"):
-                nodes[graph.nodes[node]["infosum"]] = node
+                nodes[graph.nodes[node].get("infosum")] = node
             else:
                 logger.fatal("Graph seems corrupted")
 
         for edge in graph.edges():
-            key = graph.nodes[edge[0]]["external_id"]
+            key = graph.nodes[edge[0]].get("external_id")
             if edge[1].startswith("CRE"):
-                key = key + "-" + graph.nodes[edge[1]]["external_id"]
+                key = f"{key}-{graph.nodes[edge[1]].get('external_id')}"
             else:
-                key = key + "-" + graph.nodes[edge[1]]["infosum"]
+                key = f"{key}-{graph.nodes[edge[1]].get('infosum')}"
             edges[key] = edge
         return nodes, edges
 
@@ -530,9 +528,22 @@ def edge_differences(edges1, edges2, db2):
                     }
         return differences
 
+    database1, _, _ = db_connect(path=db1)
     g1 = database1.graph.graph
-    g2 = database2.graph.graph
     n1, e1 = make_hashtable(g1)
+
+    print("$" * 90)
+    database1.graph.print_graph()
+    print("$" * 90)
+    database1.graph._instance = None
+    database1.graph = None
+
+    database2, _, _ = db_connect(path=db2)
+    g2 = database2.graph.graph
+    print("$" * 90)
+    database2.graph.print_graph()
+    print("$" * 90)
+    input()
     n2, e2 = make_hashtable(g2)
 
     d1 = node_differences(n1, n2, db2)
@@ -540,9 +551,7 @@ def edge_differences(edges1, edges2, db2):
 
     ed1 = edge_differences(e1, e2, db2)
     ed2 = edge_differences(e2, e1, db1)
-    if len(d1) or len(d2) or len(ed1) or len(ed2):
-        exit(1)
-    # return [d1, d2, ed1, ed2] # TODO uncomment when this becomes a library method
+    return [d1, d2, ed1, ed2]  # TODO uncomment when this becomes a library method
 
 
 def owasp_metadata_to_cre(meta_file: str):
diff --git a/application/database/db.py b/application/database/db.py
index 0fb93cc7e..41061566e 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -13,6 +13,7 @@
 from sqlalchemy import func
 from sqlalchemy.sql.expression import desc  # type: ignore
 import uuid
+from matplotlib import pyplot
 
 from .. import sqla  # type: ignore
 
@@ -140,6 +141,15 @@ class CRE_Graph:
     graph: nx.Graph = None
     __instance = None
 
+    def print_graph(self, png_path: str = None):
+        """DEbug method to dump the graph, if png_path is provided it shows the graph in png format
+        if not, it returns the graph as dict of dicts"""
+        if png_path:
+            nx.draw(self.graph, with_labels=True)
+            pyplot.savefig(png_path)
+            pyplot.show()
+        return nx.to_dict_of_dicts(self.graph)
+
     @classmethod
     def instance(cls, session):
         if cls.__instance is None:
@@ -219,12 +229,11 @@ def load_cre_graph(cls, session) -> nx.Graph:
 
 class Node_collection:
     graph: nx.Graph = None
-    session = sqla.session
+    session = None
 
-    def __init__(self) -> None:
+    def __init__(self, session=sqla.session) -> None:
         self.graph = CRE_Graph.instance(sqla.session)
-        # self.graph = CRE_Graph.instance(session=sqla.session)
-        self.session = sqla.session
+        self.session = session
 
     def __get_external_links(self) -> List[Tuple[CRE, Node, str]]:
         external_links: List[Tuple[CRE, Node, str]] = []
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index 69bce5b20..a6cfe1fdc 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 import os
 import shutil
@@ -372,7 +373,7 @@ def test_add_from_spreadsheet(
         self.tmpdirs.append(dir)
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
 
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_export.return_value = [
             defs.CRE(name="c0"),
             defs.Standard(name="s0", section="s1"),
@@ -415,7 +416,7 @@ def test_review_from_spreadsheet(
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir)[1]
         mocked_prepare_for_review.return_value = (loc, cache)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
 
         mocked_create_spreadsheet.return_value = "https://example.com/sheeet"
         mocked_export.return_value = [
@@ -467,7 +468,7 @@ def test_review_from_disk(
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
         mocked_prepare_for_review.return_value = (loc, cache)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_get_standards_files_from_disk.return_value = [yml for i in range(0, 3)]
         mocked_export.return_value = [
             defs.CRE(name="c0"),
@@ -511,7 +512,7 @@ def test_add_from_disk(
         yml = tempfile.mkstemp(dir=dir, suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_get_standards_files_from_disk.return_value = [yml for i in range(0, 3)]
         mocked_export.return_value = [
             defs.CRE(name="c0"),
@@ -557,7 +558,7 @@ def test_review_osib_from_file(
         loc = tempfile.mkstemp(dir=dir)[1]
         cach = tempfile.mkstemp(dir=dir)[1]
         mocked_prepare_for_review.return_value = (loc, cach)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_read_osib_yaml.return_value = [{"osib": "osib"}]
         mocked_try_from_file.return_value = [
             Osib_tree(aliases=[Osib_id("t1")]),
@@ -619,7 +620,7 @@ def test_add_osib_from_file(
         osib_yaml = tempfile.mkstemp(dir=dir, suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_read_osib_yaml.return_value = [{"osib": "osib"}]
         mocked_try_from_file.return_value = [
             odefs.Osib_tree(aliases=[Osib_id("t1")]),
@@ -663,7 +664,7 @@ def test_export_to_osib(
         # osib_yaml = tempfile.mkstemp(dir=dir,suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_cre2osib.return_value = odefs.Osib_tree(aliases=[Osib_id("t1")])
         mocked_export.return_value = [defs.CRE(name="c0")]
 
@@ -671,6 +672,73 @@ def test_export_to_osib(
         mocked_db_connect.assert_called_with(path=cache)
         mocked_cre2osib.assert_called_with([defs.CRE(name="c0")])
 
+    def test_compare_datasets(self):
+        _, t1 = tempfile.mkstemp()
+        _, t2 = tempfile.mkstemp()
+        _, tdiff = tempfile.mkstemp()
+        self.tmpdirs.extend([t1, t2, tdiff])
+
+        c0 = defs.CRE(id="111-000", description="CREdesc", name="CREname")
+        s456 = defs.Standard(
+            subsection="4.5.6",
+            section="FooStand",
+            name="BarStand",
+            hyperlink="https://example.com",
+            tags=["a", "b", "c"],
+        )
+        c1 = defs.CRE(
+            id="111-001",
+            description="Groupdesc",
+            name="GroupName",
+            links=[defs.Link(document=s456)],
+        )
+        s_unlinked = defs.Standard(
+            subsection="4.5.6",
+            section="Unlinked",
+            name="Unlinked",
+            hyperlink="https://example.com",
+        )
+        connection_1, app1, context1 = main.db_connect(path=t1)
+        sqla.create_all(app=app1)
+        connection_1.graph.graph = db.CRE_Graph.load_cre_graph(connection_1.session)
+        connection_1.add_cre(c0)
+        connection_1.add_node(s_unlinked)
+        connection_1.add_link(connection_1.add_cre(c1), connection_1.add_node(s456))
+
+        pprint("%" * 90)
+        pprint(t1)
+        pprint(connection_1.graph.print_graph())
+        input()
+
+        # connection_2,app2,context2 = main.db_connect(path=t2)
+        # sqla.create_all(app=app2)
+        # connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)
+        # connection_2.add_cre(c0)
+        # connection_2.add_node(s_unlinked)
+        # connection_2.add_link(connection_2.add_cre(c1),connection_2.add_node(s456))
+
+        connection_diff, appdiff, contextdiff = main.db_connect(path=tdiff)
+        connection_diff.graph.graph = db.CRE_Graph.load_cre_graph(
+            connection_diff.session
+        )
+        sqla.create_all(app=appdiff)
+        connection_diff.add_cre(c0)
+        connection_diff.add_cre(defs.CRE(id="000-111", name="asdfa232332sdf"))
+
+        pprint("#" * 90)
+        pprint(tdiff)
+        pprint(connection_diff.graph.print_graph())
+        input()
+        pprint("#" * 90)
+
+        # self.assertEqual(main.compare_datasets("foo", "bar"), [{},{},{},{}])
+        # self.assertEqual(main.compare_datasets(t1,t2), [{},{},{},{}])
+        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
+
+        contextdiff.pop()
+        # context2.pop()
+        context1.pop()
+
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError
 

From d2d13cccdc66600476f3a39197e5e686b76f9e76 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Wed, 1 Jun 2022 23:22:21 +0100
Subject: [PATCH 08/13] not sure if progress

---
 application/cmd/cre_main.py        | 28 ++++++++++++++++++++++------
 application/database/db.py         |  7 +++++--
 application/tests/cre_main_test.py |  7 +++++--
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index 1f878ec06..c4b01de85 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -23,6 +23,10 @@
 from dacite import from_dict
 from dacite.config import Config
 
+from application import sqla
+from sqlalchemy import create_engine
+from sqlalchemy.orm import scoped_session, sessionmaker
+
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -377,14 +381,17 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
             exit(1)
 
 
-def db_connect(path: str) -> Tuple[db.Node_collection, Any, Any]:
-
+def db_connect(
+    path: str, session=None, mk_app=True
+) -> Tuple[db.Node_collection, Any, Any]:
     global app
+    app_context = None
     conf = CMDConfig(db_uri=path)
     app = create_app(conf=conf)
-    collection = db.Node_collection()
     app_context = app.app_context()
     app_context.push()
+    collection = db.Node_collection()
+
     return (collection, app, app_context)
 
 
@@ -527,8 +534,13 @@ def edge_differences(edges1, edges2, db2):
                         "attributes2": edges2[edge],
                     }
         return differences
-
-    database1, _, _ = db_connect(path=db1)
+    # sqla = create_engine(db1)
+    # session1 = scoped_session(
+    #     sessionmaker(autocommit=False, autoflush=False, bind=sqla)
+    # )
+    # database1 = db.Node_collection(session=session1)
+    database1, app1, context1 = db_connect(path=db1)
+    database1.graph.graph = db.CRE_Graph.load_cre_graph(session=database1.session)
     g1 = database1.graph.graph
     n1, e1 = make_hashtable(g1)
 
@@ -538,7 +550,11 @@ def edge_differences(edges1, edges2, db2):
     database1.graph._instance = None
     database1.graph = None
 
-    database2, _, _ = db_connect(path=db2)
+    engine2 = create_engine(db2)
+    session2 = scoped_session(
+        sessionmaker(autocommit=False, autoflush=False, bind=engine2)
+    )
+    database2 = db.Node_collection (session=session2)
     g2 = database2.graph.graph
     print("$" * 90)
     database2.graph.print_graph()
diff --git a/application/database/db.py b/application/database/db.py
index 41061566e..6aab740d9 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -231,8 +231,11 @@ class Node_collection:
     graph: nx.Graph = None
     session = None
 
-    def __init__(self, session=sqla.session) -> None:
-        self.graph = CRE_Graph.instance(sqla.session)
+    def __init__(self, session=sqla.session, graph:CRE_Graph=None) -> None:
+        if graph:
+            self.graph = graph
+        else:
+            self.graph = CRE_Graph.instance(sqla.session)
         self.session = session
 
     def __get_external_links(self) -> List[Tuple[CRE, Node, str]]:
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index a6cfe1fdc..6ef9a4160 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -676,7 +676,7 @@ def test_compare_datasets(self):
         _, t1 = tempfile.mkstemp()
         _, t2 = tempfile.mkstemp()
         _, tdiff = tempfile.mkstemp()
-        self.tmpdirs.extend([t1, t2, tdiff])
+        # self.tmpdirs.extend([t1, t2, tdiff])
 
         c0 = defs.CRE(id="111-000", description="CREdesc", name="CREname")
         s456 = defs.Standard(
@@ -698,6 +698,7 @@ def test_compare_datasets(self):
             name="Unlinked",
             hyperlink="https://example.com",
         )
+        
         connection_1, app1, context1 = main.db_connect(path=t1)
         sqla.create_all(app=app1)
         connection_1.graph.graph = db.CRE_Graph.load_cre_graph(connection_1.session)
@@ -733,7 +734,9 @@ def test_compare_datasets(self):
 
         # self.assertEqual(main.compare_datasets("foo", "bar"), [{},{},{},{}])
         # self.assertEqual(main.compare_datasets(t1,t2), [{},{},{},{}])
-        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
+        pprint("sqlite://"+t1)
+        pprint("sqlite://"+tdiff)
+        self.assertNotEqual(main.compare_datasets("sqlite://"+t1, "sqlite://"+tdiff), [{}, {}, {}, {}])
 
         contextdiff.pop()
         # context2.pop()

From e78acad2b02ea3160101e757ac83d307c0b5c41b Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Tue, 7 Jun 2022 08:43:13 +0100
Subject: [PATCH 09/13] wip

---
 application/cmd/cre_main.py        | 56 +++++++++++++++++-------------
 application/database/db.py         | 24 +++++++------
 application/tests/cre_main_test.py | 56 ++++++++++++++++--------------
 3 files changed, 74 insertions(+), 62 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index c4b01de85..b18cb5341 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -1,3 +1,4 @@
+from pprint import pprint
 import argparse
 import json
 import logging
@@ -385,7 +386,6 @@ def db_connect(
     path: str, session=None, mk_app=True
 ) -> Tuple[db.Node_collection, Any, Any]:
     global app
-    app_context = None
     conf = CMDConfig(db_uri=path)
     app = create_app(conf=conf)
     app_context = app.app_context()
@@ -482,16 +482,16 @@ def make_hashtable(graph):
         nodes = {}
         edges = {}
         for node in graph.nodes():
-            if node.startswith("CRE"):
+            if node.startswith("CRE-id"):
                 nodes[graph.nodes[node].get("external_id")] = node
-            elif node.startswith("Node"):
+            elif node.startswith("Node-id"):
                 nodes[graph.nodes[node].get("infosum")] = node
             else:
                 logger.fatal("Graph seems corrupted")
 
         for edge in graph.edges():
             key = graph.nodes[edge[0]].get("external_id")
-            if edge[1].startswith("CRE"):
+            if edge[1].startswith("CRE-id"):
                 key = f"{key}-{graph.nodes[edge[1]].get('external_id')}"
             else:
                 key = f"{key}-{graph.nodes[edge[1]].get('infosum')}"
@@ -505,7 +505,8 @@ def node_differences(nodes1, nodes2, db2):
             if node not in nodes2:
                 logger.error(f"{node} not present in {db2}")
                 differences["not_present"] = (node, db2)
-            elif nodes2[node] != attrs:
+            elif not (attrs.startswith("CRE") nodes2[node] != attrs:
+                
                 logger.error(
                     f"Dataset 2 {db2} node:{node} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {nodes2[node]} "
                 )
@@ -540,27 +541,32 @@ def edge_differences(edges1, edges2, db2):
     # )
     # database1 = db.Node_collection(session=session1)
     database1, app1, context1 = db_connect(path=db1)
+    sqla.create_all(app=app1)
     database1.graph.graph = db.CRE_Graph.load_cre_graph(session=database1.session)
-    g1 = database1.graph.graph
-    n1, e1 = make_hashtable(g1)
-
-    print("$" * 90)
-    database1.graph.print_graph()
-    print("$" * 90)
-    database1.graph._instance = None
-    database1.graph = None
-
-    engine2 = create_engine(db2)
-    session2 = scoped_session(
-        sessionmaker(autocommit=False, autoflush=False, bind=engine2)
-    )
-    database2 = db.Node_collection (session=session2)
-    g2 = database2.graph.graph
-    print("$" * 90)
-    database2.graph.print_graph()
-    print("$" * 90)
-    input()
-    n2, e2 = make_hashtable(g2)
+    n1, e1 = make_hashtable(database1.graph.graph)
+    database1.session.remove()
+    context1.pop()
+    
+    # print("$" * 90)
+    # pprint(database1.get_node_names())
+    # pprint(database1.graph.print_graph())
+    # print("$" * 90)
+    # # database1.graph.__instance = None
+    # database1.graph = None
+
+    database2, app2, context2 = db_connect(path=db2)
+    sqla.create_all(app=app2)    
+    database2.graph.graph = db.CRE_Graph.load_cre_graph(session=database2.session)
+    context2.pop()
+    # print("$" * 90)
+    # pprint(database2.get_node_names())
+    # pprint(database2.graph.print_graph())
+    # print("$" * 90)
+    # input()
+    
+    # database2.session.remove()
+    
+    n2, e2 = make_hashtable(database2.graph.graph)
 
     d1 = node_differences(n1, n2, db2)
     d2 = node_differences(n2, n1, db1)
diff --git a/application/database/db.py b/application/database/db.py
index 6aab740d9..9cdbef851 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -170,7 +170,9 @@ def add_node(self, *args, **kwargs):
     def add_cre(cls, dbcre: CRE, graph: nx.DiGraph) -> nx.DiGraph:
         if dbcre:
             graph.add_node(
-                f"CRE: {dbcre.id}", internal_id=dbcre.id, external_id=dbcre.external_id
+                f"CRE-id: {dbcre.id}",
+                internal_id=dbcre.id,
+                external_id=dbcre.external_id
             )
         else:
             logger.error("Called with dbcre being none")
@@ -183,7 +185,7 @@ def add_dbnode(cls, dbnode: Node, graph: nx.DiGraph) -> nx.DiGraph:
                 dbnode.serialise()
             )  # using md5 would have been way more performant but then I'd have to triage every beg-hunter's SAST scanner results
             graph.add_node(
-                f"Node: {dbnode.id}",
+                f"Node-id: {dbnode.id}",
                 internal_id=dbnode.id,
                 name=dbnode.name,
                 section=dbnode.section,
@@ -212,7 +214,7 @@ def load_cre_graph(cls, session) -> nx.Graph:
                 logger.error(f"CRE {il.cre} does not exist?")
             graph = cls.add_cre(dbcre=cre, graph=graph)
 
-            graph.add_edge(f"CRE: {il.group}", f"CRE: {il.cre}", ltype=il.type)
+            graph.add_edge(f"CRE-id: {il.group}", f"CRE-id: {il.cre}", ltype=il.type)
 
         for lnk in session.query(Links).all():
             node = session.query(Node).filter(Node.id == lnk.node).first()
@@ -223,7 +225,7 @@ def load_cre_graph(cls, session) -> nx.Graph:
             cre = session.query(CRE).filter(CRE.id == lnk.cre).first()
             graph = cls.add_cre(dbcre=cre, graph=graph)
 
-            graph.add_edge(f"CRE: {lnk.cre}", f"Node: {str(lnk.node)}", ltype=lnk.type)
+            graph.add_edge(f"CRE-id: {lnk.cre}", f"Node-id: {str(lnk.node)}", ltype=lnk.type)
         return graph
 
 
@@ -926,14 +928,16 @@ def add_internal_link(
                 f" {group.external_id}:{group.name}"
                 f" == {cre.external_id}:{cre.name} ,adding"
             )
-            cycle = self.__introduces_cycle(f"CRE: {group.id}", f"CRE: {cre.id}")
+            cycle = self.__introduces_cycle(f"CRE-id: {group.id}", f"CRE-id: {cre.id}")
             if not cycle:
                 self.session.add(
                     InternalLinks(type=type.value, cre=cre.id, group=group.id)
                 )
                 self.session.commit()
                 self.graph.add_edge(
-                    f"CRE: {group.id}", f"CRE: {cre.id}", ltype=type.value
+                    f"CRE-id: {group.id}",
+                    f"CRE-id: {cre.id}",
+                    ltype=type.value
                 )
             else:
                 logger.warning(
@@ -972,7 +976,7 @@ def add_link(
             return
         else:
             cycle = self.__introduces_cycle(
-                f"CRE: {cre.id}", f"Standard: {str(node.id)}"
+                f"CRE-id: {cre.id}", f"Node-id: {str(node.id)}"
             )
             if not cycle:
                 logger.debug(
@@ -982,7 +986,7 @@ def add_link(
                 )
                 self.session.add(Links(type=type.value, cre=cre.id, node=node.id))
                 self.graph.add_edge(
-                    f"CRE: {cre.id}", f"Node: {str(node.id)}", ltype=type.value
+                    f"CRE-id: {cre.id}", f"Node-id: {str(node.id)}", ltype=type.value
                 )
             else:
                 logger.warning(
@@ -1001,8 +1005,8 @@ def find_path_between_nodes(
         this starts getting complicated when we have more linktypes"""
         res: bool = nx.has_path(
             self.graph.graph.to_undirected(),
-            "Node: " + str(node_source_id),
-            "Node: " + str(node_destination_id),
+            "Node-id: " + str(node_source_id),
+            "Node-id: " + str(node_destination_id),
         )
 
         return res
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index 6ef9a4160..80a91fbbb 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -19,14 +19,15 @@
 
 class TestMain(unittest.TestCase):
     def tearDown(self) -> None:
-        for tmpdir in self.tmpdirs:
-            shutil.rmtree(tmpdir)
+        [shutil.rmtree(tmpdir) for tmpdir in self.tmpdirs]
+        [os.remove(tmpfile) for tmpfile in self.tmpfiles]
         sqla.session.remove()
         sqla.drop_all(app=self.app)
         self.app_context.pop()
 
     def setUp(self) -> None:
         self.tmpdirs: List[str] = []
+        self.tmpfiles: List[str] = []
         self.app = create_app(mode="test")
         sqla.create_all(app=self.app)
         self.app_context = self.app.app_context()
@@ -676,7 +677,8 @@ def test_compare_datasets(self):
         _, t1 = tempfile.mkstemp()
         _, t2 = tempfile.mkstemp()
         _, tdiff = tempfile.mkstemp()
-        # self.tmpdirs.extend([t1, t2, tdiff])
+        self.tmpfiles.extend([t1, t2, tdiff])
+        self.maxDiff = None
 
         c0 = defs.CRE(id="111-000", description="CREdesc", name="CREname")
         s456 = defs.Standard(
@@ -705,18 +707,18 @@ def test_compare_datasets(self):
         connection_1.add_cre(c0)
         connection_1.add_node(s_unlinked)
         connection_1.add_link(connection_1.add_cre(c1), connection_1.add_node(s456))
+        context1.pop()
 
-        pprint("%" * 90)
-        pprint(t1)
-        pprint(connection_1.graph.print_graph())
-        input()
+        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
+        
 
-        # connection_2,app2,context2 = main.db_connect(path=t2)
-        # sqla.create_all(app=app2)
-        # connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)
-        # connection_2.add_cre(c0)
-        # connection_2.add_node(s_unlinked)
-        # connection_2.add_link(connection_2.add_cre(c1),connection_2.add_node(s456))
+        connection_2,app2,context2 = main.db_connect(path=t2)
+        sqla.create_all(app=app2)
+        connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)
+        connection_2.add_cre(c0)
+        connection_2.add_node(s_unlinked)
+        connection_2.add_link(connection_2.add_cre(c1),connection_2.add_node(s456))
+        context2.pop()
 
         connection_diff, appdiff, contextdiff = main.db_connect(path=tdiff)
         connection_diff.graph.graph = db.CRE_Graph.load_cre_graph(
@@ -725,22 +727,22 @@ def test_compare_datasets(self):
         sqla.create_all(app=appdiff)
         connection_diff.add_cre(c0)
         connection_diff.add_cre(defs.CRE(id="000-111", name="asdfa232332sdf"))
-
-        pprint("#" * 90)
-        pprint(tdiff)
-        pprint(connection_diff.graph.print_graph())
-        input()
-        pprint("#" * 90)
-
-        # self.assertEqual(main.compare_datasets("foo", "bar"), [{},{},{},{}])
-        # self.assertEqual(main.compare_datasets(t1,t2), [{},{},{},{}])
-        pprint("sqlite://"+t1)
-        pprint("sqlite://"+tdiff)
-        self.assertNotEqual(main.compare_datasets("sqlite://"+t1, "sqlite://"+tdiff), [{}, {}, {}, {}])
-
         contextdiff.pop()
+        
+        # pprint("#" * 90)
+        # pprint(tdiff)
+        # pprint(connection_diff.graph.print_graph())
+        # input()
+        # pprint("#" * 90)
+
+        self.assertEqual(main.compare_datasets("foo", "bar"), [{},{},{},{}])
+        self.assertEqual(main.compare_datasets(t1,t2), [{},{},{},{}])
+        # pprint("sqlite://"+t1)
+        # pprint("sqlite://"+tdiff)
+        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
+
+        # contextdiff.pop()
         # context2.pop()
-        context1.pop()
 
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError

From e1bc9e94cbb421561c6c68ec021387f5e5b0ea61 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Tue, 7 Jun 2022 16:23:35 +0100
Subject: [PATCH 10/13] wip

---
 application/cmd/cre_main.py        | 41 ++++++++---------------
 application/database/db.py         | 12 +++----
 application/tests/cre_main_test.py | 52 +++++++++++++++++-------------
 3 files changed, 49 insertions(+), 56 deletions(-)

diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index b18cb5341..276812a79 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -505,8 +505,10 @@ def node_differences(nodes1, nodes2, db2):
             if node not in nodes2:
                 logger.error(f"{node} not present in {db2}")
                 differences["not_present"] = (node, db2)
-            elif not (attrs.startswith("CRE") nodes2[node] != attrs:
-                
+            elif nodes2[node] != attrs and not (
+                attrs.startswith("CRE-id") or attrs.startswith("Node-id")
+            ):
+
                 logger.error(
                     f"Dataset 2 {db2} node:{node} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {nodes2[node]} "
                 )
@@ -525,7 +527,11 @@ def edge_differences(edges1, edges2, db2):
                 logger.error(f"{edge} not present in {db2}")
                 differences["not_present"] = (edge, db2)
             else:
-                if edges2[edge] != attrs:
+                if edges2[edge] != attrs and [
+                    e
+                    for e in attrs
+                    if not (e.startswith("CRE-id") or e.startswith("Node-id"))
+                ]:
                     logger.error(
                         f"Dataset 2{db2} edge:{edge} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {edges2[edge]}"
                     )
@@ -535,45 +541,26 @@ def edge_differences(edges1, edges2, db2):
                         "attributes2": edges2[edge],
                     }
         return differences
-    # sqla = create_engine(db1)
-    # session1 = scoped_session(
-    #     sessionmaker(autocommit=False, autoflush=False, bind=sqla)
-    # )
-    # database1 = db.Node_collection(session=session1)
+
     database1, app1, context1 = db_connect(path=db1)
     sqla.create_all(app=app1)
     database1.graph.graph = db.CRE_Graph.load_cre_graph(session=database1.session)
     n1, e1 = make_hashtable(database1.graph.graph)
     database1.session.remove()
     context1.pop()
-    
-    # print("$" * 90)
-    # pprint(database1.get_node_names())
-    # pprint(database1.graph.print_graph())
-    # print("$" * 90)
-    # # database1.graph.__instance = None
-    # database1.graph = None
 
     database2, app2, context2 = db_connect(path=db2)
-    sqla.create_all(app=app2)    
+    sqla.create_all(app=app2)
     database2.graph.graph = db.CRE_Graph.load_cre_graph(session=database2.session)
-    context2.pop()
-    # print("$" * 90)
-    # pprint(database2.get_node_names())
-    # pprint(database2.graph.print_graph())
-    # print("$" * 90)
-    # input()
-    
-    # database2.session.remove()
-    
     n2, e2 = make_hashtable(database2.graph.graph)
+    database2.session.remove()
+    context2.pop()
 
     d1 = node_differences(n1, n2, db2)
     d2 = node_differences(n2, n1, db1)
-
     ed1 = edge_differences(e1, e2, db2)
     ed2 = edge_differences(e2, e1, db1)
-    return [d1, d2, ed1, ed2]  # TODO uncomment when this becomes a library method
+    return [d1, d2, ed1, ed2]
 
 
 def owasp_metadata_to_cre(meta_file: str):
diff --git a/application/database/db.py b/application/database/db.py
index 9cdbef851..b8a8a8cd9 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -172,7 +172,7 @@ def add_cre(cls, dbcre: CRE, graph: nx.DiGraph) -> nx.DiGraph:
             graph.add_node(
                 f"CRE-id: {dbcre.id}",
                 internal_id=dbcre.id,
-                external_id=dbcre.external_id
+                external_id=dbcre.external_id,
             )
         else:
             logger.error("Called with dbcre being none")
@@ -225,7 +225,9 @@ def load_cre_graph(cls, session) -> nx.Graph:
             cre = session.query(CRE).filter(CRE.id == lnk.cre).first()
             graph = cls.add_cre(dbcre=cre, graph=graph)
 
-            graph.add_edge(f"CRE-id: {lnk.cre}", f"Node-id: {str(lnk.node)}", ltype=lnk.type)
+            graph.add_edge(
+                f"CRE-id: {lnk.cre}", f"Node-id: {str(lnk.node)}", ltype=lnk.type
+            )
         return graph
 
 
@@ -233,7 +235,7 @@ class Node_collection:
     graph: nx.Graph = None
     session = None
 
-    def __init__(self, session=sqla.session, graph:CRE_Graph=None) -> None:
+    def __init__(self, session=sqla.session, graph: CRE_Graph = None) -> None:
         if graph:
             self.graph = graph
         else:
@@ -935,9 +937,7 @@ def add_internal_link(
                 )
                 self.session.commit()
                 self.graph.add_edge(
-                    f"CRE-id: {group.id}",
-                    f"CRE-id: {cre.id}",
-                    ltype=type.value
+                    f"CRE-id: {group.id}", f"CRE-id: {cre.id}", ltype=type.value
                 )
             else:
                 logger.warning(
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index 80a91fbbb..b33a8ecb6 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -674,9 +674,9 @@ def test_export_to_osib(
         mocked_cre2osib.assert_called_with([defs.CRE(name="c0")])
 
     def test_compare_datasets(self):
-        _, t1 = tempfile.mkstemp()
-        _, t2 = tempfile.mkstemp()
-        _, tdiff = tempfile.mkstemp()
+        _, t1 = tempfile.mkstemp(suffix="dataset1")
+        _, t2 = tempfile.mkstemp(suffix="dataset2")
+        _, tdiff = tempfile.mkstemp(suffix="datasetdiff")
         self.tmpfiles.extend([t1, t2, tdiff])
         self.maxDiff = None
 
@@ -700,7 +700,7 @@ def test_compare_datasets(self):
             name="Unlinked",
             hyperlink="https://example.com",
         )
-        
+
         connection_1, app1, context1 = main.db_connect(path=t1)
         sqla.create_all(app=app1)
         connection_1.graph.graph = db.CRE_Graph.load_cre_graph(connection_1.session)
@@ -709,15 +709,22 @@ def test_compare_datasets(self):
         connection_1.add_link(connection_1.add_cre(c1), connection_1.add_node(s456))
         context1.pop()
 
-        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
-        
+        self.assertEqual(
+            main.compare_datasets(t1, tdiff),
+            [
+                {"not_present": (c1, id, tdiff)},
+                {},
+                {"not_present": (f"{c1.id}-<some infosum>", tdiff)},
+                {},
+            ],
+        )
 
-        connection_2,app2,context2 = main.db_connect(path=t2)
+        connection_2, app2, context2 = main.db_connect(path=t2)
         sqla.create_all(app=app2)
         connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)
         connection_2.add_cre(c0)
         connection_2.add_node(s_unlinked)
-        connection_2.add_link(connection_2.add_cre(c1),connection_2.add_node(s456))
+        connection_2.add_link(connection_2.add_cre(c1), connection_2.add_node(s456))
         context2.pop()
 
         connection_diff, appdiff, contextdiff = main.db_connect(path=tdiff)
@@ -728,21 +735,20 @@ def test_compare_datasets(self):
         connection_diff.add_cre(c0)
         connection_diff.add_cre(defs.CRE(id="000-111", name="asdfa232332sdf"))
         contextdiff.pop()
-        
-        # pprint("#" * 90)
-        # pprint(tdiff)
-        # pprint(connection_diff.graph.print_graph())
-        # input()
-        # pprint("#" * 90)
-
-        self.assertEqual(main.compare_datasets("foo", "bar"), [{},{},{},{}])
-        self.assertEqual(main.compare_datasets(t1,t2), [{},{},{},{}])
-        # pprint("sqlite://"+t1)
-        # pprint("sqlite://"+tdiff)
-        self.assertNotEqual(main.compare_datasets(t1, tdiff), [{}, {}, {}, {}])
-
-        # contextdiff.pop()
-        # context2.pop()
+
+        self.assertEqual(main.compare_datasets("foo", "bar"), [{}, {}, {}, {}])
+        self.assertEqual(main.compare_datasets(t1, t2), [{}, {}, {}, {}])
+        self.assertEqual(
+            main.compare_datasets(t1, tdiff),
+            [
+                {"not_present": (c1.id, tdiff)},
+                {},
+                {
+                    "not_present": (f"{c1.id}-", tdiff)
+                },  # here the make_hashtable method creates edges with the format of <originating_cre_id>-<node infosum> so need to find the infosum of the node conencted to  c1
+                {},
+            ],
+        )
 
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError

From 4e4a778feee3ac0807a65e9feb4ed23431cf89dd Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 26 Jun 2022 17:25:00 +0100
Subject: [PATCH 11/13] fix tests

---
 application/tests/cre_main_test.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index b33a8ecb6..51012f4db 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -706,18 +706,15 @@ def test_compare_datasets(self):
         connection_1.graph.graph = db.CRE_Graph.load_cre_graph(connection_1.session)
         connection_1.add_cre(c0)
         connection_1.add_node(s_unlinked)
-        connection_1.add_link(connection_1.add_cre(c1), connection_1.add_node(s456))
-        context1.pop()
+        db_s456 = connection_1.add_node(s456)
+        connection_1.add_link(connection_1.add_cre(c1), db_s456)
+        infosum = [
+            connection_1.graph.graph.nodes[x].get("infosum")
+            for x in connection_1.graph.graph.nodes
+            if db_s456.id in x
+        ][0]
 
-        self.assertEqual(
-            main.compare_datasets(t1, tdiff),
-            [
-                {"not_present": (c1, id, tdiff)},
-                {},
-                {"not_present": (f"{c1.id}-<some infosum>", tdiff)},
-                {},
-            ],
-        )
+        context1.pop()
 
         connection_2, app2, context2 = main.db_connect(path=t2)
         sqla.create_all(app=app2)
@@ -743,9 +740,7 @@ def test_compare_datasets(self):
             [
                 {"not_present": (c1.id, tdiff)},
                 {},
-                {
-                    "not_present": (f"{c1.id}-", tdiff)
-                },  # here the make_hashtable method creates edges with the format of <originating_cre_id>-<node infosum> so need to find the infosum of the node conencted to  c1
+                {"not_present": (f"{c1.id}-{infosum}", tdiff)},
                 {},
             ],
         )

From 91cf851a2c92ada5fb4114f7aa98ff2d910ba5f8 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 26 Jun 2022 17:27:32 +0100
Subject: [PATCH 12/13] fix tests

---
 application/tests/cre_main_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index 51012f4db..ca66fdca9 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -716,6 +716,16 @@ def test_compare_datasets(self):
 
         context1.pop()
 
+        self.assertEqual(
+            main.compare_datasets(t1, tdiff),
+            [
+                {"not_present": (c1.id, tdiff)},
+                {},
+                {"not_present": (f"{c1.id}-{infosum}", tdiff)},
+                {},
+            ],
+        )
+
         connection_2, app2, context2 = main.db_connect(path=t2)
         sqla.create_all(app=app2)
         connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)

From 41ee700e4d8b0712a4df673bd94c518e0d64a9f8 Mon Sep 17 00:00:00 2001
From: Spyros <morfeas3000@gmail.com>
Date: Sun, 26 Jun 2022 17:35:08 +0100
Subject: [PATCH 13/13] new deps

---
 requirements.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index e2de0dfee..7abfe5893 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ click-didyoumean==0.0.3
 click-plugins==1.1.1
 click-repl==0.2.0
 coverage==5.5
+cycler==0.11.0
 dacite==1.6.0
 dataclasses-json==0.5.6
 decorator==4.4.2
@@ -24,6 +25,7 @@ Flask-Cors==3.0.10
 Flask-Migrate==3.1.0
 Flask-SQLAlchemy==2.5.1
 flask-sqlalchemy-stubs==0.2
+fonttools==4.33.3
 gitdb==4.0.5
 github2==0.6.2
 GitPython==3.1.9
@@ -40,19 +42,24 @@ isort==5.9.3
 itsdangerous==1.1.0
 Jinja2==2.11.3
 jsonschema==3.2.0
+kiwisolver==1.4.3
 lazy-object-proxy==1.6.0
 Mako==1.1.5
 MarkupSafe==1.1.1
 marshmallow==3.14.1
 marshmallow-enum==1.5.1
+matplotlib==3.5.2
 mccabe==0.6.1
 mypy==0.910
 mypy-extensions==0.4.3
 networkx==2.5.1
+numpy==1.23.0
 oauthlib==3.1.0
+packaging==21.3
 pathspec==0.9.0
 pbr==5.8.0
 pep517==0.8.2
+Pillow==9.1.1
 pip-autoremove==0.9.1
 platformdirs==2.2.0
 prompt-toolkit==3.0.19