diff --git a/application/cmd/cre_main.py b/application/cmd/cre_main.py
index 462dc712d..276812a79 100644
--- a/application/cmd/cre_main.py
+++ b/application/cmd/cre_main.py
@@ -1,3 +1,4 @@
+from pprint import pprint
 import argparse
 import json
 import logging
@@ -23,6 +24,10 @@
 from dacite import from_dict
 from dacite.config import Config
 
+from application import sqla
+from sqlalchemy import create_engine
+from sqlalchemy.orm import scoped_session, sessionmaker
+
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -226,7 +231,7 @@ def add_from_spreadsheet(spreadsheet_url: str, cache_loc: str, cre_loc: str) ->
     import new mappings from <url>
     export db to ../../cres/
     """
-    database = db_connect(path=cache_loc)
+    database, _, _ = db_connect(path=cache_loc)
     spreadsheet = sheet_utils.readSpreadsheet(
         url=spreadsheet_url, cres_loc=cre_loc, alias="new spreadsheet", validate=False
     )
@@ -246,7 +251,7 @@ def add_from_disk(cache_loc: str, cre_loc: str) -> None:
     import new mappings from <path>
     export db to ../../cres/
     """
-    database = db_connect(path=cache_loc)
+    database, _, _ = db_connect(path=cache_loc)
     for file in get_cre_files_from_disk(cre_loc):
         with open(file, "rb") as standard:
             parse_file(
@@ -265,7 +270,7 @@ def review_from_spreadsheet(cache: str, spreadsheet_url: str, share_with: str) -
     create new spreadsheet of the new CRE landscape for review
     """
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     spreadsheet = sheet_utils.readSpreadsheet(
         url=spreadsheet_url, cres_loc=loc, alias="new spreadsheet", validate=False
     )
@@ -294,7 +299,7 @@ def review_from_disk(cache: str, cre_file_loc: str, share_with: str) -> None:
     create new spreadsheet of the new CRE landscape for review
     """
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     for file in get_cre_files_from_disk(cre_file_loc):
         with open(file, "rb") as standard:
             parse_file(
@@ -359,33 +364,35 @@ def run(args: argparse.Namespace) -> None:  # pragma: no cover
     elif args.osib_out:
         export_to_osib(file_loc=args.osib_out, cache=args.cache_file)
     if args.zap_in:
-        zap_alerts_parser.parse_zap_alerts(db_connect(args.cache_file))
+        cache, _, _ = db_connect(args.cache_file)
+        zap_alerts_parser.parse_zap_alerts(cache)
     if args.cheatsheets_in:
-        cheatsheets_parser.parse_cheatsheets(db_connect(args.cache_file))
+        cache, _, _ = db_connect(args.cache_file)
+        cheatsheets_parser.parse_cheatsheets(cache)
     if args.github_tools_in:
         for url in misc_tools_parser.tool_urls:
-            misc_tools_parser.parse_tool(
-                cache=db_connect(args.cache_file), tool_repo=url
-            )
-    if args.capec_in:
-        capec_parser.parse_capec(cache=db_connect(args.cache_file))
-    if args.export:
-        cache = db_connect(args.cache_file)
-        cache.export(args.export)
+            cache, _, _ = db_connect(args.cache_file)
+            misc_tools_parser.parse_tool(cache=cache, tool_repo=url)
     if args.owasp_proj_meta:
         owasp_metadata_to_cre(args.owasp_proj_meta)
 
+    if args.compare_datasets:
+        d1, d2, ed1, ed2 = compare_datasets(args.dataset1, args.dataset2)
+        if len(d1) or len(d2) or len(ed1) or len(ed2):
+            exit(1)
 
-def db_connect(path: str) -> db.Node_collection:
 
+def db_connect(
+    path: str, session=None, mk_app=True
+) -> Tuple[db.Node_collection, Any, Any]:
     global app
     conf = CMDConfig(db_uri=path)
     app = create_app(conf=conf)
-    collection = db.Node_collection()
     app_context = app.app_context()
     app_context.push()
+    collection = db.Node_collection()
 
-    return collection
+    return (collection, app, app_context)
 
 
 def create_spreadsheet(
@@ -419,7 +426,7 @@ def review_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
     """Given the location of an osib.yaml, parse osib, convert to cres and add to db
     export db to yamls and spreadsheet for review"""
     loc, cache = prepare_for_review(cache)
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     ymls = odefs.read_osib_yaml(file_loc)
     osibs = odefs.try_from_file(ymls)
     for osib in osibs:
@@ -440,7 +447,7 @@ def review_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
 
 
 def add_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
-    database = db_connect(path=cache)
+    database, _, _ = db_connect(path=cache)
     ymls = odefs.read_osib_yaml(file_loc)
     osibs = odefs.try_from_file(ymls)
     for osib in osibs:
@@ -451,13 +458,111 @@ def add_osib_from_file(file_loc: str, cache: str, cre_loc: str) -> None:
 
 
 def export_to_osib(file_loc: str, cache: str) -> None:
-    docs = db_connect(path=cache).export(file_loc, dry_run=True)
+    cache, _, _ = db_connect(path=cache)
+    docs = cache.export(file_loc, dry_run=True)
     tree = odefs.cre2osib(docs)
     with open(file_loc, "x"):
         with open(file_loc, "w") as f:
             f.write(json.dumps(tree.todict()))
 
 
+def compare_datasets(db1: str, db2: str) -> List[Dict]:
+    """
+    Given two CRE datasets in databases with connection strings db1 and db2
+    Print their differefnces.
+
+    (make db load descriptions etc in memory)
+    ensure that both graphs have same number of nodes and edges and both graphs have the same data
+    """
+
+    database1 = db_connect(path=db1)
+    database2 = db_connect(path=db2)
+
+    def make_hashtable(graph):
+        nodes = {}
+        edges = {}
+        for node in graph.nodes():
+            if node.startswith("CRE-id"):
+                nodes[graph.nodes[node].get("external_id")] = node
+            elif node.startswith("Node-id"):
+                nodes[graph.nodes[node].get("infosum")] = node
+            else:
+                logger.fatal("Graph seems corrupted")
+
+        for edge in graph.edges():
+            key = graph.nodes[edge[0]].get("external_id")
+            if edge[1].startswith("CRE-id"):
+                key = f"{key}-{graph.nodes[edge[1]].get('external_id')}"
+            else:
+                key = f"{key}-{graph.nodes[edge[1]].get('infosum')}"
+            edges[key] = edge
+        return nodes, edges
+
+    def node_differences(nodes1, nodes2, db2):
+        # get n1 nodes not in n2 and n1 nodes with different attrs than n2
+        differences = {}
+        for node, attrs in nodes1.items():
+            if node not in nodes2:
+                logger.error(f"{node} not present in {db2}")
+                differences["not_present"] = (node, db2)
+            elif nodes2[node] != attrs and not (
+                attrs.startswith("CRE-id") or attrs.startswith("Node-id")
+            ):
+
+                logger.error(
+                    f"Dataset 2 {db2} node:{node} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {nodes2[node]} "
+                )
+                differences["different data"] = {
+                    "node": node,
+                    "attributes1": attrs,
+                    "attributes2": nodes2[node],
+                }
+        return differences
+
+    def edge_differences(edges1, edges2, db2):
+        # get n1 nodes not in n2 and n1 nodes with different attrs than n2
+        differences = {}
+        for edge, attrs in edges1.items():
+            if edge not in edges2:
+                logger.error(f"{edge} not present in {db2}")
+                differences["not_present"] = (edge, db2)
+            else:
+                if edges2[edge] != attrs and [
+                    e
+                    for e in attrs
+                    if not (e.startswith("CRE-id") or e.startswith("Node-id"))
+                ]:
+                    logger.error(
+                        f"Dataset 2{db2} edge:{edge} has different data from dataset 1 equivalent, data1 is {attrs} data 2 is {edges2[edge]}"
+                    )
+                    differences["different data"] = {
+                        "edge": edge,
+                        "attributes1": attrs,
+                        "attributes2": edges2[edge],
+                    }
+        return differences
+
+    database1, app1, context1 = db_connect(path=db1)
+    sqla.create_all(app=app1)
+    database1.graph.graph = db.CRE_Graph.load_cre_graph(session=database1.session)
+    n1, e1 = make_hashtable(database1.graph.graph)
+    database1.session.remove()
+    context1.pop()
+
+    database2, app2, context2 = db_connect(path=db2)
+    sqla.create_all(app=app2)
+    database2.graph.graph = db.CRE_Graph.load_cre_graph(session=database2.session)
+    n2, e2 = make_hashtable(database2.graph.graph)
+    database2.session.remove()
+    context2.pop()
+
+    d1 = node_differences(n1, n2, db2)
+    d2 = node_differences(n2, n1, db1)
+    ed1 = edge_differences(e1, e2, db2)
+    ed2 = edge_differences(e2, e1, db1)
+    return [d1, d2, ed1, ed2]
+
+
 def owasp_metadata_to_cre(meta_file: str):
     """given a file with entries like below
     parse projects of type "tool" in file into "tool" data.
diff --git a/application/database/db.py b/application/database/db.py
index 7915ce694..b8a8a8cd9 100644
--- a/application/database/db.py
+++ b/application/database/db.py
@@ -1,4 +1,5 @@
 import logging
+import hashlib
 import re
 from collections import Counter
 from itertools import permutations
@@ -12,6 +13,7 @@
 from sqlalchemy import func
 from sqlalchemy.sql.expression import desc  # type: ignore
 import uuid
+from matplotlib import pyplot
 
 from .. import sqla  # type: ignore
 
@@ -28,6 +30,18 @@ def generate_uuid():
 
 
 class Node(BaseModel):  # type: ignore
+    def serialise(self):
+        return "".join(
+            [
+                self.name,
+                self.section or "",
+                self.subsection or "",
+                self.tags or "",
+                self.ntype,
+                self.description or "",
+                self.version or "",
+            ]
+        ).encode()
 
     __tablename__ = "node"
     id = sqla.Column(sqla.String, primary_key=True, default=generate_uuid)
@@ -58,6 +72,10 @@ class Node(BaseModel):  # type: ignore
 
 
 class CRE(BaseModel):  # type: ignore
+    def serialise(self):
+        return "".join(
+            [self.name, self.external_id or "", self.description or "", self.tags or ""]
+        ).encode()
 
     __tablename__ = "cre"
     id = sqla.Column(sqla.String, primary_key=True, default=generate_uuid)
@@ -123,6 +141,15 @@ class CRE_Graph:
     graph: nx.Graph = None
     __instance = None
 
+    def print_graph(self, png_path: str = None):
+        """DEbug method to dump the graph, if png_path is provided it shows the graph in png format
+        if not, it returns the graph as dict of dicts"""
+        if png_path:
+            nx.draw(self.graph, with_labels=True)
+            pyplot.savefig(png_path)
+            pyplot.show()
+        return nx.to_dict_of_dicts(self.graph)
+
     @classmethod
     def instance(cls, session):
         if cls.__instance is None:
@@ -143,7 +170,9 @@ def add_node(self, *args, **kwargs):
     def add_cre(cls, dbcre: CRE, graph: nx.DiGraph) -> nx.DiGraph:
         if dbcre:
             graph.add_node(
-                f"CRE: {dbcre.id}", internal_id=dbcre.id, external_id=dbcre.external_id
+                f"CRE-id: {dbcre.id}",
+                internal_id=dbcre.id,
+                external_id=dbcre.external_id,
             )
         else:
             logger.error("Called with dbcre being none")
@@ -152,11 +181,19 @@ def add_cre(cls, dbcre: CRE, graph: nx.DiGraph) -> nx.DiGraph:
     @classmethod
     def add_dbnode(cls, dbnode: Node, graph: nx.DiGraph) -> nx.DiGraph:
         if dbnode:
+            sum = hashlib.sha256(
+                dbnode.serialise()
+            )  # using md5 would have been way more performant but then I'd have to triage every beg-hunter's SAST scanner results
             graph.add_node(
-                "Node: " + str(dbnode.id),
+                f"Node-id: {dbnode.id}",
                 internal_id=dbnode.id,
                 name=dbnode.name,
                 section=dbnode.section,
+                subsection=dbnode.subsection,
+                type=dbnode.ntype,
+                description=dbnode.description,
+                version=dbnode.version,
+                infosum=sum.hexdigest(),
             )
         else:
             logger.error("Called with dbnode being none")
@@ -177,7 +214,7 @@ def load_cre_graph(cls, session) -> nx.Graph:
                 logger.error(f"CRE {il.cre} does not exist?")
             graph = cls.add_cre(dbcre=cre, graph=graph)
 
-            graph.add_edge(f"CRE: {il.group}", f"CRE: {il.cre}", ltype=il.type)
+            graph.add_edge(f"CRE-id: {il.group}", f"CRE-id: {il.cre}", ltype=il.type)
 
         for lnk in session.query(Links).all():
             node = session.query(Node).filter(Node.id == lnk.node).first()
@@ -188,18 +225,22 @@ def load_cre_graph(cls, session) -> nx.Graph:
             cre = session.query(CRE).filter(CRE.id == lnk.cre).first()
             graph = cls.add_cre(dbcre=cre, graph=graph)
 
-            graph.add_edge(f"CRE: {lnk.cre}", f"Node: {str(lnk.node)}", ltype=lnk.type)
+            graph.add_edge(
+                f"CRE-id: {lnk.cre}", f"Node-id: {str(lnk.node)}", ltype=lnk.type
+            )
         return graph
 
 
 class Node_collection:
     graph: nx.Graph = None
-    session = sqla.session
+    session = None
 
-    def __init__(self) -> None:
-        self.graph = CRE_Graph.instance(sqla.session)
-        # self.graph = CRE_Graph.instance(session=sqla.session)
-        self.session = sqla.session
+    def __init__(self, session=sqla.session, graph: CRE_Graph = None) -> None:
+        if graph:
+            self.graph = graph
+        else:
+            self.graph = CRE_Graph.instance(sqla.session)
+        self.session = session
 
     def __get_external_links(self) -> List[Tuple[CRE, Node, str]]:
         external_links: List[Tuple[CRE, Node, str]] = []
@@ -889,14 +930,14 @@ def add_internal_link(
                 f" {group.external_id}:{group.name}"
                 f" == {cre.external_id}:{cre.name} ,adding"
             )
-            cycle = self.__introduces_cycle(f"CRE: {group.id}", f"CRE: {cre.id}")
+            cycle = self.__introduces_cycle(f"CRE-id: {group.id}", f"CRE-id: {cre.id}")
             if not cycle:
                 self.session.add(
                     InternalLinks(type=type.value, cre=cre.id, group=group.id)
                 )
                 self.session.commit()
                 self.graph.add_edge(
-                    f"CRE: {group.id}", f"CRE: {cre.id}", ltype=type.value
+                    f"CRE-id: {group.id}", f"CRE-id: {cre.id}", ltype=type.value
                 )
             else:
                 logger.warning(
@@ -935,7 +976,7 @@ def add_link(
             return
         else:
             cycle = self.__introduces_cycle(
-                f"CRE: {cre.id}", f"Standard: {str(node.id)}"
+                f"CRE-id: {cre.id}", f"Node-id: {str(node.id)}"
             )
             if not cycle:
                 logger.debug(
@@ -945,7 +986,7 @@ def add_link(
                 )
                 self.session.add(Links(type=type.value, cre=cre.id, node=node.id))
                 self.graph.add_edge(
-                    f"CRE: {cre.id}", f"Node: {str(node.id)}", ltype=type.value
+                    f"CRE-id: {cre.id}", f"Node-id: {str(node.id)}", ltype=type.value
                 )
             else:
                 logger.warning(
@@ -964,8 +1005,8 @@ def find_path_between_nodes(
         this starts getting complicated when we have more linktypes"""
         res: bool = nx.has_path(
             self.graph.graph.to_undirected(),
-            "Node: " + str(node_source_id),
-            "Node: " + str(node_destination_id),
+            "Node-id: " + str(node_source_id),
+            "Node-id: " + str(node_destination_id),
         )
 
         return res
diff --git a/application/tests/cre_main_test.py b/application/tests/cre_main_test.py
index f0447c7ac..ca66fdca9 100644
--- a/application/tests/cre_main_test.py
+++ b/application/tests/cre_main_test.py
@@ -1,10 +1,11 @@
+import copy
 import logging
 import os
 import shutil
 import tempfile
 import unittest
 from pprint import pprint
-from typing import Any, Dict, List
+from typing import Any, Dict, List, NamedTuple
 from unittest import mock
 from unittest.mock import Mock, patch
 
@@ -18,14 +19,15 @@
 
 class TestMain(unittest.TestCase):
     def tearDown(self) -> None:
-        for tmpdir in self.tmpdirs:
-            shutil.rmtree(tmpdir)
+        [shutil.rmtree(tmpdir) for tmpdir in self.tmpdirs]
+        [os.remove(tmpfile) for tmpfile in self.tmpfiles]
         sqla.session.remove()
         sqla.drop_all(app=self.app)
         self.app_context.pop()
 
     def setUp(self) -> None:
         self.tmpdirs: List[str] = []
+        self.tmpfiles: List[str] = []
         self.app = create_app(mode="test")
         sqla.create_all(app=self.app)
         self.app_context = self.app.app_context()
@@ -372,7 +374,7 @@ def test_add_from_spreadsheet(
         self.tmpdirs.append(dir)
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
 
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_export.return_value = [
             defs.CRE(name="c0"),
             defs.Standard(name="s0", section="s1"),
@@ -415,7 +417,7 @@ def test_review_from_spreadsheet(
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir)[1]
         mocked_prepare_for_review.return_value = (loc, cache)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
 
         mocked_create_spreadsheet.return_value = "https://example.com/sheeet"
         mocked_export.return_value = [
@@ -467,7 +469,7 @@ def test_review_from_disk(
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
         mocked_prepare_for_review.return_value = (loc, cache)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_get_standards_files_from_disk.return_value = [yml for i in range(0, 3)]
         mocked_export.return_value = [
             defs.CRE(name="c0"),
@@ -511,7 +513,7 @@ def test_add_from_disk(
         yml = tempfile.mkstemp(dir=dir, suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_get_standards_files_from_disk.return_value = [yml for i in range(0, 3)]
         mocked_export.return_value = [
             defs.CRE(name="c0"),
@@ -557,7 +559,7 @@ def test_review_osib_from_file(
         loc = tempfile.mkstemp(dir=dir)[1]
         cach = tempfile.mkstemp(dir=dir)[1]
         mocked_prepare_for_review.return_value = (loc, cach)
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_read_osib_yaml.return_value = [{"osib": "osib"}]
         mocked_try_from_file.return_value = [
             Osib_tree(aliases=[Osib_id("t1")]),
@@ -619,7 +621,7 @@ def test_add_osib_from_file(
         osib_yaml = tempfile.mkstemp(dir=dir, suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_read_osib_yaml.return_value = [{"osib": "osib"}]
         mocked_try_from_file.return_value = [
             odefs.Osib_tree(aliases=[Osib_id("t1")]),
@@ -663,7 +665,7 @@ def test_export_to_osib(
         # osib_yaml = tempfile.mkstemp(dir=dir,suffix=".yaml")[1]
         loc = tempfile.mkstemp(dir=dir)[1]
         cache = tempfile.mkstemp(dir=dir, suffix=".sqlite")[1]
-        mocked_db_connect.return_value = self.collection
+        mocked_db_connect.return_value = self.collection, self.app, self.app_context
         mocked_cre2osib.return_value = odefs.Osib_tree(aliases=[Osib_id("t1")])
         mocked_export.return_value = [defs.CRE(name="c0")]
 
@@ -671,6 +673,88 @@ def test_export_to_osib(
         mocked_db_connect.assert_called_with(path=cache)
         mocked_cre2osib.assert_called_with([defs.CRE(name="c0")])
 
+    def test_compare_datasets(self):
+        _, t1 = tempfile.mkstemp(suffix="dataset1")
+        _, t2 = tempfile.mkstemp(suffix="dataset2")
+        _, tdiff = tempfile.mkstemp(suffix="datasetdiff")
+        self.tmpfiles.extend([t1, t2, tdiff])
+        self.maxDiff = None
+
+        c0 = defs.CRE(id="111-000", description="CREdesc", name="CREname")
+        s456 = defs.Standard(
+            subsection="4.5.6",
+            section="FooStand",
+            name="BarStand",
+            hyperlink="https://example.com",
+            tags=["a", "b", "c"],
+        )
+        c1 = defs.CRE(
+            id="111-001",
+            description="Groupdesc",
+            name="GroupName",
+            links=[defs.Link(document=s456)],
+        )
+        s_unlinked = defs.Standard(
+            subsection="4.5.6",
+            section="Unlinked",
+            name="Unlinked",
+            hyperlink="https://example.com",
+        )
+
+        connection_1, app1, context1 = main.db_connect(path=t1)
+        sqla.create_all(app=app1)
+        connection_1.graph.graph = db.CRE_Graph.load_cre_graph(connection_1.session)
+        connection_1.add_cre(c0)
+        connection_1.add_node(s_unlinked)
+        db_s456 = connection_1.add_node(s456)
+        connection_1.add_link(connection_1.add_cre(c1), db_s456)
+        infosum = [
+            connection_1.graph.graph.nodes[x].get("infosum")
+            for x in connection_1.graph.graph.nodes
+            if db_s456.id in x
+        ][0]
+
+        context1.pop()
+
+        self.assertEqual(
+            main.compare_datasets(t1, tdiff),
+            [
+                {"not_present": (c1.id, tdiff)},
+                {},
+                {"not_present": (f"{c1.id}-{infosum}", tdiff)},
+                {},
+            ],
+        )
+
+        connection_2, app2, context2 = main.db_connect(path=t2)
+        sqla.create_all(app=app2)
+        connection_2.graph.graph = db.CRE_Graph.load_cre_graph(sqla.session)
+        connection_2.add_cre(c0)
+        connection_2.add_node(s_unlinked)
+        connection_2.add_link(connection_2.add_cre(c1), connection_2.add_node(s456))
+        context2.pop()
+
+        connection_diff, appdiff, contextdiff = main.db_connect(path=tdiff)
+        connection_diff.graph.graph = db.CRE_Graph.load_cre_graph(
+            connection_diff.session
+        )
+        sqla.create_all(app=appdiff)
+        connection_diff.add_cre(c0)
+        connection_diff.add_cre(defs.CRE(id="000-111", name="asdfa232332sdf"))
+        contextdiff.pop()
+
+        self.assertEqual(main.compare_datasets("foo", "bar"), [{}, {}, {}, {}])
+        self.assertEqual(main.compare_datasets(t1, t2), [{}, {}, {}, {}])
+        self.assertEqual(
+            main.compare_datasets(t1, tdiff),
+            [
+                {"not_present": (c1.id, tdiff)},
+                {},
+                {"not_present": (f"{c1.id}-{infosum}", tdiff)},
+                {},
+            ],
+        )
+
     # def test_prepare_for_Review(self):
     #     raise NotImplementedError
 
diff --git a/cre.py b/cre.py
index 31794f9b1..6fac238e6 100644
--- a/cre.py
+++ b/cre.py
@@ -142,7 +142,21 @@ def main() -> None:
         default=None,
         help="export all data into yaml files under the directory pointed to by this argument",
     )
-
+    parser.add_argument(
+        "--compare_datasets",
+        action="store_true",
+        help="compare the CRE datasets pointed to by --dataset1 and --dataset2",
+    )
+    parser.add_argument(
+        "--dataset1",
+        default=None,
+        help="used with --compare_datasets, dataset1",
+    )
+    parser.add_argument(
+        "--dataset2",
+        default=None,
+        help="used with --compare_datasets, dataset2",
+    )
     args = parser.parse_args()
 
     from application.cmd import cre_main
diff --git a/requirements.txt b/requirements.txt
index e2de0dfee..7abfe5893 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ click-didyoumean==0.0.3
 click-plugins==1.1.1
 click-repl==0.2.0
 coverage==5.5
+cycler==0.11.0
 dacite==1.6.0
 dataclasses-json==0.5.6
 decorator==4.4.2
@@ -24,6 +25,7 @@ Flask-Cors==3.0.10
 Flask-Migrate==3.1.0
 Flask-SQLAlchemy==2.5.1
 flask-sqlalchemy-stubs==0.2
+fonttools==4.33.3
 gitdb==4.0.5
 github2==0.6.2
 GitPython==3.1.9
@@ -40,19 +42,24 @@ isort==5.9.3
 itsdangerous==1.1.0
 Jinja2==2.11.3
 jsonschema==3.2.0
+kiwisolver==1.4.3
 lazy-object-proxy==1.6.0
 Mako==1.1.5
 MarkupSafe==1.1.1
 marshmallow==3.14.1
 marshmallow-enum==1.5.1
+matplotlib==3.5.2
 mccabe==0.6.1
 mypy==0.910
 mypy-extensions==0.4.3
 networkx==2.5.1
+numpy==1.23.0
 oauthlib==3.1.0
+packaging==21.3
 pathspec==0.9.0
 pbr==5.8.0
 pep517==0.8.2
+Pillow==9.1.1
 pip-autoremove==0.9.1
 platformdirs==2.2.0
 prompt-toolkit==3.0.19
diff --git a/scripts/data-equivalency.sh b/scripts/data-equivalency.sh
new file mode 100755
index 000000000..1d44c8bea
--- /dev/null
+++ b/scripts/data-equivalency.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+curr_dir=$(pwd)
+
+rm -rf import.dump latest.backup latest.dump latest.dump.1
+
+docker run -d -e POSTGRES_HOST_AUTH_METHOD=trust --rm --network host postgres:13.6
+sleep 10
+
+export PROD_DATABASE_URL=postgres://postgres@0.0.0.0:5432
+make migrate-upgrade
+make import-all
+
+rm -rf /tmp/diff_data
+mkdir -p /tmp/diff_data
+cd /tmp/diff_data
+
+heroku login && heroku pg:backups:download -a opencreorg
+
+source $curr_dir/venv/bin/activate
+python $curr_dir/cre.py --compare_datasets --dataset1=$PROD_DATABASE_URL --dataset2=sqlite://cres/db.sqlite
+exit $?
\ No newline at end of file