[MRG] fix identifier munging for local databases (#145)

ctb · web-flow · commit f1b54685b4df · 2022-01-30T08:31:43.000-08:00
* fix identifier split

* fix identifier split x 2

* fix identifier foo in notebooks
diff --git a/genome_grist/copy_local_genomes.py b/genome_grist/copy_local_genomes.py
@@ -41,8 +41,11 @@ def main():
             record_name = record.name
             break
 
-        record_name = record_name.split(' ', 1)
-        ident, remainder = record_name
+        ident, *remainder = record_name.split(' ', 1)
+        if remainder:           # is list, needs to be string
+            remainder = remainder[0]
+        else:
+            remainder = ident
 
         print(f"read identifer '{ident}' and name '{remainder}'")
 
diff --git a/genome_grist/notebooks/report-gather.ipynb b/genome_grist/notebooks/report-gather.ipynb
@@ -82,7 +82,14 @@
     "\n",
     "# connect gather_df to all_df and left_df using 'genome_id'\n",
     "def fix_name(x):\n",
-    "    return \"_\".join(x.split('_')[:2]).split('.')[0]\n",
+    "    # pick off first space-delimited name as identifier\n",
+    "    x = x.split(' ')[0]\n",
+    "    \n",
+    "    # eliminate stuff after the period, too.\n",
+    "    x = x.split('.')[0]\n",
+    "    \n",
+    "    return x\n",
+    "    #return \"_\".join(x.split('_')[:2]).split('.')[0]\n",
     "\n",
     "gather_df['genome_id'] = gather_df['name'].apply(fix_name)\n",
     "names_df['genome_id'] = names_df['ident'].apply(fix_name)"
diff --git a/genome_grist/notebooks/report-mapping.ipynb b/genome_grist/notebooks/report-mapping.ipynb
@@ -78,7 +78,15 @@
     "\n",
     "# connect gather_df to all_df and left_df using 'genome_id'\n",
     "def fix_name(x):\n",
-    "    return \"_\".join(x.split('_')[:2]).split('.')[0]\n",
+    "    # pick off first space-delimited name as identifier\n",
+    "    x = x.split(' ')[0]\n",
+    "    \n",
+    "    # eliminate stuff after the period, too.\n",
+    "    x = x.split('.')[0]\n",
+    "    \n",
+    "    return x\n",
+    "    #return \"_\".join(x.split('_')[:2]).split('.')[0]\n",
+    "\n",
     "\n",
     "gather_df['genome_id'] = gather_df['name'].apply(fix_name)\n",
     "names_df['genome_id'] = names_df['ident'].apply(fix_name)"