GH-731: Update output format of model sonar_core_1 (#815)

rain1024 · web-flow · commit 3e8ea4dfd815 · 2025-09-21T17:38:10.000+07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,11 +14,11 @@ dependencies = [
     "flake8>=7.3.0",
     'Click>=6.0',
     'python-crfsuite>=0.9.6',
-    'nltk==3.8',
+    'nltk>=3.8',
     'tqdm',
     'requests',
     'joblib',
-    'scikit-learn==1.6.1',
+    'scikit-learn>=1.6.1',
     'PyYAML',
     'underthesea_core==1.0.5'
 ]
diff --git a/tests/pipeline/classification/test_sonar_core_1.py b/tests/pipeline/classification/test_sonar_core_1.py
@@ -12,30 +12,30 @@ def test_classify_null_cases(self):
 
     def test_classify_simple_case(self):
         text = u"HLV ngoại đòi gần tỷ mỗi tháng dẫn dắt tuyển Việt Nam"
-        actual = classify(text)[0]
+        actual = classify(text)
         expected = "the_thao"
         self.assertEqual(actual, expected)
 
     def test_classify_sports(self):
         text = u"Việt Nam giành chiến thắng 3-0 trước Thái Lan trong trận bán kết"
-        actual = classify(text)[0]
+        actual = classify(text)
         expected = "the_thao"
         self.assertEqual(actual, expected)
 
     def test_classify_technology(self):
         text = u"Apple ra mắt iPhone mới với nhiều tính năng đột phá"
-        actual = classify(text)[0]
+        actual = classify(text)
         expected = "vi_tinh"
         self.assertEqual(actual, expected)
 
     def test_classify_health(self):
         text = u"Phát hiện vaccine mới chống lại virus corona"
-        actual = classify(text)[0]
+        actual = classify(text)
         expected = "suc_khoe"
         self.assertEqual(actual, expected)
 
     def test_classify_business(self):
         text = u"Thị trường chứng khoán tăng điểm mạnh trong phiên sáng nay"
-        actual = classify(text)[0]
+        actual = classify(text)
         expected = "kinh_doanh"
         self.assertEqual(actual, expected)
diff --git a/underthesea/pipeline/classification/sonar_core_1/__init__.py b/underthesea/pipeline/classification/sonar_core_1/__init__.py
@@ -1,17 +1,21 @@
 import os
 import sys
 import urllib.request
+import warnings
 import zipfile
 from os.path import dirname
 
 import joblib
 
+# Suppress scikit-learn version warnings
+warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
+
 sys.path.insert(0, dirname(dirname(__file__)))
 classifier = None
 
 
 def _ensure_model_exists():
-    """Get model from latest run or download from release"""
+    """Download and extract sonar_core_1 model if not exists"""
     model_dir = os.path.expanduser("~/.underthesea/models")
     model_file = os.path.join(model_dir, "sonar_core_1.pkl")
     labels_file = os.path.join(model_dir, "sonar_core_1_labels.txt")
@@ -20,26 +24,6 @@ def _ensure_model_exists():
     if os.path.exists(model_file) and os.path.exists(labels_file):
         return model_file, labels_file
 
-    # Try to get from latest local run first
-    runs_dir = os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "extensions", "labs", "classify_ml", "sonar_core_1", "runs")
-    if os.path.exists(runs_dir):
-        import glob
-        run_dirs = glob.glob(os.path.join(runs_dir, "[0-9]*_[0-9]*"))
-        if run_dirs:
-            latest_run = sorted(run_dirs)[-1]
-            latest_model = os.path.join(latest_run, "models", "model.pkl")
-            latest_labels = os.path.join(latest_run, "models", "labels.txt")
-
-            if os.path.exists(latest_model) and os.path.exists(latest_labels):
-                print(f"Using model from latest local run: {latest_run}")
-                os.makedirs(model_dir, exist_ok=True)
-
-                # Copy from latest run
-                import shutil
-                shutil.copy2(latest_model, model_file)
-                shutil.copy2(latest_labels, labels_file)
-                return model_file, labels_file
-
     print("Downloading Sonar Core 1 model...")
 
     # Create directories
@@ -75,7 +59,7 @@ def _ensure_model_exists():
 
 def _load_labels(labels_file):
     """Load label mapping from file"""
-    with open(labels_file, 'r', encoding='utf-8') as f:
+    with open(labels_file, encoding='utf-8') as f:
         labels = [line.strip() for line in f.readlines()]
     return labels
 
@@ -87,7 +71,7 @@ def classify(text):
         text (str): Vietnamese text to classify
 
     Returns:
-        list: List containing the predicted category (for compatibility with underthesea API)
+        str: Predicted category
     """
     global classifier
 
@@ -96,9 +80,9 @@ def classify(text):
         classifier = joblib.load(model_file)
         classifier.labels = _load_labels(labels_file)
 
-    # Make prediction
+    # Make prediction and convert to plain string
     prediction = classifier.predict([text])[0]
-    return [prediction]
+    return str(prediction)
 
 
 def classify_with_confidence(text):
@@ -121,13 +105,16 @@ def classify_with_confidence(text):
     prediction = classifier.predict([text])[0]
     probabilities = classifier.predict_proba([text])[0]
 
-    # Get top 3 predictions with probabilities
+    # Get top 3 predictions with probabilities, convert to plain strings
     classes = classifier.classes_
     prob_dict = dict(zip(classes, probabilities))
     top_predictions = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)[:3]
 
+    # Convert numpy strings to plain strings
+    top_predictions = [(str(label), float(prob)) for label, prob in top_predictions]
+
     return {
-        'prediction': prediction,
-        'confidence': top_predictions[0][1],
+        'prediction': str(prediction),
+        'confidence': float(top_predictions[0][1]),
         'top_3': top_predictions
     }