updating code

visual-layer · Sep 6, 2023 · 7d919bf · 7d919bf
1 parent bf302fa
commit 7d919bf
Show file tree

Hide file tree

Showing 11 changed files with 1,404 additions and 340 deletions.
diff --git a/fastdup/__init__.py b/fastdup/__init__.py
diff --git a/fastdup/definitions.py b/fastdup/definitions.py
@@ -45,9 +45,11 @@
 DEFAULT_MODEL_FEATURE_WIDTH = 576
 HIGH_ACCURACY_MODEL_FEATURE_WIDTH = 960
 
+PRINTOUT_BAR_WIDTH = 88
+
 DEFUALT_METRIC_ZERO = 0
 DEFAULT_METRIC_MINUS_ONE = -1
-VERSION__ = "0.927"
+VERSION__ = "1.38"
 
 GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
 
@@ -56,7 +58,7 @@
 "and special care needs to select the right backend for your OS/Hardware combination. You can install matplot lib using "
 "python3.8 -m pip install matplotlib matplotlib-inline. (change the python3.8 to your python version). "
 
-SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".heic", ".heif"]
+SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp"]
 SUPPORTED_VID_FORMATS = ["mp4", ".avi"]
 
 RUN_ALL = 0
@@ -76,6 +78,17 @@
 DINOV2S_MODEL_DIM = 384
 DINOV2B_MODEL = "https://vl-company-website.s3.us-east-2.amazonaws.com/model_artifacts/dinov2/dinov2_vitb14.onnx"
 DINOV2B_MODEL_DIM = 768
+CLIP_MODEL = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-B-32/visual.onnx"
+CLIP_MODEL_DIM = 512
+CLIP_MODEL2 = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-L-14@336px/visual.onnx"
+CLIP_MODEL2_DIM = 768
+CLIP_MODEL14 = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-L-14/visual.onnx"
+CLIP_MODEL14_DIM = 768
+
+EFFICIENTNET_MODEL = "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx"
+EFFICIENTNET_MODEL_DIM = 1000
+RESNET50_MODEL = "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx"
+RESNET50_MODEL_DIM = 1000
 
 CAPTION_MODEL1_NAME = 'automatic'
 CAPTION_MODEL2_NAME = 'blip'

diff --git a/fastdup/engine.py b/fastdup/engine.py
@@ -29,8 +29,8 @@ class Fastdup(FastdupController):
  df_cc, cc_info = fd.connected_components()
  """
 
- def __init__(self, work_dir: Union[str, Path], input_dir: Union[str, Path] = None):
- super().__init__(work_dir, input_dir=input_dir)
+ def __init__(self, work_dir: Union[str, Path]=None, input_dir: Union[str, Path] = None):
+ super().__init__(work_dir=work_dir, input_dir=input_dir)
  self.vis = FastdupVisualizer(self)
 
  def run(self,
@@ -149,10 +149,10 @@ def run(self,
  license='' if license is None else license,
  high_accuracy=high_accuracy)
  if (model_path is not None):
- if 'dinov2s' not in model_path and 'dinov2b' not in model_path:
+ if 'dinov2s' not in model_path and 'dinov2b' not in model_path and 'resnet50' not in model_path and 'efficientnet' not in model_path and 'clip' not in model_path and 'clip336' not in model_path and 'clip14' not in model_path:
  assert 'd' in kwargs, 'Please provide d parameter to indicate the model output dimension'
  fastdup_func_params['model_path'] = model_path
  fastdup_func_params.update(kwargs)
 
- super().run(annotations=annotations, input_dir=input_dir, subset=subset, data_type=data_type,
+ return super().run(annotations=annotations, input_dir=input_dir, subset=subset, data_type=data_type,
  overwrite=overwrite, embeddings=embeddings, **fastdup_func_params)
diff --git a/fastdup/fastdup_controller.py b/fastdup/fastdup_controller.py
diff --git a/fastdup/fastdup_create.py b/fastdup/fastdup_create.py
@@ -6,7 +6,7 @@
 
 
 @v1_sentry_handler
-def create(work_dir: Union[str, Path], input_dir: Union[str, Path] = None) -> Fastdup:
+def create(work_dir: Union[str, Path] = None, input_dir: Union[str, Path, list] = None) -> Fastdup:
  """
  Create fastdup analyzer instance.
  Usage example

diff --git a/fastdup/fastdup_visualizer.py b/fastdup/fastdup_visualizer.py
diff --git a/fastdup/galleries.py b/fastdup/galleries.py
diff --git a/fastdup/image.py b/fastdup/image.py
@@ -5,13 +5,23 @@
 
 import os
 import cv2
+import fastdup.definitions
 import numpy as np
 import base64
 import io
+
+import pandas as pd
 from fastdup.definitions import *
 from fastdup.sentry import fastdup_capture_exception
 import tarfile
 import platform
+import pathlib
+from PIL import Image
+from pillow_heif import register_heif_opener
+
+register_heif_opener()
+
+
 
 def safe_replace(path):
  return path.replace('/','_').replace('\\','_').replace(":",'_')
@@ -98,6 +108,32 @@ def truncate_folder_name(path):
  return None
 
 
+
+def inner_read(img1_path):
+ if img1_path.lower().endswith('.heic') or img1_path.lower().endswith('.heif'):
+ img = Image.open(img1_path)
+ assert img is not None, f"Failed to open image from {img1_path}"
+ img = np.array(img)
+ channels = img.shape[-1] if img.ndim == 3 else 1
+ if channels == 1:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ elif channels == 4:
+ img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+ else:
+ img = cv2.imread(img1_path, cv2.IMREAD_UNCHANGED)
+ assert img is not None, f"Failed to open image from {img1_path}"
+ if img.dtype == 'uint16':
+ img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
+ channels = img.shape[-1] if img.ndim == 3 else 1
+ if channels == 1:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ elif channels == 4:
+ img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
+ return img
+
 def fastdup_imread(img1_path, input_dir, kwargs):
  """
  Read an image from local file, or from a tar file, or from s3/minio path using minio client mc
@@ -108,22 +144,22 @@ def fastdup_imread(img1_path, input_dir, kwargs):
  Returns:
  img1 (np.array): the image
  """
- assert img1_path is not None, f"img1_path should not be None {input_dir}, {kwargs}"
-
+ assert not pd.isnull(img1_path), f"img1_path should not be None {img1_path} {input_dir}, {kwargs}"
  is_minio_or_s3 = False
- if input_dir is not None:
+ if input_dir is not None and (isinstance(input_dir, str) or isinstance(input_dir, pathlib.Path)):
+ if input_dir.startswith('~/'):
+ input_dir = os.path.expanduser(input_dir)
  if not input_dir.startswith("s3://") and not input_dir.startswith("minio://"):
  assert os.path.exists(input_dir), "Failed to find input_dir: " + input_dir
  else:
  is_minio_or_s3 = True
 
-
+ if img1_path.startswith('~/'):
+ img1_path = os.path.expanduser(img1_path)
  if os.path.exists(img1_path):
- img = cv2.imread(img1_path, cv2.IMREAD_UNCHANGED)
- if img is not None:
- if img.dtype == 'uint16':
- img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ img = inner_read(img1_path)
+
+
  return img
  elif ('/' +S3_TEMP_FOLDER + '/' in img1_path or '/' + S3_TEST_TEMP_FOLDER + '/' in img1_path) and \
  '.tar/' in img1_path:
@@ -150,38 +186,68 @@ def fastdup_imread(img1_path, input_dir, kwargs):
  minio_prefix = "/".join(input_dir.replace("minio://", "").split('/')[:2])
  #print('minio_prefix', minio_prefix)
  download_minio(minio_prefix + '/' + local_dir_no_temp + '/' + os.path.basename(img1_path), S3_TEMP_FOLDER)
- ret =  cv2.imread(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
+ ret = inner_read(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
  assert ret is not None, f"Failed to read image {os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path))}"
  return ret
  elif input_dir.startswith("s3://"):
  local_dir_no_temp = truncate_folder_name(os.path.dirname(img1_path))
  s3_prefix = 's3://' + "/".join(input_dir.replace("s3://", "").split('/')[:1])
  #print('s3_prefix', s3_prefix)
  download_s3(s3_prefix + '/' + local_dir_no_temp + '/' + os.path.basename(img1_path), S3_TEMP_FOLDER)
- ret = cv2.imread(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
- assert ret is not None, f"Failed to read image {os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path))}"
+ ret = inner_read(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
  return ret
  #Failed to read image1 ..\milvus_vector_db\data\images\..\milvus_vector_db\data\images\Egyptian_Mau_210.jpg
  elif img1_path.startswith(input_dir) and len(img1_path) >= len(input_dir) +2:
  suffix = img1_path[len(input_dir):]
  if input_dir in suffix and os.path.exists(suffix):
- img = cv2.imread(suffix, cv2.IMREAD_UNCHANGED)
- if img is not None:
- if img.dtype == 'uint16':
- img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+ img = inner_read(suffix)
  return img
  elif "''" in img1_path: # try to handle french and other languages where c side doubles the '' otherwise pandas can't read it
  new_img1_path = img1_path.replace("''","")
  if os.path.exists(new_img1_path):
- img = cv2.imread(new_img1_path, cv2.IMREAD_UNCHANGED)
+ img = inner_read(new_img1_path)
  return img
 
 
  print('Failed to read image from img_path', img1_path)
  return None
 
 
+def check_valid_image_extension(filename):
+ # Check whether a file name ends with an image extension
+ # Required by OpenCV imwrite
+ return any([filename.lower().endswith(ext) for ext in SUPPORTED_IMG_FORMATS])
+
+
+def fastdup_imwrite(local_file, im):
+ has_extension = check_valid_image_extension(local_file)
+ if has_extension:
+ ret = cv2.imwrite(local_file, im)
+ else:
+ local_file_wext = local_file + '.jpg'
+ ret = cv2.imwrite(local_file_wext, im)
+ assert ret, f"Failed to save img to {local_file} most likely filename is too long for the OS"
+
+ # Rename back if extension was added
+ os.rename(local_file_wext, local_file)
+ assert os.path.isfile(local_file), "Failed to save img to " + local_file
+
+ if ret == False and len(local_file) >= 254:
+ try:
+ import uuid
+ import shutil
+ file, ext = os.path.splitext(local_file)
+ tmp_filename = str(uuid.uuid4()) + ext
+ ret = cv2.imwrite(tmp_filename, im)
+ if os.path.exists(local_file):
+ os.unlink(local_file)
+ shutil.move(tmp_filename, local_file)
+ finally:
+ assert ret, f"Failed to save img to {local_file} most likely filename is too long for the OS"
+ elif ret == False:
+ assert ret, f"Failed to save img to {local_file}"
+ assert os.path.isfile(local_file), "Failed to save img to " + local_file
+
 def get_type(str):
  if 'train' in str:
  return 'train'
@@ -282,17 +348,7 @@ def draw_text(img, text,
 
  return text_size, img
 
-def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding_box_func=None, input_dir=None, kwargs=None):
- #v1 = 'id_to_filename_func' in kwargs
- id_from, id_to = row['from'], row['to']
- #if v1:
- # assert not isinstance(id_from, str), f"Wrong type {row}"
-
- #suffix_from, suffix_to = (f'_{id_from}', f'_{id_to}') if v1 else ('', '')
- #if v1:
- # id_to_filename_func = kwargs['id_to_filename_func']
- # row[['from','to']] = [id_to_filename_func(row['from']), id_to_filename_func(row['to'])]
-
+def create_triplet_img(index, row, work_dir, save_path, extract_filenames, get_bounding_box_func=None, input_dir=None, kwargs=None):
  img1_path, img2_path, distance, ptype = extract_filenames(row, work_dir, save_path, kwargs)
 
  img1 = fastdup_imread(img1_path, input_dir, kwargs)
@@ -301,6 +357,10 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
  assert img1 is not None, f"Failed to read image1 {img1_path} {str(input_dir)}"
  assert img2 is not None, f"Failed to read image2 {img2_path} {str(input_dir)}"
 
+ if 'crop_filename_from' in row and 'crop_filename_to' in row:
+ id_from, id_to = row['crop_filename_from'], row['crop_filename_to']
+ else:
+ id_from, id_to = row['from'], row['to']
  img1 = plot_bounding_box(img1, get_bounding_box_func, id_from)
  img2 = plot_bounding_box(img2, get_bounding_box_func, id_to)
 
@@ -317,9 +377,20 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
  if rimg1.shape != rimg2.shape: # combination of grayscale and color
  if len(rimg1.shape) == 2:
  rimg1 = cv2.cvtColor(rimg1, cv2.COLOR_GRAY2RGB)
+ elif len(rimg1.shape) ==3 and rimg1.shape[2] == 4:
+ rimg1 = cv2.cvtColor(rimg1, cv2.COLOR_RGBA2RGB)
  if len(rimg2.shape) == 2:
  rimg2 = cv2.cvtColor(rimg2, cv2.COLOR_GRAY2RGB)
- cimage = cv2.addWeighted(rimg1,alpha,rimg2,1-alpha,0)
+ elif len(rimg1.shape) ==3 and rimg2.shape[2] == 4:
+ rimg2 = cv2.cvtColor(rimg2, cv2.COLOR_RGBA2RGB)
+
+ error_weighted = False
+ try:
+ cimage = cv2.addWeighted(rimg1,alpha,rimg2,1-alpha,0)
+ except Exception as ex:
+ error_weighted = True
+ fastdup_capture_exception("create_triplet_image", ex, True, f"Dimes are {rimg1.shape} {rimg2.shape}")
+
 
  hierarchical_run = kwargs is not None and 'hierarchical_run' in kwargs and kwargs['hierarchical_run']
  text1 = os.path.splitext(os.path.basename(img1_path))[0]
@@ -330,11 +401,11 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
 
  (w, h),nimg1 = draw_text(rimg1, text1, font_scale=1, pos=(10, 10))
  (w, h),nimg2 = draw_text(rimg2, text2, font_scale=1, pos=(10, 10))
- (w, h),cimage = draw_text(cimage, 'blended image', font_scale=1, pos=(10, 10))
+ if not error_weighted:
+ (w, h),cimage = draw_text(cimage, 'blended image', font_scale=1, pos=(10, 10))
+ assert cimage.shape[0] > 0 and cimage.shape[1] > 0
 
- assert cimage.shape[0] > 0 and cimage.shape[1] > 0
-
- if hierarchical_run:
+ if hierarchical_run or error_weighted:
  hcon_img = hconcat_resize_min([nimg1, nimg2])
  else:
  hcon_img = hconcat_resize_min([nimg1, nimg2, cimage])
@@ -355,11 +426,9 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
  lazy_load = 'lazy_load' in kwargs and kwargs['lazy_load']
  if lazy_load:
  os.makedirs(os.path.join(save_path, 'images'), exist_ok=True)
- hcon_img_path = f'{save_path}/images/{pid}.jpg'
+ hcon_img_path = f'{save_path}/images/{pid}_{index}.jpg'
  else:
- hcon_img_path = f'{save_path}/{pid}.jpg'
- cv2.imwrite(hcon_img_path, hcon_img)
- assert os.path.exists(hcon_img_path), f"Failed to write image to {hcon_img_path}"
-
+ hcon_img_path = f'{save_path}/{pid}_{index}.jpg'
+ fastdup_imwrite(hcon_img_path, hcon_img)
  return hcon_img, hcon_img_path
 
diff --git a/fastdup/sentry.py b/fastdup/sentry.py
@@ -74,7 +74,7 @@ def init_sentry():
  except:
  pass
 
-def fastdup_capture_exception(section, e, warn_only=False):
+def fastdup_capture_exception(section, e, warn_only=False, extra=""):
  if not warn_only:
  traceback.print_exc()
  if 'SENTRY_OPT_OUT' not in os.environ:
@@ -84,7 +84,10 @@ def fastdup_capture_exception(section, e, warn_only=False):
  scope.set_tag("token", token)
  scope.set_tag("platform", platform.platform())
  scope.set_tag("platform.version", platform.version())
- scope.set_tag("python", sys.version)
+ scope.set_tag("python", sys.version.strip().replace("\n", " "))
+ scope.set_tag("production", "FASTDUP_PRODUCTION" in os.environ)
+ if extra != "":
+ scope.set_tag("extra", extra)
  capture_exception(e, scope=scope)
 
 
@@ -106,6 +109,7 @@ def fastdup_performance_capture(section, start_time):
  scope.set_tag("platform", platform.platform())
  scope.set_tag("platform.version", platform.version())
  scope.set_tag("python", sys.version.strip().replace("\n", " "))
+ scope.set_tag("production", "FASTDUP_PRODUCTION" in os.environ)
  sentry_sdk.capture_message("Performance", scope=scope)
  finally:
  sentry_sdk.flush(timeout=5)

diff --git a/fastdup/tensorboard_projector.py b/fastdup/tensorboard_projector.py
@@ -30,7 +30,7 @@ def save_labels_tsv(labels, filepath, log_dir):
  for label in labels:
  f.write('{}\n'.format(label))
 
-def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None, h = 0, w = 0, alternative_filename = None, alternative_width=None, max_width=None):
+def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None, h = 0, w = 0, alternative_filename = None, alternative_width=None, max_width=None, kwargs={}):
  # Generate sprite image
  images_pil = []
 
@@ -54,6 +54,13 @@ def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None,
  if (alternative_width < sample_size):
  sample_size = alternative_width
  height = 1
+ elif kwargs and 'force_width' in kwargs and 'force_height' in kwargs:
+ assert isinstance(kwargs['force_width'], int), "force_width must be an integer"
+ assert isinstance(kwargs['force_height'], int), "force_height must be an integer"
+ if kwargs['force_width'] * kwargs['force_height'] > len(img_path):
+ print(f"Warning: missing images for a full grid, requested {kwargs['force_width'] * kwargs['force_height']} got {len(img_path)}")
+ NUM_IMAGES_WIDTH = kwargs['force_width']
+ height = kwargs['force_width']
  else:
  NUM_IMAGES_WIDTH = int(1.4*np.ceil(np.sqrt(min(sample_size, len(img_path)))))
  divs = int(np.ceil(min(sample_size,len(img_path)) / NUM_IMAGES_WIDTH))