diff --git a/migrations/007_add_logo_annotation_server_type_index.py b/migrations/007_add_logo_annotation_server_type_index.py new file mode 100644 index 0000000000..cfab78d813 --- /dev/null +++ b/migrations/007_add_logo_annotation_server_type_index.py @@ -0,0 +1,12 @@ +import peewee as pw +from peewee_migrate import Migrator + + +def migrate(migrator: Migrator, database: pw.Database, *, fake=False): + migrator.sql( + "CREATE INDEX CONCURRENTLY IF NOT EXISTS logo_annotation_server_type ON logo_annotation (server_type)" + ) + + +def rollback(migrator: Migrator, database: pw.Database, *, fake=False): + migrator.sql("DROP INDEX IF EXISTS logo_annotation_server_type") diff --git a/robotoff/app/api.py b/robotoff/app/api.py index 28c9a62ca2..32103dad8e 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -1217,38 +1217,39 @@ def on_get( es_client = get_es_client() if logo_id is None: - logo_embeddings = list( - LogoEmbedding.select() - .join(LogoAnnotation) - .join(ImagePrediction) - .join(ImageModel) - .where( - ImageModel.server_type == server_type.name, - # Don't include logos from deleted images - ImageModel.deleted == False, # noqa - ) - .order_by(peewee.fn.Random()) - .limit(1) - ) - - if not logo_embeddings: + # To fetch a random logo that has an embedding, we use + # TABLESAMPLE SYSTEM. The parameter in parentheses is the + # percentage of rows to sample. + # Here, we sample 20% of the rows in the logo_embedding table. + # See https://www.postgresql.org/docs/current/sql-select.html + # for more information. + result = db.execute_sql( + """ + SELECT logo_id, embedding + FROM embedding.logo_embedding as t1 TABLESAMPLE SYSTEM (20) + JOIN logo_annotation AS t2 ON t1.logo_id = t2.id + WHERE t2.server_type = %s + LIMIT 1; + """, + (server_type.name,), + ).fetchone() + + if not result: resp.media = {"results": [], "count": 0, "query_logo_id": None} return - logo_embedding = logo_embeddings[0] - logo_id = logo_embedding.logo_id + logo_id, embedding = result else: logo_embedding = LogoEmbedding.get_or_none(logo_id=logo_id) if logo_embedding is None: resp.status = falcon.HTTP_404 return + embedding = logo_embedding.embedding raw_results = [ item - for item in knn_search( - es_client, logo_embedding.embedding, count, server_type=server_type - ) + for item in knn_search(es_client, embedding, count, server_type=server_type) if item[0] != logo_id ][:count] results = [{"logo_id": item[0], "distance": item[1]} for item in raw_results] diff --git a/robotoff/cli/main.py b/robotoff/cli/main.py index 0d0323025e..4d69a00cc0 100644 --- a/robotoff/cli/main.py +++ b/robotoff/cli/main.py @@ -654,6 +654,7 @@ def add_logo_to_ann( seen = set(int(x) for x in text_file_iter(existing_ids_path)) else: seen = get_stored_logo_ids(es_client) + logger.info("Number of existing logos: %d", len(seen)) added = 0