Deprecate tokenize parameter in field definition (#602)

Yelp · Oct 25, 2023 · 3027a0c · 3027a0c
1 parent 26bbde0
commit 3027a0c
Show file tree

Hide file tree

Showing 75 changed files with 1,223 additions and 1,316 deletions.
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ cat registerFields.json
               "field":
               [
                       { "name": "doc_id", "type": "ATOM", "storeDocValues": true},
-                      { "name": "vendor_name", "type": "TEXT" , "search": true, "store": true, "tokenize": true},
+                      { "name": "vendor_name", "type": "TEXT" , "search": true, "store": true},
                       { "name": "license_no",  "type": "INT", "multiValued": true, "storeDocValues": true}
               ]
 }

diff --git a/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto b/clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
@@ -549,7 +549,7 @@ message Field {
     bool store = 4; // True if the value should be stored.
     bool storeDocValues = 5; // Whether to index the value into doc values.
     bool sort = 6; // True if the value should be indexed into doc values for sorting.
-    bool tokenize = 7; // True if the value should be tokenized.
+    bool tokenize = 7 [deprecated = true]; // This is now determined from the field type
     bool group = 8; // True if the value should be indexed into doc values for grouping.
     bool multiValued = 9; // True if this field may sometimes have more than one value.
     bool highlight = 10 [deprecated = true]; // This parameter would be ignored

diff --git a/docker-compose-config/registerFields.json b/docker-compose-config/registerFields.json
@@ -5,13 +5,13 @@
     {
       "name": "doc_id",
       "type": "ATOM",
+      "search": true,
       "storeDocValues": true},
     {
       "name": "vendor_name",
       "type": "TEXT" ,
       "search": true,
-      "store": true,
-      "tokenize": true
+      "store": true
     },
     {
       "name": "license_no",

diff --git a/docs/highlighting.rst b/docs/highlighting.rst
@@ -20,7 +20,6 @@ Fast-vector-highlighter:
 
 * The field must be stored, i.e. have "store: true"
 * The field must have term vectors with positions and offsets, i.e. have "termVectors: TERMS_POSITIONS_OFFSETS"
-* While not mandatory, the field must be tokenized for the highlights to be useful, i.e. have "tokenize: true"
 
 Query Syntax
 ------------

diff --git a/docs/mapping.rst b/docs/mapping.rst
@@ -31,7 +31,7 @@ Text search types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 * TEXT
-The tradition field for full-text content. Text fields are best suited for unstructured but human-readable content. Content of this field can be tokenized and analyzed.
+The tradition field for full-text content. Text fields are best suited for unstructured but human-readable content. Content of this field will be tokenized and analyzed.
 
 Object types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -60,11 +60,8 @@ This parameter indicates whether this field is indexed so that certain types of
 * storeDocValues
 This parameter indicates whether this field is stored as doc value so that we can retrieve this field in the NRTSearch response.
 
-* tokenize
-This parameter is only for text. This parameter indicates whether this text field should be tokenized.
-
 * indexAnalyzer
-This parameter is only for text usually together with tokenize. This parameter specifies the analyzer to use for this field during indexing.
+This parameter is only for text. This parameter specifies the analyzer to use for this field during indexing.
 
 * searchAnalyzer
-This parameter is only for text usually together with tokenize. This parameter specifies the analyzer to use for this field during searching.
+This parameter is only for text. This parameter specifies the analyzer to use for this field during searching.
diff --git a/docs/starter.rst b/docs/starter.rst
@@ -46,7 +46,7 @@ Register Fields
               "field":
               [
                       { "name": "doc_id", "type": "ATOM", "storeDocValues": true},
-                      { "name": "vendor_name", "type": "TEXT" , "search": true, "store": true, "tokenize": true},
+                      { "name": "vendor_name", "type": "TEXT" , "search": true, "store": true},
                       { "name": "license_no",  "type": "INT", "multiValued": true, "storeDocValues": true}
               ]
     }

diff --git a/example-plugin/src/test/resources/register_fields.json b/example-plugin/src/test/resources/register_fields.json
@@ -5,7 +5,6 @@
       "name": "field1",
       "type": "TEXT",
       "search": true,
-      "tokenize": true,
       "analyzer": {
         "predefined": "plugin_analyzer"
       }

diff --git a/grpc-gateway/luceneserver.pb.go b/grpc-gateway/luceneserver.pb.go
diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/AtomFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/AtomFieldDef.java
@@ -86,27 +86,15 @@ protected DocValuesType parseDocValuesType(Field requestField) {
 
   @Override
   protected void setSearchProperties(FieldType fieldType, Field requestField) {
-    // This is how things are currently expected, if the field is searchable it
-    // gets the same properties set as a text field. Even if it is not set
-    // searchable, it is expected to at least have the DOCS index option so
-    // it can be deleted by Term. This doesn't seem right and we should think
-    // about changing it.
+    // TODO: make this configurable and default to true, this is hard to do with the
+    // current grpc field type
+    fieldType.setOmitNorms(true);
+    fieldType.setTokenized(false);
     if (requestField.getSearch()) {
-      super.setSearchProperties(fieldType, requestField);
-    } else {
-      fieldType.setOmitNorms(true);
-      fieldType.setTokenized(false);
-      fieldType.setIndexOptions(IndexOptions.DOCS);
+      setIndexOptions(requestField.getIndexOptions(), fieldType, IndexOptions.DOCS);
     }
   }
 
-  @Override
-  public boolean isSearchable() {
-    // even if the search property is not set this field may have index options,
-    // see setSearchProperties.
-    return super.isSearchable() || fieldType.indexOptions() != IndexOptions.NONE;
-  }
-
   @Override
   protected Analyzer parseIndexAnalyzer(Field requestField) {
     return keywordAnalyzer;

diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/IdFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/IdFieldDef.java
@@ -47,10 +47,10 @@ protected IdFieldDef(String name, Field requestField) {
    * @param requestField field properties to validate
    */
   protected void validateRequest(Field requestField) {
-    if (requestField.getMultiValued() || requestField.getTokenize()) {
+    if (requestField.getMultiValued()) {
       throw new IllegalArgumentException(
           String.format(
-              "field: %s cannot have multivalued fields or tokenization as it's an _ID field",
+              "field: %s cannot have multivalued fields as it's an _ID field",
               requestField.getName()));
     }
     if (!requestField.getStore() && !requestField.getStoreDocValues()) {

diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextBaseFieldDef.java
@@ -17,6 +17,7 @@
 
 import com.yelp.nrtsearch.server.grpc.FacetType;
 import com.yelp.nrtsearch.server.grpc.Field;
+import com.yelp.nrtsearch.server.grpc.IndexOptions;
 import com.yelp.nrtsearch.server.grpc.TermVectors;
 import com.yelp.nrtsearch.server.luceneserver.Constants;
 import com.yelp.nrtsearch.server.luceneserver.analysis.AnalyzerCreator;
@@ -45,7 +46,6 @@
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.DocValuesType;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedDocValues;
@@ -187,42 +187,6 @@ protected Analyzer parseSearchAnalyzer(Field requestField) {
     }
   }
 
-  @Override
-  protected void setSearchProperties(FieldType fieldType, Field requestField) {
-    if (requestField.getSearch()) {
-      switch (requestField.getIndexOptions()) {
-        case DOCS:
-          fieldType.setIndexOptions(IndexOptions.DOCS);
-          break;
-        case DOCS_FREQS:
-          fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
-          break;
-        case DOCS_FREQS_POSITIONS:
-          fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
-          break;
-        case DOCS_FREQS_POSITIONS_OFFSETS:
-          fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-          break;
-        default:
-          fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
-          break;
-      }
-
-      switch (requestField.getTermVectors()) {
-        case TERMS_POSITIONS_OFFSETS_PAYLOADS:
-          fieldType.setStoreTermVectorPayloads(true);
-        case TERMS_POSITIONS_OFFSETS:
-          fieldType.setStoreTermVectorOffsets(true);
-        case TERMS_POSITIONS:
-          fieldType.setStoreTermVectorPositions(true);
-        case TERMS:
-          fieldType.setStoreTermVectors(true);
-      }
-    }
-    fieldType.setTokenized(requestField.getTokenize());
-    fieldType.setOmitNorms(requestField.getOmitNorms());
-  }
-
   /**
    * Get an optional analyzer to use during indexing. This option may be empty if analysis is not
    * possible for this field. For example, if it is not searchable.
@@ -395,4 +359,29 @@ public GlobalOrdinalLookup getOrdinalLookup(IndexReader reader) throws IOExcepti
 
     return ordinalLookup;
   }
+
+  static void setIndexOptions(
+      IndexOptions grpcIndexOptions,
+      FieldType fieldType,
+      org.apache.lucene.index.IndexOptions defaultOptions) {
+    switch (grpcIndexOptions) {
+      case DOCS:
+        fieldType.setIndexOptions(org.apache.lucene.index.IndexOptions.DOCS);
+        break;
+      case DOCS_FREQS:
+        fieldType.setIndexOptions(org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS);
+        break;
+      case DOCS_FREQS_POSITIONS:
+        fieldType.setIndexOptions(
+            org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+        break;
+      case DOCS_FREQS_POSITIONS_OFFSETS:
+        fieldType.setIndexOptions(
+            org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+        break;
+      default:
+        fieldType.setIndexOptions(defaultOptions);
+        break;
+    }
+  }
 }
diff --git a/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextFieldDef.java b/src/main/java/com/yelp/nrtsearch/server/luceneserver/field/TextFieldDef.java
@@ -16,6 +16,8 @@
 package com.yelp.nrtsearch.server.luceneserver.field;
 
 import com.yelp.nrtsearch.server.grpc.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
 
 /** Field class for 'TEXT' field type. */
 public class TextFieldDef extends TextBaseFieldDef {
@@ -27,4 +29,25 @@ public TextFieldDef(String name, Field requestField) {
   public String getType() {
     return "TEXT";
   }
+
+  @Override
+  protected void setSearchProperties(FieldType fieldType, Field requestField) {
+    if (requestField.getSearch()) {
+      setIndexOptions(
+          requestField.getIndexOptions(), fieldType, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+
+      switch (requestField.getTermVectors()) {
+        case TERMS_POSITIONS_OFFSETS_PAYLOADS:
+          fieldType.setStoreTermVectorPayloads(true);
+        case TERMS_POSITIONS_OFFSETS:
+          fieldType.setStoreTermVectorOffsets(true);
+        case TERMS_POSITIONS:
+          fieldType.setStoreTermVectorPositions(true);
+        case TERMS:
+          fieldType.setStoreTermVectors(true);
+      }
+    }
+    fieldType.setTokenized(true);
+    fieldType.setOmitNorms(requestField.getOmitNorms());
+  }
 }
diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/AddFieldsIndexingTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/AddFieldsIndexingTest.java
@@ -49,7 +49,6 @@ public class AddFieldsIndexingTest {
               .setName("field2")
               .setStoreDocValues(true)
               .setSearch(true)
-              .setTokenize(true)
               .setType(FieldType.TEXT)
               .build());
 
@@ -64,7 +63,6 @@ public class AddFieldsIndexingTest {
               .setName("field4")
               .setStoreDocValues(true)
               .setSearch(true)
-              .setTokenize(true)
               .setType(FieldType.TEXT)
               .build());
 

diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/AddFieldsSimilarityTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/AddFieldsSimilarityTest.java
@@ -45,7 +45,6 @@ public class AddFieldsSimilarityTest {
               .setName("field1")
               .setStoreDocValues(true)
               .setSearch(true)
-              .setTokenize(true)
               .setType(FieldType.TEXT)
               .setSimilarity("classic")
               .build());
@@ -56,7 +55,6 @@ public class AddFieldsSimilarityTest {
               .setName("field2")
               .setStoreDocValues(true)
               .setSearch(true)
-              .setTokenize(true)
               .setType(FieldType.TEXT)
               .setSimilarity("classic")
               .build());

diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/LuceneServerIdFieldTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/LuceneServerIdFieldTest.java
@@ -128,7 +128,6 @@ public void testAddUpdateAndSearch() throws IOException, InterruptedException {
                         .setStoreDocValues(true)
                         .setSearch(true)
                         .setMultiValued(true)
-                        .setTokenize(true)
                         .build())
                 .build());
     // 2 docs addDocuments
@@ -274,24 +273,11 @@ public void testDuplicateIdFieldInIndexState() throws IOException, InterruptedEx
   @Test(expected = StatusRuntimeException.class)
   public void testMultiValued() throws Exception {
     try {
-      registerFields(List.of(getFieldBuilder("doc_id", true, true, true, false)));
+      registerFields(List.of(getFieldBuilder("doc_id", true, true, true)));
     } catch (RuntimeException e) {
       String message =
           "INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
-              + "field: doc_id cannot have multivalued fields or tokenization as it's an _ID field";
-      assertEquals(message, e.getMessage());
-      throw e;
-    }
-  }
-
-  @Test(expected = StatusRuntimeException.class)
-  public void testTokenized() throws Exception {
-    try {
-      registerFields(List.of(getFieldBuilder("doc_id", true, true, false, true)));
-    } catch (RuntimeException e) {
-      String message =
-          "INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
-              + "field: doc_id cannot have multivalued fields or tokenization as it's an _ID field";
+              + "field: doc_id cannot have multivalued fields as it's an _ID field";
       assertEquals(message, e.getMessage());
       throw e;
     }
@@ -300,7 +286,7 @@ public void testTokenized() throws Exception {
   @Test(expected = StatusRuntimeException.class)
   public void testStoreAndDocValuesFalse() throws IOException {
     try {
-      registerFields(List.of(getFieldBuilder("doc_id", false, false, false, false)));
+      registerFields(List.of(getFieldBuilder("doc_id", false, false, false)));
     } catch (RuntimeException e) {
       String message =
           "INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
@@ -312,21 +298,21 @@ public void testStoreAndDocValuesFalse() throws IOException {
 
   @Test
   public void testStoreTrueAndDocValuesFalse() throws IOException {
-    registerFields(List.of(getFieldBuilder("doc_id", false, true, false, false)));
+    registerFields(List.of(getFieldBuilder("doc_id", false, true, false)));
   }
 
   @Test
   public void testStoreFalseAndDocValuesTrue() throws IOException {
-    registerFields(List.of(getFieldBuilder("doc_id", true, false, false, false)));
+    registerFields(List.of(getFieldBuilder("doc_id", true, false, false)));
   }
 
   @Test(expected = StatusRuntimeException.class)
   public void testMultipleDocIds() throws Exception {
     try {
       registerFields(
           List.of(
-              getFieldBuilder("doc_id", true, true, false, false),
-              getFieldBuilder("doc_id_2", true, true, false, false)));
+              getFieldBuilder("doc_id", true, true, false),
+              getFieldBuilder("doc_id_2", true, true, false)));
     } catch (RuntimeException e) {
       String message =
           "INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
@@ -337,18 +323,13 @@ public void testMultipleDocIds() throws Exception {
   }
 
   private Field getFieldBuilder(
-      String fieldName,
-      boolean storeDocValues,
-      boolean store,
-      boolean multiValued,
-      boolean tokenized) {
+      String fieldName, boolean storeDocValues, boolean store, boolean multiValued) {
     return Field.newBuilder()
         .setName(fieldName)
         .setStoreDocValues(storeDocValues)
         .setStore(store)
         .setType(FieldType._ID)
         .setMultiValued(multiValued)
-        .setTokenize(tokenized)
         .build();
   }
 

diff --git a/src/test/java/com/yelp/nrtsearch/server/grpc/LuceneServerTest.java b/src/test/java/com/yelp/nrtsearch/server/grpc/LuceneServerTest.java
@@ -327,7 +327,6 @@ public void testUpdateFieldsBasic() throws Exception {
                             .setStoreDocValues(true)
                             .setSearch(true)
                             .setMultiValued(true)
-                            .setTokenize(true)
                             .build())
                     .build());
     assertTrue(reply.getResponse().contains("new_text_field"));
@@ -392,7 +391,6 @@ public void testRegisterVirtualAndNonVirtualFields() throws Exception {
                             .setStoreDocValues(true)
                             .setSearch(true)
                             .setMultiValued(true)
-                            .setTokenize(true)
                             .build())
                     .build());
     assertTrue(reply.getResponse().contains("new_virtual_field"));
@@ -456,7 +454,6 @@ public void testSearchPostUpdate() throws IOException, InterruptedException {
                         .setStoreDocValues(true)
                         .setSearch(true)
                         .setMultiValued(true)
-                        .setTokenize(true)
                         .build())
                 .build());