Skip to content

Commit

Permalink
Deprecate tokenize parameter in field definition (#602)
Browse files Browse the repository at this point in the history
  • Loading branch information
aprudhomme authored Oct 25, 2023
1 parent 26bbde0 commit 3027a0c
Show file tree
Hide file tree
Showing 75 changed files with 1,223 additions and 1,316 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ cat registerFields.json
"field":
[
{ "name": "doc_id", "type": "ATOM", "storeDocValues": true},
{ "name": "vendor_name", "type": "TEXT" , "search": true, "store": true, "tokenize": true},
{ "name": "vendor_name", "type": "TEXT" , "search": true, "store": true},
{ "name": "license_no", "type": "INT", "multiValued": true, "storeDocValues": true}
]
}
Expand Down
2 changes: 1 addition & 1 deletion clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ message Field {
bool store = 4; // True if the value should be stored.
bool storeDocValues = 5; // Whether to index the value into doc values.
bool sort = 6; // True if the value should be indexed into doc values for sorting.
bool tokenize = 7; // True if the value should be tokenized.
bool tokenize = 7 [deprecated = true]; // This is now determined from the field type
bool group = 8; // True if the value should be indexed into doc values for grouping.
bool multiValued = 9; // True if this field may sometimes have more than one value.
bool highlight = 10 [deprecated = true]; // This parameter would be ignored
Expand Down
4 changes: 2 additions & 2 deletions docker-compose-config/registerFields.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
{
"name": "doc_id",
"type": "ATOM",
"search": true,
"storeDocValues": true},
{
"name": "vendor_name",
"type": "TEXT" ,
"search": true,
"store": true,
"tokenize": true
"store": true
},
{
"name": "license_no",
Expand Down
1 change: 0 additions & 1 deletion docs/highlighting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ Fast-vector-highlighter:

* The field must be stored, i.e. have "store: true"
* The field must have term vectors with positions and offsets, i.e. have "termVectors: TERMS_POSITIONS_OFFSETS"
* While not mandatory, the field must be tokenized for the highlights to be useful, i.e. have "tokenize: true"

Query Syntax
------------
Expand Down
9 changes: 3 additions & 6 deletions docs/mapping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Text search types
^^^^^^^^^^^^^^^^^^^^^^^^^^^

* TEXT
The tradition field for full-text content. Text fields are best suited for unstructured but human-readable content. Content of this field can be tokenized and analyzed.
The tradition field for full-text content. Text fields are best suited for unstructured but human-readable content. Content of this field will be tokenized and analyzed.

Object types
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down Expand Up @@ -60,11 +60,8 @@ This parameter indicates whether this field is indexed so that certain types of
* storeDocValues
This parameter indicates whether this field is stored as doc value so that we can retrieve this field in the NRTSearch response.

* tokenize
This parameter is only for text. This parameter indicates whether this text field should be tokenized.

* indexAnalyzer
This parameter is only for text usually together with tokenize. This parameter specifies the analyzer to use for this field during indexing.
This parameter is only for text. This parameter specifies the analyzer to use for this field during indexing.

* searchAnalyzer
This parameter is only for text usually together with tokenize. This parameter specifies the analyzer to use for this field during searching.
This parameter is only for text. This parameter specifies the analyzer to use for this field during searching.
2 changes: 1 addition & 1 deletion docs/starter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Register Fields
"field":
[
{ "name": "doc_id", "type": "ATOM", "storeDocValues": true},
{ "name": "vendor_name", "type": "TEXT" , "search": true, "store": true, "tokenize": true},
{ "name": "vendor_name", "type": "TEXT" , "search": true, "store": true},
{ "name": "license_no", "type": "INT", "multiValued": true, "storeDocValues": true}
]
}
Expand Down
1 change: 0 additions & 1 deletion example-plugin/src/test/resources/register_fields.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"name": "field1",
"type": "TEXT",
"search": true,
"tokenize": true,
"analyzer": {
"predefined": "plugin_analyzer"
}
Expand Down
2,209 changes: 1,106 additions & 1,103 deletions grpc-gateway/luceneserver.pb.go

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -86,27 +86,15 @@ protected DocValuesType parseDocValuesType(Field requestField) {

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
// This is how things are currently expected, if the field is searchable it
// gets the same properties set as a text field. Even if it is not set
// searchable, it is expected to at least have the DOCS index option so
// it can be deleted by Term. This doesn't seem right and we should think
// about changing it.
// TODO: make this configurable and default to true, this is hard to do with the
// current grpc field type
fieldType.setOmitNorms(true);
fieldType.setTokenized(false);
if (requestField.getSearch()) {
super.setSearchProperties(fieldType, requestField);
} else {
fieldType.setOmitNorms(true);
fieldType.setTokenized(false);
fieldType.setIndexOptions(IndexOptions.DOCS);
setIndexOptions(requestField.getIndexOptions(), fieldType, IndexOptions.DOCS);
}
}

@Override
public boolean isSearchable() {
// even if the search property is not set this field may have index options,
// see setSearchProperties.
return super.isSearchable() || fieldType.indexOptions() != IndexOptions.NONE;
}

@Override
protected Analyzer parseIndexAnalyzer(Field requestField) {
return keywordAnalyzer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ protected IdFieldDef(String name, Field requestField) {
* @param requestField field properties to validate
*/
protected void validateRequest(Field requestField) {
if (requestField.getMultiValued() || requestField.getTokenize()) {
if (requestField.getMultiValued()) {
throw new IllegalArgumentException(
String.format(
"field: %s cannot have multivalued fields or tokenization as it's an _ID field",
"field: %s cannot have multivalued fields as it's an _ID field",
requestField.getName()));
}
if (!requestField.getStore() && !requestField.getStoreDocValues()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import com.yelp.nrtsearch.server.grpc.FacetType;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.IndexOptions;
import com.yelp.nrtsearch.server.grpc.TermVectors;
import com.yelp.nrtsearch.server.luceneserver.Constants;
import com.yelp.nrtsearch.server.luceneserver.analysis.AnalyzerCreator;
Expand Down Expand Up @@ -45,7 +46,6 @@
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
Expand Down Expand Up @@ -187,42 +187,6 @@ protected Analyzer parseSearchAnalyzer(Field requestField) {
}
}

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
if (requestField.getSearch()) {
switch (requestField.getIndexOptions()) {
case DOCS:
fieldType.setIndexOptions(IndexOptions.DOCS);
break;
case DOCS_FREQS:
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
break;
case DOCS_FREQS_POSITIONS:
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
break;
case DOCS_FREQS_POSITIONS_OFFSETS:
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
break;
default:
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
break;
}

switch (requestField.getTermVectors()) {
case TERMS_POSITIONS_OFFSETS_PAYLOADS:
fieldType.setStoreTermVectorPayloads(true);
case TERMS_POSITIONS_OFFSETS:
fieldType.setStoreTermVectorOffsets(true);
case TERMS_POSITIONS:
fieldType.setStoreTermVectorPositions(true);
case TERMS:
fieldType.setStoreTermVectors(true);
}
}
fieldType.setTokenized(requestField.getTokenize());
fieldType.setOmitNorms(requestField.getOmitNorms());
}

/**
* Get an optional analyzer to use during indexing. This option may be empty if analysis is not
* possible for this field. For example, if it is not searchable.
Expand Down Expand Up @@ -395,4 +359,29 @@ public GlobalOrdinalLookup getOrdinalLookup(IndexReader reader) throws IOExcepti

return ordinalLookup;
}

static void setIndexOptions(
IndexOptions grpcIndexOptions,
FieldType fieldType,
org.apache.lucene.index.IndexOptions defaultOptions) {
switch (grpcIndexOptions) {
case DOCS:
fieldType.setIndexOptions(org.apache.lucene.index.IndexOptions.DOCS);
break;
case DOCS_FREQS:
fieldType.setIndexOptions(org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS);
break;
case DOCS_FREQS_POSITIONS:
fieldType.setIndexOptions(
org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
break;
case DOCS_FREQS_POSITIONS_OFFSETS:
fieldType.setIndexOptions(
org.apache.lucene.index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
break;
default:
fieldType.setIndexOptions(defaultOptions);
break;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
package com.yelp.nrtsearch.server.luceneserver.field;

import com.yelp.nrtsearch.server.grpc.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;

/** Field class for 'TEXT' field type. */
public class TextFieldDef extends TextBaseFieldDef {
Expand All @@ -27,4 +29,25 @@ public TextFieldDef(String name, Field requestField) {
public String getType() {
return "TEXT";
}

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
if (requestField.getSearch()) {
setIndexOptions(
requestField.getIndexOptions(), fieldType, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);

switch (requestField.getTermVectors()) {
case TERMS_POSITIONS_OFFSETS_PAYLOADS:
fieldType.setStoreTermVectorPayloads(true);
case TERMS_POSITIONS_OFFSETS:
fieldType.setStoreTermVectorOffsets(true);
case TERMS_POSITIONS:
fieldType.setStoreTermVectorPositions(true);
case TERMS:
fieldType.setStoreTermVectors(true);
}
}
fieldType.setTokenized(true);
fieldType.setOmitNorms(requestField.getOmitNorms());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public class AddFieldsIndexingTest {
.setName("field2")
.setStoreDocValues(true)
.setSearch(true)
.setTokenize(true)
.setType(FieldType.TEXT)
.build());

Expand All @@ -64,7 +63,6 @@ public class AddFieldsIndexingTest {
.setName("field4")
.setStoreDocValues(true)
.setSearch(true)
.setTokenize(true)
.setType(FieldType.TEXT)
.build());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ public class AddFieldsSimilarityTest {
.setName("field1")
.setStoreDocValues(true)
.setSearch(true)
.setTokenize(true)
.setType(FieldType.TEXT)
.setSimilarity("classic")
.build());
Expand All @@ -56,7 +55,6 @@ public class AddFieldsSimilarityTest {
.setName("field2")
.setStoreDocValues(true)
.setSearch(true)
.setTokenize(true)
.setType(FieldType.TEXT)
.setSimilarity("classic")
.build());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ public void testAddUpdateAndSearch() throws IOException, InterruptedException {
.setStoreDocValues(true)
.setSearch(true)
.setMultiValued(true)
.setTokenize(true)
.build())
.build());
// 2 docs addDocuments
Expand Down Expand Up @@ -274,24 +273,11 @@ public void testDuplicateIdFieldInIndexState() throws IOException, InterruptedEx
@Test(expected = StatusRuntimeException.class)
public void testMultiValued() throws Exception {
try {
registerFields(List.of(getFieldBuilder("doc_id", true, true, true, false)));
registerFields(List.of(getFieldBuilder("doc_id", true, true, true)));
} catch (RuntimeException e) {
String message =
"INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
+ "field: doc_id cannot have multivalued fields or tokenization as it's an _ID field";
assertEquals(message, e.getMessage());
throw e;
}
}

@Test(expected = StatusRuntimeException.class)
public void testTokenized() throws Exception {
try {
registerFields(List.of(getFieldBuilder("doc_id", true, true, false, true)));
} catch (RuntimeException e) {
String message =
"INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
+ "field: doc_id cannot have multivalued fields or tokenization as it's an _ID field";
+ "field: doc_id cannot have multivalued fields as it's an _ID field";
assertEquals(message, e.getMessage());
throw e;
}
Expand All @@ -300,7 +286,7 @@ public void testTokenized() throws Exception {
@Test(expected = StatusRuntimeException.class)
public void testStoreAndDocValuesFalse() throws IOException {
try {
registerFields(List.of(getFieldBuilder("doc_id", false, false, false, false)));
registerFields(List.of(getFieldBuilder("doc_id", false, false, false)));
} catch (RuntimeException e) {
String message =
"INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
Expand All @@ -312,21 +298,21 @@ public void testStoreAndDocValuesFalse() throws IOException {

@Test
public void testStoreTrueAndDocValuesFalse() throws IOException {
registerFields(List.of(getFieldBuilder("doc_id", false, true, false, false)));
registerFields(List.of(getFieldBuilder("doc_id", false, true, false)));
}

@Test
public void testStoreFalseAndDocValuesTrue() throws IOException {
registerFields(List.of(getFieldBuilder("doc_id", true, false, false, false)));
registerFields(List.of(getFieldBuilder("doc_id", true, false, false)));
}

@Test(expected = StatusRuntimeException.class)
public void testMultipleDocIds() throws Exception {
try {
registerFields(
List.of(
getFieldBuilder("doc_id", true, true, false, false),
getFieldBuilder("doc_id_2", true, true, false, false)));
getFieldBuilder("doc_id", true, true, false),
getFieldBuilder("doc_id_2", true, true, false)));
} catch (RuntimeException e) {
String message =
"INVALID_ARGUMENT: error while trying to RegisterFields for index: test_index\n"
Expand All @@ -337,18 +323,13 @@ public void testMultipleDocIds() throws Exception {
}

private Field getFieldBuilder(
String fieldName,
boolean storeDocValues,
boolean store,
boolean multiValued,
boolean tokenized) {
String fieldName, boolean storeDocValues, boolean store, boolean multiValued) {
return Field.newBuilder()
.setName(fieldName)
.setStoreDocValues(storeDocValues)
.setStore(store)
.setType(FieldType._ID)
.setMultiValued(multiValued)
.setTokenize(tokenized)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,6 @@ public void testUpdateFieldsBasic() throws Exception {
.setStoreDocValues(true)
.setSearch(true)
.setMultiValued(true)
.setTokenize(true)
.build())
.build());
assertTrue(reply.getResponse().contains("new_text_field"));
Expand Down Expand Up @@ -392,7 +391,6 @@ public void testRegisterVirtualAndNonVirtualFields() throws Exception {
.setStoreDocValues(true)
.setSearch(true)
.setMultiValued(true)
.setTokenize(true)
.build())
.build());
assertTrue(reply.getResponse().contains("new_virtual_field"));
Expand Down Expand Up @@ -456,7 +454,6 @@ public void testSearchPostUpdate() throws IOException, InterruptedException {
.setStoreDocValues(true)
.setSearch(true)
.setMultiValued(true)
.setTokenize(true)
.build())
.build());

Expand Down
Loading

0 comments on commit 3027a0c

Please sign in to comment.