diff --git a/automod/rules/keyword.go b/automod/rules/keyword.go index 8d5caa395..ce616206d 100644 --- a/automod/rules/keyword.go +++ b/automod/rules/keyword.go @@ -12,16 +12,10 @@ import ( ) func BadWordPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error { - isJapanese := false - for _, lang := range post.Langs { - if lang == "ja" || strings.HasPrefix(lang, "ja-") { - isJapanese = true - } - } for _, tok := range helpers.ExtractTextTokensPost(post) { word := keyword.SlugIsExplicitSlur(tok) // used very frequently in a reclaimed context - if word != "" && word != "faggot" && word != "tranny" && word != "coon" && !(word == "kike" && isJapanese) { + if word != "" && word != "faggot" && word != "tranny" && word != "coon" { c.AddRecordFlag("bad-word-text") c.ReportRecord(automod.ReportReasonRude, fmt.Sprintf("possible bad word in post text or alttext: %s", word)) //c.Notify("slack") @@ -30,11 +24,6 @@ func BadWordPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error { // de-pluralize tok = strings.TrimSuffix(tok, "s") if c.InSet("worst-words", tok) { - // skip this specific term, if used in a Japanese language post - if isJapanese && tok == "kike" { - continue - } - c.AddRecordFlag("bad-word-text") c.ReportRecord(automod.ReportReasonRude, fmt.Sprintf("possible bad word in post text or alttext: %s", tok)) //c.Notify("slack") diff --git a/cmd/palomar/Dockerfile.opensearch b/cmd/palomar/Dockerfile.opensearch index 079d4db29..61a0d9eb3 100644 --- a/cmd/palomar/Dockerfile.opensearch +++ b/cmd/palomar/Dockerfile.opensearch @@ -1,3 +1,2 @@ FROM opensearchproject/opensearch:2.13.0 RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-icu -RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-kuromoji diff --git a/cmd/palomar/README.md b/cmd/palomar/README.md index d830c2c91..300af9433 100644 --- a/cmd/palomar/README.md +++ b/cmd/palomar/README.md @@ -64,7 +64,7 @@ Response: ## Development Quickstart -Run an ephemeral opensearch instance on local port 9200, with SSL disabled, and the `analysis-icu` and `analysis-kuromoji` plugins installed, using docker: +Run an ephemeral opensearch instance on local port 9200, with SSL disabled, and the `analysis-icu` plugin installed, using docker: docker build -f Dockerfile.opensearch . -t opensearch-palomar diff --git a/cmd/palomar/README.opensearch.md b/cmd/palomar/README.opensearch.md index 553c7b310..4d34662e4 100644 --- a/cmd/palomar/README.opensearch.md +++ b/cmd/palomar/README.opensearch.md @@ -1,16 +1,14 @@ # Basic OpenSearch Operations -We use OpenSearch version 2.13+, with the `analysis-icu` and `analysis-kuromoji` plugins. These are included automatically on the AWS hosted version of Opensearch, otherwise you need to install: +We use OpenSearch version 2.13+, with the `analysis-icu` plugin. These are included automatically on the AWS hosted version of Opensearch, otherwise you need to install: sudo /usr/share/opensearch/bin/opensearch-plugin install analysis-icu - sudo /usr/share/opensearch/bin/opensearch-plugin install analysis-kuromoji sudo service opensearch restart If you are trying to use Elasticsearch 7.10 instead of OpenSearch, you can install the plugin with: sudo /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu - sudo /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji sudo service elasticsearch restart ## Local Development diff --git a/search/japanese.go b/search/japanese.go deleted file mode 100644 index 9784bf100..000000000 --- a/search/japanese.go +++ /dev/null @@ -1,14 +0,0 @@ -package search - -import ( - "regexp" -) - -// U+3040 - U+30FF: hiragana and katakana (Japanese only) -// U+FF66 - U+FF9F: half-width katakana (Japanese only) -var japaneseRegex = regexp.MustCompile(`[\x{3040}-\x{30ff}\x{ff66}-\x{ff9f}]`) - -// helper to check if an input string contains any Japanese-specific characters (hiragana or katakana). will not trigger on CJK characters which are not specific to Japanese -func containsJapanese(text string) bool { - return japaneseRegex.MatchString(text) -} diff --git a/search/japanese_test.go b/search/japanese_test.go deleted file mode 100644 index df7606ad2..000000000 --- a/search/japanese_test.go +++ /dev/null @@ -1,23 +0,0 @@ -package search - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestJapaneseDetection(t *testing.T) { - assert := assert.New(t) - - assert.False(containsJapanese("")) - assert.False(containsJapanese("basic english")) - assert.False(containsJapanese("basic english")) - - assert.True(containsJapanese("学校から帰って熱いお風呂に入ったら力一杯がんばる")) - assert.True(containsJapanese("パリ")) - assert.True(containsJapanese("ハリー・ポッター")) - assert.True(containsJapanese("some japanese パリ and some english")) - - // CJK, but not japanese-specific - assert.False(containsJapanese("熱力学")) -} diff --git a/search/post_schema.json b/search/post_schema.json index e2bee31c1..36cc4e18a 100644 --- a/search/post_schema.json +++ b/search/post_schema.json @@ -15,38 +15,20 @@ "type": "custom", "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - }, - "textIcuSearch": { - "type": "custom", - "tokenizer": "icu_tokenizer", - "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - }, - "textJapanese": { - "type": "custom", - "tokenizer": "kuromoji_tokenizer", - "char_filter": [ "icu_normalizer" ], "filter": [ - "kuromoji_baseform", - "kuromoji_part_of_speech", + "icu_folding", "cjk_width", - "ja_stop", - "kuromoji_stemmer", - "lowercase" + "cjk_bigram" ] }, - "textJapaneseSearch": { + "textIcuSearch": { "type": "custom", - "tokenizer": "kuromoji_tokenizer", + "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], "filter": [ - "kuromoji_baseform", - "kuromoji_part_of_speech", + "icu_folding", "cjk_width", - "ja_stop", - "kuromoji_stemmer", - "lowercase" + "cjk_bigram" ] } }, @@ -75,7 +57,6 @@ "created_at": { "type": "date" }, "text": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch", "copy_to": "everything" }, - "text_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch", "copy_to": "everything_ja" }, "lang_code": { "type": "keyword", "normalizer": "default" }, "lang_code_iso2": { "type": "keyword", "normalizer": "default" }, "mention_did": { "type": "keyword", "normalizer": "default" }, @@ -83,7 +64,6 @@ "reply_root_aturi": { "type": "keyword", "normalizer": "default" }, "embed_img_count": { "type": "integer" }, "embed_img_alt_text": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch", "copy_to": "everything" }, - "embed_img_alt_text_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch", "copy_to": "everything_ja" }, "self_label": { "type": "keyword", "normalizer": "default" }, "url": { "type": "keyword", "normalizer": "default" }, @@ -94,7 +74,6 @@ "likesFuzzy": { "type": "integer" }, "everything": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch" }, - "everything_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch" }, "lang": { "type": "alias", "path": "lang_code_iso2" } } diff --git a/search/query.go b/search/query.go index 14d6aef79..94be93dd5 100644 --- a/search/query.go +++ b/search/query.go @@ -224,9 +224,6 @@ func DoSearchPosts(ctx context.Context, dir identity.Directory, escli *es.Client queryStringParams := ParsePostQuery(ctx, dir, params.Query, params.Viewer) params.Update(&queryStringParams) idx := "everything" - if containsJapanese(params.Query) { - idx = "everything_ja" - } basic := map[string]interface{}{ "simple_query_string": map[string]interface{}{ "query": params.Query, diff --git a/search/query_test.go b/search/query_test.go index 501687802..2565e76dc 100644 --- a/search/query_test.go +++ b/search/query_test.go @@ -87,120 +87,6 @@ func testServer(ctx context.Context, t *testing.T, escli *es.Client, dir identit return srv } -func TestJapaneseRegressions(t *testing.T) { - assert := assert.New(t) - ctx := context.Background() - escli := testEsClient(t) - dir := identity.NewMockDirectory() - srv := testServer(ctx, t, escli, &dir) - ident := identity.Identity{ - DID: syntax.DID("did:plc:abc111"), - Handle: syntax.Handle("handle.example.com"), - } - - res, err := DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(0, len(res.Hits.Hits)) - - p1 := appbsky.FeedPost{Text: "basic english post", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p1, "app.bsky.feed.post/3kpnillluoh2y", cid.Undef)) - - // https://github.com/bluesky-social/indigo/issues/302 - p2 := appbsky.FeedPost{Text: "学校から帰って熱いお風呂に入ったら力一杯がんばる", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p2, "app.bsky.feed.post/3kpnillluo222", cid.Undef)) - p3 := appbsky.FeedPost{Text: "熱力学", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p3, "app.bsky.feed.post/3kpnillluo333", cid.Undef)) - p4 := appbsky.FeedPost{Text: "東京都", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p4, "app.bsky.feed.post/3kpnillluo444", cid.Undef)) - p5 := appbsky.FeedPost{Text: "京都", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p5, "app.bsky.feed.post/3kpnillluo555", cid.Undef)) - p6 := appbsky.FeedPost{Text: "パリ", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p6, "app.bsky.feed.post/3kpnillluo666", cid.Undef)) - p7 := appbsky.FeedPost{Text: "ハリー・ポッター", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p7, "app.bsky.feed.post/3kpnillluo777", cid.Undef)) - p8 := appbsky.FeedPost{Text: "ハリ", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p8, "app.bsky.feed.post/3kpnillluo223", cid.Undef)) - p9 := appbsky.FeedPost{Text: "multilingual 多言語", CreatedAt: "2024-01-02T03:04:05.006Z"} - assert.NoError(srv.indexPost(ctx, &ident, &p9, "app.bsky.feed.post/3kpnillluo224", cid.Undef)) - - _, err = srv.escli.Indices.Refresh() - assert.NoError(err) - - // expect all to be indexed - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "*", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(9, len(res.Hits.Hits)) - - // check that english matches (single post) - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - // "thermodynamics"; should return only one match - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "熱力学", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - // "Kyoto"; should return only one match - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "京都", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - // "Paris"; should return only one match - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "パリ", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - // should return only one match - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハリー", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - // part of a word; should match none - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハ", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(0, len(res.Hits.Hits)) - - // should match both ways, and together - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "多言語", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual 多言語", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) - res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"multilingual 多言語\"", 0, 20) - if err != nil { - t.Fatal(err) - } - assert.Equal(1, len(res.Hits.Hits)) -} - func TestParsedQuery(t *testing.T) { assert := assert.New(t) ctx := context.Background() diff --git a/search/testdata/transform-post-fixtures.json b/search/testdata/transform-post-fixtures.json index 780a9705c..140860c4d 100644 --- a/search/testdata/transform-post-fixtures.json +++ b/search/testdata/transform-post-fixtures.json @@ -266,14 +266,10 @@ "record_cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm", "created_at": "2023-08-07T05:46:14.423045Z", "text": "学校から帰って熱いお風呂に入ったら力一杯がんばる", - "text_ja": "学校から帰って熱いお風呂に入ったら力一杯がんばる", "embed_img_alt_text": [ "brief alt text description of the first image ハリー・ポッター", "brief alt text description of the second image" ], - "embed_img_alt_text_ja": [ - "brief alt text description of the first image ハリー・ポッター" - ], "embed_img_count": 2 } }, diff --git a/search/transform.go b/search/transform.go index 917b622d8..33ae0fb1e 100644 --- a/search/transform.go +++ b/search/transform.go @@ -31,26 +31,24 @@ type ProfileDoc struct { } type PostDoc struct { - DocIndexTs string `json:"doc_index_ts"` - DID string `json:"did"` - RecordRkey string `json:"record_rkey"` - RecordCID string `json:"record_cid"` - CreatedAt *string `json:"created_at,omitempty"` - Text string `json:"text"` - TextJA *string `json:"text_ja,omitempty"` - LangCode []string `json:"lang_code,omitempty"` - LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` - MentionDID []string `json:"mention_did,omitempty"` - EmbedATURI *string `json:"embed_aturi,omitempty"` - ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` - EmbedImgCount int `json:"embed_img_count"` - EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` - EmbedImgAltTextJA []string `json:"embed_img_alt_text_ja,omitempty"` - SelfLabel []string `json:"self_label,omitempty"` - URL []string `json:"url,omitempty"` - Domain []string `json:"domain,omitempty"` - Tag []string `json:"tag,omitempty"` - Emoji []string `json:"emoji,omitempty"` + DocIndexTs string `json:"doc_index_ts"` + DID string `json:"did"` + RecordRkey string `json:"record_rkey"` + RecordCID string `json:"record_cid"` + CreatedAt *string `json:"created_at,omitempty"` + Text string `json:"text"` + LangCode []string `json:"lang_code,omitempty"` + LangCodeIso2 []string `json:"lang_code_iso2,omitempty"` + MentionDID []string `json:"mention_did,omitempty"` + EmbedATURI *string `json:"embed_aturi,omitempty"` + ReplyRootATURI *string `json:"reply_root_aturi,omitempty"` + EmbedImgCount int `json:"embed_img_count"` + EmbedImgAltText []string `json:"embed_img_alt_text,omitempty"` + SelfLabel []string `json:"self_label,omitempty"` + URL []string `json:"url,omitempty"` + Domain []string `json:"domain,omitempty"` + Tag []string `json:"tag,omitempty"` + Emoji []string `json:"emoji,omitempty"` } // Returns the search index document ID (`_id`) for this document. @@ -147,15 +145,11 @@ func TransformPost(post *appbsky.FeedPost, did syntax.DID, rkey, cid string) Pos } var embedImgCount int var embedImgAltText []string - var embedImgAltTextJA []string if post.Embed != nil && post.Embed.EmbedImages != nil { embedImgCount = len(post.Embed.EmbedImages.Images) for _, img := range post.Embed.EmbedImages.Images { if img.Alt != "" { embedImgAltText = append(embedImgAltText, img.Alt) - if containsJapanese(img.Alt) { - embedImgAltTextJA = append(embedImgAltTextJA, img.Alt) - } } } } @@ -169,9 +163,6 @@ func TransformPost(post *appbsky.FeedPost, did syntax.DID, rkey, cid string) Pos for _, img := range post.Embed.EmbedRecordWithMedia.Media.EmbedImages.Images { if img.Alt != "" { embedImgAltText = append(embedImgAltText, img.Alt) - if containsJapanese(img.Alt) { - embedImgAltTextJA = append(embedImgAltTextJA, img.Alt) - } } } } @@ -194,28 +185,23 @@ func TransformPost(post *appbsky.FeedPost, did syntax.DID, rkey, cid string) Pos } doc := PostDoc{ - DocIndexTs: syntax.DatetimeNow().String(), - DID: did.String(), - RecordRkey: rkey, - RecordCID: cid, - Text: post.Text, - LangCode: post.Langs, - LangCodeIso2: langCodeIso2, - MentionDID: mentionDIDs, - EmbedATURI: embedATURI, - ReplyRootATURI: replyRootATURI, - EmbedImgCount: embedImgCount, - EmbedImgAltText: embedImgAltText, - EmbedImgAltTextJA: embedImgAltTextJA, - SelfLabel: selfLabels, - URL: urls, - Domain: domains, - Tag: parsePostTags(post), - Emoji: parseEmojis(post.Text), - } - - if containsJapanese(post.Text) { - doc.TextJA = &post.Text + DocIndexTs: syntax.DatetimeNow().String(), + DID: did.String(), + RecordRkey: rkey, + RecordCID: cid, + Text: post.Text, + LangCode: post.Langs, + LangCodeIso2: langCodeIso2, + MentionDID: mentionDIDs, + EmbedATURI: embedATURI, + ReplyRootATURI: replyRootATURI, + EmbedImgCount: embedImgCount, + EmbedImgAltText: embedImgAltText, + SelfLabel: selfLabels, + URL: urls, + Domain: domains, + Tag: parsePostTags(post), + Emoji: parseEmojis(post.Text), } if post.CreatedAt != "" {