Handle unicode characters in autotag (#2336)

This commit is contained in:
WithoutPants
2022-02-28 13:12:43 +11:00
committed by GitHub
parent c5c94e783e
commit 1ab5be162e
6 changed files with 41 additions and 11 deletions

View File

@@ -22,6 +22,13 @@ func getPathQueryRegex(name string) string {
const separator = `[` + separatorChars + `]` const separator = `[` + separatorChars + `]`
ret := strings.ReplaceAll(name, " ", separator+"*") ret := strings.ReplaceAll(name, " ", separator+"*")
// \p{L} is specifically omitted here because of the performance hit when
// including it. It does mean that paths where the name is bounded by
// unicode letters will be returned. However, the results should be tested
// by nameMatchesPath which does include \p{L}. The improvement in query
// performance should be outweigh the performance hit of testing any extra
// results.
ret = `(?:^|_|[^\w\d])` + ret + `(?:$|_|[^\w\d])` ret = `(?:^|_|[^\w\d])` + ret + `(?:$|_|[^\w\d])`
return ret return ret
} }
@@ -36,7 +43,7 @@ func getPathWords(path string) []string {
} }
// handle path separators // handle path separators
const separator = `(?:_|[^\w\d])+` const separator = `(?:_|[^\p{L}\w\d])+`
re := regexp.MustCompile(separator) re := regexp.MustCompile(separator)
retStr = re.ReplaceAllString(retStr, " ") retStr = re.ReplaceAllString(retStr, " ")
@@ -52,7 +59,9 @@ func getPathWords(path string) []string {
// we post-match afterwards, so we can afford to be a little loose // we post-match afterwards, so we can afford to be a little loose
// with the query // with the query
// just use the first two characters // just use the first two characters
ret = append(ret, w[0:2]) // #2293 - need to convert to unicode runes for the substring, otherwise
// the resulting string is corrupted.
ret = append(ret, string([]rune(w)[0:2]))
} }
} }
@@ -72,7 +81,7 @@ func nameMatchesPath(name, path string) int {
const separator = `[` + separatorChars + `]` const separator = `[` + separatorChars + `]`
reStr := strings.ReplaceAll(name, " ", separator+"*") reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|[^\w\d])` + reStr + `(?:$|_|[^\w\d])` reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
re := regexp.MustCompile(reStr) re := regexp.MustCompile(reStr)
found := re.FindAllStringIndex(path, -1) found := re.FindAllStringIndex(path, -1)

View File

@@ -4,71 +4,90 @@ import "testing"
func Test_nameMatchesPath(t *testing.T) { func Test_nameMatchesPath(t *testing.T) {
const name = "first last" const name = "first last"
const unicodeName = "伏字"
tests := []struct { tests := []struct {
name string testName string
path string name string
want int path string
want int
}{ }{
{ {
"exact", "exact",
name, name,
name,
0, 0,
}, },
{ {
"partial", "partial",
name,
"first", "first",
-1, -1,
}, },
{ {
"separator", "separator",
name,
"first.last", "first.last",
0, 0,
}, },
{ {
"separator", "separator",
name,
"first-last", "first-last",
0, 0,
}, },
{ {
"separator", "separator",
name,
"first_last", "first_last",
0, 0,
}, },
{ {
"separators", "separators",
name,
"first.-_ last", "first.-_ last",
0, 0,
}, },
{ {
"within string", "within string",
name,
"before_first last/after", "before_first last/after",
6, 6,
}, },
{ {
"not within string", "not within string",
name,
"beforefirst last/after", "beforefirst last/after",
-1, -1,
}, },
{ {
"not within string", "not within string",
name,
"before/first lastafter", "before/first lastafter",
-1, -1,
}, },
{ {
"not within string", "not within string",
name,
"first last1", "first last1",
-1, -1,
}, },
{ {
"not within string", "not within string",
name,
"1first last", "1first last",
-1, -1,
}, },
{
"unicode",
unicodeName,
unicodeName,
0,
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.testName, func(t *testing.T) {
if got := nameMatchesPath(name, tt.path); got != tt.want { if got := nameMatchesPath(tt.name, tt.path); got != tt.want {
t.Errorf("nameMatchesPath() = %v, want %v", got, tt.want) t.Errorf("nameMatchesPath() = %v, want %v", got, tt.want)
} }
}) })

View File

@@ -21,6 +21,8 @@ WHERE performers_tags.tag_id = ?
GROUP BY performers_tags.performer_id GROUP BY performers_tags.performer_id
` `
const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
type performerQueryBuilder struct { type performerQueryBuilder struct {
repository repository
} }
@@ -184,7 +186,7 @@ func (qb *performerQueryBuilder) QueryForAutoTag(words []string) ([]*models.Perf
var args []interface{} var args []interface{}
whereClauses = append(whereClauses, "name regexp ?") whereClauses = append(whereClauses, "name regexp ?")
args = append(args, "^[\\w][.\\-_ ]") args = append(args, singleFirstCharacterRegex)
for _, w := range words { for _, w := range words {
whereClauses = append(whereClauses, "name like ?") whereClauses = append(whereClauses, "name like ?")

View File

@@ -171,6 +171,8 @@ func (r *repository) runSumQuery(query string, args []interface{}) (float64, err
} }
func (r *repository) queryFunc(query string, args []interface{}, single bool, f func(rows *sqlx.Rows) error) error { func (r *repository) queryFunc(query string, args []interface{}, single bool, f func(rows *sqlx.Rows) error) error {
logger.Tracef("SQL: %s, args: %v", query, args)
rows, err := r.tx.Queryx(query, args...) rows, err := r.tx.Queryx(query, args...)
if err != nil && !errors.Is(err, sql.ErrNoRows) { if err != nil && !errors.Is(err, sql.ErrNoRows) {

View File

@@ -145,7 +145,6 @@ func (qb *studioQueryBuilder) QueryForAutoTag(words []string) ([]*models.Studio,
var args []interface{} var args []interface{}
// always include names that begin with a single character // always include names that begin with a single character
singleFirstCharacterRegex := "^[\\w][.\\-_ ]"
whereClauses = append(whereClauses, "studios.name regexp ? OR COALESCE(studio_aliases.alias, '') regexp ?") whereClauses = append(whereClauses, "studios.name regexp ? OR COALESCE(studio_aliases.alias, '') regexp ?")
args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex) args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex)

View File

@@ -236,7 +236,6 @@ func (qb *tagQueryBuilder) QueryForAutoTag(words []string) ([]*models.Tag, error
var args []interface{} var args []interface{}
// always include names that begin with a single character // always include names that begin with a single character
singleFirstCharacterRegex := "^[\\w][.\\-_ ]"
whereClauses = append(whereClauses, "tags.name regexp ? OR COALESCE(tag_aliases.alias, '') regexp ?") whereClauses = append(whereClauses, "tags.name regexp ? OR COALESCE(tag_aliases.alias, '') regexp ?")
args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex) args = append(args, singleFirstCharacterRegex, singleFirstCharacterRegex)