Remove single unicode character from autotag query (#2363)

* Remove single unicode character from autotag query * Compile regex once where possible * Fix CPU profiling * Only match unicode characters if in path
2025-12-17 04:14:39 +03:00 · 2022-03-07 13:26:24 +11:00
parent 0737ca953d
commit 18665863d6
3 changed files with 79 additions and 20 deletions
--- a/main.go
+++ b/main.go
@@ -35,10 +35,11 @@ func main() {
 	manager.Initialize()
 	api.Start(uiBox, loginUIBox)
 	// stop any profiling at exit
 	defer pprof.StopCPUProfile()
 	blockForever()
 	// stop any profiling at exit
 	pprof.StopCPUProfile()
 	manager.GetInstance().Shutdown(0)
 }
--- a/pkg/match/path.go
+++ b/pkg/match/path.go
@@ -5,6 +5,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"strings"
 	"unicode"
 	"github.com/stashapp/stash/pkg/gallery"
 	"github.com/stashapp/stash/pkg/image"
@@ -12,7 +13,12 @@ import (
 	"github.com/stashapp/stash/pkg/scene"
 )
-const separatorChars = `.\-_ `
+const (
 	separatorChars = `.\-_ `
 	reNotLetterWordUnicode = `[^\p{L}\w\d]`
 	reNotLetterWord        = `[^\w\d]`
 )
 func getPathQueryRegex(name string) string {
 	// escape specific regex characters
@@ -68,22 +74,22 @@ func getPathWords(path string) []string {
 	return ret
 }
 // https://stackoverflow.com/a/53069799
 func allASCII(s string) bool {
 	for i := 0; i < len(s); i++ {
 		if s[i] > unicode.MaxASCII {
 			return false
 		}
 	}
 	return true
 }
 // nameMatchesPath returns the index in the path for the right-most match.
 // Returns -1 if not found.
 func nameMatchesPath(name, path string) int {
-	// escape specific regex characters
+	// #2363 - optimisation: only use unicode character regexp if path contains
-	name = regexp.QuoteMeta(name)
+	// unicode characters
-
+	re := nameToRegexp(name, !allASCII(path))
 	name = strings.ToLower(name)
 	path = strings.ToLower(path)
 	// handle path separators
 	const separator = `[` + separatorChars + `]`
 	reStr := strings.ReplaceAll(name, " ", separator+"*")
 	reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
 	re := regexp.MustCompile(reStr)
 	found := re.FindAllStringIndex(path, -1)
 	if found == nil {
@@ -93,6 +99,39 @@ func nameMatchesPath(name, path string) int {
 	return found[len(found)-1][0]
 }
 // nameToRegexp compiles a regexp pattern to match paths from the given name.
 // Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
 func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
 	// escape specific regex characters
 	name = regexp.QuoteMeta(name)
 	name = strings.ToLower(name)
 	// handle path separators
 	const separator = `[` + separatorChars + `]`
 	// performance optimisation: only use \p{L} is useUnicode is true
 	notWord := reNotLetterWord
 	if useUnicode {
 		notWord = reNotLetterWordUnicode
 	}
 	reStr := strings.ReplaceAll(name, " ", separator+"*")
 	reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
 	re := regexp.MustCompile(reStr)
 	return re
 }
 func regexpMatchesPath(r *regexp.Regexp, path string) int {
 	path = strings.ToLower(path)
 	found := r.FindAllStringIndex(path, -1)
 	if found == nil {
 		return -1
 	}
 	return found[len(found)-1][0]
 }
 func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) {
 	words := getPathWords(path)
 	performers, err := performerReader.QueryForAutoTag(words)
@@ -208,8 +247,13 @@ func PathToScenes(name string, paths []string, sceneReader models.SceneReader) (
 	}
 	var ret []*models.Scene
 	// paths may have unicode characters
 	const useUnicode = true
 	r := nameToRegexp(name, useUnicode)
 	for _, p := range scenes {
-		if nameMatchesPath(name, p.Path) != -1 {
+		if regexpMatchesPath(r, p.Path) != -1 {
 			ret = append(ret, p)
 		}
 	}
@@ -240,8 +284,13 @@ func PathToImages(name string, paths []string, imageReader models.ImageReader) (
 	}
 	var ret []*models.Image
 	// paths may have unicode characters
 	const useUnicode = true
 	r := nameToRegexp(name, useUnicode)
 	for _, p := range images {
-		if nameMatchesPath(name, p.Path) != -1 {
+		if regexpMatchesPath(r, p.Path) != -1 {
 			ret = append(ret, p)
 		}
 	}
@@ -272,8 +321,13 @@ func PathToGalleries(name string, paths []string, galleryReader models.GalleryRe
 	}
 	var ret []*models.Gallery
 	// paths may have unicode characters
 	const useUnicode = true
 	r := nameToRegexp(name, useUnicode)
 	for _, p := range gallerys {
-		if nameMatchesPath(name, p.Path.String) != -1 {
+		if regexpMatchesPath(r, p.Path.String) != -1 {
 			ret = append(ret, p)
 		}
 	}
--- a/pkg/sqlite/performer.go
+++ b/pkg/sqlite/performer.go
@@ -21,7 +21,11 @@ WHERE performers_tags.tag_id = ?
 GROUP BY performers_tags.performer_id
 `
-const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]`
+// KNOWN ISSUE: using \p{L} to find single unicode character names results in
 // very slow queries.
 // Suggested solution will be to cache single-character names and not include it
 // in the autotag query.
 const singleFirstCharacterRegex = `^[\w][.\-_ ]`
 type performerQueryBuilder struct {
 	repository