Remove single unicode character from autotag query (#2363)

* Remove single unicode character from autotag query
* Compile regex once where possible
* Fix CPU profiling
* Only match unicode characters if in path
This commit is contained in:
WithoutPants
2022-03-07 13:26:24 +11:00
committed by GitHub
parent 0737ca953d
commit 18665863d6
3 changed files with 79 additions and 20 deletions

View File

@@ -35,10 +35,11 @@ func main() {
manager.Initialize() manager.Initialize()
api.Start(uiBox, loginUIBox) api.Start(uiBox, loginUIBox)
// stop any profiling at exit
defer pprof.StopCPUProfile()
blockForever() blockForever()
// stop any profiling at exit
pprof.StopCPUProfile()
manager.GetInstance().Shutdown(0) manager.GetInstance().Shutdown(0)
} }

View File

@@ -5,6 +5,7 @@ import (
"path/filepath" "path/filepath"
"regexp" "regexp"
"strings" "strings"
"unicode"
"github.com/stashapp/stash/pkg/gallery" "github.com/stashapp/stash/pkg/gallery"
"github.com/stashapp/stash/pkg/image" "github.com/stashapp/stash/pkg/image"
@@ -12,7 +13,12 @@ import (
"github.com/stashapp/stash/pkg/scene" "github.com/stashapp/stash/pkg/scene"
) )
const separatorChars = `.\-_ ` const (
separatorChars = `.\-_ `
reNotLetterWordUnicode = `[^\p{L}\w\d]`
reNotLetterWord = `[^\w\d]`
)
func getPathQueryRegex(name string) string { func getPathQueryRegex(name string) string {
// escape specific regex characters // escape specific regex characters
@@ -68,22 +74,22 @@ func getPathWords(path string) []string {
return ret return ret
} }
// https://stackoverflow.com/a/53069799
func allASCII(s string) bool {
for i := 0; i < len(s); i++ {
if s[i] > unicode.MaxASCII {
return false
}
}
return true
}
// nameMatchesPath returns the index in the path for the right-most match. // nameMatchesPath returns the index in the path for the right-most match.
// Returns -1 if not found. // Returns -1 if not found.
func nameMatchesPath(name, path string) int { func nameMatchesPath(name, path string) int {
// escape specific regex characters // #2363 - optimisation: only use unicode character regexp if path contains
name = regexp.QuoteMeta(name) // unicode characters
re := nameToRegexp(name, !allASCII(path))
name = strings.ToLower(name)
path = strings.ToLower(path)
// handle path separators
const separator = `[` + separatorChars + `]`
reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|[^\p{L}\w\d])` + reStr + `(?:$|_|[^\p{L}\w\d])`
re := regexp.MustCompile(reStr)
found := re.FindAllStringIndex(path, -1) found := re.FindAllStringIndex(path, -1)
if found == nil { if found == nil {
@@ -93,6 +99,39 @@ func nameMatchesPath(name, path string) int {
return found[len(found)-1][0] return found[len(found)-1][0]
} }
// nameToRegexp compiles a regexp pattern to match paths from the given name.
// Set useUnicode to true if this regexp is to be used on any strings with unicode characters.
func nameToRegexp(name string, useUnicode bool) *regexp.Regexp {
// escape specific regex characters
name = regexp.QuoteMeta(name)
name = strings.ToLower(name)
// handle path separators
const separator = `[` + separatorChars + `]`
// performance optimisation: only use \p{L} is useUnicode is true
notWord := reNotLetterWord
if useUnicode {
notWord = reNotLetterWordUnicode
}
reStr := strings.ReplaceAll(name, " ", separator+"*")
reStr = `(?:^|_|` + notWord + `)` + reStr + `(?:$|_|` + notWord + `)`
re := regexp.MustCompile(reStr)
return re
}
func regexpMatchesPath(r *regexp.Regexp, path string) int {
path = strings.ToLower(path)
found := r.FindAllStringIndex(path, -1)
if found == nil {
return -1
}
return found[len(found)-1][0]
}
func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) { func PathToPerformers(path string, performerReader models.PerformerReader) ([]*models.Performer, error) {
words := getPathWords(path) words := getPathWords(path)
performers, err := performerReader.QueryForAutoTag(words) performers, err := performerReader.QueryForAutoTag(words)
@@ -208,8 +247,13 @@ func PathToScenes(name string, paths []string, sceneReader models.SceneReader) (
} }
var ret []*models.Scene var ret []*models.Scene
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range scenes { for _, p := range scenes {
if nameMatchesPath(name, p.Path) != -1 { if regexpMatchesPath(r, p.Path) != -1 {
ret = append(ret, p) ret = append(ret, p)
} }
} }
@@ -240,8 +284,13 @@ func PathToImages(name string, paths []string, imageReader models.ImageReader) (
} }
var ret []*models.Image var ret []*models.Image
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range images { for _, p := range images {
if nameMatchesPath(name, p.Path) != -1 { if regexpMatchesPath(r, p.Path) != -1 {
ret = append(ret, p) ret = append(ret, p)
} }
} }
@@ -272,8 +321,13 @@ func PathToGalleries(name string, paths []string, galleryReader models.GalleryRe
} }
var ret []*models.Gallery var ret []*models.Gallery
// paths may have unicode characters
const useUnicode = true
r := nameToRegexp(name, useUnicode)
for _, p := range gallerys { for _, p := range gallerys {
if nameMatchesPath(name, p.Path.String) != -1 { if regexpMatchesPath(r, p.Path.String) != -1 {
ret = append(ret, p) ret = append(ret, p)
} }
} }

View File

@@ -21,7 +21,11 @@ WHERE performers_tags.tag_id = ?
GROUP BY performers_tags.performer_id GROUP BY performers_tags.performer_id
` `
const singleFirstCharacterRegex = `^[\w\p{L}][.\-_ ]` // KNOWN ISSUE: using \p{L} to find single unicode character names results in
// very slow queries.
// Suggested solution will be to cache single-character names and not include it
// in the autotag query.
const singleFirstCharacterRegex = `^[\w][.\-_ ]`
type performerQueryBuilder struct { type performerQueryBuilder struct {
repository repository