From 899d1b9395ab8a5bbd75ab34bdd22a5a3aae20a2 Mon Sep 17 00:00:00 2001 From: puc9 <51006296+puc9@users.noreply.github.com> Date: Tue, 2 May 2023 22:01:59 -0700 Subject: [PATCH] Limit duplicate matching to files that have ~ same duration (#3663) * Limit duplicate matching to files that have ~ same duration * Add UI for duration diff --------- Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com> --- graphql/documents/queries/scene.graphql | 4 +- graphql/schema/schema.graphql | 16 ++- internal/api/resolver_query_find_scene.go | 8 +- pkg/models/mocks/SceneReaderWriter.go | 2 +- pkg/models/scene.go | 2 +- pkg/sqlite/scene.go | 48 ++++--- pkg/sqlite/scene_test.go | 6 +- pkg/utils/phash.go | 22 ++- .../SceneDuplicateChecker.tsx | 134 ++++++++++++------ .../SceneDuplicateChecker/styles.scss | 4 + ui/v2.5/src/locales/en-GB.json | 6 + 11 files changed, 177 insertions(+), 75 deletions(-) diff --git a/graphql/documents/queries/scene.graphql b/graphql/documents/queries/scene.graphql index 1f762855a..e62303dc7 100644 --- a/graphql/documents/queries/scene.graphql +++ b/graphql/documents/queries/scene.graphql @@ -20,8 +20,8 @@ query FindScenesByPathRegex($filter: FindFilterType) { } } -query FindDuplicateScenes($distance: Int) { - findDuplicateScenes(distance: $distance) { +query FindDuplicateScenes($distance: Int, $duration_diff: Float) { + findDuplicateScenes(distance: $distance, duration_diff: $duration_diff) { ...SlimSceneData } } diff --git a/graphql/schema/schema.graphql b/graphql/schema/schema.graphql index 112f8aba9..3a4f6e738 100644 --- a/graphql/schema/schema.graphql +++ b/graphql/schema/schema.graphql @@ -14,8 +14,16 @@ type Query { findScenesByPathRegex(filter: FindFilterType): FindScenesResultType! - """ Returns any groups of scenes that are perceptual duplicates within the queried distance """ - findDuplicateScenes(distance: Int): [[Scene!]!]! + """ + Returns any groups of scenes that are perceptual duplicates within the queried distance + and the difference between their duration is smaller than durationDiff + """ + findDuplicateScenes( + distance: Int, + """Max difference in seconds between files in order to be considered for similarity matching. + Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance.""" + duration_diff: Float + ): [[Scene!]!]! """Return valid stream paths""" sceneStreams(id: ID): [SceneStreamEndpoint!]! @@ -295,14 +303,14 @@ type Mutation { metadataClean(input: CleanMetadataInput!): ID! """Identifies scenes using scrapers. Returns the job ID""" metadataIdentify(input: IdentifyMetadataInput!): ID! - + """Migrate generated files for the current hash naming""" migrateHashNaming: ID! """Migrates legacy scene screenshot files into the blob storage""" migrateSceneScreenshots(input: MigrateSceneScreenshotsInput!): ID! """Migrates blobs from the old storage system to the current one""" migrateBlobs(input: MigrateBlobsInput!): ID! - + """Anonymise the database in a separate file. Optionally returns a link to download the database file""" anonymiseDatabase(input: AnonymiseDatabaseInput!): String diff --git a/internal/api/resolver_query_find_scene.go b/internal/api/resolver_query_find_scene.go index 1eaa2dc03..c60cf88c2 100644 --- a/internal/api/resolver_query_find_scene.go +++ b/internal/api/resolver_query_find_scene.go @@ -220,13 +220,17 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models. return ret, nil } -func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int) (ret [][]*models.Scene, err error) { +func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) { dist := 0 + durDiff := -1. if distance != nil { dist = *distance } + if durationDiff != nil { + durDiff = *durationDiff + } if err := r.withReadTxn(ctx, func(ctx context.Context) error { - ret, err = r.repository.Scene.FindDuplicates(ctx, dist) + ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff) return err }); err != nil { return nil, err diff --git a/pkg/models/mocks/SceneReaderWriter.go b/pkg/models/mocks/SceneReaderWriter.go index f67a909b4..7ee47e906 100644 --- a/pkg/models/mocks/SceneReaderWriter.go +++ b/pkg/models/mocks/SceneReaderWriter.go @@ -439,7 +439,7 @@ func (_m *SceneReaderWriter) FindByPerformerID(ctx context.Context, performerID } // FindDuplicates provides a mock function with given fields: ctx, distance -func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) { +func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { ret := _m.Called(ctx, distance) var r0 [][]*models.Scene diff --git a/pkg/models/scene.go b/pkg/models/scene.go index ac9cd93c8..90655ff5e 100644 --- a/pkg/models/scene.go +++ b/pkg/models/scene.go @@ -153,7 +153,7 @@ type SceneReader interface { FindByPath(ctx context.Context, path string) ([]*Scene, error) FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error) FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error) - FindDuplicates(ctx context.Context, distance int) ([][]*Scene, error) + FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error) GalleryIDLoader PerformerIDLoader diff --git a/pkg/sqlite/scene.go b/pkg/sqlite/scene.go index a049557da..721a4d456 100644 --- a/pkg/sqlite/scene.go +++ b/pkg/sqlite/scene.go @@ -36,23 +36,38 @@ const ( ) var findExactDuplicateQuery = ` -SELECT GROUP_CONCAT(scenes.id) as ids -FROM scenes -INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) -INNER JOIN files ON (scenes_files.file_id = files.id) -INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') -GROUP BY files_fingerprints.fingerprint -HAVING COUNT(files_fingerprints.fingerprint) > 1 AND COUNT(DISTINCT scenes.id) > 1 -ORDER BY SUM(files.size) DESC; +SELECT GROUP_CONCAT(DISTINCT scene_id) as ids +FROM ( + SELECT scenes.id as scene_id + , video_files.duration as file_duration + , files.size as file_size + , files_fingerprints.fingerprint as phash + , abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff + FROM scenes + INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) + INNER JOIN files ON (scenes_files.file_id = files.id) + INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') + INNER JOIN video_files ON (files.id == video_files.file_id) +) +WHERE durationDiff <= ?1 + OR ?1 < 0 -- Always TRUE if the parameter is negative. + -- That will disable the durationDiff checking. +GROUP BY phash +HAVING COUNT(phash) > 1 + AND COUNT(DISTINCT scene_id) > 1 +ORDER BY SUM(file_size) DESC; ` var findAllPhashesQuery = ` -SELECT scenes.id as id, files_fingerprints.fingerprint as phash +SELECT scenes.id as id + , files_fingerprints.fingerprint as phash + , video_files.duration as duration FROM scenes -INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) -INNER JOIN files ON (scenes_files.file_id = files.id) +INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) +INNER JOIN files ON (scenes_files.file_id = files.id) INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') -ORDER BY files.size DESC +INNER JOIN video_files ON (files.id == video_files.file_id) +ORDER BY files.size DESC; ` type sceneRow struct { @@ -1729,11 +1744,11 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St return qb.stashIDRepository().get(ctx, sceneID) } -func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) { +func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) { var dupeIds [][]int if distance == 0 { var ids []string - if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery); err != nil { + if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil { return nil, err } @@ -1755,7 +1770,8 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error { phash := utils.Phash{ - Bucket: -1, + Bucket: -1, + Duration: -1, } if err := rows.StructScan(&phash); err != nil { return err @@ -1767,7 +1783,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo return nil, err } - dupeIds = utils.FindDuplicates(hashes, distance) + dupeIds = utils.FindDuplicates(hashes, distance, durationDiff) } var duplicates [][]*models.Scene diff --git a/pkg/sqlite/scene_test.go b/pkg/sqlite/scene_test.go index 560d3fcfc..137319c31 100644 --- a/pkg/sqlite/scene_test.go +++ b/pkg/sqlite/scene_test.go @@ -4237,7 +4237,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) { withRollbackTxn(func(ctx context.Context) error { distance := 0 - got, err := qb.FindDuplicates(ctx, distance) + durationDiff := -1. + got, err := qb.FindDuplicates(ctx, distance, durationDiff) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil @@ -4246,7 +4247,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) { assert.Len(t, got, dupeScenePhashes) distance = 1 - got, err = qb.FindDuplicates(ctx, distance) + durationDiff = -1. + got, err = qb.FindDuplicates(ctx, distance, durationDiff) if err != nil { t.Errorf("SceneStore.FindDuplicates() error = %v", err) return nil diff --git a/pkg/utils/phash.go b/pkg/utils/phash.go index 7b15ec5e0..395d86f93 100644 --- a/pkg/utils/phash.go +++ b/pkg/utils/phash.go @@ -1,6 +1,7 @@ package utils import ( + "math" "strconv" "github.com/corona10/goimagehash" @@ -8,21 +9,28 @@ import ( ) type Phash struct { - SceneID int `db:"id"` - Hash int64 `db:"phash"` + SceneID int `db:"id"` + Hash int64 `db:"phash"` + Duration float64 `db:"duration"` Neighbors []int Bucket int } -func FindDuplicates(hashes []*Phash, distance int) [][]int { +func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int { for i, scene := range hashes { sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash) for j, neighbor := range hashes { if i != j && scene.SceneID != neighbor.SceneID { - neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash) - neighborDistance, _ := sceneHash.Distance(neighborHash) - if neighborDistance <= distance { - scene.Neighbors = append(scene.Neighbors, j) + neighbourDurationDistance := 0. + if scene.Duration > 0 && neighbor.Duration > 0 { + neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration) + } + if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) { + neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash) + neighborDistance, _ := sceneHash.Distance(neighborHash) + if neighborDistance <= distance { + scene.Neighbors = append(scene.Neighbors, j) + } } } } diff --git a/ui/v2.5/src/components/SceneDuplicateChecker/SceneDuplicateChecker.tsx b/ui/v2.5/src/components/SceneDuplicateChecker/SceneDuplicateChecker.tsx index 882664d26..c45d1b293 100644 --- a/ui/v2.5/src/components/SceneDuplicateChecker/SceneDuplicateChecker.tsx +++ b/ui/v2.5/src/components/SceneDuplicateChecker/SceneDuplicateChecker.tsx @@ -41,6 +41,8 @@ import { objectTitle } from "src/core/files"; const CLASSNAME = "duplicate-checker"; +const defaultDurationDiff = "1"; + export const SceneDuplicateChecker: React.FC = () => { const intl = useIntl(); const history = useHistory(); @@ -49,6 +51,9 @@ export const SceneDuplicateChecker: React.FC = () => { const currentPage = Number.parseInt(query.get("page") ?? "1", 10); const pageSize = Number.parseInt(query.get("size") ?? "20", 10); const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10); + const durationDiff = Number.parseFloat( + query.get("durationDiff") ?? defaultDurationDiff + ); const [currentPageSize, setCurrentPageSize] = useState(pageSize); const [isMultiDelete, setIsMultiDelete] = useState(false); @@ -59,7 +64,10 @@ export const SceneDuplicateChecker: React.FC = () => { ); const { data, loading, refetch } = GQL.useFindDuplicateScenesQuery({ fetchPolicy: "no-cache", - variables: { distance: hashDistance }, + variables: { + distance: hashDistance, + duration_diff: durationDiff, + }, }); const { data: missingPhash } = GQL.useFindScenesQuery({ variables: { @@ -480,45 +488,91 @@ export const SceneDuplicateChecker: React.FC = () => {

- - - - - - - - setQuery({ - distance: - e.currentTarget.value === "0" - ? undefined - : e.currentTarget.value, - page: undefined, - }) - } - defaultValue={hashDistance} - className="input-control ml-4" - > - - - - - - - - - - - +
+ + + + + + + + setQuery({ + distance: + e.currentTarget.value === "0" + ? undefined + : e.currentTarget.value, + page: undefined, + }) + } + defaultValue={hashDistance} + className="input-control ml-4" + > + + + + + + + + + + + + + + + + + + + + setQuery({ + durationDiff: + e.currentTarget.value === defaultDurationDiff + ? undefined + : e.currentTarget.value, + page: undefined, + }) + } + defaultValue={durationDiff} + className="input-control ml-4" + > + + + + + + + + + +
{maybeRenderMissingPhashWarning()} {renderPagination()} diff --git a/ui/v2.5/src/components/SceneDuplicateChecker/styles.scss b/ui/v2.5/src/components/SceneDuplicateChecker/styles.scss index 24084527a..9177a9367 100644 --- a/ui/v2.5/src/components/SceneDuplicateChecker/styles.scss +++ b/ui/v2.5/src/components/SceneDuplicateChecker/styles.scss @@ -10,4 +10,8 @@ .separator { height: 50px; } + + .form-group .row { + align-items: center; + } } diff --git a/ui/v2.5/src/locales/en-GB.json b/ui/v2.5/src/locales/en-GB.json index 73f9a73e7..e049b1792 100644 --- a/ui/v2.5/src/locales/en-GB.json +++ b/ui/v2.5/src/locales/en-GB.json @@ -855,6 +855,11 @@ "donate": "Donate", "dupe_check": { "description": "Levels below 'Exact' can take longer to calculate. False positives might also be returned on lower accuracy levels.", + "duration_diff": "Maximum Duration Difference", + "duration_options": { + "any": "Any", + "equal": "Equal" + }, "found_sets": "{setCount, plural, one{# set of duplicates found.} other {# sets of duplicates found.}}", "options": { "exact": "Exact", @@ -1077,6 +1082,7 @@ "saved_filters": "Saved filters", "update_filter": "Update Filter" }, + "second": "Second", "seconds": "Seconds", "settings": "Settings", "setup": {