Limit duplicate matching to files that have ~ same duration (#3663)

* Limit duplicate matching to files that have ~ same duration
* Add UI for duration diff
---------
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
puc9
2023-05-02 22:01:59 -07:00
committed by GitHub
parent 002b71bd67
commit 899d1b9395
11 changed files with 177 additions and 75 deletions

View File

@@ -20,8 +20,8 @@ query FindScenesByPathRegex($filter: FindFilterType) {
} }
} }
query FindDuplicateScenes($distance: Int) { query FindDuplicateScenes($distance: Int, $duration_diff: Float) {
findDuplicateScenes(distance: $distance) { findDuplicateScenes(distance: $distance, duration_diff: $duration_diff) {
...SlimSceneData ...SlimSceneData
} }
} }

View File

@@ -14,8 +14,16 @@ type Query {
findScenesByPathRegex(filter: FindFilterType): FindScenesResultType! findScenesByPathRegex(filter: FindFilterType): FindScenesResultType!
""" Returns any groups of scenes that are perceptual duplicates within the queried distance """ """
findDuplicateScenes(distance: Int): [[Scene!]!]! Returns any groups of scenes that are perceptual duplicates within the queried distance
and the difference between their duration is smaller than durationDiff
"""
findDuplicateScenes(
distance: Int,
"""Max difference in seconds between files in order to be considered for similarity matching.
Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance."""
duration_diff: Float
): [[Scene!]!]!
"""Return valid stream paths""" """Return valid stream paths"""
sceneStreams(id: ID): [SceneStreamEndpoint!]! sceneStreams(id: ID): [SceneStreamEndpoint!]!
@@ -295,14 +303,14 @@ type Mutation {
metadataClean(input: CleanMetadataInput!): ID! metadataClean(input: CleanMetadataInput!): ID!
"""Identifies scenes using scrapers. Returns the job ID""" """Identifies scenes using scrapers. Returns the job ID"""
metadataIdentify(input: IdentifyMetadataInput!): ID! metadataIdentify(input: IdentifyMetadataInput!): ID!
"""Migrate generated files for the current hash naming""" """Migrate generated files for the current hash naming"""
migrateHashNaming: ID! migrateHashNaming: ID!
"""Migrates legacy scene screenshot files into the blob storage""" """Migrates legacy scene screenshot files into the blob storage"""
migrateSceneScreenshots(input: MigrateSceneScreenshotsInput!): ID! migrateSceneScreenshots(input: MigrateSceneScreenshotsInput!): ID!
"""Migrates blobs from the old storage system to the current one""" """Migrates blobs from the old storage system to the current one"""
migrateBlobs(input: MigrateBlobsInput!): ID! migrateBlobs(input: MigrateBlobsInput!): ID!
"""Anonymise the database in a separate file. Optionally returns a link to download the database file""" """Anonymise the database in a separate file. Optionally returns a link to download the database file"""
anonymiseDatabase(input: AnonymiseDatabaseInput!): String anonymiseDatabase(input: AnonymiseDatabaseInput!): String

View File

@@ -220,13 +220,17 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models.
return ret, nil return ret, nil
} }
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int) (ret [][]*models.Scene, err error) { func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) {
dist := 0 dist := 0
durDiff := -1.
if distance != nil { if distance != nil {
dist = *distance dist = *distance
} }
if durationDiff != nil {
durDiff = *durationDiff
}
if err := r.withReadTxn(ctx, func(ctx context.Context) error { if err := r.withReadTxn(ctx, func(ctx context.Context) error {
ret, err = r.repository.Scene.FindDuplicates(ctx, dist) ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff)
return err return err
}); err != nil { }); err != nil {
return nil, err return nil, err

View File

@@ -439,7 +439,7 @@ func (_m *SceneReaderWriter) FindByPerformerID(ctx context.Context, performerID
} }
// FindDuplicates provides a mock function with given fields: ctx, distance // FindDuplicates provides a mock function with given fields: ctx, distance
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) { func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
ret := _m.Called(ctx, distance) ret := _m.Called(ctx, distance)
var r0 [][]*models.Scene var r0 [][]*models.Scene

View File

@@ -153,7 +153,7 @@ type SceneReader interface {
FindByPath(ctx context.Context, path string) ([]*Scene, error) FindByPath(ctx context.Context, path string) ([]*Scene, error)
FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error) FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error)
FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error) FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error)
FindDuplicates(ctx context.Context, distance int) ([][]*Scene, error) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error)
GalleryIDLoader GalleryIDLoader
PerformerIDLoader PerformerIDLoader

View File

@@ -36,23 +36,38 @@ const (
) )
var findExactDuplicateQuery = ` var findExactDuplicateQuery = `
SELECT GROUP_CONCAT(scenes.id) as ids SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
FROM scenes FROM (
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) SELECT scenes.id as scene_id
INNER JOIN files ON (scenes_files.file_id = files.id) , video_files.duration as file_duration
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') , files.size as file_size
GROUP BY files_fingerprints.fingerprint , files_fingerprints.fingerprint as phash
HAVING COUNT(files_fingerprints.fingerprint) > 1 AND COUNT(DISTINCT scenes.id) > 1 , abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff
ORDER BY SUM(files.size) DESC; FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
INNER JOIN video_files ON (files.id == video_files.file_id)
)
WHERE durationDiff <= ?1
OR ?1 < 0 -- Always TRUE if the parameter is negative.
-- That will disable the durationDiff checking.
GROUP BY phash
HAVING COUNT(phash) > 1
AND COUNT(DISTINCT scene_id) > 1
ORDER BY SUM(file_size) DESC;
` `
var findAllPhashesQuery = ` var findAllPhashesQuery = `
SELECT scenes.id as id, files_fingerprints.fingerprint as phash SELECT scenes.id as id
, files_fingerprints.fingerprint as phash
, video_files.duration as duration
FROM scenes FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id) INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id) INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash') INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
ORDER BY files.size DESC INNER JOIN video_files ON (files.id == video_files.file_id)
ORDER BY files.size DESC;
` `
type sceneRow struct { type sceneRow struct {
@@ -1729,11 +1744,11 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St
return qb.stashIDRepository().get(ctx, sceneID) return qb.stashIDRepository().get(ctx, sceneID)
} }
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) { func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
var dupeIds [][]int var dupeIds [][]int
if distance == 0 { if distance == 0 {
var ids []string var ids []string
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery); err != nil { if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil {
return nil, err return nil, err
} }
@@ -1755,7 +1770,8 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error { if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error {
phash := utils.Phash{ phash := utils.Phash{
Bucket: -1, Bucket: -1,
Duration: -1,
} }
if err := rows.StructScan(&phash); err != nil { if err := rows.StructScan(&phash); err != nil {
return err return err
@@ -1767,7 +1783,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
return nil, err return nil, err
} }
dupeIds = utils.FindDuplicates(hashes, distance) dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
} }
var duplicates [][]*models.Scene var duplicates [][]*models.Scene

View File

@@ -4237,7 +4237,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
withRollbackTxn(func(ctx context.Context) error { withRollbackTxn(func(ctx context.Context) error {
distance := 0 distance := 0
got, err := qb.FindDuplicates(ctx, distance) durationDiff := -1.
got, err := qb.FindDuplicates(ctx, distance, durationDiff)
if err != nil { if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err) t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil return nil
@@ -4246,7 +4247,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
assert.Len(t, got, dupeScenePhashes) assert.Len(t, got, dupeScenePhashes)
distance = 1 distance = 1
got, err = qb.FindDuplicates(ctx, distance) durationDiff = -1.
got, err = qb.FindDuplicates(ctx, distance, durationDiff)
if err != nil { if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err) t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil return nil

View File

@@ -1,6 +1,7 @@
package utils package utils
import ( import (
"math"
"strconv" "strconv"
"github.com/corona10/goimagehash" "github.com/corona10/goimagehash"
@@ -8,21 +9,28 @@ import (
) )
type Phash struct { type Phash struct {
SceneID int `db:"id"` SceneID int `db:"id"`
Hash int64 `db:"phash"` Hash int64 `db:"phash"`
Duration float64 `db:"duration"`
Neighbors []int Neighbors []int
Bucket int Bucket int
} }
func FindDuplicates(hashes []*Phash, distance int) [][]int { func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
for i, scene := range hashes { for i, scene := range hashes {
sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash) sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash)
for j, neighbor := range hashes { for j, neighbor := range hashes {
if i != j && scene.SceneID != neighbor.SceneID { if i != j && scene.SceneID != neighbor.SceneID {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash) neighbourDurationDistance := 0.
neighborDistance, _ := sceneHash.Distance(neighborHash) if scene.Duration > 0 && neighbor.Duration > 0 {
if neighborDistance <= distance { neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration)
scene.Neighbors = append(scene.Neighbors, j) }
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
neighborDistance, _ := sceneHash.Distance(neighborHash)
if neighborDistance <= distance {
scene.Neighbors = append(scene.Neighbors, j)
}
} }
} }
} }

View File

@@ -41,6 +41,8 @@ import { objectTitle } from "src/core/files";
const CLASSNAME = "duplicate-checker"; const CLASSNAME = "duplicate-checker";
const defaultDurationDiff = "1";
export const SceneDuplicateChecker: React.FC = () => { export const SceneDuplicateChecker: React.FC = () => {
const intl = useIntl(); const intl = useIntl();
const history = useHistory(); const history = useHistory();
@@ -49,6 +51,9 @@ export const SceneDuplicateChecker: React.FC = () => {
const currentPage = Number.parseInt(query.get("page") ?? "1", 10); const currentPage = Number.parseInt(query.get("page") ?? "1", 10);
const pageSize = Number.parseInt(query.get("size") ?? "20", 10); const pageSize = Number.parseInt(query.get("size") ?? "20", 10);
const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10); const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10);
const durationDiff = Number.parseFloat(
query.get("durationDiff") ?? defaultDurationDiff
);
const [currentPageSize, setCurrentPageSize] = useState(pageSize); const [currentPageSize, setCurrentPageSize] = useState(pageSize);
const [isMultiDelete, setIsMultiDelete] = useState(false); const [isMultiDelete, setIsMultiDelete] = useState(false);
@@ -59,7 +64,10 @@ export const SceneDuplicateChecker: React.FC = () => {
); );
const { data, loading, refetch } = GQL.useFindDuplicateScenesQuery({ const { data, loading, refetch } = GQL.useFindDuplicateScenesQuery({
fetchPolicy: "no-cache", fetchPolicy: "no-cache",
variables: { distance: hashDistance }, variables: {
distance: hashDistance,
duration_diff: durationDiff,
},
}); });
const { data: missingPhash } = GQL.useFindScenesQuery({ const { data: missingPhash } = GQL.useFindScenesQuery({
variables: { variables: {
@@ -480,45 +488,91 @@ export const SceneDuplicateChecker: React.FC = () => {
<h4> <h4>
<FormattedMessage id="dupe_check.title" /> <FormattedMessage id="dupe_check.title" />
</h4> </h4>
<Form.Group> <Form>
<Row noGutters> <Form.Group>
<Form.Label> <Row noGutters>
<FormattedMessage id="dupe_check.search_accuracy_label" /> <Form.Label>
</Form.Label> <FormattedMessage id="dupe_check.search_accuracy_label" />
<Col xs={2}> </Form.Label>
<Form.Control <Col xs="auto">
as="select" <Form.Control
onChange={(e) => as="select"
setQuery({ onChange={(e) =>
distance: setQuery({
e.currentTarget.value === "0" distance:
? undefined e.currentTarget.value === "0"
: e.currentTarget.value, ? undefined
page: undefined, : e.currentTarget.value,
}) page: undefined,
} })
defaultValue={hashDistance} }
className="input-control ml-4" defaultValue={hashDistance}
> className="input-control ml-4"
<option value={0}> >
{intl.formatMessage({ id: "dupe_check.options.exact" })} <option value={0}>
</option> {intl.formatMessage({ id: "dupe_check.options.exact" })}
<option value={4}> </option>
{intl.formatMessage({ id: "dupe_check.options.high" })} <option value={4}>
</option> {intl.formatMessage({ id: "dupe_check.options.high" })}
<option value={8}> </option>
{intl.formatMessage({ id: "dupe_check.options.medium" })} <option value={8}>
</option> {intl.formatMessage({ id: "dupe_check.options.medium" })}
<option value={10}> </option>
{intl.formatMessage({ id: "dupe_check.options.low" })} <option value={10}>
</option> {intl.formatMessage({ id: "dupe_check.options.low" })}
</Form.Control> </option>
</Col> </Form.Control>
</Row> </Col>
<Form.Text> </Row>
<FormattedMessage id="dupe_check.description" /> <Form.Text>
</Form.Text> <FormattedMessage id="dupe_check.description" />
</Form.Group> </Form.Text>
</Form.Group>
<Form.Group>
<Row noGutters>
<Form.Label>
<FormattedMessage id="dupe_check.duration_diff" />
</Form.Label>
<Col xs="auto">
<Form.Control
as="select"
onChange={(e) =>
setQuery({
durationDiff:
e.currentTarget.value === defaultDurationDiff
? undefined
: e.currentTarget.value,
page: undefined,
})
}
defaultValue={durationDiff}
className="input-control ml-4"
>
<option value={-1}>
{intl.formatMessage({
id: "dupe_check.duration_options.any",
})}
</option>
<option value={0}>
{intl.formatMessage({
id: "dupe_check.duration_options.equal",
})}
</option>
<option value={1}>
1 {intl.formatMessage({ id: "second" })}
</option>
<option value={5}>
5 {intl.formatMessage({ id: "seconds" })}
</option>
<option value={10}>
10 {intl.formatMessage({ id: "seconds" })}
</option>
</Form.Control>
</Col>
</Row>
</Form.Group>
</Form>
{maybeRenderMissingPhashWarning()} {maybeRenderMissingPhashWarning()}
{renderPagination()} {renderPagination()}

View File

@@ -10,4 +10,8 @@
.separator { .separator {
height: 50px; height: 50px;
} }
.form-group .row {
align-items: center;
}
} }

View File

@@ -855,6 +855,11 @@
"donate": "Donate", "donate": "Donate",
"dupe_check": { "dupe_check": {
"description": "Levels below 'Exact' can take longer to calculate. False positives might also be returned on lower accuracy levels.", "description": "Levels below 'Exact' can take longer to calculate. False positives might also be returned on lower accuracy levels.",
"duration_diff": "Maximum Duration Difference",
"duration_options": {
"any": "Any",
"equal": "Equal"
},
"found_sets": "{setCount, plural, one{# set of duplicates found.} other {# sets of duplicates found.}}", "found_sets": "{setCount, plural, one{# set of duplicates found.} other {# sets of duplicates found.}}",
"options": { "options": {
"exact": "Exact", "exact": "Exact",
@@ -1077,6 +1082,7 @@
"saved_filters": "Saved filters", "saved_filters": "Saved filters",
"update_filter": "Update Filter" "update_filter": "Update Filter"
}, },
"second": "Second",
"seconds": "Seconds", "seconds": "Seconds",
"settings": "Settings", "settings": "Settings",
"setup": { "setup": {