Limit duplicate matching to files that have ~ same duration (#3663)

* Limit duplicate matching to files that have ~ same duration
* Add UI for duration diff
---------
Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
puc9
2023-05-02 22:01:59 -07:00
committed by GitHub
parent 002b71bd67
commit 899d1b9395
11 changed files with 177 additions and 75 deletions

View File

@@ -20,8 +20,8 @@ query FindScenesByPathRegex($filter: FindFilterType) {
}
}
query FindDuplicateScenes($distance: Int) {
findDuplicateScenes(distance: $distance) {
query FindDuplicateScenes($distance: Int, $duration_diff: Float) {
findDuplicateScenes(distance: $distance, duration_diff: $duration_diff) {
...SlimSceneData
}
}

View File

@@ -14,8 +14,16 @@ type Query {
findScenesByPathRegex(filter: FindFilterType): FindScenesResultType!
""" Returns any groups of scenes that are perceptual duplicates within the queried distance """
findDuplicateScenes(distance: Int): [[Scene!]!]!
"""
Returns any groups of scenes that are perceptual duplicates within the queried distance
and the difference between their duration is smaller than durationDiff
"""
findDuplicateScenes(
distance: Int,
"""Max difference in seconds between files in order to be considered for similarity matching.
Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance."""
duration_diff: Float
): [[Scene!]!]!
"""Return valid stream paths"""
sceneStreams(id: ID): [SceneStreamEndpoint!]!
@@ -295,14 +303,14 @@ type Mutation {
metadataClean(input: CleanMetadataInput!): ID!
"""Identifies scenes using scrapers. Returns the job ID"""
metadataIdentify(input: IdentifyMetadataInput!): ID!
"""Migrate generated files for the current hash naming"""
migrateHashNaming: ID!
"""Migrates legacy scene screenshot files into the blob storage"""
migrateSceneScreenshots(input: MigrateSceneScreenshotsInput!): ID!
"""Migrates blobs from the old storage system to the current one"""
migrateBlobs(input: MigrateBlobsInput!): ID!
"""Anonymise the database in a separate file. Optionally returns a link to download the database file"""
anonymiseDatabase(input: AnonymiseDatabaseInput!): String

View File

@@ -220,13 +220,17 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models.
return ret, nil
}
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int) (ret [][]*models.Scene, err error) {
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) {
dist := 0
durDiff := -1.
if distance != nil {
dist = *distance
}
if durationDiff != nil {
durDiff = *durationDiff
}
if err := r.withReadTxn(ctx, func(ctx context.Context) error {
ret, err = r.repository.Scene.FindDuplicates(ctx, dist)
ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff)
return err
}); err != nil {
return nil, err

View File

@@ -439,7 +439,7 @@ func (_m *SceneReaderWriter) FindByPerformerID(ctx context.Context, performerID
}
// FindDuplicates provides a mock function with given fields: ctx, distance
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) {
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
ret := _m.Called(ctx, distance)
var r0 [][]*models.Scene

View File

@@ -153,7 +153,7 @@ type SceneReader interface {
FindByPath(ctx context.Context, path string) ([]*Scene, error)
FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error)
FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error)
FindDuplicates(ctx context.Context, distance int) ([][]*Scene, error)
FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error)
GalleryIDLoader
PerformerIDLoader

View File

@@ -36,23 +36,38 @@ const (
)
var findExactDuplicateQuery = `
SELECT GROUP_CONCAT(scenes.id) as ids
FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
GROUP BY files_fingerprints.fingerprint
HAVING COUNT(files_fingerprints.fingerprint) > 1 AND COUNT(DISTINCT scenes.id) > 1
ORDER BY SUM(files.size) DESC;
SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
FROM (
SELECT scenes.id as scene_id
, video_files.duration as file_duration
, files.size as file_size
, files_fingerprints.fingerprint as phash
, abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff
FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
INNER JOIN video_files ON (files.id == video_files.file_id)
)
WHERE durationDiff <= ?1
OR ?1 < 0 -- Always TRUE if the parameter is negative.
-- That will disable the durationDiff checking.
GROUP BY phash
HAVING COUNT(phash) > 1
AND COUNT(DISTINCT scene_id) > 1
ORDER BY SUM(file_size) DESC;
`
var findAllPhashesQuery = `
SELECT scenes.id as id, files_fingerprints.fingerprint as phash
SELECT scenes.id as id
, files_fingerprints.fingerprint as phash
, video_files.duration as duration
FROM scenes
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
INNER JOIN files ON (scenes_files.file_id = files.id)
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
ORDER BY files.size DESC
INNER JOIN video_files ON (files.id == video_files.file_id)
ORDER BY files.size DESC;
`
type sceneRow struct {
@@ -1729,11 +1744,11 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St
return qb.stashIDRepository().get(ctx, sceneID)
}
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) {
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
var dupeIds [][]int
if distance == 0 {
var ids []string
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery); err != nil {
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil {
return nil, err
}
@@ -1755,7 +1770,8 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error {
phash := utils.Phash{
Bucket: -1,
Bucket: -1,
Duration: -1,
}
if err := rows.StructScan(&phash); err != nil {
return err
@@ -1767,7 +1783,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
return nil, err
}
dupeIds = utils.FindDuplicates(hashes, distance)
dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
}
var duplicates [][]*models.Scene

View File

@@ -4237,7 +4237,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
withRollbackTxn(func(ctx context.Context) error {
distance := 0
got, err := qb.FindDuplicates(ctx, distance)
durationDiff := -1.
got, err := qb.FindDuplicates(ctx, distance, durationDiff)
if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil
@@ -4246,7 +4247,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
assert.Len(t, got, dupeScenePhashes)
distance = 1
got, err = qb.FindDuplicates(ctx, distance)
durationDiff = -1.
got, err = qb.FindDuplicates(ctx, distance, durationDiff)
if err != nil {
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
return nil

View File

@@ -1,6 +1,7 @@
package utils
import (
"math"
"strconv"
"github.com/corona10/goimagehash"
@@ -8,21 +9,28 @@ import (
)
type Phash struct {
SceneID int `db:"id"`
Hash int64 `db:"phash"`
SceneID int `db:"id"`
Hash int64 `db:"phash"`
Duration float64 `db:"duration"`
Neighbors []int
Bucket int
}
func FindDuplicates(hashes []*Phash, distance int) [][]int {
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
for i, scene := range hashes {
sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash)
for j, neighbor := range hashes {
if i != j && scene.SceneID != neighbor.SceneID {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
neighborDistance, _ := sceneHash.Distance(neighborHash)
if neighborDistance <= distance {
scene.Neighbors = append(scene.Neighbors, j)
neighbourDurationDistance := 0.
if scene.Duration > 0 && neighbor.Duration > 0 {
neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration)
}
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
neighborDistance, _ := sceneHash.Distance(neighborHash)
if neighborDistance <= distance {
scene.Neighbors = append(scene.Neighbors, j)
}
}
}
}

View File

@@ -41,6 +41,8 @@ import { objectTitle } from "src/core/files";
const CLASSNAME = "duplicate-checker";
const defaultDurationDiff = "1";
export const SceneDuplicateChecker: React.FC = () => {
const intl = useIntl();
const history = useHistory();
@@ -49,6 +51,9 @@ export const SceneDuplicateChecker: React.FC = () => {
const currentPage = Number.parseInt(query.get("page") ?? "1", 10);
const pageSize = Number.parseInt(query.get("size") ?? "20", 10);
const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10);
const durationDiff = Number.parseFloat(
query.get("durationDiff") ?? defaultDurationDiff
);
const [currentPageSize, setCurrentPageSize] = useState(pageSize);
const [isMultiDelete, setIsMultiDelete] = useState(false);
@@ -59,7 +64,10 @@ export const SceneDuplicateChecker: React.FC = () => {
);
const { data, loading, refetch } = GQL.useFindDuplicateScenesQuery({
fetchPolicy: "no-cache",
variables: { distance: hashDistance },
variables: {
distance: hashDistance,
duration_diff: durationDiff,
},
});
const { data: missingPhash } = GQL.useFindScenesQuery({
variables: {
@@ -480,45 +488,91 @@ export const SceneDuplicateChecker: React.FC = () => {
<h4>
<FormattedMessage id="dupe_check.title" />
</h4>
<Form.Group>
<Row noGutters>
<Form.Label>
<FormattedMessage id="dupe_check.search_accuracy_label" />
</Form.Label>
<Col xs={2}>
<Form.Control
as="select"
onChange={(e) =>
setQuery({
distance:
e.currentTarget.value === "0"
? undefined
: e.currentTarget.value,
page: undefined,
})
}
defaultValue={hashDistance}
className="input-control ml-4"
>
<option value={0}>
{intl.formatMessage({ id: "dupe_check.options.exact" })}
</option>
<option value={4}>
{intl.formatMessage({ id: "dupe_check.options.high" })}
</option>
<option value={8}>
{intl.formatMessage({ id: "dupe_check.options.medium" })}
</option>
<option value={10}>
{intl.formatMessage({ id: "dupe_check.options.low" })}
</option>
</Form.Control>
</Col>
</Row>
<Form.Text>
<FormattedMessage id="dupe_check.description" />
</Form.Text>
</Form.Group>
<Form>
<Form.Group>
<Row noGutters>
<Form.Label>
<FormattedMessage id="dupe_check.search_accuracy_label" />
</Form.Label>
<Col xs="auto">
<Form.Control
as="select"
onChange={(e) =>
setQuery({
distance:
e.currentTarget.value === "0"
? undefined
: e.currentTarget.value,
page: undefined,
})
}
defaultValue={hashDistance}
className="input-control ml-4"
>
<option value={0}>
{intl.formatMessage({ id: "dupe_check.options.exact" })}
</option>
<option value={4}>
{intl.formatMessage({ id: "dupe_check.options.high" })}
</option>
<option value={8}>
{intl.formatMessage({ id: "dupe_check.options.medium" })}
</option>
<option value={10}>
{intl.formatMessage({ id: "dupe_check.options.low" })}
</option>
</Form.Control>
</Col>
</Row>
<Form.Text>
<FormattedMessage id="dupe_check.description" />
</Form.Text>
</Form.Group>
<Form.Group>
<Row noGutters>
<Form.Label>
<FormattedMessage id="dupe_check.duration_diff" />
</Form.Label>
<Col xs="auto">
<Form.Control
as="select"
onChange={(e) =>
setQuery({
durationDiff:
e.currentTarget.value === defaultDurationDiff
? undefined
: e.currentTarget.value,
page: undefined,
})
}
defaultValue={durationDiff}
className="input-control ml-4"
>
<option value={-1}>
{intl.formatMessage({
id: "dupe_check.duration_options.any",
})}
</option>
<option value={0}>
{intl.formatMessage({
id: "dupe_check.duration_options.equal",
})}
</option>
<option value={1}>
1 {intl.formatMessage({ id: "second" })}
</option>
<option value={5}>
5 {intl.formatMessage({ id: "seconds" })}
</option>
<option value={10}>
10 {intl.formatMessage({ id: "seconds" })}
</option>
</Form.Control>
</Col>
</Row>
</Form.Group>
</Form>
{maybeRenderMissingPhashWarning()}
{renderPagination()}

View File

@@ -10,4 +10,8 @@
.separator {
height: 50px;
}
.form-group .row {
align-items: center;
}
}

View File

@@ -855,6 +855,11 @@
"donate": "Donate",
"dupe_check": {
"description": "Levels below 'Exact' can take longer to calculate. False positives might also be returned on lower accuracy levels.",
"duration_diff": "Maximum Duration Difference",
"duration_options": {
"any": "Any",
"equal": "Equal"
},
"found_sets": "{setCount, plural, one{# set of duplicates found.} other {# sets of duplicates found.}}",
"options": {
"exact": "Exact",
@@ -1077,6 +1082,7 @@
"saved_filters": "Saved filters",
"update_filter": "Update Filter"
},
"second": "Second",
"seconds": "Seconds",
"settings": "Settings",
"setup": {