mirror of
https://github.com/stashapp/stash.git
synced 2025-12-17 20:34:37 +03:00
Limit duplicate matching to files that have ~ same duration (#3663)
* Limit duplicate matching to files that have ~ same duration * Add UI for duration diff --------- Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
@@ -20,8 +20,8 @@ query FindScenesByPathRegex($filter: FindFilterType) {
|
||||
}
|
||||
}
|
||||
|
||||
query FindDuplicateScenes($distance: Int) {
|
||||
findDuplicateScenes(distance: $distance) {
|
||||
query FindDuplicateScenes($distance: Int, $duration_diff: Float) {
|
||||
findDuplicateScenes(distance: $distance, duration_diff: $duration_diff) {
|
||||
...SlimSceneData
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,8 +14,16 @@ type Query {
|
||||
|
||||
findScenesByPathRegex(filter: FindFilterType): FindScenesResultType!
|
||||
|
||||
""" Returns any groups of scenes that are perceptual duplicates within the queried distance """
|
||||
findDuplicateScenes(distance: Int): [[Scene!]!]!
|
||||
"""
|
||||
Returns any groups of scenes that are perceptual duplicates within the queried distance
|
||||
and the difference between their duration is smaller than durationDiff
|
||||
"""
|
||||
findDuplicateScenes(
|
||||
distance: Int,
|
||||
"""Max difference in seconds between files in order to be considered for similarity matching.
|
||||
Fractional seconds are ok: 0.5 will mean only files that have durations within 0.5 seconds between them will be matched based on PHash distance."""
|
||||
duration_diff: Float
|
||||
): [[Scene!]!]!
|
||||
|
||||
"""Return valid stream paths"""
|
||||
sceneStreams(id: ID): [SceneStreamEndpoint!]!
|
||||
|
||||
@@ -220,13 +220,17 @@ func (r *queryResolver) ParseSceneFilenames(ctx context.Context, filter *models.
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int) (ret [][]*models.Scene, err error) {
|
||||
func (r *queryResolver) FindDuplicateScenes(ctx context.Context, distance *int, durationDiff *float64) (ret [][]*models.Scene, err error) {
|
||||
dist := 0
|
||||
durDiff := -1.
|
||||
if distance != nil {
|
||||
dist = *distance
|
||||
}
|
||||
if durationDiff != nil {
|
||||
durDiff = *durationDiff
|
||||
}
|
||||
if err := r.withReadTxn(ctx, func(ctx context.Context) error {
|
||||
ret, err = r.repository.Scene.FindDuplicates(ctx, dist)
|
||||
ret, err = r.repository.Scene.FindDuplicates(ctx, dist, durDiff)
|
||||
return err
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -439,7 +439,7 @@ func (_m *SceneReaderWriter) FindByPerformerID(ctx context.Context, performerID
|
||||
}
|
||||
|
||||
// FindDuplicates provides a mock function with given fields: ctx, distance
|
||||
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) {
|
||||
func (_m *SceneReaderWriter) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
|
||||
ret := _m.Called(ctx, distance)
|
||||
|
||||
var r0 [][]*models.Scene
|
||||
|
||||
@@ -153,7 +153,7 @@ type SceneReader interface {
|
||||
FindByPath(ctx context.Context, path string) ([]*Scene, error)
|
||||
FindByPerformerID(ctx context.Context, performerID int) ([]*Scene, error)
|
||||
FindByGalleryID(ctx context.Context, performerID int) ([]*Scene, error)
|
||||
FindDuplicates(ctx context.Context, distance int) ([][]*Scene, error)
|
||||
FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*Scene, error)
|
||||
|
||||
GalleryIDLoader
|
||||
PerformerIDLoader
|
||||
|
||||
@@ -36,23 +36,38 @@ const (
|
||||
)
|
||||
|
||||
var findExactDuplicateQuery = `
|
||||
SELECT GROUP_CONCAT(scenes.id) as ids
|
||||
SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
|
||||
FROM (
|
||||
SELECT scenes.id as scene_id
|
||||
, video_files.duration as file_duration
|
||||
, files.size as file_size
|
||||
, files_fingerprints.fingerprint as phash
|
||||
, abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff
|
||||
FROM scenes
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
|
||||
GROUP BY files_fingerprints.fingerprint
|
||||
HAVING COUNT(files_fingerprints.fingerprint) > 1 AND COUNT(DISTINCT scenes.id) > 1
|
||||
ORDER BY SUM(files.size) DESC;
|
||||
INNER JOIN video_files ON (files.id == video_files.file_id)
|
||||
)
|
||||
WHERE durationDiff <= ?1
|
||||
OR ?1 < 0 -- Always TRUE if the parameter is negative.
|
||||
-- That will disable the durationDiff checking.
|
||||
GROUP BY phash
|
||||
HAVING COUNT(phash) > 1
|
||||
AND COUNT(DISTINCT scene_id) > 1
|
||||
ORDER BY SUM(file_size) DESC;
|
||||
`
|
||||
|
||||
var findAllPhashesQuery = `
|
||||
SELECT scenes.id as id, files_fingerprints.fingerprint as phash
|
||||
SELECT scenes.id as id
|
||||
, files_fingerprints.fingerprint as phash
|
||||
, video_files.duration as duration
|
||||
FROM scenes
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
|
||||
ORDER BY files.size DESC
|
||||
INNER JOIN video_files ON (files.id == video_files.file_id)
|
||||
ORDER BY files.size DESC;
|
||||
`
|
||||
|
||||
type sceneRow struct {
|
||||
@@ -1729,11 +1744,11 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St
|
||||
return qb.stashIDRepository().get(ctx, sceneID)
|
||||
}
|
||||
|
||||
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) {
|
||||
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
|
||||
var dupeIds [][]int
|
||||
if distance == 0 {
|
||||
var ids []string
|
||||
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery); err != nil {
|
||||
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -1756,6 +1771,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
|
||||
if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error {
|
||||
phash := utils.Phash{
|
||||
Bucket: -1,
|
||||
Duration: -1,
|
||||
}
|
||||
if err := rows.StructScan(&phash); err != nil {
|
||||
return err
|
||||
@@ -1767,7 +1783,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dupeIds = utils.FindDuplicates(hashes, distance)
|
||||
dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
|
||||
}
|
||||
|
||||
var duplicates [][]*models.Scene
|
||||
|
||||
@@ -4237,7 +4237,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
|
||||
|
||||
withRollbackTxn(func(ctx context.Context) error {
|
||||
distance := 0
|
||||
got, err := qb.FindDuplicates(ctx, distance)
|
||||
durationDiff := -1.
|
||||
got, err := qb.FindDuplicates(ctx, distance, durationDiff)
|
||||
if err != nil {
|
||||
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
|
||||
return nil
|
||||
@@ -4246,7 +4247,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
|
||||
assert.Len(t, got, dupeScenePhashes)
|
||||
|
||||
distance = 1
|
||||
got, err = qb.FindDuplicates(ctx, distance)
|
||||
durationDiff = -1.
|
||||
got, err = qb.FindDuplicates(ctx, distance, durationDiff)
|
||||
if err != nil {
|
||||
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
|
||||
return nil
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"math"
|
||||
"strconv"
|
||||
|
||||
"github.com/corona10/goimagehash"
|
||||
@@ -10,15 +11,21 @@ import (
|
||||
type Phash struct {
|
||||
SceneID int `db:"id"`
|
||||
Hash int64 `db:"phash"`
|
||||
Duration float64 `db:"duration"`
|
||||
Neighbors []int
|
||||
Bucket int
|
||||
}
|
||||
|
||||
func FindDuplicates(hashes []*Phash, distance int) [][]int {
|
||||
func FindDuplicates(hashes []*Phash, distance int, durationDiff float64) [][]int {
|
||||
for i, scene := range hashes {
|
||||
sceneHash := goimagehash.NewImageHash(uint64(scene.Hash), goimagehash.PHash)
|
||||
for j, neighbor := range hashes {
|
||||
if i != j && scene.SceneID != neighbor.SceneID {
|
||||
neighbourDurationDistance := 0.
|
||||
if scene.Duration > 0 && neighbor.Duration > 0 {
|
||||
neighbourDurationDistance = math.Abs(scene.Duration - neighbor.Duration)
|
||||
}
|
||||
if (neighbourDurationDistance <= durationDiff) || (durationDiff < 0) {
|
||||
neighborHash := goimagehash.NewImageHash(uint64(neighbor.Hash), goimagehash.PHash)
|
||||
neighborDistance, _ := sceneHash.Distance(neighborHash)
|
||||
if neighborDistance <= distance {
|
||||
@@ -27,6 +34,7 @@ func FindDuplicates(hashes []*Phash, distance int) [][]int {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var buckets [][]int
|
||||
for _, scene := range hashes {
|
||||
|
||||
@@ -41,6 +41,8 @@ import { objectTitle } from "src/core/files";
|
||||
|
||||
const CLASSNAME = "duplicate-checker";
|
||||
|
||||
const defaultDurationDiff = "1";
|
||||
|
||||
export const SceneDuplicateChecker: React.FC = () => {
|
||||
const intl = useIntl();
|
||||
const history = useHistory();
|
||||
@@ -49,6 +51,9 @@ export const SceneDuplicateChecker: React.FC = () => {
|
||||
const currentPage = Number.parseInt(query.get("page") ?? "1", 10);
|
||||
const pageSize = Number.parseInt(query.get("size") ?? "20", 10);
|
||||
const hashDistance = Number.parseInt(query.get("distance") ?? "0", 10);
|
||||
const durationDiff = Number.parseFloat(
|
||||
query.get("durationDiff") ?? defaultDurationDiff
|
||||
);
|
||||
|
||||
const [currentPageSize, setCurrentPageSize] = useState(pageSize);
|
||||
const [isMultiDelete, setIsMultiDelete] = useState(false);
|
||||
@@ -59,7 +64,10 @@ export const SceneDuplicateChecker: React.FC = () => {
|
||||
);
|
||||
const { data, loading, refetch } = GQL.useFindDuplicateScenesQuery({
|
||||
fetchPolicy: "no-cache",
|
||||
variables: { distance: hashDistance },
|
||||
variables: {
|
||||
distance: hashDistance,
|
||||
duration_diff: durationDiff,
|
||||
},
|
||||
});
|
||||
const { data: missingPhash } = GQL.useFindScenesQuery({
|
||||
variables: {
|
||||
@@ -480,12 +488,13 @@ export const SceneDuplicateChecker: React.FC = () => {
|
||||
<h4>
|
||||
<FormattedMessage id="dupe_check.title" />
|
||||
</h4>
|
||||
<Form>
|
||||
<Form.Group>
|
||||
<Row noGutters>
|
||||
<Form.Label>
|
||||
<FormattedMessage id="dupe_check.search_accuracy_label" />
|
||||
</Form.Label>
|
||||
<Col xs={2}>
|
||||
<Col xs="auto">
|
||||
<Form.Control
|
||||
as="select"
|
||||
onChange={(e) =>
|
||||
@@ -520,6 +529,51 @@ export const SceneDuplicateChecker: React.FC = () => {
|
||||
</Form.Text>
|
||||
</Form.Group>
|
||||
|
||||
<Form.Group>
|
||||
<Row noGutters>
|
||||
<Form.Label>
|
||||
<FormattedMessage id="dupe_check.duration_diff" />
|
||||
</Form.Label>
|
||||
<Col xs="auto">
|
||||
<Form.Control
|
||||
as="select"
|
||||
onChange={(e) =>
|
||||
setQuery({
|
||||
durationDiff:
|
||||
e.currentTarget.value === defaultDurationDiff
|
||||
? undefined
|
||||
: e.currentTarget.value,
|
||||
page: undefined,
|
||||
})
|
||||
}
|
||||
defaultValue={durationDiff}
|
||||
className="input-control ml-4"
|
||||
>
|
||||
<option value={-1}>
|
||||
{intl.formatMessage({
|
||||
id: "dupe_check.duration_options.any",
|
||||
})}
|
||||
</option>
|
||||
<option value={0}>
|
||||
{intl.formatMessage({
|
||||
id: "dupe_check.duration_options.equal",
|
||||
})}
|
||||
</option>
|
||||
<option value={1}>
|
||||
1 {intl.formatMessage({ id: "second" })}
|
||||
</option>
|
||||
<option value={5}>
|
||||
5 {intl.formatMessage({ id: "seconds" })}
|
||||
</option>
|
||||
<option value={10}>
|
||||
10 {intl.formatMessage({ id: "seconds" })}
|
||||
</option>
|
||||
</Form.Control>
|
||||
</Col>
|
||||
</Row>
|
||||
</Form.Group>
|
||||
</Form>
|
||||
|
||||
{maybeRenderMissingPhashWarning()}
|
||||
{renderPagination()}
|
||||
|
||||
|
||||
@@ -10,4 +10,8 @@
|
||||
.separator {
|
||||
height: 50px;
|
||||
}
|
||||
|
||||
.form-group .row {
|
||||
align-items: center;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -855,6 +855,11 @@
|
||||
"donate": "Donate",
|
||||
"dupe_check": {
|
||||
"description": "Levels below 'Exact' can take longer to calculate. False positives might also be returned on lower accuracy levels.",
|
||||
"duration_diff": "Maximum Duration Difference",
|
||||
"duration_options": {
|
||||
"any": "Any",
|
||||
"equal": "Equal"
|
||||
},
|
||||
"found_sets": "{setCount, plural, one{# set of duplicates found.} other {# sets of duplicates found.}}",
|
||||
"options": {
|
||||
"exact": "Exact",
|
||||
@@ -1077,6 +1082,7 @@
|
||||
"saved_filters": "Saved filters",
|
||||
"update_filter": "Update Filter"
|
||||
},
|
||||
"second": "Second",
|
||||
"seconds": "Seconds",
|
||||
"settings": "Settings",
|
||||
"setup": {
|
||||
|
||||
Reference in New Issue
Block a user