mirror of
https://github.com/stashapp/stash.git
synced 2025-12-18 04:44:37 +03:00
Limit duplicate matching to files that have ~ same duration (#3663)
* Limit duplicate matching to files that have ~ same duration * Add UI for duration diff --------- Co-authored-by: WithoutPants <53250216+WithoutPants@users.noreply.github.com>
This commit is contained in:
@@ -36,23 +36,38 @@ const (
|
||||
)
|
||||
|
||||
var findExactDuplicateQuery = `
|
||||
SELECT GROUP_CONCAT(scenes.id) as ids
|
||||
FROM scenes
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
|
||||
GROUP BY files_fingerprints.fingerprint
|
||||
HAVING COUNT(files_fingerprints.fingerprint) > 1 AND COUNT(DISTINCT scenes.id) > 1
|
||||
ORDER BY SUM(files.size) DESC;
|
||||
SELECT GROUP_CONCAT(DISTINCT scene_id) as ids
|
||||
FROM (
|
||||
SELECT scenes.id as scene_id
|
||||
, video_files.duration as file_duration
|
||||
, files.size as file_size
|
||||
, files_fingerprints.fingerprint as phash
|
||||
, abs(max(video_files.duration) OVER (PARTITION by files_fingerprints.fingerprint) - video_files.duration) as durationDiff
|
||||
FROM scenes
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
|
||||
INNER JOIN video_files ON (files.id == video_files.file_id)
|
||||
)
|
||||
WHERE durationDiff <= ?1
|
||||
OR ?1 < 0 -- Always TRUE if the parameter is negative.
|
||||
-- That will disable the durationDiff checking.
|
||||
GROUP BY phash
|
||||
HAVING COUNT(phash) > 1
|
||||
AND COUNT(DISTINCT scene_id) > 1
|
||||
ORDER BY SUM(file_size) DESC;
|
||||
`
|
||||
|
||||
var findAllPhashesQuery = `
|
||||
SELECT scenes.id as id, files_fingerprints.fingerprint as phash
|
||||
SELECT scenes.id as id
|
||||
, files_fingerprints.fingerprint as phash
|
||||
, video_files.duration as duration
|
||||
FROM scenes
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN scenes_files ON (scenes.id = scenes_files.scene_id)
|
||||
INNER JOIN files ON (scenes_files.file_id = files.id)
|
||||
INNER JOIN files_fingerprints ON (scenes_files.file_id = files_fingerprints.file_id AND files_fingerprints.type = 'phash')
|
||||
ORDER BY files.size DESC
|
||||
INNER JOIN video_files ON (files.id == video_files.file_id)
|
||||
ORDER BY files.size DESC;
|
||||
`
|
||||
|
||||
type sceneRow struct {
|
||||
@@ -1729,11 +1744,11 @@ func (qb *SceneStore) GetStashIDs(ctx context.Context, sceneID int) ([]models.St
|
||||
return qb.stashIDRepository().get(ctx, sceneID)
|
||||
}
|
||||
|
||||
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*models.Scene, error) {
|
||||
func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int, durationDiff float64) ([][]*models.Scene, error) {
|
||||
var dupeIds [][]int
|
||||
if distance == 0 {
|
||||
var ids []string
|
||||
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery); err != nil {
|
||||
if err := qb.tx.Select(ctx, &ids, findExactDuplicateQuery, durationDiff); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -1755,7 +1770,8 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
|
||||
|
||||
if err := qb.queryFunc(ctx, findAllPhashesQuery, nil, false, func(rows *sqlx.Rows) error {
|
||||
phash := utils.Phash{
|
||||
Bucket: -1,
|
||||
Bucket: -1,
|
||||
Duration: -1,
|
||||
}
|
||||
if err := rows.StructScan(&phash); err != nil {
|
||||
return err
|
||||
@@ -1767,7 +1783,7 @@ func (qb *SceneStore) FindDuplicates(ctx context.Context, distance int) ([][]*mo
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dupeIds = utils.FindDuplicates(hashes, distance)
|
||||
dupeIds = utils.FindDuplicates(hashes, distance, durationDiff)
|
||||
}
|
||||
|
||||
var duplicates [][]*models.Scene
|
||||
|
||||
@@ -4237,7 +4237,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
|
||||
|
||||
withRollbackTxn(func(ctx context.Context) error {
|
||||
distance := 0
|
||||
got, err := qb.FindDuplicates(ctx, distance)
|
||||
durationDiff := -1.
|
||||
got, err := qb.FindDuplicates(ctx, distance, durationDiff)
|
||||
if err != nil {
|
||||
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
|
||||
return nil
|
||||
@@ -4246,7 +4247,8 @@ func TestSceneStore_FindDuplicates(t *testing.T) {
|
||||
assert.Len(t, got, dupeScenePhashes)
|
||||
|
||||
distance = 1
|
||||
got, err = qb.FindDuplicates(ctx, distance)
|
||||
durationDiff = -1.
|
||||
got, err = qb.FindDuplicates(ctx, distance, durationDiff)
|
||||
if err != nil {
|
||||
t.Errorf("SceneStore.FindDuplicates() error = %v", err)
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user