diff --git a/pkg/utils/oshash.go b/pkg/utils/oshash.go index 1ddbe4de2..75f552b84 100644 --- a/pkg/utils/oshash.go +++ b/pkg/utils/oshash.go @@ -7,6 +7,33 @@ import ( "os" ) +const chunkSize int64 = 64 * 1024 + +func oshash(size int64, head []byte, tail []byte) (string, error) { + // put the head and tail together + buf := append(head, tail...) + + // convert bytes into uint64 + ints := make([]uint64, len(buf)/8) + reader := bytes.NewReader(buf) + err := binary.Read(reader, binary.LittleEndian, &ints) + if err != nil { + return "", err + } + + // sum the integers + var sum uint64 + for _, v := range ints { + sum += v + } + + // add the filesize + sum += uint64(size) + + // output as hex + return fmt.Sprintf("%016x", sum), nil +} + // OSHashFromFilePath calculates the hash using the same algorithm that // OpenSubtitles.org uses. // @@ -24,14 +51,13 @@ func OSHashFromFilePath(filePath string) (string, error) { return "", err } - fileSize := int64(fi.Size()) + fileSize := fi.Size() if fileSize == 0 { return "", nil } - const chunkSize = 64 * 1024 - fileChunkSize := int64(chunkSize) + fileChunkSize := chunkSize if fileSize < fileChunkSize { fileChunkSize = fileSize } @@ -57,26 +83,5 @@ func OSHashFromFilePath(filePath string) (string, error) { return "", err } - // put the head and tail together - buf := append(head, tail...) - - // convert bytes into uint64 - ints := make([]uint64, len(buf)/8) - reader := bytes.NewReader(buf) - err = binary.Read(reader, binary.LittleEndian, &ints) - if err != nil { - return "", err - } - - // sum the integers - var sum uint64 - for _, v := range ints { - sum += v - } - - // add the filesize - sum += uint64(fileSize) - - // output as hex - return fmt.Sprintf("%016x", sum), nil + return oshash(fileSize, head, tail) } diff --git a/pkg/utils/oshash_internal_test.go b/pkg/utils/oshash_internal_test.go new file mode 100644 index 000000000..592d4314c --- /dev/null +++ b/pkg/utils/oshash_internal_test.go @@ -0,0 +1,46 @@ +package utils + +import ( + "testing" +) + +// Note that the public API returns "" instead. +func TestOshashEmpty(t *testing.T) { + var size int64 = 0 + head := make([]byte, chunkSize) + tail := make([]byte, chunkSize) + want := "0000000000000000" + got, err := oshash(size, head, tail) + if err != nil { + t.Errorf("TestOshashEmpty: Error from oshash: %w", err) + } + if got != want { + t.Errorf("TestOshashEmpty: oshash(0, 0, 0) = %q; want %q", got, want) + } +} + +// As oshash sums byte values, causing collisions is trivial. +func TestOshashCollisions(t *testing.T) { + buf1 := []byte("this is dumb") + buf2 := []byte("dumb is this") + var size int64 = int64(len(buf1)) + head := make([]byte, chunkSize) + + tail1 := make([]byte, chunkSize) + copy(tail1[len(tail1)-len(buf1):], buf1) + hash1, err := oshash(size, head, tail1) + if err != nil { + t.Errorf("TestOshashCollisions: Error from oshash: %v", err) + } + + tail2 := make([]byte, chunkSize) + copy(tail2[len(tail2)-len(buf2):], buf2) + hash2, err := oshash(size, head, tail2) + if err != nil { + t.Errorf("TestOshashCollisions: Error from oshash: %v", err) + } + + if hash1 != hash2 { + t.Errorf("TestOshashCollisions: oshash(n, k, ... %v) =! oshash(n, k, ... %v)", buf1, buf2) + } +}