Browse Source

add precision tests against dataset

main
Rigel Kent 2 weeks ago
parent
commit
0a5be0ff1d
No known key found for this signature in database GPG Key ID: 5E53E96A494E452F
7 changed files with 139 additions and 72 deletions
  1. +7
    -0
      LICENSE
  2. +8
    -0
      README.md
  3. +1
    -1
      download_dataset.sh
  4. +72
    -0
      pyvideohash/__init__.py
  5. +10
    -59
      pyvideohash/cli.py
  6. +11
    -12
      tests/test_cli.py
  7. +30
    -0
      tests/test_precision.py

+ 7
- 0
LICENSE View File

@@ -0,0 +1,7 @@
Copyright © 2020 <copyright holders>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 8
- 0
README.md View File

@@ -2,3 +2,11 @@

Perceptual hashing for videos

## Precision

Precision is verified against the [CC_WEB_VIDEO](http://vireo.cs.cityu.edu.hk/webvideo/) dataset's first query (a small subset), which you should download via the rootdir script first, provided you agree to their terms and respect their license.

## License

The code is MIT-licensed as follows, with the files related to the verification dataset staying under their own copyright. Copyright of any videos in the dataset fully belongs to their owners.


+ 1
- 1
download_dataset.sh View File

@@ -1 +1 @@
cut -d$'\t' -f2,4 tests/dataset/Video_List.txt | awk '{$1=$1;printf "http://vireo.cs.cityu.edu.hk/webvideo/videos/%s/%s\n", $1, $2}' | head -n 24 | wget -N --progress=bar --random-wait -P tests/dataset/videos -i -
cut -d$'\t' -f2,4 tests/dataset/Video_List.txt | awk '{$1=$1;printf "http://vireo.cs.cityu.edu.hk/webvideo/videos/%s/%s\n", $1, $2}' | head -n 50 | wget -N --progress=bar --random-wait -P tests/dataset/videos -i -

+ 72
- 0
pyvideohash/__init__.py View File

@@ -0,0 +1,72 @@
import sys
import hashlib
import imagehash
import numpy as np
from moviepy.editor import VideoFileClip
from PIL import Image


def hash_file_crypto(filename, blocksize=2**20):
m = hashlib.sha256()
with open(filename, "rb") as f:
while True:
buf = f.read(blocksize)
if not buf:
break
m.update(buf)
return m.hexdigest()


def hash_frame(frame):
pil_frame = Image.fromarray(np.uint8(frame)).convert('RGB')
return imagehash.whash(pil_frame, hash_size=16)

def compare_files(
original_file,
new_file,
use_stdout=False,
fps=1.0, # take one frame per second
up_to=5 / 100 # up to 5% of hash can be different btw two similar images / anything higher will prove videos to be different
):
"""Perceptual comparison of two videos, with potential resolutions"""
def hash_values_of(filepath):
with VideoFileClip(filepath) as clip:
return [
hash_frame(f)
for f in clip.iter_frames(1, dtype=float)
]

if hash_file_crypto(new_file) is hash_file_crypto(original_file):
sys.exit(0)

original_file_hashes = hash_values_of(original_file)
new_file_hashes = hash_values_of(new_file) # even if longer, will have same similarity

longer = False
if len(original_file_hashes) is not len(new_file_hashes):
# video is not of same length, video is probably radically different
longer = True

differences = [
(original_file_hash - new_file_hash) / len(new_file_hashes[0].hash)**2
for original_file_hash, new_file_hash in zip(original_file_hashes, new_file_hashes)
]

max_differences = [
d
for d in sorted(differences)[-int(64/10):]
]

ratio = max_differences and min(max_differences) or 0.0
if use_stdout:
print(ratio)

return {
'E': ratio == 0.0,
'S': ratio <= 0.3,
'V': 0.3 < ratio <= 0.4,
'M': 0.4 < ratio <= 0.5,
'L': longer,
'X': ratio > 0.5
}

+ 10
- 59
pyvideohash/cli.py View File

@@ -1,71 +1,22 @@
import sys
import click
import hashlib
import imagehash
import numpy as np
from moviepy.editor import VideoFileClip
from PIL import Image
def generate_file_sha256(filename, blocksize=2**20):
m = hashlib.sha256()
with open(filename, "rb") as f:
while True:
buf = f.read(blocksize)
if not buf:
break
m.update(buf)
return m.hexdigest()
def hash_frame(frame):
pil_frame = Image.fromarray(np.uint8(frame)).convert('RGB')
return imagehash.whash(pil_frame)
from . import compare_files
@click.command()
@click.argument('original_files', nargs=-1, type=click.Path(exists=True, readable=True, file_okay=True, resolve_path=True))
@click.argument('original_file', nargs=1, type=click.Path(exists=True, readable=True, file_okay=True, resolve_path=True))
@click.argument('new_file', nargs=1, type=click.Path(exists=True, readable=True, file_okay=True, resolve_path=True))
@click.option('use_stdout', '--silent', is_flag=True, default=True)
@click.option('--no-progressbar', is_flag=True, default=False)
def main(
original_files,
original_file,
new_file,
use_stdout=False,
no_progressbar=False,
fps=1.0, # take one frame per second
up_to=5 / 100 # up to 5% of hash can be different btw two similar images / anything higher will prove videos to be different
use_stdout
):
"""Perceptual comparison of two videos, with potential resolutions"""
logger = "bar" if use_stdout and not no_progressbar else None
def hash_values_of(filepath):
clip = VideoFileClip(filepath)
return [hash_frame(f) for f in clip.iter_frames(fps, dtype=float, logger=logger)]
if generate_file_sha256(new_file) in [generate_file_sha256(f) for f in original_files]:
sys.exit(0)
new_file_hashes = hash_values_of(new_file)
differences = []
for file_hashes in [hash_values_of(f) for f in original_files]:
if len(file_hashes) is not len(new_file_hashes):
# video is not of same length, video is probably radically different
pass
#sys.exit(1)
max_diff_local = 0
for file_hash, new_file_hash in zip(file_hashes, new_file_hashes):
max_diff_local = max(file_hash - new_file_hash, max_diff_local)
differences.append(max_diff_local)
ratio = min(differences)/len(new_file_hashes[0].hash)**2
if use_stdout:
print(ratio)
sys.exit(0)
compare_files(
original_file,
new_file,
use_stdout
)
if __name__ == '__main__':
main(original_files, new_file, use_stdout=True)
main(original_file, new_file, use_stdout=True)

+ 11
- 12
tests/test_cli.py View File

@@ -7,6 +7,7 @@ from expecter import expect
from click.testing import CliRunner
import pandas as pd

from pyvideohash import compare_files
from pyvideohash.cli import main


@@ -16,7 +17,7 @@ def runner():

@pytest.fixture
def db():
return pd.read_csv(Path('tests', 'dataset', 'GT', 'GT_1.rst'), sep='\t', header=None)
return pd.read_csv(Path('tests', 'dataset', 'GT', 'GT_1.rst'), sep='\t', index_col=0, header=None)[1]

class VideoIsClassifiedAs(Enum):
EXACT_DUPLICATE = 'E'
@@ -32,17 +33,15 @@ def file(number):

def describe_cli():

def describe_query_1():
def when_seed(runner):
result = runner.invoke(main, [file(1), file(1)])

def when_seed(runner):
result = runner.invoke(main, [file(1), file(1)])
expect(result.exit_code) == 0
expect(float(result.output)) == 0.0

expect(result.exit_code) == 0
expect(result.output) == ""
def when_dissimilar(runner, db):
result = runner.invoke(main, [file(1), file(5)])

def when_dissimilar(runner, db):
result = runner.invoke(main, [file(1), file(5), '--no-progressbar'])

expect(result.exit_code) == 0
expect(float(result.output)) >= 0.5
expect(db.iat[4, 1]) == VideoIsClassifiedAs.DISSIMILAR.value
expect(result.exit_code) == 0
expect(float(result.output)) >= 0.5
expect(db[5]) == VideoIsClassifiedAs.DISSIMILAR.value

+ 30
- 0
tests/test_precision.py View File

@@ -0,0 +1,30 @@
import pytest
from pathlib import Path
from expecter import expect
import pandas as pd

from pyvideohash import compare_files

@pytest.fixture
def db():
return pd.read_csv(Path('tests', 'dataset', 'GT', 'GT_1.rst'), sep='\t', index_col=0, header=None)[1]

def file(number):
return str(Path('tests', 'dataset', 'videos', '1_{}_Y.flv'.format(number)))


def describe_precision():

@pytest.mark.parametrize(
"i",
[i for i in range(1, 50)]
)
def when_video_test(i, db):
if db[i] == '-1':
return

j = i
if i in [13, 39]:
j += 1

expect(compare_files(file(1), file(j))[db[i]]) is True

Loading…
Cancel
Save