Use CutyCapt inside ArchiveBox to capture screenshot of the complete website.
Install CutyCapt utility and X virtual framebuffer.
$ sudo apt install cutycapt xvfb
Define configuration variables [archivebox/config.py
file].
diff --git a/archivebox/config.py b/archivebox/config.py index 47f1776..3c41340 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -49,6 +49,13 @@ CHROME_BINARY = os.getenv('CHROME_BINARY', None) URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) + +CUTYCAPT_BINARY = os.getenv('CUTYCAPT_BINARY', 'cutycapt') +CUTYCAPT_USER_AGENT = os.getenv('CUTYCAPT_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36') +CUTYCAPT_MIN_WIDTH = os.getenv('CUTYCAPT_MIN_WIDTH', '1280') +CUTYCAPT_SMOOTH = os.getenv('CUTYCAPT_SMOOTH', 'True').lower() == 'true' +CUTYCAPT_DELAY = os.getenv('CUTYCAPT_DELAY', '2000') + try: OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) except Exception: @@ -159,6 +166,15 @@ try: 'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR, } + CUTYCAPT_OPTIONS = { + 'TIMEOUT': TIMEOUT, + 'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY, + 'CUTYCAPT_BINARY': CUTYCAPT_BINARY, + 'CUTYCAPT_USER_AGENT': CUTYCAPT_USER_AGENT, + 'CUTYCAPT_MIN_WIDTH': CUTYCAPT_MIN_WIDTH, + 'CUTYCAPT_SMOOTH': CUTYCAPT_SMOOTH, + 'CUTYCAPT_DELAY': CUTYCAPT_DELAY, + } ### Check Python environment python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
Define helper function to build cutycapt shell command [archivebox/util.py
file].
diff --git a/archivebox/util.py b/archivebox/util.py index cec2303..763365c 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -30,6 +30,7 @@ from config import ( CHECK_SSL_VALIDITY, WGET_USER_AGENT, CHROME_OPTIONS, + CUTYCAPT_OPTIONS, ) from logs import pretty_path @@ -568,3 +569,36 @@ def chrome_args(**options): cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR'])) return cmd_args + +def cutycapt_args(**options): + """helper to build up a cutycapt shell command with arguments""" + + options = {**CUTYCAPT_OPTIONS, **options} + + cmd_args = ['xvfb-run', options['CUTYCAPT_BINARY']] + + if not options['CHECK_SSL_VALIDITY']: + cmd_args += ('--insecure',) + + if options['CUTYCAPT_USER_AGENT']: + cmd_args += ('--user-agent={}'.format(options['CUTYCAPT_USER_AGENT']),) + + if options['CUTYCAPT_MIN_WIDTH']: + cmd_args += ('--min-width={}'.format(options['CUTYCAPT_MIN_WIDTH']),) + + if options['TIMEOUT']: + cmd_args += ('--max-wait={}'.format((options['TIMEOUT']) * 1000),) + + if options['CUTYCAPT_SMOOTH']: + cmd_args += ('--smooth',) + + if options['CUTYCAPT_DELAY']: + cmd_args.append('--delay={}'.format(options['CUTYCAPT_DELAY']),) + + if options['URL']: + cmd_args += ('--url={}'.format(options['URL']),) + + if options['OUT']: + cmd_args += ('--out={}'.format(options['OUT']),) + + return cmd_args
Modify fetch_screenshot
function to to use cutycapt [archivebox/archive_methods.py
file].
diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index b2f04f3..299b9e6 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -32,9 +32,13 @@ from config import ( GIT_SHA, WGET_USER_AGENT, CHECK_SSL_VALIDITY, COOKIES_FILE, - WGET_AUTO_COMPRESSION + WGET_AUTO_COMPRESSION, + CUTYCAPT_USER_AGENT, + CUTYCAPT_MIN_WIDTH, + CUTYCAPT_SMOOTH, + CUTYCAPT_DELAY ) from util import ( domain, extension, @@ -45,8 +49,9 @@ from util import ( TimedProgress, chmod_file, wget_output_path, chrome_args, + cutycapt_args, check_link_structure, run, PIPE, DEVNULL ) from logs import ( @@ -338,13 +343,9 @@ def should_fetch_screenshot(link_dir, link): def fetch_screenshot(link_dir, link, timeout=TIMEOUT): """take screenshot of site using chrome --headless""" output = 'screenshot.png' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--screenshot', - link['url'], - ] + cmd = cutycapt_args(TIMEOUT=50000,URL=link['url'],OUT=output) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
This will generate the following command.
xvfb-run cutycapt "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36" --min-width=1280 --max-wait=50000000 --smooth --delay=2000 --url=https://digiday.com/media/wtf-link-rot/ --out=screenshot.png
I love this!