ldapsaisie/doc/clean-all-in-one-html-file.py

41 lines
1.1 KiB
Python
Raw Permalink Normal View History

2023-10-08 23:19:07 +02:00
import sys
import os.path
from bs4 import BeautifulSoup
# Check & handle arguments
if len(sys.argv) < 2:
print(f'Usage: {sys.argv[0]} [/path/to/input.html] [/path/to/output.html]')
sys.exit(1)
input_path = sys.argv[1]
if not os.path.exists(input_path):
print(f'{input_path} not found')
sys.exit(1)
output_path = sys.argv[2] if len(sys.argv) > 2 else input_path
# Open & parse HTML input file
with open(input_path, 'r') as fp:
soup = BeautifulSoup(fp, 'html.parser')
# Delete some useless elements
to_delete = [
(['div'], {'class': 'md-sidebar'}),
(['div'], {'class': 'md-search'}),
(['label'], {'for': '__search'}),
(['div'], {'id': 'print-site-banner'}),
(['div'], {'class': 'md-header__source'}),
]
for args, kwargs in to_delete:
for el in soup.find_all(*args, **kwargs):
el.decompose()
# Change LdapSaisie logo header link to JS scroll top action
soup.find('a', attrs={'class': 'md-logo'})['href'] = 'javascript:window.scrollTo(0,0)'
# Store resulting HTML document in output file
with open(output_path, 'w') as fp:
fp.write(str(soup))