import os
import sys
import warnings
import argparse
import traceback
from dorie.parscraper import readers, writers, match_parameters
from dorie.parscraper.warnings import OutputWarning
[docs]def scrape(xml_file,source_folder,out,css=None,debug=False):
# PARSE XML FILE
if os.path.isfile(xml_file):
xml_parameters = readers.xml.parse(xml_file)
else:
raise IOError("XML file {0} does not exist".format(xml_file))
# CHECK SOURCE FOLDER
if not source_folder.endswith("/"):
source_folder += "/"
if not os.path.isdir(source_folder):
raise IOError("Source folder {0} does not exist".format(source_folder))
# ITERATE OVER ALL .cc AND .hh FILES IN SOURCE FOLDER, AND CALL SOURCE SCRAPER
source_parameters = []
for subdir, dirs, files in os.walk(source_folder):
for f in files:
if f.endswith(".cc") or f.endswith(".hh"):
full_path = os.path.join(subdir, f)
source_parameters.append(readers.source.parse(full_path))
source_parameters = [item for sublist in source_parameters for item in sublist] # flatten list
# MATCH XML AND SCRAPED PARAMETERS
matched_parameters = match_parameters.match(xml_parameters,source_parameters,source_folder)
# CALL OUTPUT SCRIPTS
for o in out:
file_suffix = o.split(".")[-1]
if hasattr(writers,file_suffix):
writer = getattr(writers,file_suffix)
try:
writer.write(matched_parameters,o,source_folder,css=css)
except BaseException as e:
if file_suffix == "ini":
raise
else:
warnings.warn("Output failed for file {0} with error:\n{1}".format(o,repr(e)), OutputWarning)
if debug:
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_exception(exc_type, exc_value, exc_traceback)
else:
warnings.warn("Unknown output format: .{}. Skipping output".format(file_suffix), OutputWarning)