Source code for dorie.parscraper.wrapper.scrape_folder

import os
import sys
import warnings
import argparse
import traceback

from dorie.parscraper import readers, writers, match_parameters
from dorie.parscraper.warnings import OutputWarning

[docs]def scrape(xml_file,source_folder,out,css=None,debug=False): # PARSE XML FILE if os.path.isfile(xml_file): xml_parameters = readers.xml.parse(xml_file) else: raise IOError("XML file {0} does not exist".format(xml_file)) # CHECK SOURCE FOLDER if not source_folder.endswith("/"): source_folder += "/" if not os.path.isdir(source_folder): raise IOError("Source folder {0} does not exist".format(source_folder)) # ITERATE OVER ALL .cc AND .hh FILES IN SOURCE FOLDER, AND CALL SOURCE SCRAPER source_parameters = [] for subdir, dirs, files in os.walk(source_folder): for f in files: if f.endswith(".cc") or f.endswith(".hh"): full_path = os.path.join(subdir, f) source_parameters.append(readers.source.parse(full_path)) source_parameters = [item for sublist in source_parameters for item in sublist] # flatten list # MATCH XML AND SCRAPED PARAMETERS matched_parameters = match_parameters.match(xml_parameters,source_parameters,source_folder) # CALL OUTPUT SCRIPTS for o in out: file_suffix = o.split(".")[-1] if hasattr(writers,file_suffix): writer = getattr(writers,file_suffix) try: writer.write(matched_parameters,o,source_folder,css=css) except BaseException as e: if file_suffix == "ini": raise else: warnings.warn("Output failed for file {0} with error:\n{1}".format(o,repr(e)), OutputWarning) if debug: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback) else: warnings.warn("Unknown output format: .{}. Skipping output".format(file_suffix), OutputWarning)