eliasdabbas · November 2, 2023 13:33 · eliasdabbas · Nov 2, 2023
diff --git a/running_crawls.py b/running_crawls.py
 from subprocess import run
 from functools import partial

 run = partial(run, text=True, capture_output=True)

 def running_crawls():
    """Get details of currently running spiders.
    
    Get a DataFrame showing the following details:
    
    * pid: Process ID. Use this to identify (or stop) the spider that you want.
    * started: The time when this spider has started.
    * elapsed: The elapsed time since the spider started.
    * %mem: The percentage of memory that this spider is consuming.
    * %cpu: The percentage of CPU that this spider is consuming.
    * args: The full command that was used to start this spider. Use this to identify
      the spider(s) that you want to know about.
    * output_file: The path to the output file for each running crawl job.
    * crawled_urls: The current number of lines in ``output_file``.
    """
    ps = run(['ps', 'xo', 'pid,start,etime,%mem,%cpu,args'])
    ps_stdout = ps.stdout.splitlines()
    df = pd.DataFrame([line.split(maxsplit=5) for line in ps_stdout[1:]], columns=ps_stdout[0].split())
    df['output_file'] = df['ARGS'].str.extract('-o (.*?\.jl)')[0]
    df_subset = df[df['ARGS'].str.contains('scrapy runspider')].reset_index(drop=True)
    if df_subset.empty:
        return pd.DataFrame()
    crawled_lines = run(['wc', '-l'] + df['output_file'].str.cat(sep=' ').split())
    crawl_urls = [int(line.strip().split()[0]) for line in crawled_lines.stdout.splitlines()]
    crawl_urls = crawl_urls[:min(len(crawl_urls),  len(df_subset))]
    df_subset['crawled_urls'] = crawl_urls
    df_subset.columns = df_subset.columns.str.lower()
    return df_subset
	from subprocess import run
	from functools import partial

	run = partial(run, text=True, capture_output=True)

	def running_crawls():
	"""Get details of currently running spiders.

	Get a DataFrame showing the following details:

	* pid: Process ID. Use this to identify (or stop) the spider that you want.
	* started: The time when this spider has started.
	* elapsed: The elapsed time since the spider started.
	* %mem: The percentage of memory that this spider is consuming.
	* %cpu: The percentage of CPU that this spider is consuming.
	* args: The full command that was used to start this spider. Use this to identify
	the spider(s) that you want to know about.
	* output_file: The path to the output file for each running crawl job.
	* crawled_urls: The current number of lines in ``output_file``.
	"""
	ps = run(['ps', 'xo', 'pid,start,etime,%mem,%cpu,args'])
	ps_stdout = ps.stdout.splitlines()
	df = pd.DataFrame([line.split(maxsplit=5) for line in ps_stdout[1:]], columns=ps_stdout[0].split())
	df['output_file'] = df['ARGS'].str.extract('-o (.*?\.jl)')[0]
	df_subset = df[df['ARGS'].str.contains('scrapy runspider')].reset_index(drop=True)
	if df_subset.empty:
	return pd.DataFrame()
	crawled_lines = run(['wc', '-l'] + df['output_file'].str.cat(sep=' ').split())
	crawl_urls = [int(line.strip().split()[0]) for line in crawled_lines.stdout.splitlines()]
	crawl_urls = crawl_urls[:min(len(crawl_urls), len(df_subset))]
	df_subset['crawled_urls'] = crawl_urls
	df_subset.columns = df_subset.columns.str.lower()
	return df_subset