sametz · January 16, 2020 14:59
diff --git a/audita_parse.py b/audita_parse.py
 """Walk a directory tree, find audita.txt metadata files for NMR experiments
 of different peptides, parse their content and their location in the directory,
 and record the data as a csv.

 *** parent_folder needs to be changed to whatever parent folder you want to
 search for NMR data.***

 The tree structure resembles:
    <parent directory>/<experiment>/<peptide>/<NMR experiment>/<audita.txt>
 and it is assumed that files are stored in this structure, in order to parse
 the location for <experiment>, <peptide>, and <NMR experiment>

 audita.txt includes a line describing the location of the folder on the NMR
 server, e.g.:
    $$ /opt/data/<user>/nmr/<experiment folder>/3/audita.txt

 It also includes a line for the date and time the NMR experiment was started,
 e.g.:
    '	started at 2019-04-06 08:01:22.652 -0400,'

 The script records a csv of:
 * Category (the experiment, e.g. 'phosphorylated_pH7'
 * Peptide (e.g. 'Ac_ApS_NH2')
 * Experiment (the NMR experiment, e.g. '1H' or 'TOCSY')
 * User (username used for NMR account)
 * Server Folder (name of file on server all the NMR experiments are saved in)
 * Date (YYYY-MM-DD)
 * Started At (hh.mm.ss. 24-h with fractional seconds)
 """

 import csv
 import os
 from pathlib import Path
 import re


 HOME = Path.home()
 parent_folder = Path.home().joinpath('Dropbox', 'shared_folders',
                                     'peptide_data', 'data', 'raw', 'nmr',
                                     'phosphorylated_pH7')
 assert parent_folder.exists()
 print('Searching ', parent_folder)

 # test_audita used while testing functions that parse audita.txt.
 # If you want to run tests on an audita.txt file, uncomment the code below
 # and provide a path to one of the audita.txt files
 # test_audita = parent_folder.joinpath('Ac_ApT_NH2', '1H', 'audita.txt')
 # assert test_audita.is_file()


 def record_metadata(folder, filename):
    """Write metadata for all NMR experiments within a folder, as a csv
    named 'filename'.
    """
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Category', 'Peptide', 'Experiment', 'User',
                         'Server Folder', 'Date', 'Started At'])
        for data in find_metadata(folder):
            writer.writerow(data)


 def find_metadata(folder):
    """Given a folder, search for audita.txt files and yield metadata for
    the experiment.
    """
    # Study, Experiment, User, Server Folder, Date, Started At
    for folder_name, subfolders, filenames in os.walk(folder):
        if 'audita.txt' in filenames:
            audita_file = Path(folder_name, 'audita.txt')
            category, peptide, experiment = parse_audita_location(audita_file)
            user, folder, date, time = audita_metadata(audita_file)
            # print(peptide, experiment, user, folder, date, time)
            yield category, peptide, experiment, user, folder, date, time


 def parse_audita_location(audita_file):
    """Infer from the location of an audita.txt file the metadata for the
    conditions, the NMR experiment, and the peptide analyzed.

    Args:
        audita_file: (pathlib.Path)
            The path to the audita.txt file

    Returns:
        category, peptide, experiment (str, str, str)
            category: indicates phosphorylated pH 7 or 4, or dephosphorylated
            peptide: e.g. Ac_ApS_NH2
            experiment: '1H' or 'TOCSY'
    """
    experiment = audita_file.parent.parts[-1]
    peptide = audita_file.parent.parts[-2]
    category = audita_file.parent.parts[-3]
    if experiment not in ['1H', 'TOCSY']:
        print(f'FILE STRUCTURE ERROR: audita.txt found in: {audita_file.parent}')
    return category, peptide, experiment


 def audita_metadata(audita):
    """
    Extract metadata for NMR experiment from the audita.txt file
    Args:
        audita: (pathlib.Path)
            The path to the audita.txt file

    Returns:
        user, folder, date, time (str, str, str, str)
        user: nmr account the experiment was run on
        folder: the name of the experiment's folder on the NMR server
        date: "started at" date
        time: "started at" time
    """
    regex_time = datetime_regex()
    regex_origin = origin_regex()
    user = folder = date = time = None
    with open(audita, 'r') as file:
        content = file.readlines()
    origin_data = regex_origin.search(content[4])
    if not origin_data:
        print('FAILURE: origin regex on ', audita)
        print('content: ', content[4])
    else:
        user = origin_data.group(1)
        folder = origin_data.group(2)
    for line in content:
        r = regex_time.search(line)
        if r:
            date = r.group(1)
            time = r.group(2)
            break
    return user, folder, date, time


 def datetime_regex():
    """return a regex that finds date {group(1) and time{group(2)}."""
    regex = re.compile(r'\s*started at (\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}\.\d+)')
    return regex


 def origin_regex():
    """return a regex that finds user {group(1) and folder name{group(2)}."""
    # $$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt
    regex = re.compile(r'\$\$ /\S+/data/(\S+)/nmr/(\S+)/\d+/audita.txt')
    return regex


 # Code below was used to explore tree-walking and file/path parsing.
 # Not necessary for script to run.
 def test_walk(folder):
    """Used to explore how os.walk works."""
    for folder_name, subfolders, filenames in os.walk(folder):
        print('The current folder is ' + folder_name)
        for subfolder in subfolders:
            print('SUBFOLDER OF ' + folder_name + ': ' + subfolder)
        for filename in filenames:
            print('FILE INSIDE ' + folder_name + ': ' + filename)
            print('')


 # strategy: extract line 5 of audita.txt, split, extract user/nmr/filename
 def file_origin(audita):
    """Used to explore parsing of audita file."""
    regex = datetime_regex()
    with open(audita, 'r') as file:
        content = file.readlines()
        origin_line = content[4]
        assert origin_line.startswith('$$ ')
        print(f'Origin line: {origin_line}')
        origin_path = Path(origin_line[3:])
        print(f'Origin path: {origin_path}')
        origin_parts = origin_path.parts
        print(origin_parts)
        for line in content:
            r = regex.search(line)
            if r:
                print('regex: ', r.group())
        return origin_parts[3], origin_parts[5]


 def test_regex():
    """test that the regexes are working."""
    testdate = '	started at 2019-04-06 08:01:22.652 -0400,'
    regex_date = datetime_regex()
    result_date = regex_date.search(testdate)
    print('date regex: ', result_date.group(), ' | ', result_date.group(1), result_date.group(2))
    testorigin = '$$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt'
    regex_origin = origin_regex()
    result_origin = regex_origin.search(testorigin)
    print('origin regex: ', result_origin.group(), ' | ', result_origin.group(1), result_origin.group(2))


 # parent_folder would only search the folder for the 'phosphorylated_pH7'
 # experiment. Its parent directory held multiple experiment folders, so for a
 # complete search parent_folder.parent was used below:
 record_metadata(parent_folder.parent, 'nmr_metadata.csv')
	"""Walk a directory tree, find audita.txt metadata files for NMR experiments
	of different peptides, parse their content and their location in the directory,
	and record the data as a csv.

	*** parent_folder needs to be changed to whatever parent folder you want to
	search for NMR data.***

	The tree structure resembles:
	<parent directory>/<experiment>/<peptide>/<NMR experiment>/<audita.txt>
	and it is assumed that files are stored in this structure, in order to parse
	the location for <experiment>, <peptide>, and <NMR experiment>

	audita.txt includes a line describing the location of the folder on the NMR
	server, e.g.:
	$$ /opt/data/<user>/nmr/<experiment folder>/3/audita.txt

	It also includes a line for the date and time the NMR experiment was started,
	e.g.:
	' started at 2019-04-06 08:01:22.652 -0400,'

	The script records a csv of:
	* Category (the experiment, e.g. 'phosphorylated_pH7'
	* Peptide (e.g. 'Ac_ApS_NH2')
	* Experiment (the NMR experiment, e.g. '1H' or 'TOCSY')
	* User (username used for NMR account)
	* Server Folder (name of file on server all the NMR experiments are saved in)
	* Date (YYYY-MM-DD)
	* Started At (hh.mm.ss. 24-h with fractional seconds)
	"""

	import csv
	import os
	from pathlib import Path
	import re


	HOME = Path.home()
	parent_folder = Path.home().joinpath('Dropbox', 'shared_folders',
	'peptide_data', 'data', 'raw', 'nmr',
	'phosphorylated_pH7')
	assert parent_folder.exists()
	print('Searching ', parent_folder)

	# test_audita used while testing functions that parse audita.txt.
	# If you want to run tests on an audita.txt file, uncomment the code below
	# and provide a path to one of the audita.txt files
	# test_audita = parent_folder.joinpath('Ac_ApT_NH2', '1H', 'audita.txt')
	# assert test_audita.is_file()


	def record_metadata(folder, filename):
	"""Write metadata for all NMR experiments within a folder, as a csv
	named 'filename'.
	"""
	with open(filename, 'w', newline='') as f:
	writer = csv.writer(f)
	writer.writerow(['Category', 'Peptide', 'Experiment', 'User',
	'Server Folder', 'Date', 'Started At'])
	for data in find_metadata(folder):
	writer.writerow(data)


	def find_metadata(folder):
	"""Given a folder, search for audita.txt files and yield metadata for
	the experiment.
	"""
	# Study, Experiment, User, Server Folder, Date, Started At
	for folder_name, subfolders, filenames in os.walk(folder):
	if 'audita.txt' in filenames:
	audita_file = Path(folder_name, 'audita.txt')
	category, peptide, experiment = parse_audita_location(audita_file)
	user, folder, date, time = audita_metadata(audita_file)
	# print(peptide, experiment, user, folder, date, time)
	yield category, peptide, experiment, user, folder, date, time


	def parse_audita_location(audita_file):
	"""Infer from the location of an audita.txt file the metadata for the
	conditions, the NMR experiment, and the peptide analyzed.

	Args:
	audita_file: (pathlib.Path)
	The path to the audita.txt file

	Returns:
	category, peptide, experiment (str, str, str)
	category: indicates phosphorylated pH 7 or 4, or dephosphorylated
	peptide: e.g. Ac_ApS_NH2
	experiment: '1H' or 'TOCSY'
	"""
	experiment = audita_file.parent.parts[-1]
	peptide = audita_file.parent.parts[-2]
	category = audita_file.parent.parts[-3]
	if experiment not in ['1H', 'TOCSY']:
	print(f'FILE STRUCTURE ERROR: audita.txt found in: {audita_file.parent}')
	return category, peptide, experiment


	def audita_metadata(audita):
	"""
	Extract metadata for NMR experiment from the audita.txt file
	Args:
	audita: (pathlib.Path)
	The path to the audita.txt file

	Returns:
	user, folder, date, time (str, str, str, str)
	user: nmr account the experiment was run on
	folder: the name of the experiment's folder on the NMR server
	date: "started at" date
	time: "started at" time
	"""
	regex_time = datetime_regex()
	regex_origin = origin_regex()
	user = folder = date = time = None
	with open(audita, 'r') as file:
	content = file.readlines()
	origin_data = regex_origin.search(content[4])
	if not origin_data:
	print('FAILURE: origin regex on ', audita)
	print('content: ', content[4])
	else:
	user = origin_data.group(1)
	folder = origin_data.group(2)
	for line in content:
	r = regex_time.search(line)
	if r:
	date = r.group(1)
	time = r.group(2)
	break
	return user, folder, date, time


	def datetime_regex():
	"""return a regex that finds date {group(1) and time{group(2)}."""
	regex = re.compile(r'\s*started at (\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}\.\d+)')
	return regex


	def origin_regex():
	"""return a regex that finds user {group(1) and folder name{group(2)}."""
	# $$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt
	regex = re.compile(r'\$\$ /\S+/data/(\S+)/nmr/(\S+)/\d+/audita.txt')
	return regex


	# Code below was used to explore tree-walking and file/path parsing.
	# Not necessary for script to run.
	def test_walk(folder):
	"""Used to explore how os.walk works."""
	for folder_name, subfolders, filenames in os.walk(folder):
	print('The current folder is ' + folder_name)
	for subfolder in subfolders:
	print('SUBFOLDER OF ' + folder_name + ': ' + subfolder)
	for filename in filenames:
	print('FILE INSIDE ' + folder_name + ': ' + filename)
	print('')


	# strategy: extract line 5 of audita.txt, split, extract user/nmr/filename
	def file_origin(audita):
	"""Used to explore parsing of audita file."""
	regex = datetime_regex()
	with open(audita, 'r') as file:
	content = file.readlines()
	origin_line = content[4]
	assert origin_line.startswith('$$ ')
	print(f'Origin line: {origin_line}')
	origin_path = Path(origin_line[3:])
	print(f'Origin path: {origin_path}')
	origin_parts = origin_path.parts
	print(origin_parts)
	for line in content:
	r = regex.search(line)
	if r:
	print('regex: ', r.group())
	return origin_parts[3], origin_parts[5]


	def test_regex():
	"""test that the regexes are working."""
	testdate = ' started at 2019-04-06 08:01:22.652 -0400,'
	regex_date = datetime_regex()
	result_date = regex_date.search(testdate)
	print('date regex: ', result_date.group(), ' \| ', result_date.group(1), result_date.group(2))
	testorigin = '$$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt'
	regex_origin = origin_regex()
	result_origin = regex_origin.search(testorigin)
	print('origin regex: ', result_origin.group(), ' \| ', result_origin.group(1), result_origin.group(2))


	# parent_folder would only search the folder for the 'phosphorylated_pH7'
	# experiment. Its parent directory held multiple experiment folders, so for a
	# complete search parent_folder.parent was used below:
	record_metadata(parent_folder.parent, 'nmr_metadata.csv')