Created
January 16, 2020 14:59
-
-
Save sametz/268c652e3e7d0a9b82adadb1c9f8c859 to your computer and use it in GitHub Desktop.
walks a directory looking for audita.txt NMR metadata and saves info as a csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Walk a directory tree, find audita.txt metadata files for NMR experiments | |
of different peptides, parse their content and their location in the directory, | |
and record the data as a csv. | |
*** parent_folder needs to be changed to whatever parent folder you want to | |
search for NMR data.*** | |
The tree structure resembles: | |
<parent directory>/<experiment>/<peptide>/<NMR experiment>/<audita.txt> | |
and it is assumed that files are stored in this structure, in order to parse | |
the location for <experiment>, <peptide>, and <NMR experiment> | |
audita.txt includes a line describing the location of the folder on the NMR | |
server, e.g.: | |
$$ /opt/data/<user>/nmr/<experiment folder>/3/audita.txt | |
It also includes a line for the date and time the NMR experiment was started, | |
e.g.: | |
' started at 2019-04-06 08:01:22.652 -0400,' | |
The script records a csv of: | |
* Category (the experiment, e.g. 'phosphorylated_pH7' | |
* Peptide (e.g. 'Ac_ApS_NH2') | |
* Experiment (the NMR experiment, e.g. '1H' or 'TOCSY') | |
* User (username used for NMR account) | |
* Server Folder (name of file on server all the NMR experiments are saved in) | |
* Date (YYYY-MM-DD) | |
* Started At (hh.mm.ss. 24-h with fractional seconds) | |
""" | |
import csv | |
import os | |
from pathlib import Path | |
import re | |
HOME = Path.home() | |
parent_folder = Path.home().joinpath('Dropbox', 'shared_folders', | |
'peptide_data', 'data', 'raw', 'nmr', | |
'phosphorylated_pH7') | |
assert parent_folder.exists() | |
print('Searching ', parent_folder) | |
# test_audita used while testing functions that parse audita.txt. | |
# If you want to run tests on an audita.txt file, uncomment the code below | |
# and provide a path to one of the audita.txt files | |
# test_audita = parent_folder.joinpath('Ac_ApT_NH2', '1H', 'audita.txt') | |
# assert test_audita.is_file() | |
def record_metadata(folder, filename): | |
"""Write metadata for all NMR experiments within a folder, as a csv | |
named 'filename'. | |
""" | |
with open(filename, 'w', newline='') as f: | |
writer = csv.writer(f) | |
writer.writerow(['Category', 'Peptide', 'Experiment', 'User', | |
'Server Folder', 'Date', 'Started At']) | |
for data in find_metadata(folder): | |
writer.writerow(data) | |
def find_metadata(folder): | |
"""Given a folder, search for audita.txt files and yield metadata for | |
the experiment. | |
""" | |
# Study, Experiment, User, Server Folder, Date, Started At | |
for folder_name, subfolders, filenames in os.walk(folder): | |
if 'audita.txt' in filenames: | |
audita_file = Path(folder_name, 'audita.txt') | |
category, peptide, experiment = parse_audita_location(audita_file) | |
user, folder, date, time = audita_metadata(audita_file) | |
# print(peptide, experiment, user, folder, date, time) | |
yield category, peptide, experiment, user, folder, date, time | |
def parse_audita_location(audita_file): | |
"""Infer from the location of an audita.txt file the metadata for the | |
conditions, the NMR experiment, and the peptide analyzed. | |
Args: | |
audita_file: (pathlib.Path) | |
The path to the audita.txt file | |
Returns: | |
category, peptide, experiment (str, str, str) | |
category: indicates phosphorylated pH 7 or 4, or dephosphorylated | |
peptide: e.g. Ac_ApS_NH2 | |
experiment: '1H' or 'TOCSY' | |
""" | |
experiment = audita_file.parent.parts[-1] | |
peptide = audita_file.parent.parts[-2] | |
category = audita_file.parent.parts[-3] | |
if experiment not in ['1H', 'TOCSY']: | |
print(f'FILE STRUCTURE ERROR: audita.txt found in: {audita_file.parent}') | |
return category, peptide, experiment | |
def audita_metadata(audita): | |
""" | |
Extract metadata for NMR experiment from the audita.txt file | |
Args: | |
audita: (pathlib.Path) | |
The path to the audita.txt file | |
Returns: | |
user, folder, date, time (str, str, str, str) | |
user: nmr account the experiment was run on | |
folder: the name of the experiment's folder on the NMR server | |
date: "started at" date | |
time: "started at" time | |
""" | |
regex_time = datetime_regex() | |
regex_origin = origin_regex() | |
user = folder = date = time = None | |
with open(audita, 'r') as file: | |
content = file.readlines() | |
origin_data = regex_origin.search(content[4]) | |
if not origin_data: | |
print('FAILURE: origin regex on ', audita) | |
print('content: ', content[4]) | |
else: | |
user = origin_data.group(1) | |
folder = origin_data.group(2) | |
for line in content: | |
r = regex_time.search(line) | |
if r: | |
date = r.group(1) | |
time = r.group(2) | |
break | |
return user, folder, date, time | |
def datetime_regex(): | |
"""return a regex that finds date {group(1) and time{group(2)}.""" | |
regex = re.compile(r'\s*started at (\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}\.\d+)') | |
return regex | |
def origin_regex(): | |
"""return a regex that finds user {group(1) and folder name{group(2)}.""" | |
# $$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt | |
regex = re.compile(r'\$\$ /\S+/data/(\S+)/nmr/(\S+)/\d+/audita.txt') | |
return regex | |
# Code below was used to explore tree-walking and file/path parsing. | |
# Not necessary for script to run. | |
def test_walk(folder): | |
"""Used to explore how os.walk works.""" | |
for folder_name, subfolders, filenames in os.walk(folder): | |
print('The current folder is ' + folder_name) | |
for subfolder in subfolders: | |
print('SUBFOLDER OF ' + folder_name + ': ' + subfolder) | |
for filename in filenames: | |
print('FILE INSIDE ' + folder_name + ': ' + filename) | |
print('') | |
# strategy: extract line 5 of audita.txt, split, extract user/nmr/filename | |
def file_origin(audita): | |
"""Used to explore parsing of audita file.""" | |
regex = datetime_regex() | |
with open(audita, 'r') as file: | |
content = file.readlines() | |
origin_line = content[4] | |
assert origin_line.startswith('$$ ') | |
print(f'Origin line: {origin_line}') | |
origin_path = Path(origin_line[3:]) | |
print(f'Origin path: {origin_path}') | |
origin_parts = origin_path.parts | |
print(origin_parts) | |
for line in content: | |
r = regex.search(line) | |
if r: | |
print('regex: ', r.group()) | |
return origin_parts[3], origin_parts[5] | |
def test_regex(): | |
"""test that the regexes are working.""" | |
testdate = ' started at 2019-04-06 08:01:22.652 -0400,' | |
regex_date = datetime_regex() | |
result_date = regex_date.search(testdate) | |
print('date regex: ', result_date.group(), ' | ', result_date.group(1), result_date.group(2)) | |
testorigin = '$$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt' | |
regex_origin = origin_regex() | |
result_origin = regex_origin.search(testorigin) | |
print('origin regex: ', result_origin.group(), ' | ', result_origin.group(1), result_origin.group(2)) | |
# parent_folder would only search the folder for the 'phosphorylated_pH7' | |
# experiment. Its parent directory held multiple experiment folders, so for a | |
# complete search parent_folder.parent was used below: | |
record_metadata(parent_folder.parent, 'nmr_metadata.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment