Skip to content

Instantly share code, notes, and snippets.

@sametz
Created January 16, 2020 14:59
Show Gist options
  • Save sametz/268c652e3e7d0a9b82adadb1c9f8c859 to your computer and use it in GitHub Desktop.
Save sametz/268c652e3e7d0a9b82adadb1c9f8c859 to your computer and use it in GitHub Desktop.
walks a directory looking for audita.txt NMR metadata and saves info as a csv
"""Walk a directory tree, find audita.txt metadata files for NMR experiments
of different peptides, parse their content and their location in the directory,
and record the data as a csv.
*** parent_folder needs to be changed to whatever parent folder you want to
search for NMR data.***
The tree structure resembles:
<parent directory>/<experiment>/<peptide>/<NMR experiment>/<audita.txt>
and it is assumed that files are stored in this structure, in order to parse
the location for <experiment>, <peptide>, and <NMR experiment>
audita.txt includes a line describing the location of the folder on the NMR
server, e.g.:
$$ /opt/data/<user>/nmr/<experiment folder>/3/audita.txt
It also includes a line for the date and time the NMR experiment was started,
e.g.:
' started at 2019-04-06 08:01:22.652 -0400,'
The script records a csv of:
* Category (the experiment, e.g. 'phosphorylated_pH7'
* Peptide (e.g. 'Ac_ApS_NH2')
* Experiment (the NMR experiment, e.g. '1H' or 'TOCSY')
* User (username used for NMR account)
* Server Folder (name of file on server all the NMR experiments are saved in)
* Date (YYYY-MM-DD)
* Started At (hh.mm.ss. 24-h with fractional seconds)
"""
import csv
import os
from pathlib import Path
import re
HOME = Path.home()
parent_folder = Path.home().joinpath('Dropbox', 'shared_folders',
'peptide_data', 'data', 'raw', 'nmr',
'phosphorylated_pH7')
assert parent_folder.exists()
print('Searching ', parent_folder)
# test_audita used while testing functions that parse audita.txt.
# If you want to run tests on an audita.txt file, uncomment the code below
# and provide a path to one of the audita.txt files
# test_audita = parent_folder.joinpath('Ac_ApT_NH2', '1H', 'audita.txt')
# assert test_audita.is_file()
def record_metadata(folder, filename):
"""Write metadata for all NMR experiments within a folder, as a csv
named 'filename'.
"""
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Category', 'Peptide', 'Experiment', 'User',
'Server Folder', 'Date', 'Started At'])
for data in find_metadata(folder):
writer.writerow(data)
def find_metadata(folder):
"""Given a folder, search for audita.txt files and yield metadata for
the experiment.
"""
# Study, Experiment, User, Server Folder, Date, Started At
for folder_name, subfolders, filenames in os.walk(folder):
if 'audita.txt' in filenames:
audita_file = Path(folder_name, 'audita.txt')
category, peptide, experiment = parse_audita_location(audita_file)
user, folder, date, time = audita_metadata(audita_file)
# print(peptide, experiment, user, folder, date, time)
yield category, peptide, experiment, user, folder, date, time
def parse_audita_location(audita_file):
"""Infer from the location of an audita.txt file the metadata for the
conditions, the NMR experiment, and the peptide analyzed.
Args:
audita_file: (pathlib.Path)
The path to the audita.txt file
Returns:
category, peptide, experiment (str, str, str)
category: indicates phosphorylated pH 7 or 4, or dephosphorylated
peptide: e.g. Ac_ApS_NH2
experiment: '1H' or 'TOCSY'
"""
experiment = audita_file.parent.parts[-1]
peptide = audita_file.parent.parts[-2]
category = audita_file.parent.parts[-3]
if experiment not in ['1H', 'TOCSY']:
print(f'FILE STRUCTURE ERROR: audita.txt found in: {audita_file.parent}')
return category, peptide, experiment
def audita_metadata(audita):
"""
Extract metadata for NMR experiment from the audita.txt file
Args:
audita: (pathlib.Path)
The path to the audita.txt file
Returns:
user, folder, date, time (str, str, str, str)
user: nmr account the experiment was run on
folder: the name of the experiment's folder on the NMR server
date: "started at" date
time: "started at" time
"""
regex_time = datetime_regex()
regex_origin = origin_regex()
user = folder = date = time = None
with open(audita, 'r') as file:
content = file.readlines()
origin_data = regex_origin.search(content[4])
if not origin_data:
print('FAILURE: origin regex on ', audita)
print('content: ', content[4])
else:
user = origin_data.group(1)
folder = origin_data.group(2)
for line in content:
r = regex_time.search(line)
if r:
date = r.group(1)
time = r.group(2)
break
return user, folder, date, time
def datetime_regex():
"""return a regex that finds date {group(1) and time{group(2)}."""
regex = re.compile(r'\s*started at (\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}\.\d+)')
return regex
def origin_regex():
"""return a regex that finds user {group(1) and folder name{group(2)}."""
# $$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt
regex = re.compile(r'\$\$ /\S+/data/(\S+)/nmr/(\S+)/\d+/audita.txt')
return regex
# Code below was used to explore tree-walking and file/path parsing.
# Not necessary for script to run.
def test_walk(folder):
"""Used to explore how os.walk works."""
for folder_name, subfolders, filenames in os.walk(folder):
print('The current folder is ' + folder_name)
for subfolder in subfolders:
print('SUBFOLDER OF ' + folder_name + ': ' + subfolder)
for filename in filenames:
print('FILE INSIDE ' + folder_name + ': ' + filename)
print('')
# strategy: extract line 5 of audita.txt, split, extract user/nmr/filename
def file_origin(audita):
"""Used to explore parsing of audita file."""
regex = datetime_regex()
with open(audita, 'r') as file:
content = file.readlines()
origin_line = content[4]
assert origin_line.startswith('$$ ')
print(f'Origin line: {origin_line}')
origin_path = Path(origin_line[3:])
print(f'Origin path: {origin_path}')
origin_parts = origin_path.parts
print(origin_parts)
for line in content:
r = regex.search(line)
if r:
print('regex: ', r.group())
return origin_parts[3], origin_parts[5]
def test_regex():
"""test that the regexes are working."""
testdate = ' started at 2019-04-06 08:01:22.652 -0400,'
regex_date = datetime_regex()
result_date = regex_date.search(testdate)
print('date regex: ', result_date.group(), ' | ', result_date.group(1), result_date.group(2))
testorigin = '$$ /opt/data/himal/nmr/nvc-03040F-ApT-68-lowT/3/audita.txt'
regex_origin = origin_regex()
result_origin = regex_origin.search(testorigin)
print('origin regex: ', result_origin.group(), ' | ', result_origin.group(1), result_origin.group(2))
# parent_folder would only search the folder for the 'phosphorylated_pH7'
# experiment. Its parent directory held multiple experiment folders, so for a
# complete search parent_folder.parent was used below:
record_metadata(parent_folder.parent, 'nmr_metadata.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment