Source code for snakedeploy.collect_files
import sys
from glob import glob
import re
from collections import namedtuple
import pandas as pd
from snakedeploy.exceptions import UserError
[docs]
def collect_files(config_sheet_path: str):
"""Given a configuration sheet path with input patterns, print matches
of samples to STDOUT
"""
config_sheet = pd.read_csv(config_sheet_path, sep="\t")
config_sheet["input_re"] = config_sheet["input_pattern"].apply(re.compile)
re_groups = sorted(
{group for regex in config_sheet["input_re"] for group in regex.groupindex}
)
for item in sys.stdin:
item = item[:-1] # remove newline
matches = list(
filter(
lambda match: match.match is not None, get_matches(item, config_sheet)
)
)
if not matches:
raise UserError(f"No input pattern in config sheet matches {item}.")
elif len(matches) > 1:
raise UserError(f"Item {item} matches multiple input patterns.")
match = matches[0]
pattern = match.rule.glob_pattern.format(
**{
key: autoconvert(value)
for key, value in match.match.groupdict().items()
}
)
files = sorted(glob(pattern))
if not files:
raise UserError(f"No files were found for {item} with pattern {pattern}.")
match_groupdict = match.match.groupdict(default="")
print(
*[match_groupdict.get(group, "") for group in re_groups], *files, sep="\t"
)
Match = namedtuple("Match", "rule match")
def get_matches(item, config_sheet: pd.DataFrame):
return (
Match(rule, rule.input_re.match(item)) for rule in config_sheet.itertuples()
)
def autoconvert(value):
try:
return int(value)
except ValueError:
return value