#!/usr/bin/env python
"""Code supporting the information discovery and assimilation of data/file assets."""
# Imports
from logzero import logger as log
import os
from pathlib import Path
from collections import defaultdict, namedtuple
import pandas as pd
import numpy as np
from munch import Munch, munchify
import veoibd_synapse.data.extract_subids as extract_subids
import veoibd_synapse.errors as e
# Metadata
__author__ = "Gus Dunn"
__email__ = "w.gus.dunn@gmail.com"
# Constants
PARSE_FILE_NAME = Munch()
PARSE_FILE_NAME.REGENERON1 = extract_subids.bch.subject_from_regeneron1_fname
# Classes
Row = namedtuple('Row', ["path_hash","file_name",
"directory","batch_code",
"file_type","assay_type",
"bytes","subject_id"], verbose=False, rename=False)
# Functions
[docs]def pathify_assets(FILE_TYPE):
"""Converts the list of path glob patterns in the config file to list of ``Path`` objects.
In place conversion.
Args:
FILE_TYPE (``dict``-like): key=file type, val=list of path glob patterns
Returns:
``None``
"""
for key in FILE_TYPE.keys():
paths = []
for i in FILE_TYPE[key]:
p = Path(i)
paths.extend(list(p.parent.glob(p.name)))
FILE_TYPE[key] = paths
[docs]def build_asset_table(asset_conf, pathify=True):
"""Return asset table as ``pd.DataFrame`` built from ``asset_conf`` info.
Column Discriptions:
- path_hash (`int`)
- file_name (`str`)
- directory (`str`)
- batch_code (`Category`)
- Regeneron1, Merck1, Merck2, etc
- file_type (`Category`)
- BAM, VCF, GVCF, FASTQ, etc
- assay_type (`Category`)
- WES, WGS, RNAseq, etc
- bytes (`int`)
- subject_id (`str`)
Args:
asset_conf (``dict``-like): configuration tree built from asset_intake configuration file.
pathify (``bool``): whether or not to run ``pathify_assets()`` on the paths in ``asset_conf``
Returns:
``pd.DataFrame``
"""
dtypes = {"path_hash": np.int64,
"file_name": str,
"directory": str,
"batch_code": "category",
"file_type": "category",
"assay_type": "category",
"bytes": np.int64,
"subject_id": str,
}
if pathify:
for batch in asset_conf.BATCHES.values():
pathify_assets(FILE_TYPE=batch.FILE_TYPE)
rows = []
for batch_name, batch in asset_conf.BATCHES.items():
for ftype, paths in batch.FILE_TYPE.items():
for path in paths:
path_hash = hash(str(path))
file_name = path.name
directory = str(path.parent)
batch_code = batch_name
file_type = ftype
assay_type = batch.ASSAY_TYPE
bytes = path.stat().st_size
subject_id = path.stem
rows.append(Row(path_hash=path_hash,
file_name=file_name,
directory=directory,
batch_code=batch_code,
file_type=file_type,
assay_type=assay_type,
bytes=bytes,
subject_id=subject_id
)
)
assets = pd.DataFrame(data=rows, index=None, columns=None, dtype=None, copy=False).astype(dtype=dtypes, copy=True, raise_on_error=True)
return assets