Source code for methylprep.models.samples

# Lib
import logging
from pathlib import PurePath, Path
from urllib.parse import urlparse, urlunparse

LOGGER = logging.getLogger(__name__)
REQUIRED = ['Sentrix_ID', 'Sentrix_Position', 'SentrixBarcode_A', 'SentrixPosition_A', 'Control',
    'Sample_Group', 'Sample_Name', 'Sample_Plate', 'Pool_ID', 'Sample_Well', 'GSM_ID',
    'Sample_Type', 'Sub_Type']

[docs]class Sample():
    """Object representing a row in a SampleSheet file

Arguments:
    data_dir {string or path-like} -- Base directory of the sample sheet and associated IDAT files.
    sentrix_id {string} -- The slide number of the processed array.
    sentrix_position {string} -- The position on the processed slide.

Keyword Arguments:
    addl_fields {} -- Additional metadata describing the sample.
    including experiment subject meta data:

        name (sample name, unique id)
        Sample_Type
        Control
        GSM_ID (same as sample name if using GEO public data)

    array meta data:

        group
        plate
        pool
        well
    """

    def __init__(self, data_dir, sentrix_id, sentrix_position, **addl_fields):
        self.data_dir = data_dir
        self.sentrix_id = sentrix_id
        self.sentrix_position = sentrix_position
        self.renamed_fields = {}

        # any OTHER sample_sheet columns are passed in exactly as they appear, if possible, and if column names exist.
        # these will pass into the meta_data pkl created, and any renamed fields must be noted in a lookup.
        for field in addl_fields:
            if field not in REQUIRED:
                new_field_name = field.replace(' ','_')
                if len(field) == 0:
                    continue
                if field[0].isdigit():
                    new_field_name = field[1:]
                if not field.isalnum(): # letters or numbers, or caps. no spaces or unicode
                    import re
                    new_field_name = re.sub(r'\W+', '', new_field_name)
                setattr(self, new_field_name, addl_fields[field])
                self.renamed_fields[field] = new_field_name
        self.group = addl_fields.get('Sample_Group')
        self.name = addl_fields.get('Sample_Name')
        self.plate = addl_fields.get('Sample_Plate')
        self.pool = addl_fields.get('Pool_ID')
        self.well = addl_fields.get('Sample_Well')
        self.GSM_ID = addl_fields.get('GSM_ID') # for GEO published sample compatability
        self.type = addl_fields.get('Sample_Type','Unknown') # from GEO MINiML meta data
        self.sub_type = addl_fields.get('Sub_Type') # from GEO
        self.is_control = True if addl_fields.get('Control') in (1,'1',True, 'True', 'true', 'TRUE') else False
        self.fields = {}
        self.fields.update(self.renamed_fields)
        self.fields.update({
            'Sentrix_ID': 'Sentrix_ID',
            'Sentrix_Position': 'Sentrix_Position', # these will be standardized here, regardless of sample_sheet variation names
            'Sample_Group': 'Sample_Group',
            'Sample_Name': 'Sample_Name',
            'Sample_Plate': 'Sample_Plate',
            'Sample_Type': 'Sample_Type',
            'Sub_Type': 'Sub_Type',
            'Sample_Well': 'Sample_Well',
            'Pool_ID': 'Pool_ID',
            'GSM_ID': 'GSM_ID',
            'Control': 'Control',
        })

    def __str__(self):
        return f'{self.sentrix_id}_{self.sentrix_position}'

    @property
    def base_filename(self):
        return f'{self.sentrix_id}_{self.sentrix_position}'

    @property
    def alternate_base_filename(self):
        """ GEO data sets using this file name convention."""
        if hasattr(self,'GSM_ID') and self.GSM_ID != None:
            return f'{self.GSM_ID}_{self.sentrix_id}_{self.sentrix_position}'
        else:
            return f'{self.sentrix_id}_{self.sentrix_position}'

[docs]    def get_filepath(self, extension, suffix=None, verify=True):
        """builds the filepath based on custom file extensions and suffixes during processing.

        Params (verify):
            tests whether file exists, either in data_dir or somewhere in recursive search path of data_dir.
        Export:
            uses this later to fetch the place where a file ought to be created -- but doesn't exist yet, so use verify=False.

        Notes:
            _suffix -- used to create the `<file>_processed` files.
        """
        _suffix = ''
        if suffix is not None:
            _suffix = f'_{suffix}'

        filename = f'{self.base_filename}{_suffix}.{extension}'
        alt_filename = f'{self.alternate_base_filename}{_suffix}.{extension}'
        path = PurePath(self.data_dir, str(self.sentrix_id), filename)
        if verify:
            # confirm this sample IDAT file exists, and update its filepath if different.
            # if filename fails, it will check alt_filename too.
            path = self._build_and_verify_path(filename, alt_filename, allow_compressed=True)
        return str(path)

[docs]    def get_file_s3(self, zip_reader, extension, suffix=None):
        """ replaces get_filepath, but for `s3` context. Since these files
        are compressed within a single zipfile in the bucket, they don't
        resolve to PurePaths."""
        _suffix = ''
        if suffix is not None:
            _suffix = f'_{suffix}'

        filename_to_match = f'{self.base_filename}{_suffix}.{extension}'
        for zip_filename in zip_reader.file_names:
            if not zip_filename.endswith('.idat'):
                continue
            if filename_to_match in zip_filename:
                # this is packed within the zipfile still, but zip_reader can fetch it.
                LOGGER.info(zip_reader.get_file_info(zip_filename))
                return zip_reader.get_file(zip_filename, match_partial=False)

    def _build_and_verify_path(self, filename, alt_filename=None, allow_compressed=False):
        """
        Added to Sample as class_method:
            if the matching filename for idat file is not in the same folder.
            check if exists:
            then look recursively for that filename and update the data_dir for that Sample.
            return the complete filepath.

        alt_filename:
            because public data sets on GEO have samplesheets with a different pattern, if the primary file pattern
            fails to match, it will try the alt_filename pattern before returning a FileNotFoundError.

        replaces _build_path
        """
        same_dir_path = PurePath(self.data_dir, str(self.sentrix_id), filename)
        if Path(same_dir_path).is_file():
            # this idat file is in the same folder, no more searching needed.
            return same_dir_path

        if allow_compressed and Path(same_dir_path.with_suffix('.gz')).is_file():
            return same_dir_path

        # otherwise, do a recursive search for this file and return the first path found.
        #file_pattern = f'{self.data_dir}/**/{filename}'
        #file_matches = glob(file_pattern, recursive=True)
        file_matches = list(Path(self.data_dir).rglob(filename))
        if (not file_matches) and allow_compressed:
            file_matches = list(Path(self.data_dir).rglob(filename + '.gz'))
        if file_matches == []:
            if alt_filename != None and alt_filename != filename:
                # Note: both patterns will be identical if GSM_ID missing from sample sheet.
                alt_file_matches = list(Path(self.data_dir).rglob(alt_filename))
                if (not alt_file_matches) and allow_compressed:
                    alt_file_matches = list(Path(self.data_dir).rglob(alt_filename + '.gz'))
                if len(alt_file_matches) > 1:
                    LOGGER.warning(f'Multiple ({len(alt_file_matches)}) files matched {alt_filename} -- saved path to first one: {alt_file_matches[0]}')
                if len(alt_file_matches) > 0:
                    return alt_file_matches[0]
            raise FileNotFoundError(f'No files in {self.data_dir} (or sub-folders) match this sample id: {filename} OR {alt_filename}')
        elif len(file_matches) > 1:
            LOGGER.warning(f'Multiple ({len(file_matches)}) files matched {alt_filename} -- saved path to first one: {file_matches[0]}')
        return file_matches[0]

[docs]    def get_export_filepath(self, extension='csv'):
        """ Called by run_pipeline to find the folder/filename to export data as CSV, but CSV file doesn't exist yet."""
        return self.get_filepath(extension, 'processed', verify=False)