Source code for osl_ephys.utils.study

# Authors: 
# Andrew Quinn <a.quinn@bham.ac.uk>
# Mats van Es <mats.vanes@psych.ox.ac.uk>

import re
import glob
import parse
from string import Formatter

[docs]class Study:
    """Class for simple file finding and looping.
    
    Parameters
    ----------
    studydir : str
        The study directory with wildcards.
    
    Attributes
    ----------
    studydir : str
        The study directory with wildcards.
    fieldnames : list
        The wildcards in the study directory, i.e., the field names in between {braces}.
    globdir : str
        The study directory with wildcards replaced with *.
    match_files : list
        The files that match the globdir.
    match_values : list
        The values of the field names (i.e., wildcards) for each file.
    fields : dict
        The field names and values for each file.
    
    Notes
    -----
    This class is a simple wrapper around glob and parse. It works something like this:
    
    >>> studydir = '/path/to/study/{subject}/{session}/{subject}_{task}.fif'
    >>> study = Study(studydir)
    
    Get all files in the study directory:
    
    >>> study.get()
    
    Get all files for a particular subject:
    
    >>> study.get(subject='sub-01')
    
    Get all files for a particular subject and session:
    
    >>> study.get(subject='sub-01', session='ses-01')
    
    The fieldnames that are not specified in ``get`` are replaced with wildcards (``*``).
    """
    
    def __init__(self, studydir):
        """
        Notes
        -----
        This class is a simple wrapper around glob and parse. It works something like this:
        
        >>> studydir = '/path/to/study/{subject}/{session}/{subject}_{task}.fif'
        >>> study = Study(studydir)
        
        Get all files in the study directory:
        
        >>> study.get()
        
        Get all files for a particular subject:
        
        >>> study.get(subject='sub-01')
        
        Get all files for a particular subject and session:
        
        >>> study.get(subject='sub-01', session='ses-01')
        
        The fieldnames that are not specified in ``get`` are replaced with wildcards (*).
        """
[docs]        self.studydir = studydir

        # Extract field names in between {braces}
[docs]        self.fieldnames = [fname for _, fname, _, _ in Formatter().parse(self.studydir) if fname]

        # Replace braces with wildcards
[docs]        self.globdir = re.sub("\{.*?\}","*", studydir)

[docs]        self.match_files = sorted(glob.glob(self.globdir))
        print('found {} files'.format(len(self.match_files)))

        self.match_files = [ff for ff in self.match_files if parse.parse(self.studydir, ff) is not None]
        print('keeping {} consistent files'.format(len(self.match_files)))

[docs]        self.match_values = []
        for fname in self.match_files:
            self.match_values.append(parse.parse(self.studydir, fname).named)

[docs]        self.fields = {}
        # Use first file as a reference for keywords
        for key, value in self.match_values[0].items():
            self.fields[key] = [value]
            for d in self.match_values[1:]:
                self.fields[key].append(d[key])

    
[docs]    def refresh(self):
        """Refresh the study directory."""
        return self.__init__(self.studydir)
    
    
[docs]    def get(self, check_exist=True, **kwargs):
        """Get files from the study directory that match the fieldnames.

        Parameters
        ----------
        check_exist : bool
            Whether to check if the files exist.
        **kwargs : dict
            The field names and values to match.

        Returns
        -------
        out : list
            The files that match the field names and values.

        Notes
        -----
        Example using ``Study`` and ``Study.get()``:
        
        >>> studydir = '/path/to/study/{subject}/{session}/{subject}_{task}.fif'
        >>> study = Study(studydir)
        
        Get all files in the study directory:
        
        >>> study.get()
        
        Get all files for a particular subject:
        
        >>> study.get(subject='sub-01')
        
        Get all files for a particular subject and session:
        
        >>> study.get(subject='sub-01', session='ses-01')
        
        The fieldnames that are not specified in ``get`` are replaced with wildcards (``*``).               
        """
        keywords = {}
        for key in self.fieldnames:
            keywords[key] = kwargs.get(key, '*')

        fname = self.studydir.format(**keywords)
        
        # we only want the valid files
        if check_exist:
            return [ff for ff in glob.glob(fname) if any(ff in ff_valid for ff_valid in self.match_files)]
        else:
            return glob.glob(fname)