Source code for illumina_fastq.utils

# -*- coding: utf-8 -*-

###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
# nathan.watson86@gmail.com
###

# Created Jan, 2017

import sys
import os
import datetime
import gzip
#from memory_profiler import profile

# @profile #used for memory_profiler


[docs]def parseIlluminaFastqAttLine(attLine):
    """
    Given the title line of a FASTQ record, tonizes the line and stores the tokens in a dict.
    The Illumina FASTQ Att line format (as of CASAVA 1.8 at least) is:

      @<instrument-name>:<run ID>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read number>:<is filtered>:<control number>:<barcode sequence>

    Args:
        attLine: `str`. The title line of a FASTQ record, minus any trailing whitespace.
    Returns:
        `dict`. The keys are:

        1. instrument,
        2. runId,
        3. flowcellId,
        4. lane,
        5. tile,
        6. xpos,
        7. ypos,
        8. readNumber,
        9. isFiltered,
       10. control,
       11. barcode

    """
    uid = attLine.strip()
    header = uid.lstrip("@").split(":")
    dico = {}
    dico["instrument"] = header[0]
    dico["runId"] = header[1]
    dico["flowcellId"] = header[2]
    dico["lane"] = header[3]
    dico["tile"] = header[4]
    dico["xpos"] = header[5]
    ypos, readNumber = header[6].split()
    dico["ypos"] = ypos
    dico["readNumber"] = readNumber
    dico["isFiltered"] = header[7]
    dico["control"] = header[8]
    dico["barcode"] = header[9]
    return dico


[docs]def getPairedendReadId(read_id):
    """
    Given either a forward read or reverse read identifier, returns the corresponding paired-end
    read identifier.

    Args:
        read_id: `str`. The forward read or reverse read identifier. This should be the entire
            title line of a FASTQ record, minus any trailing whitespace.
    Returns:
        `str`: The pairend-end read identifier (title line).

    Example:
        Setting read_id to "@COOPER:74:HFTH3BBXX:3:1101:29894:1033 1:N:0:NATGAATC+NGATCTCG" will return
        @COOPER:74:HFTH3BBXX:3:1101:29894:1033 2:N:0:NATGAATC+NGATCTCG
    """
    part1, part2 = read_id.strip().split()
    if part2.startswith("1"):
        part2 = part2.replace("1", "2", 1)
    elif part2.startswith("2"):
        part2 = part2.replace("2", "1", 1)
    else:
        raise Exception("Unknown read number in {title}".format(title=read_id))
    return part1 + " " + part2


[docs]def isForwardRead(seqid):
    """Indicates whether the passed-in read identifier is a forward or reverse read identifier.

    Args:
        seqid: `str`. A read identifier of a FASTQ record.

    Returns:
        `bool`: True if a forward read identifier, False otherwise.
    """
    if seqid.split()[1].startswith("1"):
        return True
    return False


[docs]def yieldRecs(fastqFile, log=sys.stdout, barcodes=[]):
    """
    A generator function that reads a FASTQ file and yields records, one at a time.
    The records to yield can be restricted to the specified set of barcodes.

    Args:
        fastqFile: `str`. Path to the FASTQ file to parse.
        log: A file handle to write log messages to. Defaults to STDOUT.

    Yields:
        A list containing one element per line of a FASTQ record. Each element is whitespace stripped.
    """
    log.write("Parsing " + fastqFile + "\n")
    log.flush()
    fastqFileExt = os.path.splitext(fastqFile)[-1]
    if fastqFileExt == ".gz":
        fh = gzip.open(fastqFile)
    else:
        fh = open(fastqFile)
    data = []
    count = 0
    lineCount = 0
    for line in fh:
        lineCount += 1
        count += 1
        data.append(line.strip())
        if count == 4:
            barcode = data[0].rsplit(":", 1)[-1]
            if barcode in barcodes or not barcodes:
                yield data
            count = 0
            data = []
        if lineCount % 1000000 == 0:
            # every million lines
            log.write(str(datetime.datetime.now()) + ":  " + str(lineCount) + "\n")
            log.flush()
    fh.close()
    log.write("Finished parsing " + fastqFile + "\n")
    log.flush()