Source code for illumina_fastq.utils

# -*- coding: utf-8 -*-

###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
# nathan.watson86@gmail.com
###

# Created Jan, 2017

import sys
import os
import datetime
import gzip
#from memory_profiler import profile

# @profile #used for memory_profiler


[docs]def parseIlluminaFastqAttLine(attLine): """ Given the title line of a FASTQ record, tonizes the line and stores the tokens in a dict. The Illumina FASTQ Att line format (as of CASAVA 1.8 at least) is: @<instrument-name>:<run ID>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read number>:<is filtered>:<control number>:<barcode sequence> Args: attLine: `str`. The title line of a FASTQ record, minus any trailing whitespace. Returns: `dict`. The keys are: 1. instrument, 2. runId, 3. flowcellId, 4. lane, 5. tile, 6. xpos, 7. ypos, 8. readNumber, 9. isFiltered, 10. control, 11. barcode """ uid = attLine.strip() header = uid.lstrip("@").split(":") dico = {} dico["instrument"] = header[0] dico["runId"] = header[1] dico["flowcellId"] = header[2] dico["lane"] = header[3] dico["tile"] = header[4] dico["xpos"] = header[5] ypos, readNumber = header[6].split() dico["ypos"] = ypos dico["readNumber"] = readNumber dico["isFiltered"] = header[7] dico["control"] = header[8] dico["barcode"] = header[9] return dico
[docs]def getPairedendReadId(read_id): """ Given either a forward read or reverse read identifier, returns the corresponding paired-end read identifier. Args: read_id: `str`. The forward read or reverse read identifier. This should be the entire title line of a FASTQ record, minus any trailing whitespace. Returns: `str`: The pairend-end read identifier (title line). Example: Setting read_id to "@COOPER:74:HFTH3BBXX:3:1101:29894:1033 1:N:0:NATGAATC+NGATCTCG" will return @COOPER:74:HFTH3BBXX:3:1101:29894:1033 2:N:0:NATGAATC+NGATCTCG """ part1, part2 = read_id.strip().split() if part2.startswith("1"): part2 = part2.replace("1", "2", 1) elif part2.startswith("2"): part2 = part2.replace("2", "1", 1) else: raise Exception("Unknown read number in {title}".format(title=read_id)) return part1 + " " + part2
[docs]def isForwardRead(seqid): """Indicates whether the passed-in read identifier is a forward or reverse read identifier. Args: seqid: `str`. A read identifier of a FASTQ record. Returns: `bool`: True if a forward read identifier, False otherwise. """ if seqid.split()[1].startswith("1"): return True return False
[docs]def yieldRecs(fastqFile, log=sys.stdout, barcodes=[]): """ A generator function that reads a FASTQ file and yields records, one at a time. The records to yield can be restricted to the specified set of barcodes. Args: fastqFile: `str`. Path to the FASTQ file to parse. log: A file handle to write log messages to. Defaults to STDOUT. Yields: A list containing one element per line of a FASTQ record. Each element is whitespace stripped. """ log.write("Parsing " + fastqFile + "\n") log.flush() fastqFileExt = os.path.splitext(fastqFile)[-1] if fastqFileExt == ".gz": fh = gzip.open(fastqFile) else: fh = open(fastqFile) data = [] count = 0 lineCount = 0 for line in fh: lineCount += 1 count += 1 data.append(line.strip()) if count == 4: barcode = data[0].rsplit(":", 1)[-1] if barcode in barcodes or not barcodes: yield data count = 0 data = [] if lineCount % 1000000 == 0: # every million lines log.write(str(datetime.datetime.now()) + ": " + str(lineCount) + "\n") log.flush() fh.close() log.write("Finished parsing " + fastqFile + "\n") log.flush()