# -*- coding: utf-8 -*-
###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
# nathan.watson86@gmail.com
###
# Created Jan, 2017
import datetime
import gzip
import os
import pdb
import sys
#from memory_profiler import profile
import illumina_fastq.utils as utils
class _FastqParseIter:
def __init__(self, fastqparse_i):
"""
Args:
fastqparse_i: A FastqParse instance.
"""
self.idx = -1
self.fpi = fastqparse_i
self.seqids = self.fpi.lookup.keys()
self.len_seqids = len(self.seqids)
def next(self):
self.idx += 1
if self.idx == self.len_seqids:
raise StopIteration
seqid_hashval = self.seqids[self.idx]
seqid_idx = self.fpi.lookup[seqid_hashval]
return self.fpi._formatRecord(self.fpi.data[seqid_idx])
[docs]class FastqParse:
"""
Parses the records in an Illumina FASTQ file and stores all records or only those having
specific barcodes. The sequence ID, sequence, and quality strings of each FASTQ record are
stored in a list of lists of the form
[ ["seqidA", "ACGT","#AAF"], ["seqidB", "GGAT"," #AAA"] ... ]
This list of lists is stored as self.data. A lookup table (dict) is also stored as
self.lookup. It is of the form
{ "seqidA": indexA, "seqidB": indexB, ... }
where an index gives the position in the list of the record with the given sequence ID.
The sequence ID is stored as the entire title line of a FASTQ record, minus any peripheral whitespace.
Also supports indexing the returned instance object using the header line of a given sequence, i.e.
if @GADGET:77:HFNLTBBXX:8:1101:30462:1279 1:N:0:NNAGCA is the read ID of a record that is present in a FASTQ file
named reads.fq, then the following returns True:
data = FastqParse("reads.fq")
data["@GADGET:77:HFNLTBBXX:8:1101:30462:1279 1:N:0:NNAGCA"] #returns True
Args:
fastq: `str`. Path to the FASTQ file to be parsed. Accepts uncompressed or gzip
compressed with a .gz extension.
log: A file handle for logging. Defaults to STDOUT.
extract_barcodes: `list` of one or more barcodes to extract from the FASTQ file.
If the barcode is duel-indexed, separate with a '+', i.e. 'ATCGGT+GCAGCT',
as this is how it is written in the FASTQ file.
sample_size: `int`. Indicates the number of records from the start of the FASTQ file to
parse. A Falsy value (the default) means that the entire FASTQ file will be parsed.
"""
SEQID_KEY = "seqid"
SEQ_KEY = "seq"
QUAL_KEY = "qual"
# SEQID_IDX, SEQ_IDX, and QUAL_IDX store the index position of the read ID, sequence string, and quality string, respectively,
# of a given sublist in the list self.data.
SEQID_IDX = 0
SEQ_IDX = 1
QUAL_IDX = 2
def __init__(self, fastq, log=sys.stdout, extract_barcodes=[], sample_size=False):
self.fastqFile = fastq
self.barcodes = extract_barcodes
self.sample_size = sample_size
self.log = log
self._parse() # sets self.data
# sets self.lookup.
def __iter__(self):
return _FastqParseIter(self)
def __getitem__(self, seqid):
return self.isRecordPresent(seqid)
def __len__(self):
return len(self.lookup)
[docs] @classmethod
def getPairedendReadId(cls, read_id):
"""
Given either a forward read or reverse read identifier, returns the corresponding paired-end
read identifier.
Args:
read_id: `str`. The forward read or reverse read identifier. This should be the
entire title line of a FASTQ record, minus any trailing whitespace.
Returns:
`str`: The paired-end read identifier (title line).
Example:
Setting read_id to "@COOPER:74:HFTH3BBXX:3:1101:29894:1033 1:N:0:NATGAATC+NGATCTCG" will return
@COOPER:74:HFTH3BBXX:3:1101:29894:1033 2:N:0:NATGAATC+NGATCTCG
"""
return utils.getPairedendReadId(read_id)
@classmethod
def formatRecordForOutput(cls, record):
return "\n".join([record[FastqParse.SEQID_KEY], record[FastqParse.SEQ_KEY],
"+", record[FastqParse.QUAL_KEY]]) + "\n"
def printRecord(self, seqid, outfh):
outfh.write(FastqParse.formatRecordForOutput(self.getRecord(seqid)))
@classmethod
def parseIlluminaFastqAttLine(cls, attLine):
# Illumina FASTQ Att line format (as of CASAVA 1.8 at least):
# @<instrument-name>:<run ID>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read number>:<is filtered>:<control number>:<barcode sequence>
return utils.parseIlluminaFastqAttLine(attLine)
# @profile #used for memory_profiler
def _parse(self):
self.log.write("Parsing " + self.fastqFile + "\n")
self.log.flush()
fastqFileExt = os.path.splitext(self.fastqFile)[-1]
if fastqFileExt == ".gz":
fh = gzip.open(self.fastqFile)
else:
fh = open(self.fastqFile)
self.data = []
self.lookup = {}
count = 0
lineCount = 0
for line in fh:
lineCount += 1
count += 1
line = line.strip()
if count == 1:
#uid = lineCount
uid = line
barcode = line.rsplit(":", 1)[-1]
#self.data[uid] = {"name": line}
#self.data[uid] = {}
elif count == 2:
seq = line
#self.data[uid]["seq"] = line
elif count == 4:
#self.data[uid]["qual"] = line
if barcode in self.barcodes or not self.barcodes:
self.data.append([uid, seq, line])
hash_id = hash(uid)
if hash_id in self.lookup:
raise Exception(
"Found multiple entires for the FASTQ record having title line '{uid}'.".format(
uid=uid))
self.lookup[hash_id] = len(self.data) - 1
if self.sample_size and len(self.lookup) == self.sample_size:
break
count = 0
if lineCount % 1000000 == 0:
# every million lines
self.log.write(str(datetime.datetime.now()) + ": " + str(lineCount) + "\n")
self.log.flush()
fh.close()
self.log.write("Finished parsing " + self.fastqFile + "\n")
self.log.flush()
def isRecordPresent(self, title_line):
if hash(title_line) in self.lookup:
return True
return False
def getRecord(self, title_line):
rec = self.data[self.lookup[hash(title_line)]]
return self._formatRecord(rec)
@classmethod
def isForwardRead(cls, seqid):
return utils.isForwardRead(seqid)
def _formatRecord(self, rec):
"""
Args : rec - A sublist from self.data.
"""
return {FastqParse.SEQID_KEY: rec[FastqParse.SEQID_IDX],
FastqParse.SEQ_KEY: rec[FastqParse.SEQ_IDX], FastqParse.QUAL_KEY: rec[FastqParse.QUAL_IDX]}
def barcodeHist(self):
bcDico = {}
count = 0
for rec in self:
att_line = FastqParse.parseIlluminaFastqAttLine(rec[FastqParse.SEQID_KEY])
barcode = att_line["barcode"]
if barcode not in bcDico:
bcDico[barcode] = 0
bcDico[barcode] += 1
return bcDico