Here's one way using biopython. Note that the Seq object has a number of methods that act just like those of a Python string. One of these is the find() method, which returns the index of the first occurrence of the sequence if found. Replace with rfind() if you want the last occurrence if that makes more sense. This solution uses a generator to avoid storing the entire list of trimmed sequences in memory:
import argparse
import contextlib
import gzip
import sys
from Bio import SeqIO
def trim_fasta(fileobj, sequence, num_bases):
"""
Trim num_bases upstream of the sequence provided
"""
for record in SeqIO.parse(fileobj, 'fasta'):
index = record.seq.find(sequence)
if index != -1 and index > num_bases:
record = record[index-num_bases:]
yield record
def open_file(filename, mode='r'):
if filename.endswith('.gz'):
return gzip.open(filename, mode)
else:
return open(filename, mode)
def get_argument_parser():
parser = argparse.ArgumentParser(add_help=False)
group = parser.add_argument_group('input options')
group.add_argument(
"fasta",
type=str,
metavar="FILE",
help="The input FASTA file",
)
group.add_argument(
"sequence",
type=str,
metavar="STR",
help="The sequence to find and trim",
)
group = parser.add_argument_group('trimming options')
group.add_argument(
"-n",
"--num_bases",
type=int,
metavar="INT",
default=1000,
help="Trim INT bases upstream of input sequence",
)
group = parser.add_argument_group('output options')
group.add_argument(
"-o",
"--output",
type=str,
metavar="FILE",
default='-',
help="Write the filtered output to FILE (default: stdout)",
)
group.add_argument(
"-f",
"--force",
action='store_true',
help="Overwrite the output file if it exists",
)
group = parser.add_argument_group('additional options')
group.add_argument(
"-h",
"--help",
action="help",
help="Show this help message and exit",
)
return parser
def main():
parser = get_argument_parser()
args = parser.parse_args()
with contextlib.ExitStack() as stack:
if args.output != '-':
out = stack.enter_context(open_file(args.output, 'wt' if args.force else 'xt'))
else:
out = sys.stdout
input_fasta = stack.enter_context(open_file(args.fasta))
trimmed_reads = trim_fasta(input_fasta, args.sequence, args.num_bases)
SeqIO.write(trimmed_reads, out, "fasta-2line")
if __name__ == '__main__':
main()
usage: trim_reads.py [-n INT] [-o FILE] [-f] [-h] FILE STR
input options:
FILE The input FASTA file
STR The sequence to find and trim
trimming options:
-n INT, --num_bases INT
Trim INT bases upstream of input sequence
output options:
-o FILE, --output FILE
Write the filtered output to FILE (default: stdout)
-f, --force Overwrite the output file if it exists
additional options:
-h, --help Show this help message and exit