Starting with a FASTA file

Note

These tutorials are still under active development.

Extract the sequences in FASTA format from the genome sequence by a BED files

For example, you have a list of genes and want to achieve the following tasks:

  • Extract the genes from GTF file

  • Get all exons of the selected genes

  • Select the transcript which has the most exon number and keep only this transcript

  • Extract the sequences of these selected exons and concatenate them in a proper order (starting from exon 1 and each sequence starts from 5 prime end)

  • Save the sequences into FASTA file. One file per gene (including multiple exons)

from genomkit import GAnnotation
from collections import Counter
from genomkit import GRegion, GRegions, GSequences

GTF_PATH = "GRCm39/gencode.vM34.basic.annotation.gtf"
FASTA_file = "GRCm39/GRCm39.primary_assembly.genome.fa"
gtf = GAnnotation(file_path=GTF_PATH, file_format="gtf")
genes = gtf.get_regions(element_type="gene")

# Loading the names
with open("gene_names.txt") as f:
    sel_gene_names = [line.strip() for line in f]
# Load genome FASTA
fasta = GSequences(name="GRCm39", load=FASTA_file)
# Iterate genes
for gene_name in sel_gene_names:
    exons = gtf.filter_elements(element_type="exon", attribute="gene_name", value=gene_name)
    transcript_list = []
    for e in exons:
        transcript_list.append(e["transcript_id"])
    counter = Counter(transcript_list)
    most_transcript, count = counter.most_common(1)[0]
    print(most_transcript, count)
    sel_exons = [e for e in exons if e["transcript_id"] == most_transcript]
    # sort exon by their exon number
    sel_exons = sorted(sel_exons, key=lambda x: int(x['exon_number']))
    exon_regions = GRegions(name=gene_name)
    for e in sel_exons:
        exon_regions.add(GRegion(sequence=e["chr"],
                                 start=e["start"],
                                 end=e["end"],
                                 orientation=e["strand"],
                                 name=e["gene_name"]+"_exon"+e["exon_number"]))
    exon_seqs = fasta.extract_seqs_by_regions(regions=exon_regions)
    exon_seqs.write_FASTA(filename="{}.fasta".format(gene_name))