## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, fig.width = 8, fig.height = 6 ) ## ----install, eval=FALSE------------------------------------------------------ # # Install pak if not already installed # if (!require("pak")) install.packages("pak") # # Install from GitHub # pak::pkg_install("github::monahton/GencoDymo2") # # # Load the package # library(GencoDymo2) ## ----get_release, eval=FALSE-------------------------------------------------- # # Fetch the most recent human and mouse GENCODE release identifiers # human_release <- get_latest_release("human", verbose = T) # mouse_release <- get_latest_release("mouse", verbose = T) ## ----get_release_ex, echo=FALSE----------------------------------------------- # Get latest human and mouse release cat("Latest human GENCODE release: release_47") cat("Latest human GENCODE release: release_M36") ## ----get-files, eval=FALSE---------------------------------------------------- # # # Download latest human long noncoding RNAs GTF # lnc_47_gtf <- get_gtf( # species = "human", # release_version = human_release, # annotation_type = "long_noncoding_RNAs.gtf.gz", # dest_folder = tempdir() # ) # # # Download previous human release (release_46) for comparison # lnc_46_gtf <- get_gtf( # species = "human", # release_version = "release_46", # annotation_type = "long_noncoding_RNAs.gtf.gz", # dest_folder = tempdir() # ) # # # Download latest mouse primary assembly annotations (GFF3) # mouse_36_gff3 <- get_gff3( # species = "mouse", # release_version = mouse_release, # annotation_type = "primary_assembly.annotation.gff3.gz", # dest_folder = tempdir() # ) ## ----annotation-types, echo=FALSE--------------------------------------------- cat("Valid Annotation Types:\n") valid_annotation_types <- c( "annotation", "basic.annotation", "chr_patch_hapl_scaff.annotation", "chr_patch_hapl_scaff.basic.annotation", "long_noncoding_RNAs", "primary_assembly.annotation", "primary_assembly.basic.annotation", "tRNAs", "polyAs") valid_annotation_types ## ----load-data, eval=FALSE---------------------------------------------------- # # Loading using the stored paths from previous steps # lnc_47_df <- load_file(lnc_47_gtf) # head(lnc_47_df) # # # Alternatively, specify the file path directly # lnc_46_df <- load_file(file.path(tempdir(), "gencode.v46.long_noncoding_RNAs.gtf.gz")) # head(lnc_46_df) # # # Load mouse GFF3 # mouse_pri_36 <- load_file(file.path(tempdir(),"gencode.vM36.primary_assembly.annotation.gff3.gz")) # head(mouse_pri_36) ## ----compare-releases, eval=FALSE--------------------------------------------- # # Compare gene counts between release 47 and 46 # gene_comparison <- compare_release(lnc_47_df, lnc_46_df, type = "gene") # # # Compare exon counts # exon_comparison <- compare_release(lnc_47_df, lnc_46_df, type = "exon") # # # Compare a specific gene biotype (e.g., TEC) using a custom baseline # comparison <- compare_release( # lnc_47_df, # lnc_46_df, # type = "gene", # gene_type = "TEC", # baseline = "count1" # ) ## ----introns, eval=FALSE------------------------------------------------------ # # Human lncRNA introns for release 47 # introns_lnc_47 <- extract_introns(lnc_47_df, verbose = T) # # # Mouse introns (filtering to primary chromosomes first) # mouse_pri_36 <- mouse_pri_36[grepl("^chr", mouse_pri_36$seqnames), ] # mouse_introns_pri_36 <- extract_introns(mouse_pri_36, verbose = T) # ## ----splice-sites, eval=FALSE------------------------------------------------- # # Human # library(BSgenome.Hsapiens.UCSC.hg38) # lnc_47_ss <- assign_splice_sites( # introns_lnc_47, # genome = BSgenome.Hsapiens.UCSC.hg38, # verbose = T # ) # # # Mouse # library(BSgenome.Mmusculus.UCSC.mm39) # mouse_pri_36_ss <- assign_splice_sites( # mouse_introns_pri_36, # genome = BSgenome.Mmusculus.UCSC.mm39, # verbose = T # ) ## ----cryptic, eval=FALSE------------------------------------------------------ # # Identify cryptic (non-canonical) splice sites # cryptic_ss <- find_cryptic_splice_sites( # lnc_47_ss, # genome = BSgenome.Hsapiens.UCSC.hg38, # canonical_donor = "GT", # canonical_acceptor = "AG", # verbose = TRUE # ) ## ----motifs, eval=FALSE------------------------------------------------------- # # Donor motifs (5'ss) # motifs_donor <- extract_ss_motif( # input = lnc_47_ss, # genome = BSgenome.Hsapiens.UCSC.hg38, # type = "5ss", # verbose = T, # save_fasta = T, # output_file = file.path(tempdir(), "lnc_47_5ss_motifs.fa") # ) # # # Acceptor motifs (3'ss) # motifs_acc <- extract_ss_motif( # input = lnc_47_ss, # genome = BSgenome.Hsapiens.UCSC.hg38, # type = "3ss", # verbose = T, # save_fasta = T, # output_file = file.path(tempdir(), "lnc_47_3ss_motifs.fa") # ) ## ----unspliced, eval=FALSE---------------------------------------------------- # ## identify single exon genes and transcripts # single_exon_genes <- extract_single_exon(lnc_47_df, level = "gene") # single_exon_trans <- extract_single_exon(lnc_47_df, level = "transcript") ## ----exon_class, eval=FALSE--------------------------------------------------- # # Assign the ordinal position of exons # lnc_47_class_exons <- classify_exons(lnc_47_df, verbose = TRUE) ## ----eval=FALSE--------------------------------------------------------------- # # Length of spliced transcript # lnc_47_spliced_length <- spliced_trans_length(lnc_47_df) # head(lnc_47_spliced_length) ## ----stat, eval=FALSE--------------------------------------------------------- # # Exon length statistics # lnc_47_exon_stats <- stat_summary(lnc_47_class_exons, type = "exon") # # # Intron length statistics # lnc_47_intron_stats <- stat_summary(introns_lnc_47, type = "intron") ## ----gc-content, eval=FALSE--------------------------------------------------- # # Human # lnc_47_gc <- calculate_gc_content( # lnc_47_df, # genome = BSgenome.Hsapiens.UCSC.hg38, # verbose = TRUE # ) # # Mouse # mouse_pri_36_gc <- calculate_gc_content( # mouse_pri_36, # genome = BSgenome.Mmusculus.UCSC.mm39, # verbose = TRUE # ) ## ----cds, eval=FALSE---------------------------------------------------------- # # Convert to GRanges and extract # library(GenomicRanges) # mouse_pri_36_granges <- GRanges(mouse_pri_36) # mouse_cds_seqs <- extract_cds_sequences( # mouse_pri_36_granges, # BSgenome.Mmusculus.UCSC.mm39, # save_fasta = TRUE, # output_file = file.path(tempdir(), "mouse_pri_36_CDS.fa.gz") # verbose = TRUE # ) ## ----eval=TRUE, echo=FALSE---------------------------------------------------- devtools::session_info()