diff --git a/Project.toml b/Project.toml new file mode 100644 index 000000000..6a964c599 --- /dev/null +++ b/Project.toml @@ -0,0 +1,49 @@ +name = "PhyloNetworks" +uuid = "33ad39ac-ed31-50eb-9b15-43d0656eaa72" +license = "MIT" +version = "0.10.0" + +[deps] +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" + +[compat] +BioSequences = "≥ 1.0.0" +BioSymbols = "≥ 3.0.0" +CSV = "≥ 0.4.0" +Combinatorics = "≥ 0.7.0" +DataFrames = "≥ 0.13.0" +DataStructures = "≥ 0.9.0" +Distributions = "≥ 0.15.0" +GLM = "≥ 1.0.0" +NLopt = "≥ 0.5.1" +SpecialFunctions = "≥ 0.7.0" +StaticArrays = "≥ 0.8.3" +StatsBase = "≥ 0.26.0" +StatsFuns = "≥ 0.7.0" +StatsModels = "≥ 0.6.0" +julia = "≥ 0.7.0" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/REQUIRE b/REQUIRE deleted file mode 100644 index a048f653f..000000000 --- a/REQUIRE +++ /dev/null @@ -1,15 +0,0 @@ -julia 0.7 -BioSequences 1.0 -BioSymbols 3.0 -Combinatorics 0.7 -CSV 0.4 -DataFrames 0.13 -DataStructures 0.9 -Distributions 0.15.0 -GLM 1.0 -NLopt 0.5.1 -SpecialFunctions 0.7 -StaticArrays 0.8.3 -StatsBase 0.26 -StatsFuns 0.7 -StatsModels 0.3 diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 0a5cb7836..4635f295b 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -9,7 +9,7 @@ const SUITE = BenchmarkGroup() SUITE["nasm"] = BenchmarkGroup(["JC69", "HKY85"]) SUITE["fitDiscreteFixed"] = BenchmarkGroup(["ERSM", "BTSM", "JC69", "HKY85"]) -SUITE["fitDiscrete"] = BenchmarkGroup(["ERSM", "BTSM", "JC69", "HKY85"]) +SUITE["fitdiscrete"] = BenchmarkGroup(["ERSM", "BTSM", "JC69", "HKY85"]) # Add benchmarks to nasm group SUITE["nasm"]["JC69"] = @benchmarkable JC69([0.5]) @@ -20,8 +20,8 @@ SUITE["nasm"]["HKY85"] = @benchmarkable P!(P(m1, 1.0), m1, 3.0) net_dat = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);") species_alone = ["C","A","B","D"] dat_alone = DataFrame(trait=["hi","lo","lo","hi"]) -SUITE["fitDiscreteFixed"]["ERSM"] = @benchmarkable fitDiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) -SUITE["fitDiscreteFixed"]["BTSM"] = @benchmarkable fitDiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) +SUITE["fitDiscreteFixed"]["ERSM"] = @benchmarkable fitdiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) +SUITE["fitDiscreteFixed"]["BTSM"] = @benchmarkable fitdiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) fastafile = joinpath(@__DIR__, "..", "examples", "Ae_bicornis_Tr406_Contig10132.aln") dna_dat, dna_weights = readfastatodna(fastafile, true); @@ -32,14 +32,14 @@ for edge in net_dna.edge #adds branch lengths setGamma!(edge, 0.5) end end -SUITE["fitDiscreteFixed"]["JC69"] = @benchmarkable fitDiscrete(net_dna, :JC69, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) -SUITE["fitDiscreteFixed"]["HKY85"] = @benchmarkable fitDiscrete(net_dna, :HKY85, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) - -## fitDiscrete benchmarks -SUITE["fitDiscrete"]["ERSM"] = @benchmarkable fitDiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=true, optimizeRVAS=true) -SUITE["fitDiscrete"]["BTSM"] = @benchmarkable fitDiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=true, optimizeRVAS=true) -SUITE["fitDiscrete"]["JC69"] = @benchmarkable fitDiscrete(net_dna, :JC69, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true) -SUITE["fitDiscrete"]["HKY85"] = @benchmarkable fitDiscrete(net_dna, :HKY85, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true) +SUITE["fitDiscreteFixed"]["JC69"] = @benchmarkable fitdiscrete(net_dna, :JC69, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) +SUITE["fitDiscreteFixed"]["HKY85"] = @benchmarkable fitdiscrete(net_dna, :HKY85, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) + +## fitdiscrete benchmarks +SUITE["fitdiscrete"]["ERSM"] = @benchmarkable fitdiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=true, optimizeRVAS=true) +SUITE["fitdiscrete"]["BTSM"] = @benchmarkable fitdiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=true, optimizeRVAS=true) +SUITE["fitdiscrete"]["JC69"] = @benchmarkable fitdiscrete(net_dna, :JC69, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true) +SUITE["fitdiscrete"]["HKY85"] = @benchmarkable fitdiscrete(net_dna, :HKY85, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true) # If a cache of tuned parameters already exists, use it, otherwise, tune and cache # the benchmark parameters. Reusing cached parameters is faster and more reliable diff --git a/docs/src/index.md b/docs/src/index.md index 311d39b83..30723f383 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -48,7 +48,7 @@ Pages = [ "man/multiplealleles.md", "man/trait_tree.md", "man/parsimony.md", - "man/fitDiscrete.md" + "man/fitdiscrete.md" ] Depth = 3 ``` diff --git a/docs/src/lib/public.md b/docs/src/lib/public.md index db5ac6582..b9500324b 100644 --- a/docs/src/lib/public.md +++ b/docs/src/lib/public.md @@ -137,11 +137,11 @@ parsimonyGF Q randomTrait randomTrait! -fitDiscrete +fitdiscrete maxParsimonyNet nstates -setrates! -setalpha! +stationary +empiricalDNAfrequencies ``` ```@meta diff --git a/docs/src/man/fitDiscrete.md b/docs/src/man/fitDiscrete.md index 4be492f6f..d831ae721 100644 --- a/docs/src/man/fitDiscrete.md +++ b/docs/src/man/fitDiscrete.md @@ -1,137 +1,142 @@ +```@setup traitevol_fixednet +using PhyloNetworks +using DataFrames +mkpath("../assets/figures") +``` + # Discrete Trait Evolution With a phylogenetic network structure inferred, we can now estimate how quickly traits -have evolved over time using a likelihood model. These traits should be discrete -characteristics of a species such as feather color, diet type, or dna in aligned -genetic sequences. +have evolved over time using a likelihood model. These traits should be discrete +characteristics of a species such as feather color, diet type, +or DNA in aligned genetic sequences. ## Discrete Trait Data -As with continuous trait evolution, we assume a fixed network, correctly rooted, +As with continuous trait evolution, we assume a fixed network, correctly rooted, with branch lengths proportional to calendar time. We start with a network, then add data about the tips of this network. We allow data of two types. -1. A vector of species names with a data frame of traits: -```@setup fitDiscrete -using PhyloNetworks, DataFrames -mkpath("../assets/figures") -``` - -```@example fitDiscrete -#read in network -simple_net = readTopology("(A:3.0,(B:2.0,(C:1.0,D:1.0):1.0):1.0);"); - -#read in trait data -simple_species = ["C","A","B","D"] -simple_dat = DataFrame(trait=["hi","lo","lo","hi"]) -``` - -If your species names and trait data are in the same data frame, read in your data -frame then subset the data like this: -```@example fitDiscrete -dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]) -simple_species = dat[:species] -simple_dat = DataFrame(trait = dat[:trait]) -``` - -2. To use dna data, read in the network structure then start with a fasta -file. Reading the data from this file using the `readfastatodna` function. This -creates a data frame of dna data and a vector of dna pattern weights. - -```@example fitDiscrete -#read in network -dna_net = readTopology("((((((((((((((Ae_caudata_Tr275:1.0,Ae_caudata_Tr276:1.0):1.0,Ae_caudata_Tr139:1.0):1.0)#H1:1.0::0.6,((((((Ae_longissima_Tr241:1.0,Ae_longissima_Tr242:1.0):1.0,Ae_longissima_Tr355:1.0):1.0,(Ae_sharonensis_Tr265:1.0,Ae_sharonensis_Tr264:1.0):1.0):1.0,((Ae_bicornis_Tr408:1.0,Ae_bicornis_Tr407:1.0):1.0,Ae_bicornis_Tr406:1.0):1.0):1.0,((Ae_searsii_Tr164:1.0,Ae_searsii_Tr165:1.0):1.0,Ae_searsii_Tr161:1.0):1.0):1.0)#H2:1.0::0.6):1.0,(((Ae_umbellulata_Tr266:1.0,Ae_umbellulata_Tr257:1.0):1.0,Ae_umbellulata_Tr268:1.0):1.0,#H1:1.0::0.4):1.0):1.0,((Ae_comosa_Tr271:1.0,Ae_comosa_Tr272:1.0):1.0,(((Ae_uniaristata_Tr403:1.0,Ae_uniaristata_Tr357:1.0):1.0,Ae_uniaristata_Tr402:1.0):1.0,Ae_uniaristata_Tr404:1.0):1.0):1.0):1.0,(((Ae_tauschii_Tr352:1.0,Ae_tauschii_Tr351:1.0):1.0,(Ae_tauschii_Tr180:1.0,Ae_tauschii_Tr125:1.0):1.0):1.0,(#H2:1.0::0.4,((((Ae_mutica_Tr237:1.0,Ae_mutica_Tr329:1.0):1.0,Ae_mutica_Tr244:1.0):1.0,Ae_mutica_Tr332:1.0):1.0)#H4:1.0::0.6):1.0):1.0):1.0,(((T_boeoticum_TS8:1.0,(T_boeoticum_TS10:1.0,T_boeoticum_TS3:1.0):1.0):1.0,T_boeoticum_TS4:1.0):1.0,((T_urartu_Tr315:1.0,T_urartu_Tr232:1.0):1.0,(T_urartu_Tr317:1.0,T_urartu_Tr309:1.0):1.0):1.0):1.0):1.0,(((((Ae_speltoides_Tr320:1.0,Ae_speltoides_Tr323:1.0):1.0,Ae_speltoides_Tr223:1.0):1.0,Ae_speltoides_Tr251:1.0):1.0):1.0,#H4:1.0::0.4):1.0):1.0):1.0,Ta_caputMedusae_TB2:1.0):1.0,S_vavilovii_Tr279:1.0):1.0,Er_bonaepartis_TB1:1.0):1.0,H_vulgare_HVens23:1.0);"); +1. A vector of species names with a data frame of traits: -#read in dna data -fastafile = joinpath(dirname(pathof(PhyloNetworks)), "..","examples", -"Ae_bicornis_Tr406_Contig10132.aln") -dna_dat, dna_weights = readfastatodna(fastafile, true); -``` + ```@example traitevol_fixednet + # read in network + net = readTopology("(A:3,((B:0.4)#H1:1.6::0.92,((C:0.4,#H1:0::0.08):0.6,D:1):1):1);"); + # read in trait data + species = ["C","A","B","D"] + dat = DataFrame(trait=["hi","lo","lo","hi"]) + ``` + + If your species names and trait data are in the same data frame, + read in your data frame then subset the data like this: + ```@example traitevol_fixednet + dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]) + species = dat[:species] + dat = DataFrame(trait = dat[:trait]) + ``` + +2. To use dna data, read in the network structure then start with a fasta + file. Reading the data from this file using the `readfastatodna` function. + This creates a data frame of dna data and a vector of dna pattern weights. + + ```@example traitevol_fixednet + # read in network + dna_net = readTopology("((((((((((((((Ae_caudata_Tr275:1.0,Ae_caudata_Tr276:1.0):1.0,Ae_caudata_Tr139:1.0):1.0)#H1:1.0::0.6,((((((Ae_longissima_Tr241:1.0,Ae_longissima_Tr242:1.0):1.0,Ae_longissima_Tr355:1.0):1.0,(Ae_sharonensis_Tr265:1.0,Ae_sharonensis_Tr264:1.0):1.0):1.0,((Ae_bicornis_Tr408:1.0,Ae_bicornis_Tr407:1.0):1.0,Ae_bicornis_Tr406:1.0):1.0):1.0,((Ae_searsii_Tr164:1.0,Ae_searsii_Tr165:1.0):1.0,Ae_searsii_Tr161:1.0):1.0):1.0)#H2:1.0::0.6):1.0,(((Ae_umbellulata_Tr266:1.0,Ae_umbellulata_Tr257:1.0):1.0,Ae_umbellulata_Tr268:1.0):1.0,#H1:1.0::0.4):1.0):1.0,((Ae_comosa_Tr271:1.0,Ae_comosa_Tr272:1.0):1.0,(((Ae_uniaristata_Tr403:1.0,Ae_uniaristata_Tr357:1.0):1.0,Ae_uniaristata_Tr402:1.0):1.0,Ae_uniaristata_Tr404:1.0):1.0):1.0):1.0,(((Ae_tauschii_Tr352:1.0,Ae_tauschii_Tr351:1.0):1.0,(Ae_tauschii_Tr180:1.0,Ae_tauschii_Tr125:1.0):1.0):1.0,(#H2:1.0::0.4,((((Ae_mutica_Tr237:1.0,Ae_mutica_Tr329:1.0):1.0,Ae_mutica_Tr244:1.0):1.0,Ae_mutica_Tr332:1.0):1.0)#H4:1.0::0.6):1.0):1.0):1.0,(((T_boeoticum_TS8:1.0,(T_boeoticum_TS10:1.0,T_boeoticum_TS3:1.0):1.0):1.0,T_boeoticum_TS4:1.0):1.0,((T_urartu_Tr315:1.0,T_urartu_Tr232:1.0):1.0,(T_urartu_Tr317:1.0,T_urartu_Tr309:1.0):1.0):1.0):1.0):1.0,(((((Ae_speltoides_Tr320:1.0,Ae_speltoides_Tr323:1.0):1.0,Ae_speltoides_Tr223:1.0):1.0,Ae_speltoides_Tr251:1.0):1.0):1.0,#H4:1.0::0.4):1.0):1.0):1.0,Ta_caputMedusae_TB2:1.0):1.0,S_vavilovii_Tr279:1.0):1.0,Er_bonaepartis_TB1:1.0):1.0,H_vulgare_HVens23:1.0);"); + # read in dna data + fastafile = joinpath(dirname(pathof(PhyloNetworks)), "..","examples","Ae_bicornis_Tr406_Contig10132.aln") + dna_dat, dna_weights = readfastatodna(fastafile, true); + dna_dat + dna_weights + ``` ## Choosing a Substitution Model -After reading in your data, choose a model to describe how evolutionary changes -(or substitutions, in the case of DNA) happened over time. We offer a selection -of Markov substitution models to describe the evolutionary process. +After reading in your data, choose a model to describe how evolutionary changes +(or substitutions, in the case of DNA) happened over time. +Available Markov substitution models are described below. -### Generic Trait Models +### Generic Trait Models These models works well for any type of trait we may want to model. For general -trait types, use one of these three models: -`:ERSM` Equal Rates Substitution Model -`:BTSM` Binary Trait Substitution Model -`:TBTSM` Two Binary Trait Substituion Model - -### DNA-Specific Models - -The DNA-specific models are optimized for aligned sequence data. We offer JC69 -and HKY85 models in both relative and absolute versions. The JC69 model was -developed by Jukes and Cantor in 1969 and uses one rate for all type of substitutions. -The HKY85 model was developed in 1985 by Hasegawa, Kishino, & Yano. It treats -transitions differently from transversions. -`:JC69` Jukes Cantor 69 Model -`:HKY85` Hasegawa, Kishino and Yano 1985 - -## Running FitDiscrete - -To infer evolutionary rates, run the `fitDiscrete` function on the network and data. -It will calculate the maximum likelihood score of a network given one or more -discrete trait characters at the tips. Along each edge, evolutionary changes -are modeled with a continous time Markov model, with parameters estimated by -maximizing the likelihood. At each hybrid node, the trait is assumed to be -inherited from the immediate parent (or parents, in the case of a hybrid edge). -If there is a hybrid edge, the trait is modeled according to the parents' weighted -average genetic contributions, as measured by inheritance gamma γ. The model -currently ignores incomplete lineage sorting. +trait types, use one of these three models: +- `:BTSM` Binary Trait Substitution Model (2 states, rates unconstrained) +- `:ERSM` Equal Rates Substitution Model + (`k` states, all transitions possible with equal rates) +- `:TBTSM` Two Binary Trait Substituion Model (though not fully implemented yet) + +### DNA-Specific Models + +The DNA-specific models are optimized for aligned sequence data. +The 4 nucleotide states are from +[BioSymbols](https://github.com/BioJulia/BioSymbols.jl) +(listed [here](http://biojulia.net/BioSymbols.jl/stable/nucleicacids/)). +Each model has a relative and an absolute version. +- `:JC69` Jukes & Cantor 1969 model: one single rate for all transitions. + The relative version has values -1 along the diagonal of the rate matrix + (1 expected transition / unit of time). The absolute version has an extra + parameter to scale the rate matrix. +- `:HKY85` Hasegawa, Kishino & Yano 1985: treats transitions differently + from transversions. + +## Fitting the model + +To infer evolutionary rates, run the `fitdiscrete` function on the network and data. +It will calculate the maximum likelihood score of a fixed network +given one or more discrete trait characters at the tips. +Along each edge, evolutionary changes +are modeled with a continous time Markov model, with parameters estimated by +maximizing the likelihood. At each hybrid node, the trait is assumed to be +inherited from the immediate parent (or parents, in the case of a hybrid node). +At a hybrid node, the trait is assumed to be inherited from one or the other +parent, with probabilities equal to the inheritance γ of each parent edge +(which is given by the network). +The model ignores incomplete lineage sorting (e.g. hemiplasy). ### General Trait Data -```@example fitDiscrete -s1 = fitDiscrete(simple_net, :ERSM, simple_species, simple_dat; optimizeQ=false, -optimizeRVAS=false) -s2 = fitDiscrete(simple_net, :BTSM, simple_species, simple_dat; optimizeQ=false, -optimizeRVAS=false) +```@repl traitevol_fixednet +s1 = fitdiscrete(net, :ERSM, species, dat; optimizeQ=false) +s2 = fitdiscrete(net, :BTSM, species, dat; optimizeQ=false) ``` -In this `fitDiscrete` call, we do not optimize rates or allow for rate variation -across sites. - -If `optimizeQ = true`, the `fitDiscrete` function estimates the evolutionary rate -or rates. Because we didn't allow for rate variation across sites in these models, -we do not optimize the way rates may vary across trait types. - -```@example fitDiscrete -s3 = fitDiscrete(simple_net, :ERSM, simple_species, simple_dat; optimizeQ=true, -optimizeRVAS=true) -s4 = fitDiscrete(simple_net, :BTSM, simple_species, simple_dat; optimizeQ=true, -optimizeRVAS=true) +In this `fitdiscrete` call, we do not optimize rates or allow for rate variation +across sites. The default rates (which act as starting value if rates +were to be optimized) are chosen equal to the inverse of the total edge lengths +in the network (or 1/ntax if all branch lengths are missing). + +If `optimizeQ = true` (which is the default), the `fitdiscrete` +function estimates the parameters of the rate matrix. +Because we didn't allow for rate variation across sites in these models, +there is nothing to optimize in the way rates may vary across traits (sites). + +```@repl traitevol_fixednet +s3 = fitdiscrete(net, :ERSM, species, dat) +s4 = fitdiscrete(net, :BTSM, species, dat) ``` ### DNA Data -For DNA data, use `:JC69` or `:HKY85` models. -```@example fitDiscrete -d1 = fitDiscrete(dna_net, :JC69, dna_dat, dna_weights, :RV; optimizeQ=false, -optimizeRVAS=false) -d2 = fitDiscrete(dna_net, :HKY85, dna_dat, dna_weights, :RV; optimizeQ=false, -optimizeRVAS=false) +For DNA data, use one of `:JC69` or `:HKY85`. +To allow for rate variation across sites, use the `:RV` option. + +```@example traitevol_fixednet +d1 = fitdiscrete(dna_net, :JC69, dna_dat, dna_weights, :RV; optimizeQ=false, optimizeRVAS=false) +d2 = fitdiscrete(dna_net, :HKY85, dna_dat, dna_weights, :RV; optimizeQ=false, optimizeRVAS=false) ``` -In these `fitDiscrete` models, we do not optimize rates (`optimizeQ=false`), but -we do allow for rate variation across sites. +In these `fitdiscrete` models, we do not optimize rates (`optimizeQ=false`), but +we do allow for rate variation across sites, with a default α of 1. ### Rate Variation Across Sites -In its default version, `fitDiscrete` does not allow for rate variation across sites. -To allow for rate variation across sites in your estimate of evolutionary rates -(or rate variation across trait types, in the case of general trait types), -include `:RV`. If you include `:RV` and `optimizeRVAS = true`, the model will -not only allow for rate variation, but it will also optimize how rates vary across -sites. +In its default version, `fitdiscrete` does not allow for rate variation across sites. +To allow for rate variation across sites in your estimate of evolutionary rates +(or rate variation across traits in the case of general traits), +include `:RV`. If you include `:RV` and `optimizeRVAS = true`, +the model will allow for rate variation and +also optimize the parameter α of the distribution of rates across sites. We optimize the evolutionary rates and the way rates vary across sites for the -DNA data here: -```@example fitDiscrete -d3 = fitDiscrete(dna_net, :JC69, dna_dat, dna_weights, :RV; optimizeQ=true, -optimizeRVAS=false) -d4 = fitDiscrete(dna_net, :HKY85, dna_dat, dna_weights, :RV; optimizeQ=true, -optimizeRVAS=false) -``` \ No newline at end of file +DNA data here: +```@repl traitevol_fixednet +d3 = fitdiscrete(dna_net, :JC69, dna_dat, dna_weights, :RV; optimizeRVAS=false) +d4 = fitdiscrete(dna_net, :HKY85, dna_dat, dna_weights, :RV) +``` diff --git a/docs/src/man/fixednetworkoptim.md b/docs/src/man/fixednetworkoptim.md index a8e5b70c8..85114b689 100644 --- a/docs/src/man/fixednetworkoptim.md +++ b/docs/src/man/fixednetworkoptim.md @@ -8,9 +8,9 @@ This is useful if we have a few candidate networks to compare. Each network can be optimized individually, and the network with the best pseudolikelihood can be chosen. -The score being optimized is the pseudo-deviance, i.e. -the negative log pseudo-likelihood up to an additive constant -(the lower the better). +The score being optimized is a pseudo-deviance, i.e. +a multiple of the negative log pseudo-likelihood up to an additive constant +(the lower the better; a pseudo-deviance of 0 corresponds to a perfect fit). Following our example in [Getting a Network](@ref), we can optimize parameters on the true network @@ -29,7 +29,7 @@ raxmlCF = readTrees2CF(raxmltrees, writeTab=false, writeSummary=false) truenet = readTopology("((((D:0.4,C:0.4):4.8,((A:0.8,B:0.8):2.2)#H1:2.2::0.7):4.0,(#H1:0::0.3,E:3.0):6.2):2.0,O:11.2);"); net1alt = topologyMaxQPseudolik!(truenet, raxmlCF); writeTopology(net1alt, round=true) -net1alt.loglik # pseudo deviance, actually +net1alt.loglik # pseudo deviance actually: the lower the better ``` ```@example fixednetworkoptim using PhyloPlots, RCall @@ -54,16 +54,16 @@ the search stops (but the optimization will take longer). It makes no difference on this small data set. ```julia net1par = topologyMaxQPseudolik!(truenet, raxmlCF, ftolRel=1e-10, xtolAbs=1e-10) -net1par.loglik +net1par.loglik # pseudo deviance, actually: the lower the better ``` ## Network Score with no optimization For a network with given branch lengths and γ heritabilies, -we can compute the pseudolikelihood with: +we can compute the pseudolikelihood (well, a pseudo-deviance) with: ```@repl fixednetworkoptim topologyQPseudolik!(truenet,raxmlCF); -truenet.loglik +truenet.loglik # again, pseudo deviance ``` This function is not maximizing the pseudolikelihood, it is simply computing the pseudolikelihood (or deviance) for the given branch lengths and probabilities of diff --git a/docs/src/man/snaq_plot.md b/docs/src/man/snaq_plot.md index f7d167886..f7aba32bf 100644 --- a/docs/src/man/snaq_plot.md +++ b/docs/src/man/snaq_plot.md @@ -89,7 +89,8 @@ The file `net1.networks` contains a list of networks that are slight modificatio of the best (estimated) network `net1`. The modifications changed the direction of one reticulation at a time, by moving the placement of one hybrid node to another node inside the same cycle. -For each modified network, the pseudolikelihood score was calculated. +For each modified network, the pseudolikelihood score was calculated +(the `loglik` or `-Ploglik` values give a pseudo deviance actually). The function name `snaq!` ends with ! because it modifies the argument `raxmlCF` by including the expected CF. Type `?` then `snaq!` to get help on that function. @@ -266,7 +267,7 @@ echo "end of SNaQ run ..." ## choosing the number of hybridizations Each network has a `loglik` attribute, which is its pseudo deviance: -twice the negative log-likelihood up to a constant (the constant is +a multiple of the negative log-likelihood up to a constant (the constant is such that the score is 0 if the network fits the data perfectly). The lower the better. We can plot these scores across hybrid values: ```@example snaqplot diff --git a/docs/src/man/trait_tree.md b/docs/src/man/trait_tree.md index fe4e8ed0e..b75954ea2 100644 --- a/docs/src/man/trait_tree.md +++ b/docs/src/man/trait_tree.md @@ -131,7 +131,7 @@ associated with trait 2 has a high p-value, which means that this coefficient is not significantly different from 0. This is consistent with the way we simulated trait 3. -The function returns an object of type [`PhyloNetworkLinearModel`](@ref)`<:LinPredModel`. +The function returns an object of type [`PhyloNetworkLinearModel`](@ref)`<:GLM.LinPredModel`. It is a subtype of the GLM type `LinPredModel`, which means that all base functions from Julia [StatsBase](https://github.com/JuliaStats/StatsBase.jl) can be applied to it. See the documentation for this type for a list of all diff --git a/src/PhyloNetworks.jl b/src/PhyloNetworks.jl index 4a8d5382f..37aa253a2 100644 --- a/src/PhyloNetworks.jl +++ b/src/PhyloNetworks.jl @@ -2,7 +2,7 @@ __precompile__() module PhyloNetworks - # stdlib (standard libraries): no need to list in REQUIRE or Project.toml + # stdlib (standard libraries) using Dates using Distributed using LinearAlgebra # for LowerTriangular, logdet, diag @@ -10,7 +10,7 @@ module PhyloNetworks using Random using Statistics: mean, quantile - # other libraries, to list as dependencies + # other libraries, indicate compatible version in Project.toml using BioSequences using BioSymbols using Combinatorics: combinations @@ -135,15 +135,16 @@ module PhyloNetworks TwoBinaryTraitSubstitutionModel, JC69, HKY85, nstates, - Q, setrates!, + Q, getlabels, nparams, RateVariationAcrossSites, - setalpha!, randomTrait, randomTrait!, - fitDiscrete, + fitdiscrete, readfastatodna, + stationary, + empiricalDNAfrequencies, ## TICR test ticr, ticr! diff --git a/src/parsimony.jl b/src/parsimony.jl index fd26479d0..70496839f 100755 --- a/src/parsimony.jl +++ b/src/parsimony.jl @@ -411,7 +411,7 @@ function readfastatodna(fastafile::String, countPatterns=false::Bool) species = String[] firstspecies = Bool(true) nsites = 0 - for record in reader #species (row) + for record in reader #by species (row) if firstspecies nsites = length(sequence(record)) for site in 1:nsites # initialize an array for each site @@ -419,7 +419,7 @@ function readfastatodna(fastafile::String, countPatterns=false::Bool) end firstspecies = false end - push!(species, FASTA.identifier(record)) + push!(species, FASTA.identifier(record)) #adds species name to end of species vector length(sequence(record)) == nsites || error("sequences of different length: current sequences is ", length(sequence(record)), " long while first sequence is ", nsites, " long") for site in 1:nsites diff --git a/src/pseudolik.jl b/src/pseudolik.jl index 3eba3cdb7..8121fa2a7 100644 --- a/src/pseudolik.jl +++ b/src/pseudolik.jl @@ -1364,6 +1364,8 @@ function logPseudoLik(quartet::Quartet) suma += -1.e15 else suma += quartet.obsCF[i] == 0 ? 0.0 : 100*quartet.obsCF[i]*log(quartet.qnet.expCF[i]/quartet.obsCF[i]) + # WARNING: 100 should be replaced by -2*ngenes to get the deviance. + # below: negative sign used below in logPseudoLik() when summing up across 4-taxon sets end end ## to account for missing data: diff --git a/src/substitutionModels.jl b/src/substitutionModels.jl index 43b781650..a4b46816d 100644 --- a/src/substitutionModels.jl +++ b/src/substitutionModels.jl @@ -742,7 +742,7 @@ transition/transversion ratio: κ=α/β. The rate transition matrix Q is normali `nparams` returns 1 or 2. In other words: the stationary distribution is not counted in the number of parameters -(and `fitDiscrete` does not optimize the pi values at the moment). +(and `fitdiscrete` does not optimize the pi values at the moment). # examples @@ -785,15 +785,14 @@ struct HKY85 <: NucleicAcidSubstitutionModel #Warning: this constructor should not be used directly. Use the constructor below, # which will call this. all(rate .> 0.) || error("All elements of rate must be positive") - 1 <= length(rate) <= 2 || error("rate is not a valid length for HKY85 model") + 1 <= length(rate) <= 2 || error("rate has invalid length for HKY85 model") + if relative length(rate) == 1 || error("the relative version of HKY85 takes a single rate") + else length(rate) == 2 || error("the absolute version of HKY85 takes 2 rates") + end length(pi) == 4 || error("pi must be of length 4") all(0. .< pi.< 1.) || error("All base proportions must be between 0 and 1") sum(pi) == 1. || error("Base proportions must sum to 1") - if length(rate) == 1 - new(rate, pi, true, eigeninfo) - else - new(rate, pi, false, eigeninfo) - end + new(rate, pi, relative, eigeninfo) end end function HKY85(rate::AbstractVector, pi::Vector{Float64}, relative=true::Bool) @@ -801,7 +800,7 @@ function HKY85(rate::AbstractVector, pi::Vector{Float64}, relative=true::Bool) seteigeninfo!(obj) return obj end -HKY85(rate::Float64, pi::Vector{Float64}, relative=true::Bool) = HKY85([rate], pi, relative) +HKY85(rate::Float64, pi::Vector{Float64}) = HKY85([rate], pi, true) """ for [`HKY85`](@ref): store piR, piY, the 2 non-zero eigenvalues and a scaling factor @@ -977,14 +976,14 @@ Because the mean of the desired continuous Gamma distribution is 1, we use shape α and scale θ=1/α (e.g. rate β=α). The shape parameter is referred to as alpha here. The Gamma distribution is discretized into `ncat` categories. -In each category, the category's rate multiplier is selected as a quantile. +In each category, the category's rate multiplier is a normalized quantile of the gamma distribution. ```jldoctest julia> rv = RateVariationAcrossSites() Rate Variation Across Sites using Discretized Gamma Model alpha: 1.0 categories for Gamma discretization: 4 -ratemultiplier: [0.133531, 0.470004, 0.980829, 2.07944] +ratemultiplier: [0.145784, 0.513132, 1.07083, 2.27025] julia> PhyloNetworks.setalpha!(rv, 2.0) @@ -992,7 +991,13 @@ julia> rv Rate Variation Across Sites using Discretized Gamma Model alpha: 2.0 categories for Gamma discretization: 4 -ratemultiplier: [0.304691, 0.652574, 1.05902, 1.80351] +ratemultiplier: [0.319065, 0.683361, 1.10898, 1.8886] + +julia> RateVariationAcrossSites(2.0, 4) +Rate Variation Across Sites using Discretized Gamma Model +alpha: 2.0 +categories for Gamma discretization: 4 +ratemultiplier: [0.319065, 0.683361, 1.10898, 1.8886] ``` """ mutable struct RateVariationAcrossSites @@ -1001,11 +1006,13 @@ mutable struct RateVariationAcrossSites ratemultiplier::Array{Float64} function RateVariationAcrossSites(alpha = 1.0::Float64, ncat = 4::Int) @assert alpha >= 0.0 "alpha must be >= 0" + @assert ncat > 0 "ncat must be 1 or greater" if ncat == 1 ratemultiplier = [1.0] else - cuts = Vector((0:(ncat-1))/ncat) + repeat([1/2ncat], ncat) + cuts = Vector((0:(ncat-1))/ncat) + repeat([1/(2ncat)], ncat) ratemultiplier = quantile.(Distributions.Gamma(alpha, 1/alpha), cuts) + ratemultiplier ./= mean(ratemultiplier) end new(alpha, ncat, ratemultiplier) end @@ -1022,8 +1029,10 @@ function setalpha!(obj::RateVariationAcrossSites, alpha::Float64) @assert alpha >= 0 "alpha must be a float >= 0" obj.alpha = alpha if obj.ncat > 1 + rv = obj.ratemultiplier cuts = Vector((0:(obj.ncat-1))/obj.ncat) + repeat([1/(2*obj.ncat)], obj.ncat) - obj.ratemultiplier[:] = quantile.(Distributions.Gamma(alpha, 1/alpha), cuts) + rv[:] = quantile.(Distributions.Gamma(alpha, 1/alpha), cuts) + rv ./= mean(rv) end return nothing end @@ -1039,3 +1048,112 @@ end function nparams(obj::RateVariationAcrossSites) return (obj.ncat == 1 ? 0 : 1) end + +""" + empiricalDNAfrequencies(DNAdata::AbstractDataFrame, DNAweights, + correction=true, useambiguous=true) + +Estimate base frequencies in DNA data `DNAdata`, ordered ACGT. + +- `DNAdata`: data frame. All columns are used. If the first column + gives species names, find a way to ignore it before calculating empirical + frequencies, e.g. `empiricalDNAfrequencies(view(DNAdata, 2:ncol(DNAdata)))`. + Data type must be `BioSymbols.DNA` or `Char` or `String`. + WARNING: this is checked on the first column only. +- `DNAweights`: vector of weights, to weigh each column in `DNAdata`. +- `correction`: if `true`, add 1 to each count and 4 to the denominator + for a more stable estimator, similar to Bayes prior of 1/4 and + the Agresti-Coull interval in binomial estimation. +- `useambiguous`: if `true`, ambiguous bases are used (except gaps and Ns). + For example, `Y` adds 0.5 weight to `C` and 0.5 weight to `T`. +""" +function empiricalDNAfrequencies(dnaDat::AbstractDataFrame, dnaWeights::Vector, + correctedestimate=true::Bool, useambiguous=true::Bool) + + # warning: checking first column and first row only + eltypes(dnaDat)[1] == BioSymbols.DNA || eltypes(dnaDat)[1] == Char || + dnaDat[1,1] ∈ string.(BioSymbols.alphabet(DNA)) || + error("empiricalDNAfrequencies requires data of type String, Char, or BioSymbols.DNA") + + # initialize counts: keys same as BioSymbols.ACGT (which are ordered) + prior = correctedestimate ? 1.0 : 0.0 + dnacounts = Dict(DNA_A=>prior, DNA_C=>prior, DNA_G=>prior, DNA_T=>prior) + + convert2dna = eltypes(dnaDat)[1] != BioSymbols.DNA + for j in 1:ncol(dnaDat) # for each column + col = dnaDat[j] + wt = dnaWeights[j] + for nuc in col # for each row + if convert2dna + nuc = convert(DNA, nuc[1]) # if nuc is a string, nuc[1] = 1st character + end + if nuc ∈ BioSymbols.ACGT + dnacounts[nuc] += wt + elseif nuc == DNA_Gap || nuc == DNA_N || !useambiguous + continue # to next row + # else: ambiguity, uses BioSequences definitions + elseif nuc == DNA_M # A or C + dnacounts[DNA_A] += 0.5*wt + dnacounts[DNA_C] += 0.5*wt + elseif nuc == DNA_R # A or G + dnacounts[DNA_A] += 0.5*wt + dnacounts[DNA_G] += 0.5*wt + elseif nuc == DNA_W # A or T/U + dnacounts[DNA_A] += 0.5*wt + dnacounts[DNA_T] += 0.5*wt + elseif nuc == DNA_S # C or G + dnacounts[DNA_C] += 0.5*wt + dnacounts[DNA_G] += 0.5*wt + elseif nuc == DNA_Y # C or T/U + dnacounts[DNA_C] += 0.5*wt + dnacounts[DNA_T] += 0.5*wt + elseif nuc == DNA_K # G or T/U + dnacounts[DNA_G] += 0.5*wt + dnacounts[DNA_T] += 0.5*wt + elseif nuc == DNA_V # A or C or G + dnacounts[DNA_A] += wt/3 + dnacounts[DNA_C] += wt/3 + dnacounts[DNA_G] += wt/3 + elseif nuc == DNA_H # A or C or T + dnacounts[DNA_A] += wt/3 + dnacounts[DNA_C] += wt/3 + dnacounts[DNA_T] += wt/3 + elseif nuc == DNA_D # A or G or T/U + dnacounts[DNA_A] += wt/3 + dnacounts[DNA_G] += wt/3 + dnacounts[DNA_T] += wt/3 + elseif nuc == DNA_B # C or G or T/U + dnacounts[DNA_C] += wt/3 + dnacounts[DNA_G] += wt/3 + dnacounts[DNA_T] += wt/3 + end + end + end + totalweight = sum(values(dnacounts)) + res = [dnacounts[key]/totalweight for key in BioSymbols.ACGT] # to control the order + all(0. .<= res .<= 1.) || error("""weird: empirical base frequency < 0 or > 1""") + return res +end + +""" + stationary(substitutionmodel) + +Stationary distribution of a Markov model +""" +stationary(mod::SM) = error("stationary not defined for $(typeof(mod)).") + +function stationary(mod::JC69) + return [0.25,0.25,0.25,0.25] +end + +function stationary(mod::HKY85) + return mod.pi +end + +function stationary(mod::ERSM) #for ERSM, uniform = stationary + return [1/mod.k for i in 1:mod.k] +end + +function stationary(mod::BTSM) + return [mod.eigeninfo[2], mod.eigeninfo[3]] +end diff --git a/src/traits.jl b/src/traits.jl index b2b515ede..bec4dd3f3 100644 --- a/src/traits.jl +++ b/src/traits.jl @@ -512,7 +512,7 @@ julia> sim = simulate(net, params); # simulate a dataset with shifts julia> using DataFrames # to handle data frames julia> dat = DataFrame(trait = sim[:Tips], tipNames = sim.M.tipNames) -4×2 DataFrames.DataFrame +4×2 DataFrame │ Row │ trait │ tipNames │ │ │ Float64 │ String │ ├─────┼─────────┼──────────┤ @@ -522,7 +522,7 @@ julia> dat = DataFrame(trait = sim[:Tips], tipNames = sim.M.tipNames) │ 4 │ 7.88906 │ D │ julia> dfr_shift = regressorShift(net.node[nodes_shifts], net) # the regressors matching the shifts. -4×3 DataFrames.DataFrame +4×3 DataFrame │ Row │ shift_1 │ shift_8 │ tipNames │ │ │ Float64 │ Float64 │ String │ ├─────┼─────────┼─────────┼──────────┤ @@ -536,7 +536,7 @@ julia> dfr = join(dat, dfr_shift, on=:tipNames); # join data and regressors in a julia> using StatsModels # for statistical model formulas julia> fitBM = phyloNetworklm(@formula(trait ~ shift_1 + shift_8), dfr, net) # actual fit -StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} +StatsModels.TableRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} Formula: trait ~ 1 + shift_1 + shift_8 @@ -652,7 +652,7 @@ julia> using Random; Random.seed!(2468); # sets the seed for reproducibility julia> sim = simulate(net, params); # simulate a dataset with shifts julia> dat = DataFrame(trait = sim[:Tips], tipNames = sim.M.tipNames) -4×2 DataFrames.DataFrame +4×2 DataFrame │ Row │ trait │ tipNames │ │ │ Float64 │ String │ ├─────┼─────────┼──────────┤ @@ -662,7 +662,7 @@ julia> dat = DataFrame(trait = sim[:Tips], tipNames = sim.M.tipNames) │ 4 │ 12.6891 │ D │ julia> dfr_hybrid = regressorHybrid(net) # the reressors matching the hybrids. -4×3 DataFrames.DataFrame +4×3 DataFrame │ Row │ shift_6 │ tipNames │ sum │ │ │ Float64 │ String │ Float64 │ ├─────┼─────────┼──────────┼─────────┤ @@ -676,7 +676,7 @@ julia> dfr = join(dat, dfr_hybrid, on=:tipNames); # join data and regressors in julia> using StatsModels julia> fitBM = phyloNetworklm(@formula(trait ~ shift_6), dfr, net) # actual fit -StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} +StatsModels.TableRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} Formula: trait ~ 1 + shift_6 @@ -1120,18 +1120,21 @@ end # New type for phyloNetwork regression """ - PhyloNetworkLinearModel<:LinPredModel + PhyloNetworkLinearModel<:GLM.LinPredModel Regression object for a phylogenetic regression. Result of fitting function [`phyloNetworklm`](@ref). -Dominated by the `LinPredModel` class, from package `GLM`. +Dominated by the `GLM.LinPredModel` class, from package `GLM`. The following StatsBase functions can be applied to it: `coef`, `nobs`, `vcov`, `stderror`, `confint`, `coeftable`, `dof_residual`, `dof`, `deviance`, `residuals`, `response`, `predict`, `loglikelihood`, `nulldeviance`, `nullloglikelihood`, `r2`, `adjr2`, `aic`, `aicc`, `bic`. -The following StatsModels functions can also be applied to it: -`ModelFrame`, `ModelMatrix`, `Formula`. +For accessing the model matrix (`object.mm` and `object.mm.m`), +the model frame (`object.mf`) or formula (`object.mf.f`), refer to +[StatsModels](https://juliastats.github.io/StatsModels.jl/stable/) functions, +like `show(object.mf.f)`, `terms(object.mf.f)`, `coefnames(object.mf.f)`, +`terms(object.mf.f.rhs)`, `response(object)` etc. Estimated variance and mean of the BM process used can be retrieved with functions [`sigma2_estim`](@ref) and [`mu_estim`](@ref). @@ -1145,7 +1148,7 @@ An ancestral state reconstruction can be performed from this fitted object using The `PhyloNetworkLinearModel` object has fields: `lm`, `V`, `Vy`, `RL`, `Y`, `X`, `logdetVy`, `ind`, `nonmissing`, `model`, `lambda`. Type in "?PhyloNetworkLinearModel.field" to get help on a specific field. """ -mutable struct PhyloNetworkLinearModel <: LinPredModel +mutable struct PhyloNetworkLinearModel <: GLM.LinPredModel "lm: a GLM.LinearModel object, fitted on the cholesky-tranformend problem" lm::GLM.LinearModel # result of a lm on a matrix "V: a MatrixTopologicalOrder object of the network-induced correlations" @@ -1473,11 +1476,10 @@ end Phylogenetic regression, using the correlation structure induced by the network. -Returns an object of class [`PhyloNetworkLinearModel`](@ref). See documentation for this type and -example to see all the functions that can be applied to it. +Returns an object of class [`PhyloNetworkLinearModel`](@ref). See documentation for this type and example to see all the functions that can be applied to it. # Arguments -* `f::Formula`: formula to use for the regression (see the `DataFrame` package) +* `f::StatsModels.FormulaTerm`: formula to use for the regression * `fr::AbstractDataFrame`: DataFrame containing the data and regressors at the tips. It should have an extra column labelled "tipNames", that gives the names of the taxa for each observation. * `net::HybridNetwork`: phylogenetic network to use. Should have labelled tips. * `model::AbstractString="BM"`: the model to use, "BM" (default) or "lambda" (for Pagel's lambda). @@ -1507,9 +1509,9 @@ julia> using StatsModels # for stat model formulas julia> fitBM = phyloNetworklm(@formula(trait ~ 1), dat, phy); julia> fitBM # Shows a summary -StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} +StatsModels.TableRegressionModel{PhyloNetworkLinearModel,Array{Float64,2}} -Formula: trait ~ +1 +Formula: trait ~ 1 Model: BM @@ -1633,8 +1635,8 @@ julia> round.(predict(fitBM), digits=5) 4.679 ``` -""" #" -function phyloNetworklm(f::Formula, +""" +function phyloNetworklm(f::StatsModels.FormulaTerm, fr::AbstractDataFrame, net::HybridNetwork; model="BM"::AbstractString, @@ -1681,19 +1683,19 @@ function phyloNetworklm(f::Formula, end # fr = fr[ind, :] end - # Find the regression matrix and answer vector - mf = ModelFrame(f,fr) - if isequal(f.rhs, -1) # If there are no regressors - mm = ModelMatrix(zeros(size(mf.df, 1), 0), [0]) - else - mm = ModelMatrix(mf) - end - Y = convert(Vector{Float64}, StatsModels.model_response(mf)) - # Fit the model (Method copied from DataFrame/src/statsmodels/statsmodels.jl, lines 47-58) - # (then StatsModels/src/statsmodels.jl lines 42-46) - StatsModels.DataFrameRegressionModel(phyloNetworklm(mm.m, Y, net; - nonmissing=mf.nonmissing, model=model, ind=ind, - startingValue=startingValue, fixedValue=fixedValue), mf, mm) + # Find the regression matrix and response vector + data, nonmissing = StatsModels.missing_omit(StatsModels.columntable(fr), f) + sch = StatsModels.schema(f, data) + f = StatsModels.apply_schema(f, sch, PhyloNetworkLinearModel) + mf = ModelFrame(f, sch, data, PhyloNetworkLinearModel) + mm = StatsModels.ModelMatrix(mf) + Y = StatsModels.response(mf) + # Y = convert(Vector{Float64}, StatsModels.response(mf)) + # Y, pred = StatsModels.modelcols(f, fr) + StatsModels.TableRegressionModel( + phyloNetworklm(mm.m, Y, net; nonmissing=nonmissing, model=model, ind=ind, + startingValue=startingValue, fixedValue=fixedValue), + mf, mm) end ### Methods on type phyloNetworkRegression @@ -1775,7 +1777,7 @@ function StatsBase.adjr2(obj::PhyloNetworkLinearModel) end ## REMARK -# As PhyloNetworkLinearModel <: LinPredModel, the following functions are automatically defined: +# As PhyloNetworkLinearModel <: GLM.LinPredModel, the following functions are automatically defined: # aic, aicc, bic ## New quantities @@ -1786,8 +1788,8 @@ end Estimated variance for a fitted object. """ sigma2_estim(m::PhyloNetworkLinearModel) = deviance(m.lm) / nobs(m) -# Need to be adapted manually to DataFrameRegressionModel beacouse it's a new function -sigma2_estim(m::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T) = +# Need to be adapted manually to TableRegressionModel beacouse it's a new function +sigma2_estim(m::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T) = sigma2_estim(m.model) # ML estimate for ancestral state of the BM """ @@ -1806,9 +1808,9 @@ function mu_estim(m::PhyloNetworkLinearModel) return coef(m)[1] end end -# Need to be adapted manually to DataFrameRegressionModel beacouse it's a new function -function mu_estim(m::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T) - if (!m.mf.terms.intercept) +# Need to be adapted manually to TableRegressionModel beacouse it's a new function +function mu_estim(m::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T) + if m.mf.f.rhs.terms[1] != StatsModels.InterceptTerm{true}() error("The fit was done without intercept, so I cannot estimate mu") end return coef(m)[1] @@ -1820,13 +1822,7 @@ end Estimated lambda parameter for a fitted object. """ lambda_estim(m::PhyloNetworkLinearModel) = m.lambda -lambda_estim(m::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T) = lambda_estim(m.model) - -### Functions specific to DataFrameRegressionModel -StatsModels.ModelFrame(m::StatsModels.DataFrameRegressionModel) = m.mf -StatsModels.ModelMatrix(m::StatsModels.DataFrameRegressionModel) = m.mm -StatsModels.Formula(m::StatsModels.DataFrameRegressionModel) = Formula(m.mf.terms) -StatsModels.response(m::StatsModels.DataFrameRegressionModel) = response(m.model) +lambda_estim(m::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T) = lambda_estim(m.model) ### Print the results # Variance @@ -1844,11 +1840,11 @@ function Base.show(io::IO, obj::PhyloNetworkLinearModel) end # For DataFrameModel. see also Base.show in # https://github.com/JuliaStats/StatsModels.jl/blob/master/src/statsmodel.jl -function Base.show(io::IO, model::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T) +function Base.show(io::IO, model::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T) ct = coeftable(model) println(io, "$(typeof(model))") - println(io) - println(io, Formula(model.mf.terms)) + print(io, "\nFormula: ") + println(io, string(model.mf.f)) # formula println(io) println(io, "Model: $(model.model.model)") println(io) @@ -1868,7 +1864,7 @@ end ############################################################################### ############################################################################### -function GLM.ftest(objs::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T}...) where T +function GLM.ftest(objs::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T}...) where T objsModels = [obj.model for obj in objs] return ftest(objsModels...) end @@ -1895,7 +1891,7 @@ data, for models that have more and more effects. Returns a DataFrame object with the anova table. """ -function anova(objs::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T}...) where T +function anova(objs::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T}...) where T objsModels = [obj.model for obj in objs] return(anova(objsModels...)) end @@ -2129,7 +2125,7 @@ end # object fitted by function phyloNetworklm, and some predictors expressed at all the nodes of the network. # # - obj: a PhyloNetworkLinearModel object, or a -# DataFrameRegressionModel{PhyloNetworkLinearModel}, if data frames were used. +# TableRegressionModel{PhyloNetworkLinearModel}, if data frames were used. # - X_n a matrix with as many columns as the number of predictors used, and as # many lines as the number of unknown nodes or tips. # @@ -2369,7 +2365,7 @@ julia> ancStates = ancestralStateReconstruction(fitBM); └ @ PhyloNetworks ~/build/crsl4/PhyloNetworks.jl/src/traits.jl:2163 julia> expectations(ancStates) -31×2 DataFrames.DataFrame +31×2 DataFrame │ Row │ nodeNumber │ condExpectation │ │ │ Int64 │ Float64 │ ├─────┼────────────┼─────────────────┤ @@ -2414,7 +2410,7 @@ julia> predint(ancStates) 1.0695 1.0695 julia> expectationsPlot(ancStates) # format node <-> ancestral state -31×2 DataFrames.DataFrame +31×2 DataFrame │ Row │ nodeNumber │ PredInt │ │ │ Int64 │ Abstract… │ ├─────┼────────────┼───────────┤ @@ -2438,7 +2434,7 @@ julia> expectationsPlot(ancStates) # format node <-> ancestral state julia> plot(phy, :RCall, nodeLabel = expectationsPlot(ancStates)); julia> predintPlot(ancStates) # prediction intervals, in data frame, useful to plot -31×2 DataFrames.DataFrame +31×2 DataFrame │ Row │ nodeNumber │ PredInt │ │ │ Int64 │ Abstract… │ ├─────┼────────────┼───────────────┤ @@ -2474,11 +2470,11 @@ function ancestralStateReconstruction(obj::PhyloNetworkLinearModel) X_n = ones((length(obj.V.internalNodeNumbers) + sum(.!obj.nonmissing), 1)) ancestralStateReconstruction(obj, X_n) end -# For a DataFrameRegressionModel -function ancestralStateReconstruction(obj::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T) +# For a TableRegressionModel +function ancestralStateReconstruction(obj::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T) ancestralStateReconstruction(obj.model) end -function ancestralStateReconstruction(obj::StatsModels.DataFrameRegressionModel{PhyloNetworkLinearModel,T} where T, X_n::Matrix) +function ancestralStateReconstruction(obj::StatsModels.TableRegressionModel{PhyloNetworkLinearModel,T} where T, X_n::Matrix) ancestralStateReconstruction(obj.model, X_n) end diff --git a/src/traitsLikDiscrete.jl b/src/traitsLikDiscrete.jl index 27117cf62..c8b24aa21 100644 --- a/src/traitsLikDiscrete.jl +++ b/src/traitsLikDiscrete.jl @@ -3,7 +3,7 @@ Subtype of `StatsBase.StatisticalModel`, to fit discrete data to a model of trait substitution along a network. -See [`fitDiscrete`](@ref) to fit a trait substitution model to discrete data. +See [`fitdiscrete`](@ref) to fit a trait substitution model to discrete data. It returns an object of type `StatisticalSubstitutionModel`, to which standard functions can be applied, like `loglikelihood(object)`, `aic(object)` etc. """ @@ -44,9 +44,10 @@ StatsBase.dof(obj::SSM) = nparams(obj.model) + nparams(obj.ratemodel) function Base.show(io::IO, obj::SSM) disp = "$(typeof(obj)):\n" disp *= string(obj.model) - disp *= "$(obj.nsites) traits, $(length(obj.trait)) species, " + disp *= "$(obj.nsites) traits, $(length(obj.trait)) species\n" if obj.ratemodel.ncat != 1 - disp *= "with gamma variable rate model according to $(obj.ratemodel) with alpha = $(obj.ratemodel.alpha) and $(obj.ratemodel.ncat) categories " + disp *= "variable rates across sites ~ discretized gamma with\n alpha=$(obj.ratemodel.alpha)" + disp *= "\n $(obj.ratemodel.ncat) categories\n rate multipliers: $(obj.ratemodel.ratemultiplier)\n" end disp *= "on a network with $(obj.net.numHybrids) reticulations" if !ismissing(obj.loglik) @@ -65,14 +66,14 @@ end # weights """ - fitDiscrete(net, model, tipdata) - fitDiscrete(net, model, RateVariationAcrossSites, tipdata) - fitDiscrete(net, model, species, traits) - fitDiscrete(net, model, RateVariationAcrossSites, species, traits) - fitDiscrete(net, model, dnadata, dnapatternweights) - fitDiscrete(net, model, RateVariationAcrossSites, dnadata, dnapatternweights) - fitDiscrete(net, modSymbol, species, traits) - fitDiscrete(net, modSymbol, dnadata, dnapatternweights) + fitdiscrete(net, model, tipdata) + fitdiscrete(net, model, RateVariationAcrossSites, tipdata) + fitdiscrete(net, model, species, traits) + fitdiscrete(net, model, RateVariationAcrossSites, species, traits) + fitdiscrete(net, model, dnadata, dnapatternweights) + fitdiscrete(net, model, RateVariationAcrossSites, dnadata, dnapatternweights) + fitdiscrete(net, modSymbol, species, traits) + fitdiscrete(net, modSymbol, dnadata, dnapatternweights) Calculate the maximum likelihood (ML) score of a network given one or more discrete characters at the tips. Along each edge, transitions @@ -109,6 +110,8 @@ Optional arguments (default): - tolerance values to control when the optimization is stopped: `ftolRel` (1e-12), `ftolAbs` (1e-10) on the likelihood, and `xtolRel` (1e-10), `xtolAbs` (1e-10) on the model parameters. +- bounds for the alpha parameter of the Gamma distribution of rates across sites: + `alphamin=0.05`, `alphamax=500`. - `verbose` (false): if true, more information is output. # examples: @@ -122,22 +125,24 @@ julia> using DataFrames julia> dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]); -julia> fit1 = fitDiscrete(net, m1, dat; optimizeQ=true, optimizeRVAS=false) +julia> fit1 = fitdiscrete(net, m1, dat) PhyloNetworks.StatisticalSubstitutionModel: Binary Trait Substitution Model: rate lo→hi α=0.27222 rate hi→lo β=0.34981 -1 traits, 4 species, on a network with 1 reticulations +1 traits, 4 species +on a network with 1 reticulations log-likelihood: -2.7277 julia> tips = Dict("A" => "lo", "B" => "lo", "C" => "hi", "D" => "hi"); -julia> fit2 = fitDiscrete(net, m1, tips; optimizeRVAS=false, xtolRel=1e-16, xtolAbs=1e-16, ftolRel=1e-16) +julia> fit2 = fitdiscrete(net, m1, tips; xtolRel=1e-16, xtolAbs=1e-16, ftolRel=1e-16) PhyloNetworks.StatisticalSubstitutionModel: Binary Trait Substitution Model: rate lo→hi α=0.27222 rate hi→lo β=0.34981 -1 traits, 4 species, on a network with 1 reticulations +1 traits, 4 species +on a network with 1 reticulations log-likelihood: -2.7277 ``` @@ -154,7 +159,7 @@ julia> tips = Dict("sp1" => BioSymbols.DNA_A, "sp2" => BioSymbols.DNA_A, "sp3" = julia> mJC69 = JC69([0.25], false); -julia> fitJC69 = fitDiscrete(net, mJC69, tips; optimizeQ=true, optimizeRVAS=false) +julia> fitJC69 = fitdiscrete(net, mJC69, tips) PhyloNetworks.StatisticalSubstitutionModel: Jukes and Cantor 69 Substitution Model, absolute rate version @@ -165,16 +170,17 @@ rate matrix Q: C 0.0974 * 0.0974 0.0974 G 0.0974 0.0974 * 0.0974 T 0.0974 0.0974 0.0974 * -1 traits, 4 species, on a network with 0 reticulations +1 traits, 4 species +on a network with 0 reticulations log-likelihood: -4.99274 -julia> rv = RateVariationAcrossSites(1.0, 4) +julia> rv = RateVariationAcrossSites() Rate Variation Across Sites using Discretized Gamma Model alpha: 1.0 categories for Gamma discretization: 4 -ratemultiplier: [0.133531, 0.470004, 0.980829, 2.07944] +ratemultiplier: [0.145784, 0.513132, 1.07083, 2.27025] -julia> fitDiscrete(net, mJC69, rv, tips; optimizeQ=false, optimizeRVAS=false) +julia> fitdiscrete(net, mJC69, rv, tips; optimizeQ=false, optimizeRVAS=false) PhyloNetworks.StatisticalSubstitutionModel: Jukes and Cantor 69 Substitution Model, absolute rate version @@ -185,22 +191,23 @@ rate matrix Q: C 0.0833 * 0.0833 0.0833 G 0.0833 0.0833 * 0.0833 T 0.0833 0.0833 0.0833 * -1 traits, 4 species, with gamma variable rate model according to Rate Variation Across Sites using Discretized Gamma Model -alpha: 1.0 -categories for Gamma discretization: 4 -ratemultiplier: [0.133531, 0.470004, 0.980829, 2.07944] - with alpha = 1.0 and 4 categories on a network with 0 reticulations -log-likelihood: -5.26879 +1 traits, 4 species +variable rates across sites ~ discretized gamma with + alpha=1.0 + 4 categories + rate multipliers: [0.145784, 0.513132, 1.07083, 2.27025] +on a network with 0 reticulations +log-likelihood: -5.2568 ``` """ -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, #tips::Dict no ratemodel version +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, #tips::Dict no ratemodel version tips::Dict; kwargs...) ratemodel = RateVariationAcrossSites(1.0, 1) - fitDiscrete(net, model, ratemodel, tips; kwargs...) + fitdiscrete(net, model, ratemodel, tips; kwargs...) end #tips::Dict version with ratemodel -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, tips::Dict; kwargs...) species = String[] dat = Vector{Int}[] # indices of trait labels @@ -216,13 +223,13 @@ function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::Ra end #dat::DataFrame, no rate model version -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, dat::DataFrame; kwargs...) +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, dat::DataFrame; kwargs...) ratemodel = RateVariationAcrossSites(1.0, 1) - fitDiscrete(net, model, ratemodel, dat; kwargs...) + fitdiscrete(net, model, ratemodel, dat; kwargs...) end #dat::DataFrame with rate model version -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, dat::DataFrame; kwargs...) i = findfirst(isequal(:taxon), DataFrames.names(dat)) @@ -241,13 +248,13 @@ function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::Ra end #species, dat version, no ratemodel -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, species::Array{String}, dat::DataFrame; kwargs...) +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, species::Array{String}, dat::DataFrame; kwargs...) ratemodel = RateVariationAcrossSites(1.0, 1) - fitDiscrete(net, model, ratemodel, species, dat; kwargs...) + fitdiscrete(net, model, ratemodel, species, dat; kwargs...) end #species, dat version with ratemodel -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, species::Array{String}, dat::DataFrame; kwargs...) dat2 = traitlabels2indices(dat, model) # vec of vec, indices o, net = check_matchtaxonnames!(copy(species), dat2, net) @@ -255,19 +262,24 @@ function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::Ra end #wrapper: species, dat version with model symbol -function fitDiscrete(net::HybridNetwork, modSymbol::Symbol, species::Array{String}, dat::DataFrame, rvSymbol=:noRV::Symbol; kwargs...) +function fitdiscrete(net::HybridNetwork, modSymbol::Symbol, + species::Array{String}, dat::DataFrame, rvSymbol=:noRV::Symbol; kwargs...) rate = startingrate(net) labels = learnLabels(modSymbol, species, dat) if modSymbol == :JC69 - model = JC69([rate], true) + model = JC69([1.0], true) # 1.0 instead of rate because relative version elseif modSymbol == :HKY85 - model = HKY85([rate, rate], [0.25, 0.25, 0.25, 0.25], true) + model = HKY85([1.0], # transition/transversion rate ratio + empiricalDNAfrequencies(dat, repeat([1.], inner=ncol(dat))), + true) elseif modSymbol == :ERSM model = EqualRatesSubstitutionModel(length(labels), rate, labels); elseif modSymbol == :BTSM model = BinaryTraitSubstitutionModel([rate, rate], labels) elseif modSymbol == :TBTSM model = TwoBinaryTraitSubstitutionModel([rate, rate, rate, rate, rate, rate, rate, rate], labels) + else + error("model $modSymbol is unknown or not implemented yet") end if rvSymbol == :RV @@ -275,22 +287,22 @@ function fitDiscrete(net::HybridNetwork, modSymbol::Symbol, species::Array{Strin else rvas = RateVariationAcrossSites(1.0, 1) end - - fitDiscrete(net, model, rvas, species, dat; kwargs...) + fitdiscrete(net, model, rvas, species, dat; kwargs...) end #dnadata with dnapatternweights version, no ratemodel -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, dnadata::DataFrame, +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, dnadata::DataFrame, dnapatternweights::Array{Float64}; kwargs...) ratemodel = RateVariationAcrossSites(1.0, 1) - fitDiscrete(net, model, ratemodel, dnadata, dnapatternweights; kwargs...) + fitdiscrete(net, model, ratemodel, dnadata, dnapatternweights; kwargs...) end #dnadata with dnapatternweights version with ratemodel -function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, +function fitdiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, dnadata::DataFrame, dnapatternweights::Array{Float64}; kwargs...) dat2 = traitlabels2indices(dnadata[2:end], model) - # dat2 = traitlabels2indices(view(dnadata, 2:ncol(dnadata)), model) #removed view bc it wasnt recognized as DF by traitlabels2indices + # dat2 = traitlabels2indices(view(dnadata, 2:ncol(dnadata)), model) #removed view bc it wasnt recognized + #as DF by traitlabels2indices # this doesnt work, but was attempting to uses view to avoid a shallow copy and # be more space efficient #produces a vec of vec, indices @@ -301,18 +313,23 @@ function fitDiscrete(net::HybridNetwork, model::SubstitutionModel, ratemodel::Ra end #wrapper for dna data -function fitDiscrete(net::HybridNetwork, modSymbol::Symbol, dnadata::DataFrame, dnapatternweights::Array{Float64}, rvSymbol=:noRV::Symbol; kwargs...) +function fitdiscrete(net::HybridNetwork, modSymbol::Symbol, dnadata::DataFrame, + dnapatternweights::Array{Float64}, rvSymbol=:noRV::Symbol; kwargs...) rate = startingrate(net) if modSymbol == :JC69 - model = JC69([rate], true) + model = JC69([1.0], true) # 1.0 instead of rate because relative version (relative = true) elseif modSymbol == :HKY85 - model = HKY85([rate, rate], [0.25, 0.25, 0.25, 0.25], true) + model = HKY85([1.0], # transition/transversion rate ratio + empiricalDNAfrequencies(view(dnadata, 2:ncol(dnadata)), dnapatternweights), + true) elseif modSymbol == :ERSM model = EqualRatesSubstitutionModel(4, rate, [BioSymbols.DNA_A, BioSymbols.DNA_C, BioSymbols.DNA_G, BioSymbols.DNA_T]); elseif modSymbol == :BTSM error("Binary Trait Substitution Model supports only two trait states, but dna data has four states.") elseif modSymbol == :TBTSM error("Two Binary Trait Substitution Model does not support dna data: it supports two sets of potentially correlated two trait states.") + else + error("model $modSymbol is unknown or not implemented yet") end if rvSymbol == :RV @@ -321,22 +338,23 @@ function fitDiscrete(net::HybridNetwork, modSymbol::Symbol, dnadata::DataFrame, rvas = RateVariationAcrossSites(1.0, 1) end - fitDiscrete(net, model, rvas, dnadata, dnapatternweights; kwargs...) + fitdiscrete(net, model, rvas, dnadata, dnapatternweights; kwargs...) end """ fit(StatisticalSubstitutionModel, net, model, traits; kwargs...) fit!(StatisticalSubstitutionModel; kwargs...) -Internal function called by [`fitDiscrete`](@ref): with same key word arguments `kwargs`. -But dangerous: `traits` should be a vector of vectors as for [`fitDiscrete`](@ref) +Internal function called by [`fitdiscrete`](@ref): with same key word arguments `kwargs`. +But dangerous: `traits` should be a vector of vectors as for [`fitdiscrete`](@ref) **but** here `traits` need to contain the *indices* of trait values corresponding to the indices in `getlabels(model)`, and species should appear in `traits` in the order corresponding to the node numbers in `net`. See [`traitlabels2indices`](@ref) to convert trait labels to trait indices. -**Warning**: does *not* perform checks. [`fitDiscrete`](@ref) calls this function +**Warning**: does *not* perform checks. [`fitdiscrete`](@ref) calls this function after doing checks, preordering nodes in the network, making sure nodes have consecutive numbers, species are matched between data and network etc. +Like, snaq!(), we estimate the network starting from tree (or network) `astraltree`. """ function StatsBase.fit(::Type{SSM}, net::HybridNetwork, model::SubstitutionModel, ratemodel::RateVariationAcrossSites, trait::AbstractVector; kwargs...) @@ -375,7 +393,8 @@ end function fit!(obj::SSM; optimizeQ=true::Bool, optimizeRVAS=true::Bool,verbose=false::Bool, NLoptMethod=:LD_MMA::Symbol, ftolRel=fRelBL::Float64, ftolAbs=fAbsBL::Float64, - xtolRel=xRelBL::Float64, xtolAbs=xAbsBL::Float64) + xtolRel=xRelBL::Float64, xtolAbs=xAbsBL::Float64, + alphamin=0.05, alphamax=500) all(x -> x >= 0.0, [e.length for e in obj.net.edge]) || error("branch lengths should be >= 0") all(x -> x >= 0.0, [e.gamma for e in obj.net.edge]) || error("gammas should be >= 0") if optimizeQ && nparams(obj.model) <1 @@ -386,25 +405,23 @@ function fit!(obj::SSM; optimizeQ=true::Bool, optimizeRVAS=true::Bool,verbose=fa @debug "Rate model has one rate category, so there are no parameters to optimize. optimizeRVAS will be set to false." optimizeRVAS = false end - if !optimizeQ & !optimizeRVAS - #? rates need to be set? + if !optimizeQ && !optimizeRVAS discrete_corelikelihood!(obj) verbose && println("loglik = $(loglikelihood(obj)) under fixed parameters, no optimization") - return obj # return fmax,xmax,ret + return obj end counter = [0] if optimizeQ function loglikfun(x::Vector{Float64}, grad::Vector{Float64}) # modifies obj counter[1] += 1 - setrates!(obj.model, x) #replaced assignment here for efficiency will be setalpha! + setrates!(obj.model, x) res = discrete_corelikelihood!(obj) verbose && println("loglik: $res, model rates: $x") length(grad) == 0 || error("gradient not implemented") return res end - # build optimizer for Q - # optimize Q under fixedRVAS - # set-up optimization object + # optimize Q under fixed RVAS parameters + # set-up optimization object for Q NLoptMethod=:LN_COBYLA # no gradient # :LN_COBYLA for (non)linear constraits, :LN_BOBYQA for bound constraints nparQ = nparams(obj.model) @@ -425,15 +442,13 @@ function fit!(obj::SSM; optimizeQ=true::Bool, optimizeRVAS=true::Bool,verbose=fa if optimizeRVAS function loglikfunRVAS(alpha::Vector{Float64}, grad::Vector{Float64}) # modifies obj counter[1] += 1 - setalpha!(obj.ratemodel, alpha[1]) #replaced assignment here for efficiency will be setalpha! + setalpha!(obj.ratemodel, alpha[1]) # obj.ratemodel.alpha is a scalar, alpha is a vector within here res = discrete_corelikelihood!(obj) verbose && println("loglik: $res, rate variation model shape parameter alpha: $(alpha[1])") length(grad) == 0 || error("gradient not implemented") return res end - # build optimizer for RAS - # optimize RAS - # set-up optimization object + # set-up optimization object for RVAS parameter NLoptMethod=:LN_COBYLA # no gradient # :LN_COBYLA for (non)linear constraits, :LN_BOBYQA for bound constraints nparRVAS = nparams(obj.ratemodel) @@ -444,24 +459,21 @@ function fit!(obj::SSM; optimizeQ=true::Bool, optimizeRVAS=true::Bool,verbose=fa NLopt.xtol_abs!(optRVAS,xtolAbs) NLopt.maxeval!(optRVAS,1000) # max number of iterations # NLopt.maxtime!(optRVAS, t::Real) - NLopt.lower_bounds!(optRVAS, zeros(Float64, nparRVAS)) - # fixit: set upper bound depending on branch lengths in network? + NLopt.lower_bounds!(optRVAS, fill(alphamin, (nparRVAS,)) ) # for 0 as lower bound: zeros(Float64, nparRVAS) + NLopt.upper_bounds!(optRVAS, fill(alphamax, (nparRVAS,)) ) # delete to remove upper bound counter[1] = 0 NLopt.max_objective!(optRVAS, loglikfunRVAS) fmax, xmax, ret = NLopt.optimize(optRVAS, [obj.ratemodel.alpha]) # optimization here! verbose && println("RVAS: got $(round(fmax, digits=5)) at $(round.(xmax, digits=5)) after $(counter[1]) iterations (return code $(ret))") end if optimizeQ && optimizeRVAS - # optimize Q under RVAS - # fixit: set upper bound depending on branch lengths in network? + # optimize Q under fixed RVAS parameters: a second time counter[1] = 0 - #NLopt.max_objective!(optQ, loglikfun) - fmax, xmax, ret = NLopt.optimize(optQ, obj.model.rate) # optimization here! + fmax, xmax, ret = NLopt.optimize(optQ, obj.model.rate) verbose && println("got $(round(fmax, digits=5)) at $(round.(xmax, digits=5)) after $(counter[1]) iterations (return code $(ret))") - # optimize RVAS + # optimize RVAS under fixed Q: a second time counter[1] = 0 - #NLopt.max_objective!(optRVAS, loglikfunRVAS) - fmax, xmax, ret = NLopt.optimize(optRVAS, [obj.ratemodel.alpha]) # optimization here! + fmax, xmax, ret = NLopt.optimize(optRVAS, [obj.ratemodel.alpha]) verbose && println("RVAS: got $(round(fmax, digits=5)) at $(round.(xmax, digits=5)) after $(counter[1]) iterations (return code $(ret))") end return obj @@ -471,6 +483,7 @@ end """ discrete_corelikelihood!(obj::StatisticalSubstitutionModel; whichtrait=:all) discrete_corelikelihood_tree!(obj, t::Integer, traitrange::AbstractArray) + Calculate the likelihood and update `obj.loglik` for discrete characters on a network (or on a single tree: `t`th tree displayed in the network, for the second form). Update forward and direct partial likelihoods while doing so. @@ -538,7 +551,11 @@ function discrete_corelikelihood_tree!(obj::SSM, t::Integer, traitrange::Abstrac end end if ni==1 # root is first index in nodes changed - logprior = [-log(k) for i in 1:k] # uniform prior at root + if typeof(obj.model) == NASM + logprior = log.(stationary(obj.model)) + else #other trait models + logprior = [-log(k) for i in 1:k] # uniform prior at root + end loglik = logsumexp(logprior + view(forwardlik, :,nnum)) # log P{data for ci | tree t} if iratemultiplier == 1 currentloglik = loglik @@ -593,17 +610,26 @@ function traitlabels2indices(data::AbstractVector, model::SubstitutionModel) end return A end + function traitlabels2indices(data::Union{AbstractMatrix,DataFrame}, model::SubstitutionModel) A = Vector{Vector{Union{Missings.Missing,Int}}}(undef, 0) # indices of trait labels labs = getlabels(model) isDNA = typeof(labs) == Array{DNA,1} ntraits = size(data,2) #starting with spcies column - for i in 1:size(data,1) # go row by row + for i in 1:size(data,1) # go row by row (which is species by species) V = Vector{Union{Missings.Missing,Int}}(undef, ntraits) for j in 1:ntraits vi = missing # value index @inbounds l = data[i,j] # value label + if !isDNA && ismissing(l) + vi = missing + elseif isDNA #else if DNA + if typeof(l) == String #takes string and converts to a Char so that we can convert to DNA + l = Vector{Char}(l)[1] + end + l = convert(DNA, l) + end if !ismissing(l) vi = findfirst(isequal(l), labs) if vi == nothing @@ -619,7 +645,7 @@ function traitlabels2indices(data::Union{AbstractMatrix,DataFrame}, V[j] = vi end push!(A, V) - end + end return A end @@ -631,7 +657,7 @@ Return a new network (`net` is *not* modified) with tips matching those in speci if some species in `net` have no data, these species are pruned from the network. The network also has its node names reset, such that leaves have nodes have consecutive numbers starting at 1, with leaves first. -Used by [`fitDiscrete`](@ref) to build a new [`StatisticalSubstitutionModel`](@ref). +Used by [`fitdiscrete`](@ref) to build a new [`StatisticalSubstitutionModel`](@ref). """ function check_matchtaxonnames!(species::AbstractVector, dat::AbstractVector, net::HybridNetwork) # 1. basic checks for dimensions and types @@ -682,7 +708,7 @@ end Estimate the marginal probability of ancestral states for discrete character number `trait`, or for the active trait if `trait` is unspecified: `obj.activesite`. The parameters of the [`StatisticalSubstitutionModel`](@ref) object `obj` -must first be fitted using [`fitDiscrete`](@ref), and ancestral state reconstruction +must first be fitted using [`fitdiscrete`](@ref), and ancestral state reconstruction is conditional on the estimated parameters. If these parameters were estimated using all traits, they are used as is to do ancestral state reconstruction of the particular `trait` of interest. @@ -712,10 +738,10 @@ julia> using DataFrames julia> dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]); -julia> fit1 = fitDiscrete(net, m1, dat); +julia> fit1 = fitdiscrete(net, m1, dat); julia> asr = ancestralStateReconstruction(fit1) -9×4 DataFrames.DataFrame +9×4 DataFrame │ Row │ nodenumber │ nodelabel │ lo │ hi │ │ │ Int64 │ String │ Float64 │ Float64 │ ├─────┼────────────┼───────────┼──────────┼──────────┤ @@ -731,7 +757,7 @@ julia> asr = ancestralStateReconstruction(fit1) julia> round.(exp.(fit1.postltw), digits=6) # marginal (posterior) probability that the trait evolved on each displayed tree 2-element Array{Float64,1}: - 0.919831 + 0.919831 0.080169 julia> using PhyloPlots @@ -787,7 +813,11 @@ function discrete_backwardlikelihood_tree!(obj::SSM, t::Integer, trait::Integer) bkdlik = view(obj.backwardlik,:,:,t) k = nstates(obj.model) bkwtmp = Vector{Float64}(undef, k) # to hold bkw lik without parent edge transition - logprior = [-log(k) for i in 1:k] + if typeof(obj.model) == NASM + logprior = log.(stationary(obj.model)) + else #trait models + logprior = [-log(k) for i in 1:k] # uniform prior at root + end for ni in 1:length(tree.nodes_changed) # pre-order traversal to calculate backwardlik n = tree.nodes_changed[ni] nnum = n.number @@ -816,18 +846,18 @@ end """ learnLabels(model::SubstitutionModel, species::Array{String}, dat::DataFrame) -Learn label names and length from data for fitDiscrete wrapper function with species vector and trait dataframe data +Learn label names and length from data for fitdiscrete wrapper function with species vector and trait dataframe data """ function learnLabels(modSymbol::Symbol, species::Array{String}, dat::DataFrame) labels = unique(reshape(convert(Matrix, dat), 1, size(dat)[1]*size(dat)[2])) if modSymbol == :BTSM - length(labels) == 2 || error("Binary Trait Substitution Model supports traits with two states. These data have more than two states.") + length(labels) == 2 || error("Binary Trait Substitution Model supports traits with two states. These data have do not have two states.") elseif modSymbol == :TBTSM - unique(dat[1]) == 2 && unique(dat[2] == 2) || error("Two Binary Trait Substitution Model supports two traits with two states each. These data have more than two states.") + unique(dat[1]) == 2 && unique(dat[2] == 2) || error("Two Binary Trait Substitution Model supports two traits with two states each.") elseif modSymbol == :HKY85 - uppercase(join(sort(labels))) == "ACGT" || error("HKY85 requires that trait data are dna bases A, C, G, and T") + occursin(uppercase(join(sort(labels))), "ACGT") || error("HKY85 requires that trait data are dna bases A, C, G, and T") elseif modSymbol == :JC69 - uppercase(join(sort(labels))) == "ACGT" || error("JC69 requires that trait data are dna bases A, C, G, and T") + occursin(uppercase(join(sort(labels))), "ACGT") || error("JC69 requires that trait data are dna bases A, C, G, and T") end return labels end @@ -836,8 +866,9 @@ end startingrate(net) Estimate an evolutionary rate appropriate for the branch lengths in the network, -which should be a good starting value before optimization in `fitDiscrete`, +which should be a good starting value before optimization in `fitdiscrete`, assuming approximately 1 change across the entire tree. +If all edge lengths are missing, set starting rate to 1/(number of taxa). """ function startingrate(net::HybridNetwork) totaledgelength = 0.0 @@ -847,7 +878,7 @@ function startingrate(net::HybridNetwork) end end if totaledgelength == 0.0 # as when all edge lengths are missing - totaledgelength = 0.25 + totaledgelength = net.numTaxa end return 1.0/totaledgelength end diff --git a/test/test_traitLikDiscrete.jl b/test/test_traitLikDiscrete.jl index afdb2ba83..d4229920d 100644 --- a/test/test_traitLikDiscrete.jl +++ b/test/test_traitLikDiscrete.jl @@ -103,10 +103,10 @@ end # test on a tree #= likelihood calculated in R using a fixed Q matrix, first with ace() then -with fitDiscrete(), then with fitMK(). problem: they give different results, +with fitdiscrete(), then with fitMK(). problem: they give different results, see http://blog.phytools.org/2015/09/the-difference-between-different.html - ace: misses log(#states) in its log-likelihood -- fitDiscrete in geiger: uses empirical prior at root, not stationary dist, +- fitdiscrete in geiger: uses empirical prior at root, not stationary dist, but "lik" object is very flexible - fitMk is correct. also great for 2 correlated binary traits library(ape) @@ -119,7 +119,7 @@ print(fitER$loglik - log(2), digits=17) # -2.6638002683925799 print(fitER$rates, digits=17) # rates = 0.3743971742794559 print(fitER$lik.anc, digits=17)# posterior probs of states at nodes: 3x2 matrix (3 internal nodes, 2 states) library(geiger) -fitER = fitDiscrete(mytree, states, model="ER") +fitER = fitdiscrete(mytree, states, model="ER") print(fitER$opt$q12, digits=17) # rates = 0.36836216513047726 print(fitER$opt$lnL, digits=17) # log-likelihood = -2.6626566310743804 lik = fitER$lik @@ -128,7 +128,7 @@ library(phytools) Q2 = matrix(c(-1,1,1,-1),2,2)*fitER$opt$q12 fit2 = fitMk(mytree, states, model="ER", fixedQ=Q2) print(fit2$logLik, digits=17) # log-likelihood = -2.6638637960257574 -fitER = fitDiscrete(mytree, states, model="ARD") +fitER = fitdiscrete(mytree, states, model="ARD") lik = fitER$lik Q = c(0.29885191850718751, 0.38944304456937912) # q12, q21 lik(Q, root="given", root.p=Q[2:1]/sum(Q)) # -2.6457428692377234 @@ -138,32 +138,32 @@ lik(Q, root="flat") # -2.6754091090953693 .1,.7: -3.3291679800706073 optim(Q, lik, lower=1e-8, control=list(fnscale=-1), root="flat") # rates = 0.29993140042699212 0.38882902905265493 loglik=-2.6447247349802496 states=c(1,2,1); names(states)=c("A","B","D") -fitER = fitDiscrete(mytree, states, model="ARD"); lik = fitER$lik +fitER = fitdiscrete(mytree, states, model="ARD"); lik = fitER$lik lik(Q, root="flat") # -2.1207856874033491 =# net = readTopology("(A:3.0,(B:2.0,(C:1.0,D:1.0):1.0):1.0);"); tips = Dict("A" => "lo", "B" => "lo", "C" => "hi", "D" => "hi"); #? this is supposed to be an AbstractVector, is a Dict{String,String} m1 = EqualRatesSubstitutionModel(2,[0.36836216513047726], ["lo", "hi"]); -fit1 = (@test_logs fitDiscrete(net, m1, tips; optimizeQ=false, optimizeRVAS=false)); +fit1 = (@test_logs fitdiscrete(net, m1, tips; optimizeQ=false, optimizeRVAS=false)); @test_logs show(devnull, fit1) @test StatsBase.loglikelihood(fit1) ≈ -2.6638637960257574 atol=2e-4 species = ["G","C","A","B","D"] dat1 = DataFrame(trait = ["hi","hi","lo","lo","hi"], species = species) m2 = BinaryTraitSubstitutionModel(0.2, 0.3, ["lo", "hi"]) -fit2 = (@test_logs fitDiscrete(net, m2, dat1; optimizeQ=false, optimizeRVAS=false)) +fit2 = (@test_logs fitdiscrete(net, m2, dat1; optimizeQ=false, optimizeRVAS=false)) @test fit2.trait == [[1],[1],[2],[2]] @test StatsBase.loglikelihood(fit2) ≈ -2.6754091090953693 atol=2e-4 originalstdout = stdout redirect_stdout(open("/dev/null", "w")) #OPTIMIZES RATES -fit2 = @test_logs fitDiscrete(net, m2, dat1; optimizeQ=true, optimizeRVAS=false, verbose=true) # 65 iterations +fit2 = @test_logs fitdiscrete(net, m2, dat1; optimizeQ=true, optimizeRVAS=false, verbose=true) # 65 iterations redirect_stdout(originalstdout) @test fit2.model.rate ≈ [0.29993140042699212, 0.38882902905265493] atol=2e-4 @test StatsBase.loglikelihood(fit2) ≈ -2.6447247349802496 atol=2e-4 m2.rate[:] = [0.2, 0.3]; dat2 = DataFrame(trait1= ["hi","hi","lo","lo","hi"], trait2=["hi",missing,"lo","hi","lo"]); -fit3 = (@test_logs fitDiscrete(net, m2, species, dat2; optimizeQ=false, optimizeRVAS=false)) +fit3 = (@test_logs fitdiscrete(net, m2, species, dat2; optimizeQ=false, optimizeRVAS=false)) @test fit3.loglik ≈ (-2.6754091090953693 - 2.1207856874033491) PhyloNetworks.fit!(fit3; optimizeQ=true, optimizeRVAS=false) @@ -192,7 +192,7 @@ function traitprobabilities(model, net, ntraits=10) npatterns = i lik = Float64[] for i in 1:npatterns - fit = fitDiscrete(net, model, dat[[:species, Symbol("x",i)]]; optimizeQ=false, optimizeRVAS=false) + fit = fitdiscrete(net, model, dat[[:species, Symbol("x",i)]]; optimizeQ=false, optimizeRVAS=false) push!(lik, fit.loglik) end return dat, prop, lik @@ -224,20 +224,20 @@ d = DataFrame(species=["D","C","B","A"], x1=[1,1,1,1], x2=[1,2,2,1], x3=[2,2,2,2 x11=[1,2,1,1], x12=[2,2,1,2], x13=[2,1,1,2], x14=[1,2,2,2], x15=[1,1,1,2], x16=[2,1,2,2]) lik = Float64[] for i in 1:16 - fit = fitDiscrete(net, m1, d[[:species, Symbol("x",i)]]; optimizeQ=false, optimizeRVAS=false) + fit = fitdiscrete(net, m1, d[[:species, Symbol("x",i)]]; optimizeQ=false, optimizeRVAS=false) push!(lik, fit.loglik) end @test lik ≈ [-1.6218387598967712, -3.008066347196894, -4.3943604143403245, -3.008199100743402, -3.70121329832901, -3.0081981601869483, -2.315051933868397, -2.314985711030534, -3.0081988850020873, -3.0081983709272504, -2.3150512090547584, -3.70134532205944, -3.008132923628349, -3.7012134632082083, -2.3149859724945876, -3.7013460518770915] -fit1 = fitDiscrete(net, m1, d[[:species, :x6]]; optimizeRVAS=false) +fit1 = fitdiscrete(net, m1, d[[:species, :x6]]; optimizeRVAS=false) # with parameter estimation net = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);") m1 = BinaryTraitSubstitutionModel([1.0, 1.0], ["lo", "hi"]) dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]) -fit1 = fitDiscrete(net, m1, dat; optimizeQ=false, optimizeRVAS=false) +fit1 = fitdiscrete(net, m1, dat; optimizeQ=false, optimizeRVAS=false) @test fit1.loglik ≈ -2.77132013004859 PhyloNetworks.fit!(fit1; optimizeQ=true, optimizeRVAS=false) @test fit1.model.rate ≈ [0.2722263130324768, 0.34981109618902395] atol=1e-4 @@ -248,7 +248,7 @@ function simulateManyTraits_estimate(ntraits) res, lab = randomTrait(m1, net; ntraits=ntraits) tips = findall(in(tipLabels(net)), lab) # indices of tips: columns in res dat = DataFrame(transpose(res[:,tips])); species = lab[tips] - return fitDiscrete(net, m1, species, dat; optimizeRVAS = false) + return fitdiscrete(net, m1, species, dat; optimizeRVAS = false) end # simulateManyTraits_estimate(100000) # α=1.1124637623451075, β=0.5604529225895175, loglik=-25587.1 with ntraits=10000 @@ -271,8 +271,8 @@ asr = ancestralStateReconstruction(fit1) end # end of testset, fixed topology @testset "testing readfastatodna" begin -#fastafile = joinpath(@__DIR__, "../..", "dev/PhyloNetworks/examples", "test_8_withrepeatingsites.aln") fastafile = joinpath(@__DIR__, "..", "examples", "test_8_withrepeatingsites.aln") +#fastafile = abspath(joinpath(dirname(Base.find_package("PhyloNetworks")), "..", "examples", "test_8_withrepeatingsites.aln")) dat, weights = readfastatodna(fastafile, true); @test weights == [3.0, 1.0, 1.0, 2.0, 1.0] #check that no columns are repeated, only correct columns removed @@ -280,6 +280,7 @@ dat, weights = readfastatodna(fastafile, true); #test on data with no repeated site patterns fastafile = joinpath(@__DIR__, "..", "examples", "Ae_bicornis_8sites.aln") +#fastafile = abspath(joinpath(dirname(Base.find_package("PhyloNetworks")), "..", "examples", "Ae_bicornis_8sites.aln")) dat, weights = readfastatodna(fastafile, true); #check that weights are correct @test weights == ones(Float64, 8) @@ -287,7 +288,7 @@ dat, weights = readfastatodna(fastafile, true); @test ncol(dat) == 9 end #testing readfastatodna -@testset "testing NucleicAcidSubsitutionModels" begin +@testset "NucleicAcidSubsitutionModels" begin #test NASM models basics mJC69 = JC69(0.5, false); @@ -300,12 +301,15 @@ mJC69 = JC69(0.5, false); 0.121646 0.121646 0.635063 0.121646; 0.121646 0.121646 0.121646 0.635063] atol=1e-5 +@test_throws ErrorException HKY85([0.5, 0.5], [0.25, 0.25, 0.25, 0.25], true) +@test_throws ErrorException HKY85([.1,.1,.1], [0.25, 0.25, 0.25, 0.25], false) +@test_throws ErrorException HKY85([0.5], [0.25, 0.25, 0.25, 0.25], false) mHKY85 = HKY85([0.5, 0.5], [0.25, 0.25, 0.25, 0.25], false) @test Q(mHKY85) ≈ [ -0.375 0.125 0.125 0.125; 0.125 -0.375 0.125 0.125; 0.125 0.125 -0.375 0.125; 0.125 0.125 0.125 -0.375] atol=1e-5 -mHKY85rel = HKY85([0.5], [0.25, 0.25, 0.25, 0.25], true) +mHKY85rel = HKY85(0.5, [0.25, 0.25, 0.25, 0.25]) @test Q(mHKY85rel) ≈ [-1.0 0.4 0.2 0.4; 0.4 -1.0 0.4 0.2; 0.2 0.4 -1.0 0.4; @@ -329,60 +333,91 @@ mHKY85rel = HKY85([0.5], [0.25, 0.25, 0.25, 0.25], true) @test_logs show(devnull, mHKY85) end # end of testing NASMs -@testset "testing fitDiscrete for NucleicAcidSubsitutionModels & RateVariationAcrossSites" begin -# test fitDiscrete with NASM # +@testset "fitdiscrete for NucleicAcidSubsitutionModels & RateVariationAcrossSites" begin +# test fitdiscrete with NASM # #there are 3 alignments in PhyloNetworks/examples net = readTopology("(A:3.0,(B:2.0,(C:1.0,D:1.0):1.0):1.0);"); tips = Dict("A" => BioSymbols.DNA_A, "B" => BioSymbols.DNA_A, "C" => BioSymbols.DNA_G, "D" => BioSymbols.DNA_G); #without optimization (confirmed with ape ace() function and phangorn) mJC69 = JC69(0.2923350741254221, false) #ace() gives Q matrix cell, not rate. lambda = (1.0/3.0)*obj.rate[1] so rate = 3*0.097445024708474035 = 0.2923350741254221 -fitJC69 = fitDiscrete(net, mJC69, tips; optimizeQ=false, optimizeRVAS=false); +fitJC69 = fitdiscrete(net, mJC69, tips; optimizeQ=false); @test loglikelihood(fitJC69) ≈ -4.9927386890207304 atol=2e-6 #ace() from ape pkg and our method agree here. ##without optimize at 0.25 mJC69 = JC69([0.25], false) -fitJC69 = fitDiscrete(net, mJC69, tips; optimizeQ=false, optimizeRVAS=false); +fitJC69 = fitdiscrete(net, mJC69, tips; optimizeQ=false); @test loglikelihood(fitJC69) ≈ -4.99997 atol=2e-3 #from phangorn #with optimization (confirmed with ape ace() function and phangorn) mJC69 = JC69([0.25], false) -fitJC69 = fitDiscrete(net, mJC69, tips; optimizeQ=true, optimizeRVAS=false); +fitJC69 = fitdiscrete(net, mJC69, tips; optimizeQ=true) @test Q(fitJC69.model)[1,2] ≈ 0.097445024708474035 atol = 2e-3 #from ace() in ape pkg @test loglikelihood(fitJC69) ≈ -4.9927386890207304 atol=2e-6 #from ace() + log(4) #without optimization (confirmed with phangorn function) mHKY85 = HKY85([4.0/3, 4.0/3], [0.25, 0.25, 0.25, 0.25], false); #absolute -fitHKY85 = fitDiscrete(net, mHKY85, tips; optimizeQ=false, optimizeRVAS=false); +fitHKY85 = fitdiscrete(net, mHKY85, tips; optimizeQ=false); @test loglikelihood(fitHKY85) ≈ -5.365777014 atol = 2e-8 mHKY85 = HKY85([1.0], [0.25, 0.25, 0.25, 0.25], true); #relative -fitHKY85 = fitDiscrete(net, mHKY85, tips; optimizeQ=false, optimizeRVAS=false); +fitHKY85 = fitdiscrete(net, mHKY85, tips; optimizeQ=false); @test loglikelihood(fitHKY85) ≈ -5.365777014 atol = 2e-8 #with optimization (confirmed with ape ace() function) mHKY85 = HKY85([0.5, 0.1], [0.25, 0.25, 0.25, 0.25], false); #absolute -fitHKY85 = fitDiscrete(net, mHKY85, tips; optimizeQ=true, optimizeRVAS=false); -@test fitHKY85.model.rate[1] ≈ 1.4975887229148119 atol = 2e-3 +@time fitHKY85 = fitdiscrete(net, mHKY85, tips; optimizeQ=true) +@test fitHKY85.model.rate[1] ≈ 1.4975887229148119 atol = 2e-4 @test loglikelihood(fitHKY85) ≈ -3.3569474489525244 atol = 2e-8 # test RateVariationAcrossSites with NASM -rv = RateVariationAcrossSites(1.0, 4); +@test RateVariationAcrossSites(2.0, 4).ratemultiplier ≈ [0.3190650334827019,0.6833612017749638,1.108977045226681,1.888596719515653] +rv = RateVariationAcrossSites() +@test rv.ratemultiplier ≈ [0.14578435573294748,0.5131315935152865,1.0708310452240655,2.270253005527701] +PhyloNetworks.setalpha!(rv, 2.0) +@test rv.ratemultiplier ≈ [0.3190650334827019,0.6833612017749638,1.108977045226681,1.888596719515653] @test_logs show(devnull, rv) rv.ratemultiplier[:] = [0.1369538, 0.4767519, 1.0000000, 2.3862944] # NOTE: phangorn calculates gamma quantiles differently, so I assign them for testing mJC69 = JC69([1.0], true) -fitJC69rv = fitDiscrete(net, mJC69, rv, tips; optimizeQ=false, optimizeRVAS=false, ftolRel=1e-20); +fitJC69rv = fitdiscrete(net, mJC69, rv, tips; optimizeQ=false, optimizeRVAS=false, ftolRel=1e-20); @test loglikelihood(fitJC69rv) ≈ -5.26390008 atol = 2e-8 -fitJC69rvOpt = fitDiscrete(net, mJC69, rv, tips, optimizeQ=false, optimizeRVAS=true); +fitJC69rvOpt = fitdiscrete(net, mJC69, rv, tips, optimizeQ=false, optimizeRVAS=true); mHKY85 = HKY85([4.0/3, 4.0/3], [0.25, 0.25, 0.25, 0.25], false); -fitHKY85rv = fitDiscrete(net, mHKY85, rv, tips; optimizeQ=false, optimizeRVAS=false); +fitHKY85rv = fitdiscrete(net, mHKY85, rv, tips; optimizeQ=false, optimizeRVAS=false); @test loglikelihood(fitHKY85rv) ≈ -5.2639000803742979 atol = 2e-5 #from phangorn -fitHKY85rvOpt = fitDiscrete(net, mHKY85, rv, tips; optimizeQ=false, optimizeRVAS=true); +fitHKY85rvOpt = fitdiscrete(net, mHKY85, rv, tips; optimizeQ=false, optimizeRVAS=true); -#tests wrappers +## TEST WRAPPERS ## +#for species, trait data +net_dat = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);") +dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]) +species_alone = ["C","A","B","D"] +dat_alone = DataFrame(trait=["hi","lo","lo","hi"]) +net_tips = readTopology("(A:3.0,(B:2.0,(C:1.0,D:1.0):1.0):1.0);"); +@test_throws ErrorException fitdiscrete(net_dat, :bogus, species_alone, dat_alone); +s1 = fitdiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=false) +@test_logs show(devnull, s1) +s1 = fitdiscrete(net_dat, :ERSM, species_alone, dat_alone, :RV; optimizeQ=false, optimizeRVAS=false) +@test_logs show(devnull, s1) +s2 = fitdiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) +@test_logs show(devnull, s2) +@test_throws ErrorException fitdiscrete(net_dat, :TBTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false) +dna_alone = DataFrame(trait=['A','C','C','A']) +s3 = fitdiscrete(net_dat, :JC69, species_alone, dna_alone, :RV; optimizeRVAS=false, ftolRel=.1,ftolAbs=.2,xtolRel=.1,xtolAbs=.2) # 1 site: no info to optimize RVAS +@test s3.model.relative +@test s3.ratemodel.alpha == 1.0 +@test_logs show(devnull, s3) +s4 = fitdiscrete(net_dat, :HKY85, species_alone, dna_alone, :RV; optimizeRVAS=false, ftolRel=.1,ftolAbs=.2,xtolRel=.1,xtolAbs=.2) +@test s4.model.relative +@test s4.model.pi == [3,3,1,1]/8 +@test s4.ratemodel.alpha == 1.0 +@test_logs show(devnull, s4) + +#for dna data (output of fastatodna) fastafile = joinpath(@__DIR__, "..", "examples", "Ae_bicornis_Tr406_Contig10132.aln") +#fastafile = abspath(joinpath(dirname(Base.find_package("PhyloNetworks")), "..", "examples", "Ae_bicornis_Tr406_Contig10132.aln")) dna_dat, dna_weights = readfastatodna(fastafile, true); net_dna = readTopology("((((((((((((((Ae_caudata_Tr275,Ae_caudata_Tr276),Ae_caudata_Tr139))#H1,#H2),(((Ae_umbellulata_Tr266,Ae_umbellulata_Tr257),Ae_umbellulata_Tr268),#H1)),((Ae_comosa_Tr271,Ae_comosa_Tr272),(((Ae_uniaristata_Tr403,Ae_uniaristata_Tr357),Ae_uniaristata_Tr402),Ae_uniaristata_Tr404))),(((Ae_tauschii_Tr352,Ae_tauschii_Tr351),(Ae_tauschii_Tr180,Ae_tauschii_Tr125)),(((((((Ae_longissima_Tr241,Ae_longissima_Tr242),Ae_longissima_Tr355),(Ae_sharonensis_Tr265,Ae_sharonensis_Tr264)),((Ae_bicornis_Tr408,Ae_bicornis_Tr407),Ae_bicornis_Tr406)),((Ae_searsii_Tr164,Ae_searsii_Tr165),Ae_searsii_Tr161)))#H2,#H4))),(((T_boeoticum_TS8,(T_boeoticum_TS10,T_boeoticum_TS3)),T_boeoticum_TS4),((T_urartu_Tr315,T_urartu_Tr232),(T_urartu_Tr317,T_urartu_Tr309)))),(((((Ae_speltoides_Tr320,Ae_speltoides_Tr323),Ae_speltoides_Tr223),Ae_speltoides_Tr251))H3,((((Ae_mutica_Tr237,Ae_mutica_Tr329),Ae_mutica_Tr244),Ae_mutica_Tr332))#H4))),Ta_caputMedusae_TB2),S_vavilovii_Tr279),Er_bonaepartis_TB1),H_vulgare_HVens23);"); for edge in net_dna.edge #adds branch lengths @@ -391,42 +426,26 @@ for edge in net_dna.edge #adds branch lengths setGamma!(edge, 0.5) end end -net_dat = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);") -dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]) -species_alone = ["C","A","B","D"] -dat_alone = DataFrame(trait=["hi","lo","lo","hi"]) - -net_tips = readTopology("(A:3.0,(B:2.0,(C:1.0,D:1.0):1.0):1.0);"); - -## TEST WRAPPERS ## - #for species, trait data -s1 = fitDiscrete(net_dat, :ERSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false); -@test_logs show(devnull, s1) -s1 = fitDiscrete(net_dat, :ERSM, species_alone, dat_alone, :RV; optimizeQ=false, optimizeRVAS=false); -@test_logs show(devnull, s1) -s2 = fitDiscrete(net_dat, :BTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false); -@test_logs show(devnull, s2) -@test_throws ErrorException fitDiscrete(net_dat, :TBTSM, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false); -@test_throws ErrorException fitDiscrete(net_dat, :JC69, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false); -@test_throws ErrorException fitDiscrete(net_dat, :HKY85, species_alone, dat_alone; optimizeQ=false, optimizeRVAS=false); - - #for dna data (output of fastatodna) -d1 = fitDiscrete(net_dna, :ERSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); +d1 = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(net_dna, + :ERSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false)) @test_logs show(devnull, d1) -@test_throws ErrorException fitDiscrete(net_dna, :BTSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); -@test_throws ErrorException fitDiscrete(net_dna, :TBTSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); -d2 = fitDiscrete(net_dna, :JC69, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); +@test_throws ErrorException fitdiscrete(net_dna, :BTSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); +@test_throws ErrorException fitdiscrete(net_dna, :TBTSM, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); +d2 = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(net_dna, + :JC69, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false)) @test_logs show(devnull, d2) -d2 = fitDiscrete(net_dna, :JC69, dna_dat, dna_weights, :RV; optimizeQ=false, optimizeRVAS=false); +d2 = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(net_dna, + :JC69, dna_dat, dna_weights, :RV; optimizeQ=false, optimizeRVAS=false)) @test_logs show(devnull, d2) -d3 = fitDiscrete(net_dna, :HKY85, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false); +d3 = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(net_dna, + :HKY85, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false)) @test_logs show(devnull, d3) -end #testing NASM with rate variation +end #testing fitdiscrete for NucleicAcidSubsitutionModels & RateVariationAcrossSites -@testset "testing readfastatodna with NASM and RateVariationAcrossSites" begin -#fastafile = joinpath(@__DIR__, "../..", "dev/PhyloNetworks/examples", "Ae_bicornis_Tr406_Contig10132.aln") +@testset "readfastatodna with NASM and RateVariationAcrossSites" begin fastafile = joinpath(@__DIR__, "..", "examples", "Ae_bicornis_Tr406_Contig10132.aln") +#fastafile = abspath(joinpath(dirname(Base.find_package("PhyloNetworks")), "..", "examples", "Ae_bicornis_Tr406_Contig10132.aln")) dna_dat, dna_weights = readfastatodna(fastafile, true); dna_net_top = readTopology("((((((((((((((Ae_caudata_Tr275,Ae_caudata_Tr276),Ae_caudata_Tr139))#H1,#H2),(((Ae_umbellulata_Tr266,Ae_umbellulata_Tr257),Ae_umbellulata_Tr268),#H1)),((Ae_comosa_Tr271,Ae_comosa_Tr272),(((Ae_uniaristata_Tr403,Ae_uniaristata_Tr357),Ae_uniaristata_Tr402),Ae_uniaristata_Tr404))),(((Ae_tauschii_Tr352,Ae_tauschii_Tr351),(Ae_tauschii_Tr180,Ae_tauschii_Tr125)),(((((((Ae_longissima_Tr241,Ae_longissima_Tr242),Ae_longissima_Tr355),(Ae_sharonensis_Tr265,Ae_sharonensis_Tr264)),((Ae_bicornis_Tr408,Ae_bicornis_Tr407),Ae_bicornis_Tr406)),((Ae_searsii_Tr164,Ae_searsii_Tr165),Ae_searsii_Tr161)))#H2,#H4))),(((T_boeoticum_TS8,(T_boeoticum_TS10,T_boeoticum_TS3)),T_boeoticum_TS4),((T_urartu_Tr315,T_urartu_Tr232),(T_urartu_Tr317,T_urartu_Tr309)))),(((((Ae_speltoides_Tr320,Ae_speltoides_Tr323),Ae_speltoides_Tr223),Ae_speltoides_Tr251))H3,((((Ae_mutica_Tr237,Ae_mutica_Tr329),Ae_mutica_Tr244),Ae_mutica_Tr332))#H4))),Ta_caputMedusae_TB2),S_vavilovii_Tr279),Er_bonaepartis_TB1),H_vulgare_HVens23);"); @@ -434,23 +453,76 @@ for edge in dna_net_top.edge #adds branch lengths setLength!(edge,1.0) end -nasm_model = JC69([0.5], false); -@test_throws ErrorException fitDiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) - -#Fixes the gamma error (creates a network) +nasm_model = JC69([0.3], false); # relative=false: absolute version +rv = RateVariationAcrossSites(1.0, 2); # 2 rates to go faster +# below: error because missing gammas +@test_throws ErrorException fitdiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) +# set gamma at the 3 reticulations, to fix error above setGamma!(dna_net_top.edge[6],0.6) setGamma!(dna_net_top.edge[7],0.6) setGamma!(dna_net_top.edge[58],0.6) -dna_net = fitDiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=false) - -dna_net_optQ = fitDiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=false) - -dna_net_optRVAS = fitDiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=true) +dna_net = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(dna_net_top, + nasm_model, dna_dat, dna_weights; optimizeQ=false)) +@test dna_net.model.rate == nasm_model.rate +@test dna_net.ratemodel.ratemultiplier == [1.0] +dna_net_optQ = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(dna_net_top, + nasm_model, rv, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=false, ftolRel=.1, ftolAbs=.2, xtolRel=.1, xtolAbs=.2)) +@test dna_net_optQ.model.rate != nasm_model.rate +@test dna_net_optQ.ratemodel.alpha == 1.0 +dna_net_optRVAS = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(dna_net_top, + nasm_model, rv, dna_dat, dna_weights; optimizeQ=false, optimizeRVAS=true, ftolRel=.1, ftolAbs=.2, xtolRel=.1, xtolAbs=.2)) +@test dna_net_optRVAS.model.rate == nasm_model.rate +@test dna_net_optRVAS.ratemodel.alpha != 1.0 +@test dna_net_optRVAS.ratemodel.ratemultiplier ≈ [0.02, 1.98] atol=0.05 +originalstdout = stdout +redirect_stdout(open("/dev/null", "w")) +dna_net_opt_both = (@test_logs (:warn, r"^the network contains taxa with no data") (:warn, r"^resetting edge numbers") fitdiscrete(dna_net_top, + nasm_model, rv, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true, ftolRel=.1, ftolAbs=.2, xtolRel=.1, xtolAbs=.2, verbose=true)) +redirect_stdout(originalstdout) +@test dna_net_opt_both.model.rate != nasm_model.rate +@test dna_net_opt_both.ratemodel.alpha != 1.0 +# for this example: all NaN values if no lower bound on RVAS's alpha, because it goes to 0 +@test dna_net_opt_both.ratemodel.ratemultiplier ≈ [1e-4, 2.0] atol=0.02 +@test dna_net_opt_both.loglik > -3100. # should be ~ -2901.3 -- but low tolerance: just check it's not horrible +# with default strict tolerance values: takes *much* longer, alpha=0.05, JC rate = 0.00293, loglik = -2535.618 +end # of testing readfastatodna with NASM and RateVariationAcrossSites + +@testset "stationary and empiricalDNAfrequencies" begin + +BTSM_1 = BinaryTraitSubstitutionModel(1.0, 2.0); +ERSM_1 = EqualRatesSubstitutionModel(4, 3.0, ["S1","S2","S3","S4"]); +@test PhyloNetworks.stationary(BTSM_1) ≈ [0.6666666666666666, 0.3333333333333333] atol=1e-6 +@test PhyloNetworks.stationary(ERSM_1) == [0.25, 0.25, 0.25, 0.25] + +JC69_1 = JC69(0.5, false); +@test PhyloNetworks.stationary(JC69_1) == [0.25, 0.25, 0.25, 0.25] +HKY85_1 = HKY85([0.5, 0.5], [0.2, 0.3, 0.25, 0.25], false) +@test PhyloNetworks.stationary(HKY85_1) == [0.2, 0.3, 0.25, 0.25] + +# test empiricalDNAfrequencies with string type +# Bayesian correction by default: more stable and avoids zeros +dna_String = view(DataFrame(A = ["s1", "s2"], site1 = ["A", "A"], site2 = ["G", "T"]), 2:3) +@test PhyloNetworks.empiricalDNAfrequencies(dna_String, [1, 1]) ≈ [3,1,2,2]/(4+4) +# with char type +dna_Char = view(DataFrame(A = ["s1", "s2"], site1 = ['A', 'A'], site2 = ['G', 'T']), 2:3) +@test PhyloNetworks.empiricalDNAfrequencies(dna_Char, [1, 1]) ≈ [3,1,2,2]/(4+4) +# uncorrected estimate +@test PhyloNetworks.empiricalDNAfrequencies(dna_Char, [1, 1], false) ≈ [2,0,1,1]/4 +# with ambiguous sites +dna_Char = DataFrame(site1 = ['A','A','Y'], site2 = ['G','T','V']) +@test PhyloNetworks.empiricalDNAfrequencies(dna_Char, [1, 1], false, false) ≈ [2,0,1,1]/4 +@test PhyloNetworks.empiricalDNAfrequencies(dna_Char, [1, 1], false) ≈ [2+1/3,1/2+1/3,1+1/3,1+1/2]/6 +# with DNA type and weights +#fastafile = abspath(joinpath(dirname(Base.find_package("PhyloNetworks")), "..", "examples", "test_8_withrepeatingsites.aln")) +fastafile = joinpath(@__DIR__, "..", "examples", "test_8_withrepeatingsites.aln") +dat, weights = readfastatodna(fastafile, true); +@test PhyloNetworks.empiricalDNAfrequencies(view(dat, 2:6), weights) ≈ [0.21153846153846154, 0.3076923076923077, 0.40384615384615385, 0.07692307692307693] atol=1e-9 -dna_net_opt_both = fitDiscrete(dna_net_top, nasm_model, dna_dat, dna_weights; optimizeQ=true, optimizeRVAS=true) +#test PhyloNetworks.empiricalDNAfrequencies with bad type +dna_bad = view(DataFrame(A = ["s1", "s2"], trait1 = ["hi", "lo"], trait2 = ["lo", "hi"]), 2:3) +@test_throws ErrorException PhyloNetworks.empiricalDNAfrequencies(dna_bad, [1, 1]) -end #of testing readfastatodna with NASM and RateVariationAcrossSites +end #testing stationary and empiricalDNAfrequencies functions end # of nested testsets -