Last active
March 10, 2025 12:49
-
-
Save inikishev/7c446fe4ddfb5c7e5611498bf2b7b82c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pylint: disable = reimported | |
# ruff: noqa: F811 | |
# ------------------------ OTHER ONES I HAVE INSTALLED ----------------------- # | |
from pytorch_optimizer import ADOPT, AdaBelief #, ... | |
from OPTAMI import GradientDescent, SimilarTriangles, CubicRegularizedNewton, BasicTensorMethod, DampedNewton, NesterovAcceleration, NearOptimalAcceleration, ProxPointSegmentSearch, NATA, Optimal | |
from heavyball import ForeachSFAdamW, PaLMForeachSFAdamW, ForeachADOPT, ForeachMuon, ForeachLaProp, MuonLaProp, ForeachSOAP, PaLMForeachSOAP, PrecondScheduleForeachSOAP, PrecondSchedulePaLMForeachSOAP, ForeachPSGDKron, ForeachPurePSGD, ForeachCachedDelayedPSGDKron, ForeachCachedPSGDKron, ForeachDelayedPSGD | |
from schedulefree import SGDScheduleFreeClosure, AdamWScheduleFreeClosure, RAdamScheduleFreeClosure, ScheduleFreeWrapper | |
from timm.optim import AdaBelief, Adafactor #, ... | |
# ----------------------------------- MINE ----------------------------------- # | |
from torchzero.optim import Adagrad, AdamW #, ... | |
# ----------------------------------- misc ----------------------------------- # | |
from .Optimizer_PyTorch import AdaBound, AdaBoundW, Adam, ErrorFeedbackSGD, ExtraAdam, ExtraSGD, OptimisticAdam, OMD, SGD, Storm | |
from .PersonalCodeRepository import SVRG, ErrorFeedbackSGD | |
from .sota_data_augmentation_and_optimizers import RAdam, DeepMemory, Lookahead | |
from .Awesome_Optimizers import * # insane number of them | |
from .moai import * # insane number of them | |
from .collie import AdaLomo, Adan, Lion, Lomo, SophiaG | |
from .pyutils import Adam_GC,DAdaptAdam,DAdaptSGD,GLD,Lookahead,Prodigy,RAdam,SAM,SGD_GC,SMTP | |
from .Best_Deep_Learning_Optimizers import madgrad_wd, Ranger, Sls, Adahessian, AdaMod, DeepMemory, DiffGrad, diffRGrad, DiffMod | |
from .over9000 import AdaBelief, AdaMod, Adan, Apollo, DiffGrad, Lamb, Lookahead, Madam, MADGRAD, AdamW, RAdam, PlainRAdam, Novograd, Ralamb, Ranger, RangerLars | |
from .cringe_live import AdaAbs, AdaptiveCompass, Clybius, Compass, DOPE, ExMachina, FARMSCropV3, FCompass, SAVEUS | |
from .Personalized_Optimizers import FARMSCrop, FARMSCropV2, FCompass, FishMonger, FMARSCrop, FMARSCrop_ExMachina, FMARSCropV2 | |
# ----------------------------------- repos ---------------------------------- # | |
from .kron_torch import Kron | |
# Kronecker-factored preconditioner | |
from .MEKF_MAME import MEKF, MEKF_MA, Lookahead | |
# Modified Extended Kalman Filter with generalized exponential Moving Average | |
from .NGD_SGD import NGD | |
# Natural gradient descent | |
from .psgd_torch import LRA, Affine, Kron, Newton, XMat | |
# Preconditioned gradient descent | |
from .psiDAG import UniversalSGD | |
# Universal Stochastic Gradient Method | |
from .RiemannianSGD import HyperboloidRSGD, PoincareRSGD | |
# Non euclidian space GD | |
from .StochasticMirrorDescent import SMD_compress, SMD_qnorm | |
# Stochastic Mirror Descent | |
from .SUG.SUG import SUG | |
# Adaptive stochastic gradient method based on the universal gradient method | |
from .VTTCG import VTTCG, AdaBelief | |
# Variable three-term conjugate gradient method | |
from .FAdam import FAdam, AnyPrecisionAdamW | |
# Fisher Adam | |
from .dfw import DFW | |
# Deep Frank Wolfe | |
from .coolmomentum import Coolmom, Coolmomentum, Coollin | |
# CoolMomentum: a method for stochastic optimization by Langevin dynamics with simulated annealing | |
from .bgd import BGD | |
# Bayesian Gradient Descent | |
from .torchimize import GNA | |
# Gauss-Newton algorithm | |
from .autosgm import AutoSGM | |
# AutoSGM: A Unifying Framework for Accelerated Learning | |
from .torch_kfac import KFACOptimizer, EKFACOptimizer | |
from .KFAC import KFAC, EKFAC, GKFAC | |
from .torch_kfac2 import KFAC # MAYBE GOOD | |
# Kronecker-Factored Approximate Curvature | |
from .SGDPH.sgdph import sgdph | |
# SGD with Partial Hessian | |
from .LaplacianSmoothing_GradientDescent import LS_SGD # dont work | |
from .LS_MCMC import LSpSGLD, LSSGLD, pSGLD, SGLD # LSpSGLD, LSSGLD require "vecs" whatever that is | |
from .DP_LSSGD import LSSGD, LSSGDTorch # dont work | |
from .dlt import LSSGD, LSSGDTorch # dont work | |
# Laplacian Smoothing Gradient Descent | |
from .adashift import AdaShift | |
# AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods | |
from .soap import SOAP | |
# Shampoo with Adam in the Preconditioner's eigenbasis (SOAP). | |
from .PAL import PalOptimizer | |
# PAL - Parabolic Approximation Line Search for DNNs | |
from .LABPAL import GOLSI, LabPal, PalOptimizer, PLS, Sls, SLS | |
# The Large-Batch Parabolic Approximation Line Search (LABPAL) | |
from .lion import Lion, LionForEach # LionForEach is not in __init__ by default so idk if it is tested | |
# EvoLved Sign Momentum (Symbolic Discovery of Optimization Algorithms) | |
from .adam_atan2 import AdamAtan2, AdamAtan2ForEach, AdamAtan2WithWassersteinReg # Only AdamAtan2 in __init__ | |
# Adam with atan2 instead of epsilon (Scaling Exponents Across Parameterizations and Optimizers) | |
from .grokfast import GrokFastAdamW | |
# Grokfast, Accelerated Grokking by Amplifying Slow Gradients | |
from .lbfgs import LBFGSNew, LBFGSB | |
# Improved LBFGS and LBFGS-B optimizers. | |
from .AdEMAMix import AdEMAMix, AdEMAMixDistributedShampoo | |
# The AdEMAMix Optimizer: Better, Faster, Older (mixture of two EMAs) | |
from .parameterfree import COCOB, KT, cKT | |
# Parameter-Free Optimizers | |
from .SimulatedAnnealing import SimulatedAnnealing | |
# Simulated Annealing | |
from .Positive_Negative_Momentum import PNM, AdaPNM | |
# Positive-Negative Momentum: Manipulating Stochastic Gradient Noise to Improve Generalization | |
from .AngularGrad import AdaBelief, diffgrad, cosangulargrad, tanangulargrad | |
# AngularGrad: A New Optimization Technique for Angular Convergence of Convolutional Neural Networks | |
from .PIDOptimizer import PIDOptimizer | |
# A PID Controller Approach for Stochastic Optimization of Deep Networks | |
from .esgd import ESGD | |
# Stochastic non-convex second order optimizer | |
from .pytorch_soo import * # a lot of them | |
# Second Order Optimizers for Machine Learning | |
from .curveball import CurveBall, CurveBallInterleave | |
# Small Steps and Giant Leaps: Minimal Newton Solvers for Deep Learning | |
from .torch_second_order import GradientDescent, LevenbergMarquardt | |
# Levenberg–Marquardt algorithm | |
from .grnewt import NewtonSummary, NewtonSummaryVanilla, NewtonSummaryFB, NewtonSummaryUniformAvg | |
# Adapting Newton's Method to Neural Networks through a Summary of Higher-Order Derivatives | |
from .pytorch_storm import STORM | |
# stochastic first order trust region method | |
from .pytorch_trish import TRish | |
# A Stochastic Trust Region Algorithm Based on Careful Step Normalization | |
from .fate_llm import ZerothOrderOptimizer, KSeedZerothOrderOptimizer | |
# "This optimizer performs a `random` walk update for the parameters of the model." | |
from .FederatedScope_FedKSeed import MeZOBiasOptimizer | |
from .fusion_bench import MeZO | |
# MEZO | |
from .NewtonCG import NewtonCG | |
# Newton-CG algorithm with backtracking line-search | |
from .dreamplace import NesterovAcceleratedGradientOptimizer | |
# Nesterov's implementation of e-place algorithm (???) (THIS NOT NESTEROV MOMENTUM ITS NESTEROV SOMETHING ELSE) | |
from .sls_ffa import Sls, SlsAcc, SlsEg, SVRG, AdaBound, CocobBackprop, CocobOns, PLS | |
# Stochastic line search (fork with more stufff) | |
from .sps import Sps | |
# Stochastic Polyak Step-size | |
from .ada_sls import AdaSLS | |
# Adaptive Gradient Methods Converge Faster with Over-Parameterization | |
from .sls import Sls, SlsAcc, SlsEg | |
# Stochastic line search | |
from .chop import PGD, PGDMadry, S3CM, PairwiseFrankWolfe, FrankWolfe | |
# constrained optimization for PyTorch | |
from .ncg_optimizer import LCG, BASIC | |
from .ncg_optimizer_ApostolosGreece import LCG, BASIC # fork, seems to have some kinds of changes | |
# nonlinear conjugate gradient | |
from .LPF_SGD import EntropySGD2, EntropyAdam, EntropySGD, SAM | |
# Low-Pass Filtering SGD for Recovering Flat Optima (but I don't think it has LPFSGD optimizer, unless EntropySGD is one) | |
from .optimizer import SAM, NelderMead, PatternSearch | |
# bro made a nelder mead (Loss Landscapes are All You Need: Neural Network Generalization Can Be Explained Without the Implicit Bias of Gradient Descent) | |
from .convis import FiniteDifferenceGradientOptimizer | |
# apparently second order fd | |
from .fullbatch import AdaptiveGradientClipping, FISTA, FISTALineSearch, SGDLineSearch, LARS, LBFGS, SAM, SGD_AGC, RestartingLineSearch, NonMonotoneLinesearch, WolfeGradientDescent | |
# Training vision models with full-batch gradient descent and regularization | |
from .peps_torch_feat_czx import SGD_MOD, LBFGS_MOD | |
# SGD with backtracking line search | |
from .Target_Based_Surrogates_For_Stochastic_Optimization import Ada_FMDOpt, Adam_FMDOpt, Diag_Ada_FMDOpt, GULF2, LSOpt, MD_FMDOpt, Online_Newton_FMDOpt, Sadagrad, SGD_FMDOpt, SLS_FMDOpt, SVRG | |
# Target Based Surrogates For Stochastic Optimization (some crazy stuff) | |
from .SDLS import SDLS | |
# Armijo Back-tracking line search on Training DNN | |
from .hessianfree import HessianFree | |
# Deep learning via Hessian-free optimization (need to install backpack) | |
from .salsa.SaLSA import SaLSA | |
# SALSA - Stable Armijo Line Search Adaptation | |
from .nitorch import OGM, BacktrackingLineSearch | |
# optimizers from neuroimaging library | |
from .qori_aziz_sa import SimulatedAnnealing | |
# SA from someones homework | |
from .neural_net_optimizers import GeneticAlgorithm, ParticleSwarm | |
# dfo | |
from .NNAIF import CMAES, EMNA, IMFIL, NNAIF, SGPGD, RESNETEULER | |
# Neural Network Accelerated Implicit Filtering: Integrating Neural Network Surrogates With Provably Convergent Derivative Free Optimization Methods | |
from .befree import CurveBall, HessianFree, Newton, SimplifiedHessian | |
# On the New method of Hessian-free second-order optimization | |
from .bayesian_snn import BayesBiSNN, GaussianBayesOptimizer | |
# Bayesian Continual Learning via Spiking Neural Networks (I think it needs layers from that lib too) | |
from .ML_APTS import APTS, LocalTR, TR, TRAdam | |
# Additively preconditioned trust-region strategies for machine learning | |
from .torchmin import Minimizer, ScipyMinimizer | |
from .pytorch_minimize import MinimizeWrapper, BasinHoppingWrapper, DualAnnealingWrapper, DifferentialEvolutionWrapper | |
# scipy minimize (ha ha mine is better) | |
from .geoopt import SGRHMC, RHMC, RSGLD, RiemannianAdam, RiemannianLineSearch, RiemannianSGD, SparseRiemannianAdam, SparseRiemannianSGD | |
# Riemannian Adaptive Optimization Methods (maybe only works on geoopt layers idk) | |
from .pykg2vec import RiemannianOptimizer | |
# from "Python library for knowledge graph embedding" (but I changed it to affect all layers) | |
from .M_FAC import MFAC | |
# M-FAC: Efficient Matrix-Free Approximations of Second-Order Information | |
from .ddpnopt import Step, RmsDDP, AdamDDP | |
# DDPNOpt: Differential Dynamic Programming Neural Optimizer | |
from .singd import SINGD | |
# KFAC-like Structured Inverse-Free Natural Gradient Descent | |
from .sirfshampoo import SIRFShampoo | |
# SIRFShampoo: Structured inverse- and root-free Shampoo in PyTorch | |
from .StructuredNGD_DL import KFACOptimizer, LocalOptimizer | |
# Matrix-multiplication-only KFAC (Simplifying Momentum-based Positive-definite Submanifold Optimization) | |
from .Muon import Muon | |
# MomentUm Orthogonalized by Newton-schulz. | |
from .orth_optim import orthogonalise | |
# Orthogonalising gradients to speed up neural network optimisation. `orthogonalise(AdamW)(model.parameters(), lr = 1e-3)` | |
from .torch_pso import ParticleSwarmOptimizer, GenerationalPSO,AutotuningPSO,RingTopologyPSO,ChaoticPSO,GenericPSO,AcceleratedPSO,SineCosineAlgorithm,ImprovedSineCosineAlgorithm | |
# Particle Swarm Optimization | |
from .langevin_sampling import SGLD, pSGLD | |
# Sampling with gradient-based Markov Chain Monte Carlo approaches | |
from .adopt import ADOPT | |
# Modified Adam Can Converge with Any β2 with the Optimal Rate | |
from .fsdp_optimizers import SOAP, Kron, Muon, KronMars | |
# optimizers with FSDP support | |
from .NGPlus import NGPlus, o_NGPlus, o_NGPlus_Block, create_oNG_optimizer | |
# NG+: A new second-order optimizer for deep learning | |
from .MARS_AdamW import MarsAdamW | |
# MARS: Unleashing the Power of Variance Reduction for Training Large Models | |
from .MSAM import AdamW, AdamW_MSAM, AdamW_SAM, ESAM, LookSAM, MSAM, SAM, SGD | |
# Momentum-SAM: Sharpness Aware Minimization without Computational Overhead | |
from .adasub import SubHes | |
# Stochastic Optimization Using Second-Order Information in Low-Dimensional Subspaces | |
from .MomSPS import MomSPS, MomSPS_smooth | |
# Stochastic Polyak Step-sizes and Momentum | |
from .momo import Momo, MomoAdam | |
# Momentum Models for Adaptive Learning Rates | |
from .DIMAT import CDMSGD, CDSGD, CGA, DSMA, LDSGD, SGP, SwarmSGD | |
# Decentralized Iterative Merging-And-Training for Deep Learning Models | |
from .Noise_stability_optimization import BSAM, NSM, SAM, RSAM | |
# noise stability optimization algorithm, Hessian-based regularization approach for finding flat minima (NSM) | |
from .Exponentiated_Gradient import EGPM | |
# exponentiated gradient (EG) algorithm and plus-minus variant | |
from .zeroptim import MeZO, SmartES | |
# zero-order optimization techniques | |
from .GDPolyak import GDPolyak | |
# Gradient descent with adaptive stepsize converges (nearly) linearly under fourth-order growth | |
from .APROX import Truncated, TruncatedAdagrad | |
# APROX: Robust Stochastic Optimization Algorithms | |
from .SVRG_Pytorch import SVRG | |
# efficient variant of SVRG that relies on mini-batching implemented in Pytorch | |
from .poincare_embeddings import RiemannianSGD | |
# actually working riemannian SGD | |
from .tram_optimizer import TRAM | |
# Trust Region Aware Minimization | |
from .gsam import GSAM | |
# Surrogate Gap Guided Sharpness-Aware Minimization | |
from .ReinventingWheel import FTRLP | |
# FTRL-proximal algorithm (Follow-the-Regularized-Leader and Mirror Descent: Equivalence Theorems and L1 Regularization, H. B. Mcmahan. AISTATS 2011.) | |
from .OLoptim import FTML, FTRL_Proximal, OSD, SGDOL_global, SGD_globLR, STORM | |
# Online & Stochastic optimization algorithms for deep learning | |
from .metaopt import SGD_Multi_LR, SGD_Quotient_LR | |
# Online hyperparameter optimization by real-time recurrent learning | |
from .core_optimizer import CoRe | |
# Continual Resilient (CoRe) Optimizer | |
from .Seminar import Ada_Grad, FTRL, nAda_Grad, nFTRL, nKT, nOGD, OGD | |
# "Implementation of different algorithms and their normalized counterparts in the pytorch framework" | |
from .Recommendation_System_Method_Reproduce import FTRL | |
from .Code import FTRL, OBC | |
from .ftrl import FTRL | |
from .DP_FTRL import FTRLOptimizer # official implementation by Google | |
# Follow-the-Regularized-Leader | |
from .smart_grid import AdaX | |
# AdaX: Adaptive Gradient Descent with Exponential Long Term Memory | |
from .nerf_atlas import UniformAdam | |
# something crazy with solves and laplacian matrix?? | |
from .mlopt import Adahessian, Goldstein, Normalized_Optimizer, OneBit_Adam, SAM, Alternate_SAM, Alternate_SAM_v2, Alternate_SAM_v3, AdamS_v1, ASAM_ON, Sketch_Adam, SophiaG, Sophus, GN_DOM_SGD, GN_BULK_SGD, DOM_SGD, BULK_SGD | |
# crazy stuff (no descriptions) | |
from .subgd import PCAOptimizer | |
# Few-Shot Learning by Dimensionality Reduction in Gradient Space (needs some kind of config) | |
from .RFR_NeurIPS23 import RFR | |
# robust fairness regularization (RFR) - Chasing Fairness under Distribution Shift: a Model Weight Perturbation Approach | |
from .A_Deep_Learning_Optimizer_Based_on_Grunwald_Letnikov_Fractional_Order_Definition import FCSGD_G_L, FCAdam_G_L | |
# A Deep Learning Optimizer Based on Grunwald Letnikov Fractional Order Definition | |
from .VFOGD_PF_and_Its_Application_in_Deep_Learning import VFOSGD_PF, VFOAdam_PF | |
# VFOGD_PF and Its Application in Deep Learning | |
from .staleness_corrected_momentum import SCMSGD, SCMTDProp, OptChain, FixedSGD | |
# Correcting Momentum in Temporal Difference Learning | |
from .DPSGD import DPSGD | |
# Pytorch implentation of tf.privacy.DPGradientDescentGaussianOptimizer | |
from .DPtorch import JDPSGD | |
# Improving Deep Learning with Differential Privacy using Gradient Encoding and Denoising | |
from .optimizer2 import AdaBound, AdaGC, AdaMod, Adan, Yogi | |
# Adaptive Optimization Algorithm with Gradient Bias Correction (AdaGC) | |
from .ProxSPS import SPS, ProxAdam | |
# Polyak step sizes with weight decay in Pytorch | |
from .bb_dl import BB | |
# Barzilai-Borwein-based Adaptive Learning Rate for Deep Learning | |
from .Adaptive_learning_rate_optimization_algorithm_with_dynamic_bound_based_on_Barzilai_Borwein_method import BBbound, AdaBBbound | |
# Adaptive learning rate optimization algorithm with dynamic bound based on Barzilai-Borwein method | |
from .mL_BFGS import SlimQN, BlockSlimQN, KFACOptimizer, LBFGSOptimizer, SGDOptimizer | |
# A Momentum-based L-BFGS for Distributed Large-Scale Neural Network Optimization | |
from .Noisy_SGD import GGDO1, GGDO2, GGDO3, GGDO4, GGDO5, pSGLD,SGLD | |
# Adaptively Preconditioned Stochastic Gradient Langevin Dynamics | |
from .adamaio import AdamAIO | |
# All-In-One Adam | |
from .adams import Adams, AdamUCB, AdamCB | |
# Exploiting Uncertainty of Loss Landscape for Stochastic Optimization | |
from .AdaTS import AdaTS, AdaITS, AdamTS, YOGITS | |
# ???? can't find anything about "AdaTS algorithm" | |
from .MNSAM import MNSAM, SAM, SUM, Adan | |
# Sharpness-Aware Minimization Method with Momentum Acceleration for Deep Neural Networks | |
from .innaprop import INNAprop | |
# INNAprop, a second-order optimization algorithm for deep learning | |
from .M3Learning import AdaHessian, TRCG | |
# Trust-Region Conjugate Gradient | |
from .opt_for_pinns import Adam_LBFGS, Adam_LBFGS_GD, Adam_LBFGS_NNCG, ALRMAG, GD, NysNewtonCG, PolyakGD, PolyakLBFGS, SketchyGN, SketchySGD | |
# bunch of stuff from "Challenges in Training PINNs: A Loss Landscape Perspective" | |
from .deepxde import NNCG | |
# NysNewtonCG, a damped Newton-CG method that uses Nyström preconditioning | |
from .alternating_layers import DampedNewton | |
# seemingly a good DampedNewton (they apply it to final layer rest optimized by 1st order) | |
from .nanoGPTSLS import AdamSLS, KenSLS | |
# training nanoGPT with SLS | |
from .Skoltech3D import BatchBFGS, BatchGD | |
# batch bfgs? whats that (those don't inherit Optimizer but have same signature) | |
from .ICNN_verification import SdLBFGS, SdLBFGS0 | |
# Stochastic quasi-Newton methods for nonconvex stochastic optimization | |
from .ZO_AdaMM_vs_FO_AdaMM import AdaMM | |
# ON THE CONVERGENCE OF ADAM AND BEYON | |
from .AdaSTE import BayesBiNN, FenBPOpt, FenBPOptQuad, FenBPOptProx, MDTanhOpt | |
# AdaSTE: An adaptive Straight-Through Estimator to Train Binary Neural Networks, Training Binary Neural Networks using the Bayesian Learning Rule | |
from .alopex import Alopex | |
# ALgorithm Of Pattern EXtraction (ALOPEX) 99/B version (gradient free) | |
from .statopt import QHM, SALSA, SSLS, SASA, SLOPE | |
# ???idk | |
from .superpolyak import SuperPolyak, NewtonCG | |
# SuperPolyak subgradient method - first-order method for solving (possibly) nonsmooth equations/optimization problems | |
from .GaussNewtonPolyak import GNP | |
# A linearly convergent Gauss-Newton subgradient method for ill-conditioned problems | |
from .ntd import NTD, Polyak | |
# Normal-Tangent-Descent (A nearly linearly convergent first-order method for nonsmooth functions with quadratic growth) | |
from .nuqls import LaplaceGGN | |
# Uncertainty Quantification with the Empirical Tangent Kernel | |
from .SimuRLacra import GSS | |
# Golden Section Search (I think this is gradient free and for univariate funcs) | |
from .gcopt import GCOptimizer | |
# Gaussian continuation optimizer (wraps another optimizer, and ultra-recent) | |
from .k_fac import KFACOptimizer,KFACIDOptimizer,SKFACOptimizer,EKFACOptimizer,KBFGSOptimizer,KBFGSLOptimizer,KBFGSL2LOOPOptimizer,KBFGSLMEOptimizer,NGDOptimizer | |
# biggest k-fac repo (i fixed all acc_stats) | |
from .proxyprox import ProxyProx | |
# konstmish's mysterious ProxyProx (has step as well as inner_step methods) | |
from .SWANOptimizer import SWAN | |
# SWAN (SGD with Whitening And Normalization) | |
from .sparse_szo import DuelingEvolutionOptimizer, VanillaEvolutionOptimizer, OneSideEvolutionOptimizer, TwoSideEvolutionOptimizer, FirstOrderOptimizer, FirstOrderBanditOptimizer | |
# Sparse Perturbations for Improved Convergence in Stochastic Zeroth-Order Optimization | |
from .PSGD_Nuon import Nuon, AutoNuon | |
# Use single sided whitening that is dynamic and learned instead of being instantanious like Muon | |
from .coherent_gradients import RA3,RM3, M3 | |
# Weak and Strong Gradient Directions: Explaining Memorization, Generalization, and Hardness of Examples at Scale | |
from .eva import Eva, EvaExperimental, KFAC, AdaKFAC, AdaKFAC2, KFACSAM, MFAC, Shampoo | |
# Eva: Practical Second-order Optimization with Kronecker-vectorized Approximation (pretty sure they modify the gradient and don't update params) | |
from .natural_galore import SubSpaceAdamW | |
# GaLore extension - Natural Gradient Descent in low rank subspace | |
from .galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit | |
# Memory-Efficient LLM Training by Gradient Low-Rank Projection | |
from .compass_optimizer import CompassExperimental4Bit, CompassExperimental8Bit, CompassExperimental8BitBNB, Compasstic, LPFAdamW, AdamW, RMSProp | |
# A modification of original ADAMW optimizer by replacing momentum moment with smoothing filter. | |
from .sgd_sai import SGD_sai | |
# No More Adam: Learning Rate Scaling at Initialization is All You Need | |
from .unrl import EligibilityTraceOptimizer,KFACOptimizer | |
# optimizers from Reinforcement Learning algorithms library | |
from .second_order_optimization_NQS import SecondOrderOpt | |
# Second-order Optimisation strategies for neural network quantum states | |
from .ldadamw_torch import LDAdamW | |
# Low-Dimensional Adam - Adaptive Optimization from Low-Dimensional Gradient Statistics. | |
from .pydrsom import DRSOMB, DRSOMB2, DRSOMK, DRSOMVec, KDRSOM | |
# dimension-reduced second-order method (DRSOM) | |
from .AdaGL import AdaGL, FractionalSmoothLoss | |
# deep learning optimizer that combines fractional-order calculus with adaptive techniques. Using Grünwald–Letnikov derivatives | |
from .mkor import MKOR | |
# Momentum-Enabled Kronecker-Factor-Based Optimizer Using Rank-1 Updates | |
from .sn_sm import GenericOptim, AdamWSN, AdamWSNG | |
# GenericOptim is maybe it, Subset-Norm and Subspace-Momentum: Faster Memory-Efficient Adaptive Optimization with Convergence Guarantees | |
from .OptML_Project import Adasub, Adahessian | |
# Comparison of second-order optimizers on transformers | |
from .MARS import MARS, ADOPT, Muon, AdamW | |
# MARS (Make vAriance Reduction Shine | |
from .pytorch_velo import VeLO | |
# learned optimizer LSTM (just a pytorch wrapper for jax optimizer) | |
from .mctorch import ConjugateGradient | |
# other optimizers from this are for manifolds only, this works on any layers | |
from .modded_nanogpt import Kron | |
# kron fork by evanatyourservice with recent changes | |
from .smplifyx import TrustRegionNewtonCG, LBFGS | |
# trust region newton cg | |
from .widedeepnetworks import ESS, HMC | |
# Gaussian Process Behaviour in Wide Deep Neural Networks (zeroth order?) | |
from .alf import NeroPlus, AdamTF | |
# keeps the norm of each parameter vector fixed and mean at zero during the optimization process | |
from .SOAP_MUON import SOAP_Muon | |
# SOAP + Muon = SOAP_Muon | |
from .psgd_kron_lra import KronLRA | |
# LRA per kron factor | |
from .psgd_kron_contraction import Kron | |
# joint learning of Xilin Li's criterion 3 as well as Madeleine Udell's contraction factor on the Lie group | |
from .Moonlight import Muon | |
# Muon with lr normalization based on param size and maybe other stuff | |
from .FedPD import PSVRG, PSGD, FedPD_SGD, FedPD_VR | |
# Federated Primal-Dual Algorithm | |
from .llmart import GreedyCoordinateGradient | |
# greedy coordinate gradient | |
from .EOPC import Rosen | |
# Optimizing Mode Connectivity for Class Incremental Learning | |
from .Autoenv import IEKF | |
# iterative extended kalman filter optimizer | |
from .fastr import FastrD, FastrN, STORMplus, StormPlus | |
# Fully Adaptive STochastic Recursive-momentum | |
from .NeuralNetwork import SLBI, SLBI2, SLBI_ADAM_ToolBox, SLBI_SGD_ToolBox | |
from .DessiLBI import SLBI, SLBI_ToolBox | |
# Exploring Structural Sparsity of Deep Networks via Inverse Scale Spaces | |
from .dowg import DoWG, CDoWG | |
# DoWG Unleashed: An Efficient Universal Parameter-Free Gradient Descent Method | |
from .archai import CocobBackprop, CocobOns, Lamb | |
# microsofts NAS lib | |
from .coin_betting import SGDOL, Cocob, Recursive, Regralizer, Scinol2, ONSBet | |
# Parameter-free coin betting optimizers | |
from .dolphinflow import DolphinFlow | |
# recent muon/adamw like has a bunch of settings to tune https://github.com/cognitivecomputations/dolphinflow-optimizer | |
from .neosr import adamw_win, adan_sf, adamw_sf, adan, soap_sf, fsam | |
# from super resolution lib and stuff adapted from heavyball | |
from .recpre import SOAP, LionW, SophiaG, Lilith, ELLISAdam, IVON, ZeroShampooWithAdamGraftingOptimizer, OrthogonalNesterov | |
# recurrent pretraining | |
from .supertrainer2k import Adalite, Lilith | |
# idk | |
from .wu_nature_comms_2024 import NewStyleBatchFISTAOptim, NewStyleSingleFISTAOptim | |
# something insane | |
from .dd4ml import APTS,APTS_D, TrustRegion, TrustRegionLegacy | |
# Additively preconditioned trust-region strategies for machine learning. requires some type of condig and some type of subdomain_optimizer | |
from .koaning_io_more_descent_less_gradient import KeepStepping, KeepVaulting | |
# keeps stepping on single batch or maybe it was supposed to be a line search idk | |
from .CR import COMP | |
# Compact representations for recursive Hessian matrix estimates (similar to LBFGS) | |
from .MaxFactor import MaxFactor | |
# utra recent | |
from .scion import Scion | |
# Training Deep Learning Models with Norm-Constrained LMOs. | |
from .rapp import RAPPsgd, RAPPadam, ExtraAdagrad, ExtraAdam, ExtraSGD, EGplusAdam, EGplusSGD, LA, AdamLA, ExtraSGDLA, ExtraAdamLA, EGplusLA, EGplusAdamLA | |
# Stable Nonconvex-Nonconcave Training via Linear Interpolation | |
from .storm_plus import STORMplus | |
# STORM+ | |
from .AccSGD import AccSGD | |
# On the insufficiency of existing momentum schemes for Stochastic Optimization | |
from .AdaInject import AdaBelief, AdaBeliefInject, AdamInject, diffGrad, diffGradInject,Radam, RadamInject | |
# AdaInject: Injection Based Adaptive Gradient Descent Optimizers for Convolutional Neural Networks | |
from .PowerSign_and_AddSign import AddSign, PowerSign | |
# https://github.com/Neoanarika/Implementing-the-PowerSign-and-AddSign-rule | |
from .AddSign_PowerSign_in_PyTorch import AddSign, PowerSign, LinearInternalDecay, CosineInternalDecay, RestartCosineInternalDecay | |
# https://github.com/cydonia999/AddSign_PowerSign_in_PyTorch | |
# Neural Optimiser search with Reinforcment learning | |
from .neumann_optimizer import Neumann, Neumann2 | |
# https://github.com/jayybhatt/neumann-optimizer | |
# A Practical Optimization Algorithm for Deep Neural Networks (implicitly computes the inverse Hessian of each mini-batch to produce descent directions) | |
from .neural_search_optimizer import Optimizer_1 | |
# https://github.com/daviddao/pytorch-neural-search-optimizer | |
# Neural Optimizer Search's Optimizer_1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment