Merge pull request #51 from PumasAI/docstring

Add Docstrings
PumasAI · Apr 14, 2022 · 2485545 · 2485545 · chriselrod · Apr 14, 2022
2 parents c4a4d74 + 8a8556c
commit 2485545
Show file tree

Hide file tree

Showing 14 changed files with 225 additions and 16 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SimpleChains"
 uuid = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
 authors = ["Chris Elrod <[email protected]> and contributors"]
-version = "0.2.1"
+version = "0.2.2"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ using SimpleChains, BenchmarkTools
 x = rand(24, 200); # 24 inputs per 200 observations
 
 # 2 responses each per 200 observations
-y = StrideArray{Float64}(undef, (static(2),200)) .= randn.() .* 10;
+y = Matrix{Float64}(undef, 2, 200) .= randn.() .* 10;
 
 schain = SimpleChain(
   static(24), # input dimension (optional)

diff --git a/src/activation.jl b/src/activation.jl
@@ -1,7 +1,12 @@
 
 
 # Elementwise transforms
+"""
+    Activation(activation)
 
+
+Applies `activation` function elementwise.
+"""
 struct Activation{F}
   f::F
 end

diff --git a/src/conv.jl b/src/conv.jl
@@ -680,6 +680,18 @@ function convlayeradjA!(
   end
 end
 
+"""
+    Conv(activation, dims::Tuple{Vararg{Integer}}, outputdim::Integer)
+
+Performs a convolution with `dims` and maps it to `outputdim` output channels, then
+adds a bias (one per `outputdim`) and applies `activation` elementwise.
+
+E.g., `Conv(relu, (5, 5), 16)` performs a `5 × 5` convolution, and maps the input
+channels to 16 output channels, before adding a bias and applying `relu`.
+
+Randomly initializing weights using the (Xavier) Glorot uniform distribution.
+The bias is zero-initialized.
+"""
 struct Conv{F,D<:Tuple{Vararg{Integer}},O<:Integer}
   dim::D
   outputdim::O

diff --git a/src/dense.jl b/src/dense.jl
@@ -1,8 +1,14 @@
 
 """
-  TurboDense{B}(outputdim, activation)
+    TurboDense{B=true}(activation, outputdim::Integer)
 
+Linear (dense) layer.
+- `B` specifies whether the layer includes a bias term.
+- The `activation` function is applied elementwise to the result.
+- `outputdim` indicates how many dimensions the input is mapped to.
 
+Randomly initializing weights using the (Xavier) Glorot normal distribution.
+The bias is zero-initialized.
 """
 struct TurboDense{B,I<:Integer,F}
   f::F

diff --git a/src/dropout.jl b/src/dropout.jl
@@ -1,7 +1,7 @@
 using VectorizedRNG
 
 """
-  Dropout(p) # 0 < p < 1
+    Dropout(p) # 0 < p < 1
 
 Dropout layer.
 

diff --git a/src/flatten.jl b/src/flatten.jl
@@ -1,5 +1,20 @@
 
-
+"""
+    Flatten{N}()
+
+Flattens the first `N` dimensions. E.g.,
+
+```julia
+julia> Flatten{2}()(rand(2,3,4))
+6×4 Matrix{Float64}:
+ 0.0609115  0.597285  0.279899  0.888223
+ 0.0667422  0.315741  0.351003  0.805629
+ 0.678297   0.350817  0.984215  0.399418
+ 0.125801   0.566696  0.96873   0.57744
+ 0.331961   0.350742  0.59598   0.741998
+ 0.26345    0.144635  0.076433  0.330475
+```
+"""
 struct Flatten{N} end
 Flatten(N) = Flatten{convert(Int, N)::Int}()
 @generated _dec(::Flatten{N}) where {N} = Flatten{N - 1}()

diff --git a/src/loss.jl b/src/loss.jl
@@ -1,6 +1,12 @@
 abstract type AbstractLoss{Y} end
 
 has_loss(sc::SimpleChain) = last(sc.layers) isa AbstractLoss
+"""
+    add_loss(chn, l::AbstractLoss)
+
+Add the loss function `l` to the simple chain. The loss function
+should hold the target you're trying to fit.
+"""
 function add_loss(sc::SimpleChain, l::AbstractLoss)
   id = chain_input_dims(sc)
   if has_loss(sc)
@@ -49,6 +55,11 @@ function layer_output_size(::Val{T}, sl::AbstractLoss, s) where {T}
   _layer_output_size_no_temp(Val{T}(), sl, s)
 end
 
+"""
+    SquaredLoss(target)
+
+Calculates half of mean squared loss of the target.
+"""
 struct SquaredLoss{Y} <: AbstractLoss{Y}
   y::Y
 end
@@ -93,7 +104,11 @@ function (sl::SquaredLoss{<:AbstractArray{<:Number}})(arg::AbstractArray{T,N}, p
   T(0.5/size(arg,N)) * s, p, pu
 end
 
+"""
+    AbsoluteLoss
 
+Calculates mean absolute loss of the target.
+"""
 struct AbsoluteLoss{Y} <: AbstractLoss{Y}
   y::Y
 end
@@ -144,7 +159,11 @@ function (sl::AbstractLoss{<:AbstractArray{<:AbstractArray}})(arg, p, pu)
   return s, p, pu
 end
 
+"""
+    LogitCrossEntropyLoss
 
+Calculates mean logit cross-entropy loss.
+"""
 struct LogitCrossEntropyLoss{Y<:Union{AbstractVector{UInt32},Nothing}} <: AbstractLoss{Y}
   y::Y
 end

diff --git a/src/maxpool.jl b/src/maxpool.jl
@@ -1,4 +1,9 @@
 
+"""
+    MaxPool(dims::Tuple{Vararg{Integer}}
+
+Calculates the maximum of pools of size `dims`.
+"""
 struct MaxPool{D} end
 MaxPool(x::Tuple{Vararg{Integer}}) = MaxPool{map(Int, x)}()
 MaxPool(x::Vararg{Integer}) = MaxPool{map(Int, x)}()

diff --git a/src/optimize.jl b/src/optimize.jl
@@ -1,6 +1,11 @@
 
 abstract type AbstractOptimizer end
 
+"""
+    ADAM(η = 0.001, β = (0.9, 0.999))
+
+ADAM optimizer.
+"""
 struct ADAM <: AbstractOptimizer
   η::Float64
   β::Tuple{Float64,Float64}
@@ -57,7 +62,10 @@ function update!(g::AbstractVector, opt, Xp, layers, pen, sx, p, pm, optbuffer,
 end
 function chain_valgrad_thread!((g, Xp, layers, p, pm, mpt), start, stop)
   batchsize = size(Xp, ndims(Xp))
-  start > stop && return nothing
+  if start > stop
+    fill!(g, zero(eltype(g)))
+    return nothing
+  end
   off = start - 1
   nt = size(g, static(2))
   goff = stride(g, static(2)) * sizeof(eltype(g)) * off
@@ -119,7 +127,10 @@ function shuffle_chain_valgrad_thread!(
 
   fm1 = off * batchsize + pstart + min(r, off)
   lastdim = batchsize + (start <= r)
-  ((lastdim > 0) & (subrangelen > 0)) || return nothing
+  if !((lastdim > 0) & (subrangelen > 0))
+    # fill!(g, 0)
+    return nothing
+  end
   l = fm1 + lastdim
 
   loss = last(layers)
@@ -167,6 +178,49 @@ function shuffle_update!(
   pstop,
 )
   nthread = size(g, static(2))
+  #=
+  batchsize = pstop - pstart
+  if batchsize < nthread
+    gpb = preserve_buffer(g)
+    GC.@preserve gpb begin
+      if batchsize == 1
+        gv = PtrArray(pointer(g), (length(p),))
+        return shuffle_update!(
+          gv,
+          opt,
+          Xp,
+          layers,
+          pen,
+          sx,
+          p,
+          pm,
+          optbuffer,
+          mpt,
+          perm,
+          pstart,
+          pstop,
+        )
+      else
+        gm = PtrArray(stridedpointer(g), (length(p), batchsize), Val{(true, false)}())
+        return shuffle_update!(
+          gm,
+          opt,
+          Xp,
+          layers,
+          pen,
+          sx,
+          p,
+          pm,
+          optbuffer,
+          mpt,
+          perm,
+          pstart,
+          pstop,
+        )
+      end
+    end
+  end
+  =#
   Polyester.batch(
     shuffle_chain_valgrad_thread!,
     (nthread, nthread),
@@ -215,7 +269,6 @@ function shuffle_update!(
   update!(opt, optbuffer, p, g)
 end
 
-
 function train_unbatched!(g, p, _chn::Chain, X, opt::AbstractOptimizer, t::AbstractArray)
   if g isa AbstractMatrix && size(g,2) == 1
     gpb = preserve_buffer(g)
@@ -241,6 +294,19 @@ function train_unbatched!(g, p, _chn::Chain, X, opt::AbstractOptimizer, t::Abstr
   end
   p
 end
+"""
+    train_unbatched!(g::AbstractVecOrMat, p, chn, X, opt, iters)
+
+Train without batching inputs.
+
+Arguments:
+- `g` pre-allocated gradient buffer. Can be allocated with `similar(p)` (if you want to run single threaded), or `alloc_threaded_grad(chn, size(X))` (`size(X)` argument is only necessary if the input dimension was not specified when constructing the chain). If a matrix, the number of columns gives how many threads to use. Do not use more threads than batch size would allow.
+- `p` is the parameter vector. It is updated inplace. It should be pre-initialized, e.g. with `init_params`/`init_params!`. This is to allow calling `train_unbatched!` several times to train in increments.
+- `chn` is the `SimpleChain`. It must include a loss (see `SimpleChains.add_loss`) containing the target information (dependent variables) you're trying to fit.
+- `X` the training data input argument (independent variables).
+- `opt` is the optimizer. Currently, only `SimpleChains.ADAM` is supported.
+- `iters`, how many iterations to train for.
+"""
 function train_unbatched!(g, p, _chn::Chain, X, opt::AbstractOptimizer, iters::Int)
   if g isa AbstractMatrix && size(g,2) == 1
     gpb = preserve_buffer(g)
@@ -320,6 +386,20 @@ end
 @inline view_slice_last(X::AbstractArray{<:Any,3}, r) = view(X, :, :, r)
 @inline view_slice_last(X::AbstractArray{<:Any,4}, r) = view(X, :, :, :, r)
 @inline view_slice_last(X::AbstractArray{<:Any,5}, r) = view(X, :, :, :, :, r)
+"""
+    train_batched!(g::AbstractVecOrMat, p, chn, X, opt, iters; batchsize = nothing)
+
+Train while batching arguments.
+
+Arguments:
+- `g` pre-allocated gradient buffer. Can be allocated with `similar(p)` (if you want to run single threaded), or `alloc_threaded_grad(chn, size(X))` (`size(X)` argument is only necessary if the input dimension was not specified when constructing the chain). If a matrix, the number of columns gives how many threads to use. Do not use more threads than batch size would allow.
+- `p` is the parameter vector. It is updated inplace. It should be pre-initialized, e.g. with `init_params`/`init_params!`. This is to allow calling `train_unbatched!` several times to train in increments.
+- `chn` is the `SimpleChain`. It must include a loss (see `SimpleChains.add_loss`) containing the target information (dependent variables) you're trying to fit.
+- `X` the training data input argument (independent variables).
+- `opt` is the optimizer. Currently, only `SimpleChains.ADAM` is supported.
+- `iters`, how many iterations to train for.
+- `batchsize` keyword argument: the size of the batches to use. If `batchsize = nothing`, it'll try to do a half-decent job of picking the batch size for you. However, this is not well optimized at the moment.
+"""
 function train_batched!(
   g::AbstractVecOrMat,
   p::AbstractVector,
@@ -350,6 +430,10 @@ function train_batched!(
   else
     batchsize
   end
+  if N_bs >= N
+    train_unbatched!(g, p, _chn, X, opt, iters)
+    return p
+  end
   tgt_batch_len = tsprod(Base.front(size(tgt))) * N_bs
   X_batch_len = tsprod(Base.front(sx)) * N_bs
   sxb = (Base.front(sx)..., N_bs)
@@ -383,6 +467,8 @@ function train_batched!(
       doff = 0
       while true
         doffnext = doff + N_bs
+        # doffnext > N && break
+        # batchstop = doffnext
         batchstop::Int = min(doffnext, N)
         # @show doff:batchstop
         shuffle_update!(

diff --git a/src/penalty.jl b/src/penalty.jl
@@ -74,6 +74,11 @@ getλ(::NoPenalty) = nothing
 @inline apply_penalty(Λ, p, _) = apply_penalty(Λ, p)
 @inline apply_penalty!(g, Λ, p, _) = apply_penalty!(g, Λ, p)
 
+"""
+    L1Penalty(λ)
+
+Applies a L1 penalty of `λ` to parameters, i.e. penalizing by their absolute value.
+"""
 struct L1Penalty{NN,T} <: AbstractPenalty{NN}
   chn::NN
   λ::T
@@ -109,6 +114,11 @@ function apply_penalty!(
   l
 end
 
+"""
+    L2Penalty(λ)
+
+Applies a L2 penalty of `λ` to parameters, i.e. penalizing by their squares.
+"""
 struct L2Penalty{NN,T} <: AbstractPenalty{NN}
   chn::NN
   λ::T

diff --git a/src/simple_chain.jl b/src/simple_chain.jl
@@ -2,6 +2,29 @@
 struct InputDimUnknown end
 const InputDim = Union{InputDimUnknown,Tuple{Vararg{Integer}}}
 
+"""
+    SimpleChain([inputdim::Union{Integer,Tuple{Vararg{Integer}}, ] layers)
+
+Construct a SimpleChain. Optional `input dims` argument allows `SimpleChains` to check
+the size of inputs. Making these `static` will allow `SimpleChains` to infer size
+and loop bounds at compile time.
+Batch size generally should not be included in the `input dim`.
+If `inputdim` is not specified, some methods, e.g. `init_params`, will require
+passing the size as an additional argument, because the number of parameters may be
+a function of the input size (e.g., for a `TurboDense` layer).
+
+The `layers` argument holds various `SimpleChains` layers, e.g. `TurboDense`, `Conv`,
+`Activation`, `Flatten`, `Dropout`, or `MaxPool`. It may optionally terminate in an
+`AbstractLoss` layer.
+
+These objects are callable, e.g.
+
+```julia
+c = SimpleChain(...);
+p = SimpleChains.init_params(c);
+c(X, p) # X are the independent variables, and `p` the parameter vector.
+```
+"""
 struct SimpleChain{N,I<:InputDim,L<:Tuple{Vararg{Any,N}}}
   inputdim::I
   layers::L
@@ -224,6 +247,13 @@ function chain_input_dims(chn::SimpleChain, inputdim::Tuple{Vararg{Integer}})
   _try_static(chain_input_dims(chn), inputdim)
 end
 
+
+"""
+    SimpleChains.init_params!(chn, p, id = nothing)
+
+Randomly initializes parameter vector `p` with input dim `id`. Input dim does not need to be specified if these were provided to the chain object itself.
+See the documentation of the individual layers to see how they are initialized, but it is generally via (Xavier) Glorot uniform or normal distributions.
+"""
 function init_params!(chn::SimpleChain, x::AbstractVector, id = nothing)
   GC.@preserve x init_params!(chn.layers, pointer(x), chain_input_dims(chn, id))
   return x
@@ -241,6 +271,12 @@ function init_params(
   _id = chain_input_dims(Λ, id)
   init_params!(Λ, Vector{T}(undef, numparam(Λ, id)), chain_input_dims(Λ, _id))
 end
+"""
+    SimpleChains.init_params(chn[, id = nothing][, ::Type{T} = Float32])
+
+Creates a parameter vector of element type `T` with size matching that by `id` (argument not reguired if provided to the `chain` object itself.
+See the documentation of the individual layers to see how they are initialized, but it is generally via (Xavier) Glorot uniform or normal distributions.
+"""
 function init_params(Λ::SimpleChain, ::Type{T}) where {T}
   init_params(Λ, nothing, T)
 end