Skip to content

Commit

Permalink
Document sunecasp's changes, and adding parameter for number of itera…
Browse files Browse the repository at this point in the history
…tions
  • Loading branch information
Benjamin Schmidt committed Jan 8, 2016
1 parent 49ee73a commit 6900dd9
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
16 changes: 12 additions & 4 deletions R/word2vec.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
##' @title Train a model by word2vec.
##' @param train_file Path of a single .txt file for training. Tokens are split on spaces.
##' @param output_file Path of the output file.
##' @param vectors The number of vectors to output. Defaults to 100. More vectors may be useful with large files.
##' @param threads Number of threads to run training process on. Defaults to 1; up to the number of cores on your machine may be useful.
##' @param window The size of the window (in words) to use in training.
##' @param classes Number of classes for k-means clustering. Not documented/tested.
##' @param cbow If 1, use a continuous-bag-of-words model instead of skip-grams. Defaults to false (recommended for newcomers).
##' @param min_count Minimum times a word must appear to be included in the samples. High values help reduce model size.
##' @param iter Number of passes to make over the corpus in training.
##' @return A word2vec object.
##' @author Jian Li <\email{rweibo@@sina.com}>, Ben Schmidt <\email{bmchmidt@@gmail.com}>
##' @references \url{https://code.google.com/p/word2vec/}
Expand All @@ -22,10 +29,11 @@
##' @examples \dontrun{
##' model = word2vec(system.file("examples", "rfaq.txt", package = "tmcn.word2vec"))
##' }
train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12,classes=0,cbow=0,min_count=5)
train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12,
classes=0,cbow=0,min_count=5,iter=5)
{
if (!file.exists(train_file)) stop("Can't find the training file!")
if (file.exists(output_file)) return(read.vectors(output_file))
if (file.exists(output_file)) stop("The output file '", output_file , "' already exists: delete or give a new destination.")

train_dir <- dirname(train_file)

Expand Down Expand Up @@ -53,8 +61,8 @@ train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,t
window=as.character(window),
classes=as.character(classes),
cbow=as.character(cbow),
min_count=as.character(min_count)

min_count=as.character(min_count),
iter=as.character(iter)
)

read.vectors(output_file)
Expand Down
8 changes: 5 additions & 3 deletions src/tmcn_word2vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

void tmcn_word2vec(char *train_file0, char *output_file0,
char *binary0, char *dims0, char *threads,
char *window0, char *classes0, char *cbow0, char *min_count0)
char *window0, char *classes0, char *cbow0,
char *min_count0, char *iter0)
{
int i;
layer1_size = atoll(dims0);
Expand All @@ -14,6 +15,7 @@ void tmcn_word2vec(char *train_file0, char *output_file0,
classes = atoi(classes0);
cbow = atoi(cbow0);
min_count = atoi(min_count0);
iter = atoll(iter0);
strcpy(train_file, train_file0);
strcpy(output_file, output_file0);

Expand All @@ -34,8 +36,8 @@ void tmcn_word2vec(char *train_file0, char *output_file0,

void CWrapper_word2vec(char **train_file, char **output_file,
char **binary, char **dims, char **threads,
char **window, char **classes, char **cbow, char **min_count)
char **window, char **classes, char **cbow, char **min_count, char **iter)
{
tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count);
tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count,*iter);
}

0 comments on commit 6900dd9

Please sign in to comment.