Document sunecasp's changes, and adding parameter for number of itera…

…tions
bmschmidt · Jan 8, 2016 · 6900dd9 · 6900dd9
1 parent 49ee73a
commit 6900dd9
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 7 deletions.
diff --git a/R/word2vec.R b/R/word2vec.R
@@ -12,6 +12,13 @@
 ##' @title Train a model by word2vec.
 ##' @param train_file Path of a single .txt file for training. Tokens are split on spaces.
 ##' @param output_file Path of the output file.
+##' @param vectors The number of vectors to output. Defaults to 100. More vectors may be useful with large files.
+##' @param threads Number of threads to run training process on. Defaults to 1; up to the number of cores on your machine may be useful.
+##' @param window The size of the window (in words) to use in training.
+##' @param classes Number of classes for k-means clustering. Not documented/tested.
+##' @param cbow If 1, use a continuous-bag-of-words model instead of skip-grams. Defaults to false (recommended for newcomers).
+##' @param min_count Minimum times a word must appear to be included in the samples. High values help reduce model size.
+##' @param iter Number of passes to make over the corpus in training.
 ##' @return A word2vec object.
 ##' @author Jian Li <\email{rweibo@@sina.com}>, Ben Schmidt <\email{bmchmidt@@gmail.com}>
 ##' @references \url{https://code.google.com/p/word2vec/}
@@ -22,10 +29,11 @@
 ##' @examples \dontrun{
 ##' model = word2vec(system.file("examples", "rfaq.txt", package = "tmcn.word2vec"))
 ##' }
-train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12,classes=0,cbow=0,min_count=5)
+train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12,
+                           classes=0,cbow=0,min_count=5,iter=5)
 {
   if (!file.exists(train_file)) stop("Can't find the training file!")
-  if (file.exists(output_file)) return(read.vectors(output_file))
+  if (file.exists(output_file)) stop("The output file '", output_file , "' already exists: delete or give a new destination.")
 
   train_dir <- dirname(train_file)
 
@@ -53,8 +61,8 @@ train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,t
             window=as.character(window),
             classes=as.character(classes),
             cbow=as.character(cbow),
-            min_count=as.character(min_count)
-
+            min_count=as.character(min_count),
+            iter=as.character(iter)
   )
 
   read.vectors(output_file)

diff --git a/src/tmcn_word2vec.c b/src/tmcn_word2vec.c
@@ -4,7 +4,8 @@
 
 void tmcn_word2vec(char *train_file0, char *output_file0,
                    char *binary0, char *dims0, char *threads,
-                   char *window0, char *classes0, char *cbow0, char *min_count0)
+                   char *window0, char *classes0, char *cbow0,
+                   char *min_count0, char *iter0)
 {
 	int i;
   layer1_size = atoll(dims0);
@@ -14,6 +15,7 @@ void tmcn_word2vec(char *train_file0, char *output_file0,
 	classes = atoi(classes0);
 	cbow = atoi(cbow0);
 	min_count = atoi(min_count0);
+	iter = atoll(iter0);
 	strcpy(train_file, train_file0);
 	strcpy(output_file, output_file0);
 
@@ -34,8 +36,8 @@ void tmcn_word2vec(char *train_file0, char *output_file0,
 
 void CWrapper_word2vec(char **train_file, char **output_file,
                        char **binary, char **dims, char **threads,
-                       char **window, char **classes, char **cbow, char **min_count)
+                       char **window, char **classes, char **cbow, char **min_count, char **iter)
 {
-    tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count);
+    tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count,*iter);
 }