From 6900dd99f0562753bf71a5ff273ee2f5a7a37f9a Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt Date: Fri, 8 Jan 2016 15:31:13 -0500 Subject: [PATCH] Document sunecasp's changes, and adding parameter for number of iterations --- R/word2vec.R | 16 ++++++++++++---- src/tmcn_word2vec.c | 8 +++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/R/word2vec.R b/R/word2vec.R index 3817c99..6b81ebe 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -12,6 +12,13 @@ ##' @title Train a model by word2vec. ##' @param train_file Path of a single .txt file for training. Tokens are split on spaces. ##' @param output_file Path of the output file. +##' @param vectors The number of vectors to output. Defaults to 100. More vectors may be useful with large files. +##' @param threads Number of threads to run training process on. Defaults to 1; up to the number of cores on your machine may be useful. +##' @param window The size of the window (in words) to use in training. +##' @param classes Number of classes for k-means clustering. Not documented/tested. +##' @param cbow If 1, use a continuous-bag-of-words model instead of skip-grams. Defaults to false (recommended for newcomers). +##' @param min_count Minimum times a word must appear to be included in the samples. High values help reduce model size. +##' @param iter Number of passes to make over the corpus in training. ##' @return A word2vec object. ##' @author Jian Li <\email{rweibo@@sina.com}>, Ben Schmidt <\email{bmchmidt@@gmail.com}> ##' @references \url{https://code.google.com/p/word2vec/} @@ -22,10 +29,11 @@ ##' @examples \dontrun{ ##' model = word2vec(system.file("examples", "rfaq.txt", package = "tmcn.word2vec")) ##' } -train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12,classes=0,cbow=0,min_count=5) +train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,threads=1,window=12, + classes=0,cbow=0,min_count=5,iter=5) { if (!file.exists(train_file)) stop("Can't find the training file!") - if (file.exists(output_file)) return(read.vectors(output_file)) + if (file.exists(output_file)) stop("The output file '", output_file , "' already exists: delete or give a new destination.") train_dir <- dirname(train_file) @@ -53,8 +61,8 @@ train_word2vec <- function(train_file, output_file = "vectors.txt",vectors=100,t window=as.character(window), classes=as.character(classes), cbow=as.character(cbow), - min_count=as.character(min_count) - + min_count=as.character(min_count), + iter=as.character(iter) ) read.vectors(output_file) diff --git a/src/tmcn_word2vec.c b/src/tmcn_word2vec.c index c01a3ab..34b37ae 100644 --- a/src/tmcn_word2vec.c +++ b/src/tmcn_word2vec.c @@ -4,7 +4,8 @@ void tmcn_word2vec(char *train_file0, char *output_file0, char *binary0, char *dims0, char *threads, - char *window0, char *classes0, char *cbow0, char *min_count0) + char *window0, char *classes0, char *cbow0, + char *min_count0, char *iter0) { int i; layer1_size = atoll(dims0); @@ -14,6 +15,7 @@ void tmcn_word2vec(char *train_file0, char *output_file0, classes = atoi(classes0); cbow = atoi(cbow0); min_count = atoi(min_count0); + iter = atoll(iter0); strcpy(train_file, train_file0); strcpy(output_file, output_file0); @@ -34,8 +36,8 @@ void tmcn_word2vec(char *train_file0, char *output_file0, void CWrapper_word2vec(char **train_file, char **output_file, char **binary, char **dims, char **threads, - char **window, char **classes, char **cbow, char **min_count) + char **window, char **classes, char **cbow, char **min_count, char **iter) { - tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count); + tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count,*iter); }