Train an NGram Model
Will take the ngrams, reading and writing from the SQLite database and train the model utilizing Katz-Backoff methodology on the likelihood of the next word.
See code below
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
#ngramTrainer Class library(methods) library(stringr) library(plyr) # ---------------------------------------------------------- #' class for parsing the ngrams for training #' @param ngrams ngrams character list of ngrams #' @param ngramsparsed data.frame with ngrams separated by word #' ngramTrainer <- setRefClass("ngramTrainer", fields = list(ngrams = "character" ,ngramsparsed = "data.frame" ,ngramguesses = "data.frame" ,ngramscores = "data.frame" ,ngramrow = "data.frame" ,ngramsize = "numeric" ,sqlitedb = "character" ,sqlModel = "sqlTrainer" )) ngramTrainer$methods( # ---------------------------------------------------------- #' initialize method for prepping the ngrams for training initialize = function(ngramslist = "list", ngramdbase = "character") { ngrams <<- unlist(ngramslist) sqlitedb <<- ngramdbase # convert into dataframe for processing tempmatrix <- matrix(.self$ngrams, ncol = 1 ,nrow = length(.self$ngrams)) ngramsparsed <<- data.frame(ngram = tempmatrix, stringsAsFactors = FALSE) rm(tempmatrix) #split into individual words split_words <- str_split(.self$ngramsparsed[,1] ,"_" ,simplify = TRUE) #edit the dataframe so each word into its own column ngramsparsed <<- ngramsparsed %>% mutate(word_1 = split_words[,1] ,word_2 = split_words[,2] ,word_3 = split_words[,3] ,word_4 = split_words[,4]) rm(split_words) #set the database location sqlModel$database <<- .self$sqlitedb } ,runQueries = function(currentrow = "data.frame") { ngramrow <<- currentrow #message(c("processing: ",ngramrow$ngram)) #run query sqlModel$query <<- switch(currentrow$V1, "1" = sqlModel$queryNgramTwo(.self$ngramrow) ,"2" = sqlModel$queryNgramTwo(.self$ngramrow) ,"3" = sqlModel$queryNgramThree(.self$ngramrow) ,"4" = sqlModel$queryNgramFour(.self$ngramrow) ) queryresults <- sqlModel$selectQuery() return(queryresults) } ,processQueryResults = function() { #process query message("..... processing query results") ngramguesses <<- ngramguesses %>% group_by(ngram) %>% mutate(frequency_relative = frequency / sum(frequency) ,frequency_log = log10(frequency/ sum(frequency)) ,frequency_log_scores = case_when(V1 == 1 ~ frequency_log * 0.2 ,V1 == 2 ~ frequency_log * 0.4 ,V1 == 3 ~ frequency_log * 0.6 ,V1 == 4 ~ frequency_log * 0.8 ,TRUE ~ 0 ) ,ranking = rank(-frequency_log_scores, 100) ) # TO-DO multiply by weighting factors } ,scoreQueryResults = function(currentrow = "data.frame") { #score results rowid <- match(currentrow$ngram, .self$ngramguesses$dbase_ngram) #message(c("measureResults: ", currentrow$ngram, " rank: ", .self$ngramguesses[rowid, 8] )) return(as.data.frame(c(.self$ngramguesses[rowid, "ranking"] ,.self$ngramguesses[rowid, "frequency_relative"] ,.self$ngramguesses[rowid, "frequency_log"]))) } ,storeQueryResults = function() { #store results message("..... storing score results") ngramscores <<- ngramsparsed %>% select(ngram , ranking , frequency_relative , frequency_log , ngram_size = V1) queryresults <- sqlModel$writeTable(output_dataframe = .self$ngramscores , remote_table ="ngrams_scores") } ,ngramSize = function(ngramrow = "data.frame") { ngramsize <<- sum( nzchar(ngramrow$word_1) +nzchar(ngramrow$word_2) +nzchar(ngramrow$word_3) +nzchar(ngramrow$word_4) ) return(.self$ngramsize) } ) |
Leave a Reply