Naive Bayes実装してみた

Naive Bayesくらい作ったことないと、ということで実装してみた。

にあるように、最初にやるには不向き。理由は、

  • 解釈とモデル設計が難しい(その変わり理論学ぶモチベになる)
  • ゼロ頻度問題やらスムージングやらが面倒

class NaiveBayes < Classifier
include Math
attr_accessor :data_set, :times, :prior, :likelihood,:evidence,:posterior, :times_i,:sum,:times_c,:option
def initialize(*options)
options = options[0]
if options.is_a?(Hash)
if options.key?(:teaching_data)
@teaching_data= options[:teaching_data]
else
raise "error : teaching_data must be given"
end
if options.key?(:data_set)
@data_set = options[:data_set]
else
@data_set = Marshal::load(Marshal::dump(@teaching_data))
end
if options.key?(:option)
@option=options[:option]
else
@option="normal"
end
end

# calculating possibilities
case @option
when "normal"
normal_bayes
when "complement"
complement_bayes
else
raise "error: invalid option"
end
self
end

def normal_bayes
set_times!
set_words!
set_likelihood!
set_prior!
test
classify_all!(@data_set)
end

def complement_bayes
set_times!
set_words!
set_likelihood!
set_prior!
test
classify_all!(@data_set)
end

def test
puts "sum:#{@sum}"
puts "times_c:#{@times_c}"
puts "times_i:#{@times_i}"
puts "words:#{@words}"
["kbc","geil","programmer"].each do |category|
puts "lh;#{category}:#{@likelihood[category]}"
puts "times_c;#{category}:#{times_c[category]}"
end
end

# segging @times and @times_c and @times_i and @sum
def set_times!
@sum =0
@times=Hash.new()
@times_i = Hash.new()
@times_c = Hash.new()

@teaching_data.dictionary.each_key do |incident_name|
@times[incident_name]=Hash.new()
end
@teaching_data.data_set.each do |single_data|
single_data.features.each_key do |incident|
if @times[incident].key?(single_data.classification)
@times[incident][single_data.classification] += single_data.features[incident]
else
@times[incident][single_data.classification]=1
end
end
end
@times.each_key do |incident|
# init num of incident (1 not 0 not to set the possibility 0 )
@times_i[incident]=0
end

@teaching_data.class_dic.each_key do |category|
@times_c[category] = 0
end
@times.each_key do |incident|
@times[incident].each_key do |category|
# updating the total num of incident
@times_i[incident] += @times[incident][category] if @times[incident][category] >0
@times_c[category] += @times[incident][category] if @times[incident][category] >0
@sum += @times[incident][category]
end
end
end

def classify_all!(data_set)
data_set.data_set.each do |single_data|
classify!(single_data)
end
data_set
end

# classify single_data by the size of @posterior[obj][cat] (=P(cat|doc))
# befor the classification , set @posterior

def classify!(single_data)
case @option
when "normal"
normal_posterior!(single_data)
when "complement"
complement_posterior!(single_data)
end
result = nil
max = -1000000000000000
@posterior[single_data].each_key do |category|
if @posterior[single_data][category] > max
result = category
max = @posterior[single_data][category]
end
end
puts "class:#{result}"
single_data.classification = result
result
end

def complement_posterior!(single_data)
@posterior = Hash.new()
@posterior[single_data]=Hash.new()
@teaching_data.class_dic.each_key do |category|
likelihood = 0.0
single_data.features.each_key do |incident|
if @likelihood[category].key?(incident)
likelihood = likelihood + log(@likelihood[category][incident])*single_data.features[incident] #if single_data.features[incident]>0
else
likelihood = likelihood + log(c_laplace_smoothing(category))*single_data.features[incident]
end
end
@posterior[single_data][category]= log(@prior[category])-likelihood
puts "#{category}:#{@posterior[single_data][category]}"
end
end

def normal_posterior!(single_data)
@posterior = Hash.new()
@posterior[single_data]=Hash.new()
@teaching_data.class_dic.each_key do |category|
likelihood = 0.0
single_data.features.each_key do |incident|
if @likelihood[category].key?(incident)
likelihood = likelihood + log(@likelihood[category][incident])*single_data.features[incident] #if single_data.features[incident]>0
else
likelihood = likelihood + log(n_laplace_smoothing(category))*single_data.features[incident]
end
end
@posterior[single_data][category]= log(@prior[category])+likelihood
puts "#{category}:#{@posterior[single_data][category]}"
end
end

def set_words!
@words = @times_i.keys.size
end

def c_laplace_smoothing(category)
(@sum+1.0)/(@sum-@times_c[category]+@words)
#@times_i[incident]
end

def n_laplace_smoothing(category)
(1.0)/(@times_c[category]+@words)
#@times_i[incident]
end

# setting @likelihood[cat][inc] (= P(inc|cat) )

def set_likelihood!
case @option
when "normal"
n_set_likelihood!
when "complement"
c_set_likelihood!
end
end

def c_set_likelihood!
@likelihood = Hash.new()
@times.each_key do |incident|
@times[incident].each_key do |category|
@likelihood[category]=Hash.new()
end
end
@times.each_key do |incident|
@times[incident].each_key do |category|
# set P(inc|cat)= (incident in the category (= cat and inc) )/ times of cat occuered
@likelihood[category][incident] = ((@times_i[incident]-@times[incident][category]+1)*(1.0)/(@sum-@times_c[category]+@words))
end
end
end

def n_set_likelihood!
@likelihood = Hash.new()
@times.each_key do |incident|
@times[incident].each_key do |category|
@likelihood[category]=Hash.new()
end
end
@times.each_key do |incident|
@times[incident].each_key do |category|
# set P(inc|cat)= (incident in the category (= cat and inc) )/ times of cat occuered
@likelihood[category][incident] = ((@times[incident][category]+1)*(1.0)/(@times_c[category]+@words))
end
end
end

# setting @prior[cat](=P(cat))
def set_prior!
@prior = Hash.new()
# num of samples
sum = 0
@teaching_data.class_dic.each_key do |category|
sum += @teaching_data.class_dic[category]
end
@teaching_data.class_dic.each_key do |category|
# set P(cat) = num of the samples of the given cat / total num of samples
@prior[category]=(@teaching_data.class_dic[category]+1)*(1.0)/(sum+@teaching_data.class_dic.keys.size)
end
end

end