Naive Bayes実装してみた - christopherの日記

Naive Bayesくらい作ったことないと、ということで実装してみた。
にあるように、最初にやるには不向き。理由は、
解釈とモデル設計が難しい（その変わり理論学ぶモチベになる）
ゼロ頻度問題やらスムージングやらが面倒
class NaiveBayes < Classifier

  include Math

  attr_accessor :data_set, :times, :prior, :likelihood,:evidence,:posterior, :times_i,:sum,:times_c,:option

  def initialize(*options)

    options = options[0]

    if options.is_a?(Hash)

      if options.key?(:teaching_data)

        @teaching_data= options[:teaching_data]

      else

        raise "error : teaching_data must be given"

      end

      if options.key?(:data_set)

        @data_set = options[:data_set]

      else

        @data_set = Marshal::load(Marshal::dump(@teaching_data))

      end

      if options.key?(:option)

        @option=options[:option]

      else

        @option="normal"

      end

    end

    

    # calculating possibilities

    case @option

    when "normal"

      normal_bayes

    when "complement"

      complement_bayes

    else

      raise "error: invalid option"

    end

    self

  end

  

  def normal_bayes

    set_times!

    set_words!

    set_likelihood!

    set_prior!

    test

    classify_all!(@data_set)

  end

  

  def complement_bayes

    set_times!

    set_words!

    set_likelihood!

    set_prior!

    test

    classify_all!(@data_set)

  end

  

  def test

    puts "sum:#{@sum}"

    puts "times_c:#{@times_c}"

    puts "times_i:#{@times_i}"

    puts "words:#{@words}"

    ["kbc","geil","programmer"].each do |category|

      puts "lh;#{category}:#{@likelihood[category]}"

      puts "times_c;#{category}:#{times_c[category]}"

    end

  end

  

  # segging @times and @times_c and @times_i and @sum

  def set_times!

    @sum =0

    @times=Hash.new()

    @times_i = Hash.new()

    @times_c = Hash.new()

    

    @teaching_data.dictionary.each_key do |incident_name|

     @times[incident_name]=Hash.new()

    end

    @teaching_data.data_set.each do |single_data|

      single_data.features.each_key do |incident|

        if @times[incident].key?(single_data.classification)

          @times[incident][single_data.classification] += single_data.features[incident]

        else

          @times[incident][single_data.classification]=1

        end

      end

    end

    @times.each_key do |incident|

      # init num of incident (1 not 0 not to set the possibility 0 )

      @times_i[incident]=0

    end

    

    @teaching_data.class_dic.each_key do |category|

      @times_c[category] = 0

    end

    @times.each_key do |incident|

      @times[incident].each_key do |category|

        # updating the total num of incident

        @times_i[incident] += @times[incident][category] if @times[incident][category] >0

        @times_c[category] += @times[incident][category] if @times[incident][category] >0

        @sum += @times[incident][category]

      end

    end

  end

  

  def classify_all!(data_set)

    data_set.data_set.each do |single_data|

      classify!(single_data)

    end 

    data_set

  end

  

  # classify single_data by the size of @posterior[obj][cat] (=P(cat|doc))

  # befor the classification , set @posterior

  

  def classify!(single_data)

    case @option

    when "normal"

      normal_posterior!(single_data)

    when "complement"

      complement_posterior!(single_data)

    end

    result = nil

    max = -1000000000000000

    @posterior[single_data].each_key do |category|

      if @posterior[single_data][category] > max

        result = category

        max = @posterior[single_data][category]

      end

    end

    puts "class:#{result}"

    single_data.classification = result

    result

  end

  

  def complement_posterior!(single_data)

    @posterior = Hash.new()

    @posterior[single_data]=Hash.new()

    @teaching_data.class_dic.each_key do |category|

      likelihood = 0.0

      single_data.features.each_key do |incident|

         if @likelihood[category].key?(incident) 

          likelihood = likelihood + log(@likelihood[category][incident])*single_data.features[incident] #if single_data.features[incident]>0

         else

           likelihood = likelihood + log(c_laplace_smoothing(category))*single_data.features[incident]

         end

      end

      @posterior[single_data][category]= log(@prior[category])-likelihood

      puts "#{category}:#{@posterior[single_data][category]}"

    end

  end

  

  def normal_posterior!(single_data)

    @posterior = Hash.new()

    @posterior[single_data]=Hash.new()

    @teaching_data.class_dic.each_key do |category|

      likelihood = 0.0

      single_data.features.each_key do |incident|

         if @likelihood[category].key?(incident) 

          likelihood = likelihood + log(@likelihood[category][incident])*single_data.features[incident] #if single_data.features[incident]>0

         else

           likelihood = likelihood + log(n_laplace_smoothing(category))*single_data.features[incident]

         end

      end

      @posterior[single_data][category]= log(@prior[category])+likelihood

      puts "#{category}:#{@posterior[single_data][category]}"

    end

  end

  

  def set_words!

    @words =  @times_i.keys.size

  end

  

  def c_laplace_smoothing(category)

    (@sum+1.0)/(@sum-@times_c[category]+@words)

    #@times_i[incident]

  end

  

  def n_laplace_smoothing(category)

    (1.0)/(@times_c[category]+@words)

    #@times_i[incident]

  end

  

  # setting @likelihood[cat][inc] (= P(inc|cat) )

  

  def set_likelihood!

    case @option

    when "normal"

      n_set_likelihood!

    when "complement"

      c_set_likelihood!

    end

  end

  

  def c_set_likelihood!

    @likelihood = Hash.new()

    @times.each_key do |incident|

      @times[incident].each_key do |category|

        @likelihood[category]=Hash.new()

      end

    end

    @times.each_key do |incident|

      @times[incident].each_key do |category|

        # set P(inc|cat)= (incident in the category (= cat and inc) )/ times of cat occuered

        @likelihood[category][incident] = ((@times_i[incident]-@times[incident][category]+1)*(1.0)/(@sum-@times_c[category]+@words))

      end

    end

  end

  

  def n_set_likelihood!

    @likelihood = Hash.new()

    @times.each_key do |incident|

      @times[incident].each_key do |category|

        @likelihood[category]=Hash.new()

      end

    end

    @times.each_key do |incident|

      @times[incident].each_key do |category|

        # set P(inc|cat)= (incident in the category (= cat and inc) )/ times of cat occuered

        @likelihood[category][incident] = ((@times[incident][category]+1)*(1.0)/(@times_c[category]+@words))

      end

    end

  end

  

  # setting @prior[cat](=P(cat))

  def set_prior!

    @prior = Hash.new()

    # num of samples

    sum = 0

    @teaching_data.class_dic.each_key do |category|

      sum += @teaching_data.class_dic[category]

    end

    @teaching_data.class_dic.each_key do |category|

      # set P(cat) = num of the samples of the given cat / total num of samples

      @prior[category]=(@teaching_data.class_dic[category]+1)*(1.0)/(sum+@teaching_data.class_dic.keys.size)

    end

  end

  

end