Class Cass::Analysis

  1. cass/lib/cass/analysis.rb
Parent: Object

Various methods used to conduct analyses on one or more Documents. The primary processing stream is run_spec, which is essentially a wrapper arouond the other methods for conducting one and two-sample tests.

Attributes

contexts [RW]
docs [RW]
targets [RW]

Public class methods

bootstrap_test (doc, contrasts, output_file, n_boot, opts={})

Do a bootstrap test comparing the bootstrapped distribution to zero.

  • doc: The Document object to analyze
  • contrasts: an array of Contrast objects to apply
  • output_file: name of output file
  • n_boot: number of bootstrap iterations to run
  • opts: an optional hash of additional settings. Currently, only

‘verbose’ and ‘normalize_weights’ apply here.

[show source]
# File cass/lib/cass/analysis.rb, line 141
    def self.bootstrap_test(doc, contrasts, output_file, n_boot, opts={})

      # Merge options with defaults
      opts = {'verbose'=>true, 'normalize_weights'=>false }.merge(opts)
          
      outf = File.new(output_file,'w')
      outf.puts(%w[contrast result_id doc_name pair_1 pair_2 pair_3 pair_4 interaction_term].join("\t"))          
                outf.sync = true
                
                doc.cooccurrence(opts['normalize_weights'])
                
                contrasts = [contrasts] if contrasts.class == Contrast
      contrasts.each { |c|
                        observed = c.apply(doc)
                        outf.puts "#{c.words.join(".")}\tobserved\t#{observed}"
                }
                d1 = doc.clone
                n_boot.times { |i|
                        puts "Running bootstrap iteration #{i+1}..." if opts['verbose']
                        d1.clines = doc.resample(clines=true)
                        # d1.context = Context.new(d1)   # Currently uses the same context; can uncomment
                        d1.cooccurrence(opts['normalize_weights'])
                        contrasts.each { |c|
                                res = c.apply(d1)
                                outf.puts "#{c.words.join(".")}\tboot_#{i+1}\t#{res}"
                        }
                }
        end
p_values (input_file, mode='boot', mean=true)

Takes the results of a bootstrap or permutation test as input and saves a file summarizing the corresponding p-values.

  • input_file: path to the results of the bootstrapping/permutation analysis
  • mode: indicates the source analysis type. Must be either ‘boot’ or ‘perm’
  • mean: boolean variable indicating whether or not to compute the mean across all contrasts
[show source]
# File cass/lib/cass/analysis.rb, line 188
    def self.p_values(input_file, mode='boot', mean=true)
      c = File.new(input_file).readlines
      c.shift
      buffer = ["file\tcontrast\tN_permutations\tvalue\tp-value"]
      tests = {}
      c.each { |l|
        l = l.strip.split(/\t/)
        row = [l[0], l[1], l[-1].to_f]
        fname =  mode == 'boot' ? l[2] : input_file
        tests[fname] = [] if !tests.key?(fname)
        tests[fname] << row
      }
    
      tests.each { |fname, rows|
        dists, obs, means = {}, {}, []
        rows.each { |row|
          test, iter, val = row
          if iter == 'observed'
            obs[test] = val
          else
            dists[test] = [] if !dists.key?(test)
            dists[test] << val
            if mean
              i = iter[/\d+$/].to_i-1
              means[i] = 0 if means[i].nil?
              means[i] += val
            end
          end
        }
        if mean
          means.map! { |m| m/obs.size }
          dists['mean'] = means
          obs['mean'] = obs.values.inject(0) {|sum, e| sum+e }/obs.size
        end
      
        dists.each { |k,v|
          v, o = v.sort, obs[k]
          gt = v.inject(0) { |sum, e| 
            sum + 
            if mode == 'perm'
              o >= e ? 1 : 0
            else
              e > 0 ? 1 : 0
            end
          }
          p = gt.to_f / v.size
          p = 1 - p if p > 0.5
          line = [fname, k, v.size, o, p*2]
          buffer << line.join("\t")
        }
      
      }
      base = File.basename(input_file, '.txt')
      File.new("#{base}_p_values.txt",'w').puts buffer
    end
parse_contrasts (contrast_file)

Parse contrast file. Takes a filename as input and returns an array of Contrasts.

[show source]
# File cass/lib/cass/analysis.rb, line 85
    def self.parse_contrasts(contrast_file)
      File.new(contrast_file).readlines.map { |l| next if l.empty?; Contrast.parse(l) }
    end
permutation_test (doc1, doc2, contrasts, output_file, n_perm, opts={})

Run a permutation test comparing two Documents.

  • doc1, doc2: The two Documents to compare
  • contrasts: an array of Contrasts used to compare the documents
  • output_file: name of output file
  • n_perm: number of permutations to run
  • opts: an optional hash of additional settings. Currently, only

‘verbose’ and ‘normalize_weights’ apply here.

[show source]
# File cass/lib/cass/analysis.rb, line 96
    def self.permutation_test(doc1, doc2, contrasts, output_file, n_perm, opts={})

      # Merge options with defaults
      opts = {'verbose'=>true, 'normalize_weights'=>false }.merge(opts)
      
      # Merge contexts. Could change this later to allow different contexts for each
      # document, but that would make processing substantially slower.
      context = doc1.context
      context.words = context.words & doc2.context.words
      context.index_words
      doc1.context, doc2.context = context, context

      # Generate cooccurrence matrices and get observed difference.
      doc1.cooccurrence(opts['normalize_weights'])
      doc2.cooccurrence(opts['normalize_weights'])
    
      outf = File.new(output_file,'w')
      outf.puts "contrast\titeration\t#{doc1.name}\t#{doc2.name}\tdifference"
      outf.sync = true
      # Save observed values
      contrasts.each { |c|
        res1, res2, diff = compare_docs(c, doc1, doc2)
        outf.puts "#{c.words.join(".")}\tobserved\t#{res1}\t#{res2}\t#{diff}"
      }
      # Run permutations and save results
      d1, d2 = doc1.clone, doc2.clone
      n_perm.times { |i|
        puts "Running permutation #{i+1}..." if opts['verbose']
        d1.clines, d2.clines = permute_labels(doc1.clines, doc2.clines)
        d1.cooccurrence(opts['normalize_weights'])
        d2.cooccurrence(opts['normalize_weights'])
        contrasts.each { |c|
          res1, res2, diff = compare_docs(c, d1, d2)
          outf.puts "#{c.words.join(".")}\tperm_#{i+1}\t#{res1}\t#{res2}\t#{diff}"
        }
      }
    end
run_spec (spec_file='default.spec')

Read and parse the specifications for an analysis, then run the analysis. Only does basic error checking for now...

[show source]
# File cass/lib/cass/analysis.rb, line 13
    def self.run_spec(spec_file='default.spec')
    
      # Basic error checking
      abort("Error: can't find spec file (#{spec_file}).") if !File.exist?(spec_file)
      load spec_file
      abort("Error: can't find contrast file (#{CONTRAST_FILE}).") if !File.exist?(CONTRAST_FILE) 
      
      # Create options hash
      opts = {}
      # Ruby 1.9 returns constants as symbols, 1.8.6 uses strings, so standardize
      consts = Module.constants.map { |c| c.to_s }
      %w[PARSE_TEXT N_PERM N_BOOT MAX_LINES RECODE CONTEXT_SIZE MIN_PROP STOP_FILE NORMALIZE_WEIGHTS VERBOSE].each { |c|
        opts[c.downcase] = Module.const_get(c) if consts.include?(c)  
      }
      
      if (defined?(VERBOSE) and VERBOSE)
        puts "\nRunning CASS with the following options:"
        opts.each { |k,v| puts "\t#{k}: #{v}" }
      end
      
      contrasts = parse_contrasts(CONTRAST_FILE)
    
      # Create contrasts
      puts "\nFound #{contrasts.size} contrasts." if (defined?(VERBOSE) and VERBOSE)
      
      # Set targets
      targets = contrasts.inject([]) { |t, c| t += c.words.flatten }.uniq
      puts "\nFound #{targets.size} target words." if (defined?(VERBOSE) and VERBOSE)
      
      # Read in files and create documents
      docs = []
      FILES.each { |f| 
        abort("Error: can't find input file #{f}.") if !File.exist?(f)
        puts "\nReading in file #{f}..."
        text = File.new(f).read
        docs << Document.new(f.split(/\//)[-1], targets, text, opts)
      }
      docs
    
      # Load contrasts
      contrasts = parse_contrasts(CONTRAST_FILE)
    
      # Make sure N_PERM is zero if we don't want stats
      n_perm = STATS ? N_PERM : 0
      
      # One or two-sample test?
      case TEST_TYPE
      when 1
        docs.each { |d|
          base = File.basename(d.name, '.txt')
          puts "\nRunning one-sample analysis on document '#{d.name}'."
          puts "Generating #{n_perm} bootstraps..." if (defined?(VERBOSE) and VERBOSE) and STATS
          bootstrap_test(d, contrasts, "#{OUTPUT_ROOT}_#{base}_results.txt", n_perm, opts)
          p_values("#{OUTPUT_ROOT}_#{base}_results.txt", 'boot', true) if STATS
        }
        
      when 2
        abort("Error: in order to run a permutation test, you need to pass exactly two files as input.") if FILES.size != 2 or docs.size != 2
        puts "Running two-sample comparison between '#{File.basename(FILES[0])}' and '#{File.basename(FILES[1])}'." if (defined?(VERBOSE) and VERBOSE)
        puts "Generating #{n_perm} permutations..." if (defined?(VERBOSE) and VERBOSE) and STATS
        permutation_test(docs[0], docs[1], contrasts, "#{OUTPUT_ROOT}_results.txt", n_perm, opts)
        p_values("#{OUTPUT_ROOT}_results.txt", 'perm', true)
      
      # No other test types implemented for now.
      else
        
      end    
      puts "Done!"
    
    end