Ruby: java document processor

Tika is a utility that parses proprietary formats like Microsoft Word into plain text. This can prove to be valuable from a variety of search criteria for any application.


module Joe

  extend self
  require "subexec"
  def extract_metadata(file)
    begin
      sub = Subexec.run "java -jar '#{Rails.root}/bin/tika-app-1.13.jar' --metadata --json #{File.expand_path(file)}", :timeout => 5
      Rails.logger.info(">>>>>>>>>>>>>> metadata: " + sub.output)
      sub.output
      raise sub.output unless JSON?(sub.output)
    rescue Exception => e
      #for now consume exceptions
      sub.output = "{ \"Exception\": \"Metadata cannot be parsed - see log for full details.\" }"
      Rails.logger.error(e.message)
    end
    sub.output
  end

  def extract_plain_text(file)
    sub = Subexec.run "java -jar '#{Rails.root}/bin/tika-app-1.13.jar' --text #{File.expand_path(file)}", :timeout => 5
    Rails.logger.info(">>>>>>>>>>>>>> plain text:  " + sub.output)
    sub.output
  end

  def JSON?(string)
    begin
      !!JSON.parse(string)
    rescue
      false
    end
  end
end