Ruby: java document processor
Tika
is a utility that parses proprietary formats like Microsoft Word
into plain text. This can prove to be valuable from a variety of search criteria for any application.
module Joe
extend self
require "subexec"
def extract_metadata(file)
begin
sub = Subexec.run "java -jar '#{Rails.root}/bin/tika-app-1.13.jar' --metadata --json #{File.expand_path(file)}", :timeout => 5
Rails.logger.info(">>>>>>>>>>>>>> metadata: " + sub.output)
sub.output
raise sub.output unless JSON?(sub.output)
rescue Exception => e
#for now consume exceptions
sub.output = "{ \"Exception\": \"Metadata cannot be parsed - see log for full details.\" }"
Rails.logger.error(e.message)
end
sub.output
end
def extract_plain_text(file)
sub = Subexec.run "java -jar '#{Rails.root}/bin/tika-app-1.13.jar' --text #{File.expand_path(file)}", :timeout => 5
Rails.logger.info(">>>>>>>>>>>>>> plain text: " + sub.output)
sub.output
end
def JSON?(string)
begin
!!JSON.parse(string)
rescue
false
end
end
end
Comment