#------------------------------------------------------------ # TITLE: example-REXML-problem.rb # AUTHOR: MAS # DATE: 2012-11-04 # VERSION: 1.0 #------------------------------------------------------------ # DESCRIPTION: # ... # # This program illustrates problem with REXML. Input # file contains HTML entities like ’. These are # converted on the fly to their literal Unicode # character point, which is NOT the same as their # original HTML function. Note that these # transliterations seem to only occur when using ANY # form of Xpath expression. # #------------------------------------------------------------ # REQUIREMENTS: # Ruby 1.9 (for UTF ability) # REXML capability (may be installed already with 1.9?) #------------------------------------------------------------ # LIABILITY: # USE AT YOUR OWN RISK!! #------------------------------------------------------------ # USAGE: # Run program as ruby Input/output file # names can be specified in variables below. #------------------------------------------------------------ # CHANGE THESE VARIABLES AS NEEDED: input_source_file = "sample-REXML-input.xml" output_source_file = "sample-REXML-output.xml" this_year = 2012 data_version = '1815' # Match with whatever current CN puts out #------------------------------------------------------------ # AVOID CHANGING ANYTHING BELOW THIS LINE UNLESS YOU KNOW # WHAT YOU'RE DOING #------------------------------------------------------------ require 'date' require "rexml/document" require 'cgi' include REXML file = File.new(input_source_file) doc = Document.new( file ) names = XPath.match( doc, "//node" ) string = < EOF doc_out = Document.new string doc_out << XMLDecl.new root = doc_out.root #tags = doc_out.elements["notebook/tags"] notes = doc_out.elements["notebook"] curDate = DateTime.now strDate = curDate.to_s strYear,strMo,strDay = strDate.slice(0,4), strDate.slice(5,2), strDate.slice(8,2) strStamp = "#{strYear}#{strMo}#{strDay}T080000" names.each do |element| title = element.elements["title"].text.strip article = element.elements["article"].text #puts article el = notes.add_element "note" el.attributes["title"] = title el.attributes["tags"] = "IMPORT" el.attributes["created"] = strStamp el.attributes["modified"] = strStamp el.attributes["source"] = '' el.attributes["link"] = '' el.attributes["plainText"] = '0' el.add_text(CData.new(article)) end temp = doc_out.text #File.open(output_source_file, 'wt:UTF-16LE') do |f| # I output here in UTF-8 so that the user can observe and # confirm the stated problem with code conversion. The # The conversion problem has actually already occurred by # the time we have reached this point. File.open(output_source_file, 'wt:UTF-8') do |f| doc_out.write f end