class PDF::Reader::PageTextReceiver
Builds a UTF-8 string of all the text on a single page by processing all the operaters in a content stream.
Constants
- SPACE
Attributes
options[R]
state[R]
Public Instance Methods
content()
click to toggle source
deprecated
# File lib/pdf/reader/page_text_receiver.rb, line 73 def content mediabox = @page.rectangles[:MediaBox] PageLayout.new(runs, mediabox).to_s end
invoke_xobject(label)
click to toggle source
XObjects
# File lib/pdf/reader/page_text_receiver.rb, line 112 def invoke_xobject(label) @state.invoke_xobject(label) do |xobj| case xobj when PDF::Reader::FormXObject then xobj.walk(self) end end end
move_to_next_line_and_show_text(str)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 98 def move_to_next_line_and_show_text(str) # ' @state.move_to_start_of_next_line show_text(str) end
page=(page)
click to toggle source
starting a new page
# File lib/pdf/reader/page_text_receiver.rb, line 43 def page=(page) @state = PageState.new(page) @page = page @content = [] @characters = [] end
runs(opts = {})
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 50 def runs(opts = {}) runs = @characters if rect = opts.fetch(:rect, @page.rectangles[:CropBox]) runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect) end if opts.fetch(:skip_zero_width, true) runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs) end if opts.fetch(:skip_overlapping, true) runs = OverlappingRunsFilter.exclude_redundant_runs(runs) end if opts.fetch(:merge, true) runs = merge_runs(runs) end runs end
set_spacing_next_line_show_text(aw, ac, string)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 103 def set_spacing_next_line_show_text(aw, ac, string) # " @state.set_word_spacing(aw) @state.set_character_spacing(ac) move_to_next_line_and_show_text(string) end
show_text(string)
click to toggle source
Text Showing Operators
record text that is drawn on the page
# File lib/pdf/reader/page_text_receiver.rb, line 82 def show_text(string) # Tj (AWAY) internal_show_text(string) end
show_text_with_positioning(params)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 86 def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)] params.each do |arg| if arg.is_a?(String) internal_show_text(arg) elsif arg.is_a?(Numeric) @state.process_glyph_displacement(0, arg, false) else # skip it end end end
Private Instance Methods
apply_rotation(x, y)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 148 def apply_rotation(x, y) if @page.rotate == 90 tmp = x x = y y = tmp * -1 elsif @page.rotate == 180 y *= -1 x *= -1 elsif @page.rotate == 270 tmp = y y = x x = tmp * -1 end return x, y end
group_chars_into_runs(chars)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 174 def group_chars_into_runs(chars) chars.each_with_object([]) do |char, runs| if runs.empty? runs << char elsif runs.last.mergable?(char) runs[-1] = runs.last + char else runs << char end end end
internal_show_text(string)
click to toggle source
# File lib/pdf/reader/page_text_receiver.rb, line 123 def internal_show_text(string) PDF::Reader::Error.validate_type_as_malformed(string, "string", String) if @state.current_font.nil? raise PDF::Reader::MalformedPDFError, "current font is invalid" end glyphs = @state.current_font.unpack(string) glyphs.each_with_index do |glyph_code, index| # paint the current glyph newx, newy = @state.trm_transform(0,0) newx, newy = apply_rotation(newx, newy) utf8_chars = @state.current_font.to_utf8(glyph_code) # apply to glyph displacment for the current glyph so the next # glyph will appear in the correct position glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code) th = 1 scaled_glyph_width = glyph_width * @state.font_size * th unless utf8_chars == SPACE @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars) end @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE) end end
merge_runs(runs)
click to toggle source
take a collection of TextRun
objects and merge any that are in close proximity
# File lib/pdf/reader/page_text_receiver.rb, line 166 def merge_runs(runs) runs.group_by { |char| char.y.to_i }.map { |y, chars| group_chars_into_runs(chars.sort) }.flatten.sort end