class PDF::Reader::PageLayout
Takes a collection of TextRun
objects and renders them into a single string that best approximates the way they'd appear on a render PDF page.
media box should be a 4 number array that describes the dimensions of the page to be rendered as described by the page's MediaBox attribute
Constants
- DEFAULT_FONT_SIZE
Public Class Methods
new(runs, mediabox)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 17 def initialize(runs, mediabox) raise ArgumentError, "a mediabox must be provided" if mediabox.nil? @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs)) @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0 @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0 @page_width = mediabox[2] - mediabox[0] @page_height = mediabox[3] - mediabox[1] @x_offset = @runs.map(&:x).sort.first end
Public Instance Methods
to_s()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 29 def to_s return "" if @runs.empty? page = row_count.times.map { |i| " " * col_count } @runs.each do |run| x_pos = ((run.x - @x_offset) / col_multiplier).round y_pos = row_count - (run.y / row_multiplier).round if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0 local_string_insert(page[y_pos-1], run.text, x_pos) end end interesting_rows(page).map(&:rstrip).join("\n") end
Private Instance Methods
col_count()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 66 def col_count @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor end
col_multiplier()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 74 def col_multiplier @col_multiplier ||= @page_width.to_f / col_count.to_f end
each_line() { |y, collection| ... }
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 86 def each_line(&block) @runs.sort.group_by { |run| run.y.to_i }.map { |y, collection| yield y, collection } end
group_chars_into_runs(chars)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 104 def group_chars_into_runs(chars) runs = [] while head = chars.shift if runs.empty? runs << head elsif runs.last.mergable?(head) runs[-1] = runs.last + head else runs << head end end runs end
interesting_rows(rows)
click to toggle source
given an array of strings, return a new array with empty rows from the beginning and end removed.
interesting_rows([ "", "one", "two", "" ]) => [ "one", "two" ]
# File lib/pdf/reader/page_layout.rb, line 51 def interesting_rows(rows) line_lengths = rows.map { |l| l.strip.length } return [] if line_lengths.all?(&:zero?) first_line_with_text = line_lengths.index { |l| l > 0 } last_line_with_text = line_lengths.size - line_lengths.reverse.index { |l| l > 0 } interesting_line_count = last_line_with_text - first_line_with_text rows[first_line_with_text, interesting_line_count].map end
local_string_insert(haystack, needle, index)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 118 def local_string_insert(haystack, needle, index) haystack[Range.new(index, index + needle.length - 1)] = String.new(needle) end
mean(collection)
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 78 def mean(collection) if collection.size == 0 0 else collection.inject(0) { |accum, v| accum + v} / collection.size.to_f end end
merge_runs(runs)
click to toggle source
take a collection of TextRun
objects and merge any that are in close proximity
# File lib/pdf/reader/page_layout.rb, line 96 def merge_runs(runs) runs.group_by { |char| char.y.to_i }.map { |y, chars| group_chars_into_runs(chars.sort) }.flatten.sort end
row_count()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 62 def row_count @row_count ||= (@page_height / @mean_font_size).floor end
row_multiplier()
click to toggle source
# File lib/pdf/reader/page_layout.rb, line 70 def row_multiplier @row_multiplier ||= @page_height.to_f / row_count.to_f end