Hello all,

The below code is too slow for practical use. I need it to run at least

20 times faster. Perhaps that is possible with some C code? I have no

experience with writing Ruby extensions. What are the pitfalls? Which

part of the code should be ported? Any pointers to get me started?

Cheers,

Martin

#!/usr/bin/env ruby

# IUPAC nucleotide pair ambiguity equivalents are saved in an

# array of bit fields.

BIT_A = 1 << 0

BIT_T = 1 << 1

BIT_C = 1 << 2

BIT_G = 1 << 3

EQUAL = Array.new(256, 0)

EQUAL[‘A’.ord] = BIT_A

EQUAL[‘T’.ord] = BIT_T

EQUAL[‘U’.ord] = BIT_T

EQUAL[‘C’.ord] = BIT_C

EQUAL[‘G’.ord] = BIT_G

EQUAL[‘M’.ord] = (BIT_A|BIT_C)

EQUAL[‘R’.ord] = (BIT_A|BIT_G)

EQUAL[‘W’.ord] = (BIT_A|BIT_T)

EQUAL[‘S’.ord] = (BIT_C|BIT_G)

EQUAL[‘Y’.ord] = (BIT_C|BIT_T)

EQUAL[‘K’.ord] = (BIT_G|BIT_T)

EQUAL[‘B’.ord] = (BIT_C|BIT_G|BIT_T)

EQUAL[‘D’.ord] = (BIT_A|BIT_G|BIT_T)

EQUAL[‘H’.ord] = (BIT_A|BIT_C|BIT_T)

EQUAL[‘V’.ord] = (BIT_A|BIT_C|BIT_G)

EQUAL[‘N’.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)

# Module containing code to locate nucleotide patterns in sequences

allowing for

# ambiguity codes and a given maximum edit distance.

# Insertions are nucleotides found in the pattern but not in the

sequence.

# Deletions are nucleotides found in the sequence but not in the

pattern.

# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):

# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf

module PatternMatcher

# str.match(pattern[, pos[, max_edit_distance]])

# → Match or nil

# Method to locate the next pattern match starting from a given

position. A match

# is allowed to contain a given maximum edit distance. If a match is

located a

# Match object will be returned otherwise nil.

def match(pattern, pos = 0, max_edit_distance = 0)

@pattern = pattern

@pos = pos

@max_edit_distance = max_edit_distance

@vector = vector_init

```
while @pos < @seq.length
vector_update
return match_new if match_found?
@pos += 1
end
```

end

# str.scan(pattern[, pos[, max_edit_distance]])

# → Array

# str.scan(pattern[, pos[, max_edit_distance]]) { |match|

# block

# }

# → Match

# Method to iterate through a sequence to locate pattern matches

starting

# from a given position and allowing for a maximum edit distance.

# Matches found in block context return the Match object. Otherwise

matches are

# returned in an Array.

def scan(pattern, pos = 0, max_edit_distance = 0)

matches = []

```
while match = match(pattern, pos, max_edit_distance)
if block_given?
yield match
else
matches << match
end
pos = match.pos + 1
end
return matches unless block_given?
```

end

private

# Method to initailize the score vector and return this.

def vector_init

vector = []

```
(0 ... @pattern.length + 1).each do |i|
vector[i] = Score.new(matches = 0, mismatches = 0, insertions = i)
end
vector
```

end

# Method to update the score vector.

def vector_update

new_vector = @vector.dup

```
(0 ... @pattern.length).each do |i|
if match?(@seq[@pos], @pattern[i])
new_vector[i + 1] = @vector[i].dup
new_vector[i + 1].matches += 1
else
mismatch = @vector[i].dup
insertion = new_vector[i].dup
deletion = @vector[i + 1].dup
if deletion?(mismatch, insertion, deletion)
deletion.deletions += 1
new_vector[i + 1] = deletion
elsif mismatch?(mismatch, insertion, deletion)
mismatch.mismatches += 1
new_vector[i + 1] = mismatch
elsif insertion?(mismatch, insertion, deletion)
insertion.insertions += 1
new_vector[i + 1] = insertion
end
end
end
@vector = new_vector
```

end

# Method to determine if a match occurred.

def match?(char1, char2)

(EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0

end

# Method to determine if a mismatch occured.

def mismatch?(mismatch, insertion, deletion)

if mismatch.edit_distance <= insertion.edit_distance and

mismatch.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if an insertion occured.

def insertion?(mismatch, insertion, deletion)

if insertion.edit_distance <= mismatch.edit_distance and

insertion.edit_distance <= deletion.edit_distance

true

end

end

# Method to determine if a deletion occured.

def deletion?(mismatch, insertion, deletion)

if deletion.edit_distance <= mismatch.edit_distance and

deletion.edit_distance <= insertion.edit_distance

true

end

end

# Method to print the score vector.

def vector_print

@vector.each do |s|

puts s

end

```
puts
```

end

# Method that returns a Match object initialized with

# information from the score vector.

def match_new

matches = @vector.last.matches

mismatches = @vector.last.mismatches

insertions = @vector.last.insertions

deletions = @vector.last.deletions

length = @pattern.length - insertions + deletions

pos = @pos - length + 1

match = @seq[pos … pos + length]

```
Match.new(pos, match, matches, mismatches, insertions, deletions,
```

length)

end

# Method that determines if a match was found by analyzing the score

vector.

def match_found?

if @vector.last.edit_distance <= @max_edit_distance

true

end

end

# Class to instantiate Score objects that holds score information.

class Score

attr_accessor :matches, :mismatches, :insertions, :deletions

```
def initialize(matches = 0, mismatches = 0, insertions = 0,
```

deletions = 0)

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

end

```
# Method to calculate and return the edit distance.
def edit_distance
self.mismatches + self.insertions + self.deletions
end
private
def to_s
"(#{[self.matches, self.mismatches, self.insertions,
```

self.deletions].join(‘,’)})"

end

end

# Class for creating Match objects which contain the description of a

# match between a nucleotide sequence and a pattern.

class Match

attr_reader :pos, :match, :matches, :mismatches, :insertions,

:deletions, :edit_distance, :length

```
def initialize(pos, match, matches, mismatches, insertions,
```

deletions, length)

@pos = pos

@match = match

@matches = matches

@mismatches = mismatches

@insertions = insertions

@deletions = deletions

@edit_distance = mismatches + insertions + deletions

@length = length

end

end

end