Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,28 @@ AIDamerauLevenshteinDistanceTest >> testDistanceToUsingAIDamerauLevenshteinDista

{ #category : 'tests' }
AIDamerauLevenshteinDistanceTest >> testFillFirstTwoRowsAndColumns [

| max result |
damerauLevenshtein distanceMatrix: (CTArray2D rows: 5 columns: 5).
damerauLevenshtein distanceMatrix: (CTArray2D width: 5 height: 5).
max := 10.
result := CTArray2D rows: 5 columns: 5 contents:
{ { max . max . max . max . max } .
{ max . 0 . 1 . 2 . 3 } .
{ max . 1 . nil . nil . nil } .
{ max . 2 . nil . nil . nil } .
{ max . 3 . nil . nil . nil } } flattened.

result := CTArray2D width: 5 height: 5.
result atAllPut: nil.

1 to: 5 do: [:i |
result atColumn: 1 atRow: i put: max.
result atColumn: i atRow: 1 put: max.
].

result atColumn: 2 atRow: 2 put: 0.
result atColumn: 3 atRow: 2 put: 1.
result atColumn: 4 atRow: 2 put: 2.
result atColumn: 5 atRow: 2 put: 3.

result atColumn: 2 atRow: 3 put: 1.
result atColumn: 2 atRow: 4 put: 2.
result atColumn: 2 atRow: 5 put: 3.

damerauLevenshtein fillFirstTwoRowsAndColumnsWith: 'AAAAA' and: 'BBBBB'.
self assert: damerauLevenshtein distanceMatrix equals: result
]
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ AIEuclideanDistanceTest >> testEuclideanDistanceThreeDimensions [

self
assert: (metric distanceBetween: #( -5.1 4 -3.1 ) and: #( 4 5.9 -2.2 ))
closeTo: 9.3397
closeTo: 9.339700209321496
]

{ #category : '*AI-EditDistances-Tests' }
AIEuclideanDistanceTest >> testEuclideanDistanceTwoDimensions [

self
assert: (metric distanceBetween: #( -3.54 7 ) and: #( -11.64 9.9 ))
closeTo: 8.603488.
closeTo: 8.603487664894978.

self
assert: (metric distanceBetween: #( 0 1 ) and: #( 1 0))
Expand Down
17 changes: 0 additions & 17 deletions src/AI-EditDistances/AICosineSimilarityDistance.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,3 @@ AICosineSimilarityDistance >> distanceBetween: anArray and: anotherArray [
^ num / (size1 * size2)

]

{ #category : 'api' }
AICosineSimilarityDistance >> distanceBetween: firstCollection and: secondCollection[
| dotProduct normA normB |
firstCollection size = secondCollection size ifFalse: [
self error: 'Collections must have the same length' ].
(firstCollection allSatisfy: [ :x | x isNumber ]) ifFalse: [
self error: 'First collection contains non-numeric elements' ].
(secondCollection allSatisfy: [ :x | x isNumber ]) ifFalse: [
self error: 'Second collection contains non-numeric elements' ].
dotProduct := (firstCollection with: secondCollection collect: [ :a :b | a * b ]) sum.
normA := (firstCollection collect: [ :x | x * x ]) sum sqrt.
normB := (secondCollection collect: [ :x | x * x ]) sum sqrt.
(normA = 0 and: [ normB = 0 ])
ifTrue: [ ^ 1.0 ] "Zero vectors are considered identical."
ifFalse: [ ^ dotProduct / (normA * normB) ]
]
142 changes: 70 additions & 72 deletions src/AI-EditDistances/AIDamerauLevenshteinDistance.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ Class {

{ #category : 'private' }
AIDamerauLevenshteinDistance >> calculateMinValue: cost at: i at: j using: lastMatchingRow and: lastMatchColumn [

"Here we calculate the value of all the possible operations we can do (addition, deletion, substitution and transposition).

- Each operation costs 1.
Expand All @@ -41,74 +40,71 @@ AIDamerauLevenshteinDistance >> calculateMinValue: cost at: i at: j using: lastM
- We make sure our lastMatchingRow and lastMatchColumn variables are greater than 1 so when calculating the value of this we don't get a (0,0) cell - since it does not exist."

| addition deletion substitution transposition minValue upperCell leftCell upperLeftCell |
upperCell := distanceMatrix at: i at: j - 1.
addition := upperCell + 1.

leftCell := distanceMatrix at: i - 1 at: j.
deletion := leftCell + 1.
upperLeftCell := distanceMatrix at: i - 1 at: j - 1.
substitution := upperLeftCell + cost.

transposition := (lastMatchingRow > 1 and: [ lastMatchColumn > 1 ])
ifTrue: [ (distanceMatrix at: lastMatchingRow - 1 at: lastMatchColumn - 1)
+ (i - lastMatchingRow - 1)
+ 1
+ (j - lastMatchColumn - 1) ]
ifFalse: [ distanceMatrix at: 1 at: 1 ].

minValue := { addition . deletion . substitution . transposition } min.

distanceMatrix at: i at: j put: minValue
upperCell := distanceMatrix atColumn: j atRow: i - 1.
addition := upperCell + 1.
leftCell := distanceMatrix atColumn: j - 1 atRow: i.
deletion := leftCell + 1.
upperLeftCell := distanceMatrix atColumn: j - 1 atRow: i - 1.
substitution := upperLeftCell + cost.
transposition := Float infinity.
(lastMatchingRow > 1 and: [ lastMatchColumn > 1 ])
ifTrue: [
transposition := (distanceMatrix atColumn: lastMatchColumn - 1 atRow: lastMatchingRow - 1)
+ (i - lastMatchingRow - 1)
+ 1
+ (j - lastMatchColumn - 1) ].
minValue := { addition . deletion . substitution . transposition } min.
distanceMatrix atColumn: j atRow: i put: minValue
]

{ #category : 'private' }
AIDamerauLevenshteinDistance >> damerauLevenshteinAlgorithmFor: firstString and: secondString [

" Here we implement the Damerau-Levenshtein algorithm.

- LastMatchingRow indicates the last row with the current column's character.
- LastMatchColumn indicates the last column in this row where the characters matched.
(Reminder : column's characters belong to the second string and row's characters to the first string) "

| cost lastMatchColumn secondStringCurrentCharacter firstStringCurrentCharacter lastMatchingRow rowCharactersWithIndexes |

self initializeDistanceMatrixWith: firstString and: secondString.

rowCharactersWithIndexes := Dictionary new.

3 to: distanceMatrix rowCount do: [ :i |

firstStringCurrentCharacter := firstString at: i - 2.
lastMatchColumn := 1.

3 to: distanceMatrix columnCount do: [ :j |

secondStringCurrentCharacter := secondString at: j - 2.
lastMatchingRow := rowCharactersWithIndexes at: secondStringCurrentCharacter ifAbsent: 1.

cost := secondStringCurrentCharacter = firstStringCurrentCharacter
ifTrue: [ 0 ]
ifFalse: [ 1 ].
cost = 0 ifTrue: [ lastMatchColumn := j ].

self calculateMinValue: cost at: i at: j using: lastMatchingRow and: lastMatchColumn ].

rowCharactersWithIndexes at: firstStringCurrentCharacter put: i ]
| cost lastMatchColumn secondStringChar firstStringChar lastMatchingRow charPositions |

self initializeDistanceMatrixWith: firstString and: secondString.
charPositions := Dictionary new.

3 to: distanceMatrix height do: [ :i |
firstStringChar := firstString at: i - 2.
lastMatchColumn := 1.

3 to: distanceMatrix width do: [ :j |
secondStringChar := secondString at: j - 2.
lastMatchingRow := charPositions at: secondStringChar ifAbsent: 1.

cost := secondStringChar = firstStringChar
ifTrue: [ 0 ]
ifFalse: [ 1 ].

cost = 0 ifTrue: [ lastMatchColumn := j ].

self calculateMinValue: cost at: i at: j using: lastMatchingRow and: lastMatchColumn ].

charPositions at: firstStringChar put: i ]
]

{ #category : 'api' }
AIDamerauLevenshteinDistance >> distanceBetween: firstString and: secondString [

firstString isEmpty ifTrue: [ ^ secondString size ].
secondString isEmpty ifTrue: [ ^ firstString size ].
self damerauLevenshteinAlgorithmFor:firstString and: secondString.
^ distanceMatrix
at: distanceMatrix rowCount
at: distanceMatrix columnCount
secondString isEmpty ifTrue: [ ^ firstString size ].
self damerauLevenshteinAlgorithmFor: firstString and: secondString.
^ distanceMatrix
atColumn: distanceMatrix width
atRow: distanceMatrix height
]

{ #category : 'accessing' }
Expand All @@ -123,31 +119,33 @@ AIDamerauLevenshteinDistance >> distanceMatrix: aCollection [

{ #category : 'private' }
AIDamerauLevenshteinDistance >> fillFirstTwoRowsAndColumnsWith: firstString and: secondString [

"It fills the first row and column with the maxDistance value and the second row and column with values starting with 0"

| maxDistance |
maxDistance := firstString size + secondString size.

1 to: distanceMatrix rowCount do: [ :i |
distanceMatrix at: i at: 1 put: maxDistance ].

1 to: distanceMatrix columnCount do: [ :j |
distanceMatrix at: 1 at: j put: maxDistance ].

2 to: distanceMatrix rowCount do: [ :i |
distanceMatrix at: i at: 2 put: i - 2 ].

2 to: distanceMatrix columnCount do: [ :j |
distanceMatrix at: 2 at: j put: j - 2 ]
maxDistance := firstString size + secondString size.

1 to: distanceMatrix height do: [ :i |
distanceMatrix atColumn: 1 atRow: i put: maxDistance ].

1 to: distanceMatrix width do: [ :j |
distanceMatrix atColumn: j atRow: 1 put: maxDistance ].

2 to: distanceMatrix height do: [ :i |
distanceMatrix atColumn: 2 atRow: i put: i - 2 ].

2 to: distanceMatrix width do: [ :j |
distanceMatrix atColumn: j atRow: 2 put: j - 2 ].

distanceMatrix atColumn: 2 atRow: 2 put: 0.
]

{ #category : 'private' }
AIDamerauLevenshteinDistance >> initializeDistanceMatrixWith: firstString and: secondString [

distanceMatrix := CTArray2D
rows: firstString size + 2
columns: secondString size + 2.

distanceMatrix := CTArray2D
width: secondString size + 2
height: firstString size + 2.

distanceMatrix atAllPut: 0.
self fillFirstTwoRowsAndColumnsWith: firstString and: secondString
]
]
28 changes: 17 additions & 11 deletions src/AI-EditDistances/AIEpisodeDistance.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,24 @@ AIEpisodeDistance >> longestCommonSubsequenceLength: firstString and: secondStri
| m n dp |
m := firstString size.
n := secondString size.

dp := CTArray2D rows: m + 1 columns: n + 1.
1 to: m + 1 do: [ :i | dp at: i at: 1 put: 0 ].
1 to: n + 1 do: [ :j | dp at: 1 at: j put: 0 ].


dp := CTArray2D width: n + 1 height: m + 1.

1 to: m + 1 do: [ :i | dp atColumn: 1 atRow: i put: 0 ].
1 to: n + 1 do: [ :j | dp atColumn: j atRow: 1 put: 0 ].

1 to: m do: [ :i |
1 to: n do: [ :j |
(firstString at: i) = (secondString at: j)
ifTrue: [ dp at: i + 1 at: j + 1 put: ((dp at: i at: j) + 1) ]
ifFalse: [ dp at: i + 1 at: j + 1 put: ((dp at: i + 1 at: j) max: (dp at: i at: j + 1)) ]
]
].

^ dp at: m + 1 at: n + 1
ifTrue: [
dp atColumn: j + 1 atRow: i + 1 put: (dp atColumn: j atRow: i) + 1 ]
ifFalse: [
dp
atColumn: j + 1
atRow: i + 1
put:
((dp atColumn: j atRow: i + 1) max:
(dp atColumn: j + 1 atRow: i)) ] ] ].

^ dp atColumn: n + 1 atRow: m + 1
]
52 changes: 27 additions & 25 deletions src/AI-EditDistances/AILevenshteinDistance.class.st
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,27 @@ Class {
AILevenshteinDistance >> distanceBetween: firstString and: secondString [

| distanceMatrix |

"If one of the strings is empty, return the other string's size"
firstString isEmpty ifTrue: [ ^ secondString size ].
secondString isEmpty ifTrue: [ ^ firstString size ].

distanceMatrix := self distanceMatrixBasedOn: firstString and: secondString.

distanceMatrix := self
distanceMatrixBasedOn: firstString
and: secondString.

^ distanceMatrix
at: distanceMatrix numberOfRows
at: distanceMatrix numberOfColumns
atColumn: distanceMatrix width
atRow: distanceMatrix height
]

{ #category : 'private' }
AILevenshteinDistance >> distanceMatrixBasedOn: firstString and: secondString [

| distanceMatrix |
| distanceMatrix |
distanceMatrix := CTArray2D
rows: secondString size + 1
columns: firstString size + 1.

width: firstString size + 1
height: secondString size + 1.
self fillFirstRowAndColumn: distanceMatrix.

self fillStartingFromSecondRowAndColumn: distanceMatrix
Expand All @@ -48,30 +49,31 @@ AILevenshteinDistance >> distanceMatrixBasedOn: firstString and: secondString [
AILevenshteinDistance >> fillCellInMatrix: aMatrix at: i at: j basedOn: firstString and: secondString [

| cost leftCell upperCell upperLeftCell |

"Setting the cost"
cost := (firstString at: j - 1) = (secondString at: i - 1)
ifTrue: [ 0 ]
ifFalse: [ 1 ].
leftCell := (aMatrix at: i at: j - 1) + 1.
upperCell := (aMatrix at: i - 1 at: j) + 1.
upperLeftCell := (aMatrix at: i - 1 at: j - 1) + cost.
ifTrue: [ 0 ]
ifFalse: [ 1 ].

leftCell := (aMatrix atColumn: j - 1 atRow: i) + 1.
upperCell := (aMatrix atColumn: j atRow: i - 1) + 1.
upperLeftCell := (aMatrix atColumn: j - 1 atRow: i - 1) + cost.

"Calculate the min between the left, upper-left, and upper case of our current case"
aMatrix at: i at: j put:
{ leftCell . upperCell . upperLeftCell } min.
aMatrix atColumn: j atRow: i put: {
leftCell.
upperCell.
upperLeftCell } min
]

{ #category : 'private' }
AILevenshteinDistance >> fillFirstRowAndColumn: aMatrix [

"Fill the first row and column starting with 0"
1 to: aMatrix rowCount do: [ :i |
aMatrix at: i at: 1 put: i - 1 ].

1 to: aMatrix height do: [ :i |
aMatrix atColumn: 1 atRow: i put: i - 1 ].

1 to: aMatrix columnCount do: [ :j |
aMatrix at: 1 at: j put: j - 1 ].
1 to: aMatrix width do: [ :j |
aMatrix atColumn: j atRow: 1 put: j - 1 ].
]

{ #category : 'private' }
Expand All @@ -85,4 +87,4 @@ AILevenshteinDistance >> fillStartingFromSecondRowAndColumn: aMatrix basedOn: fi
at: j
basedOn: firstString
and: secondString ] ].
]
]
Loading