📦 calda / corpus-assisted-google-translate

📄 tfidf.swift · 55 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55import Foundation

// an implementation of the tf-idf (term frequency — inverse document frequency) algorithm
// http://www.tfidf.com

func termFrequency(of term: String, in document: String) -> Double {
    let numberOfAppearencesInDocument = Double(document.count(of: term))
    let totalTermsInDocument = floor(Double(document.words.count) / Double(term.words.count))
    return numberOfAppearencesInDocument / totalTermsInDocument
}

func inverseDocumentFrequency(of term: String, allDocuments: [String]) -> Double {
    let totalNumberOfDocuments = Double(allDocuments.count)
    let numberOfDocumentsWithTerm = Double(allDocuments.filter { $0.contains(term) }.count)
    return log(totalNumberOfDocuments / numberOfDocumentsWithTerm)
}

func tfidf(of term: String, in document: String, allDocuments: [String]) -> Double {
    return termFrequency(of: term, in: document) * inverseDocumentFrequency(of: term, allDocuments: allDocuments)
}


func contiguousDocument(from entries: [StringsEntry]) -> String {
    return entries.compactMap { entry -> String? in
        guard let translation = entry.translatedText else {
            return nil
        }
        
        // strip out punctuation that isn't meaningful on the word-level
        return translation.trimmingCharacters(in: .whitespacesAndNewlines)
            .replacingOccurrences(of: ".", with: "")
            .replacingOccurrences(of: ",", with: "")
            .replacingOccurrences(of: "(", with: "")
            .replacingOccurrences(of: ")", with: "")
            .replacingOccurrences(of: "?", with: "")
            .replacingOccurrences(of: "¿", with: "")
            .replacingOccurrences(of: "!", with: "")
            .replacingOccurrences(of: "¡", with: "")
    }.joined(separator: " ")
}


extension String {
    
    func count(of stringToFind: String) -> Int {
        var stringToSearch = self
        var count = 0
        while let foundRange = stringToSearch.range(of: stringToFind, options: [.diacriticInsensitive]) {
            stringToSearch = stringToSearch.replacingCharacters(in: foundRange, with: "")
            count += 1
        }
        return count
    }
}