DD.WellWorkover.Cloud/AsbCloudInfrastructure/Services/WellOperationImport/FileParser/StringSimilarity/CosineSimilarity.cs
Степанов Дмитрий 5896af9c81 Добавил алгоритм проверки сходства строк
Для поиска похожих ключей по значению ячейки используется алгоритм шинглов.
2023-09-04 14:03:49 +05:00

99 lines
2.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace AsbCloudInfrastructure.Services.WellOperationImport.FileParser.StringSimilarity;
public class CosineSimilarity
{
private const int DefaultK = 2;
protected int K { get; }
public CosineSimilarity(int k)
{
if (k <= 0)
{
throw new ArgumentOutOfRangeException(nameof(k), "k should be positive!");
}
K = k;
}
public CosineSimilarity() : this(DefaultK) { }
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
=> DotProduct(profile1, profile2)
/ (Norm(profile1) * Norm(profile2));
public Dictionary<string, int> GetProfile(string s)
{
var shingles = new Dictionary<string, int>();
if (string.IsNullOrWhiteSpace(s))
return shingles;
var cleanString = Stemming(s);
for (int i = 0; i < (cleanString.Length - K + 1); i++)
{
var shingle = cleanString.Substring(i, K);
if (shingles.TryGetValue(shingle, out var old))
{
shingles[shingle] = old + 1;
}
else
{
shingles[shingle] = 1;
}
}
return shingles;
}
private static string Stemming(string s)
{
var cleaned = Regex.Replace(s.ToLower(), "[^a-zа-я0-9]", "");
var words = cleaned.Split(' ');
var filteredWords = words.Where(word => word.Length > 1).ToArray();
return string.Concat(filteredWords);
}
private static double Norm(IDictionary<string, int> profile)
{
double agg = 0;
foreach (var entry in profile)
{
agg += 1.0 * entry.Value * entry.Value;
}
return Math.Sqrt(agg);
}
private static double DotProduct(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
{
var smallProfile = profile2;
var largeProfile = profile1;
if (profile1.Count < profile2.Count)
{
smallProfile = profile1;
largeProfile = profile2;
}
double agg = 0;
foreach (var entry in smallProfile)
{
if (!largeProfile.TryGetValue(entry.Key, out var i))
continue;
agg += 1.0 * entry.Value * i;
}
return agg;
}
}