forked from ddrilling/AsbCloudServer
Степанов Дмитрий
5896af9c81
Для поиска похожих ключей по значению ячейки используется алгоритм шинглов.
99 lines
2.4 KiB
C#
99 lines
2.4 KiB
C#
using System;
|
||
using System.Collections.Generic;
|
||
using System.Linq;
|
||
using System.Text.RegularExpressions;
|
||
|
||
namespace AsbCloudInfrastructure.Services.WellOperationImport.FileParser.StringSimilarity;
|
||
|
||
public class CosineSimilarity
|
||
{
|
||
private const int DefaultK = 2;
|
||
|
||
protected int K { get; }
|
||
|
||
public CosineSimilarity(int k)
|
||
{
|
||
if (k <= 0)
|
||
{
|
||
throw new ArgumentOutOfRangeException(nameof(k), "k should be positive!");
|
||
}
|
||
|
||
K = k;
|
||
}
|
||
|
||
public CosineSimilarity() : this(DefaultK) { }
|
||
|
||
public double Similarity(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
|
||
=> DotProduct(profile1, profile2)
|
||
/ (Norm(profile1) * Norm(profile2));
|
||
|
||
public Dictionary<string, int> GetProfile(string s)
|
||
{
|
||
var shingles = new Dictionary<string, int>();
|
||
|
||
if (string.IsNullOrWhiteSpace(s))
|
||
return shingles;
|
||
|
||
var cleanString = Stemming(s);
|
||
|
||
for (int i = 0; i < (cleanString.Length - K + 1); i++)
|
||
{
|
||
var shingle = cleanString.Substring(i, K);
|
||
|
||
if (shingles.TryGetValue(shingle, out var old))
|
||
{
|
||
shingles[shingle] = old + 1;
|
||
}
|
||
else
|
||
{
|
||
shingles[shingle] = 1;
|
||
}
|
||
}
|
||
|
||
return shingles;
|
||
}
|
||
|
||
private static string Stemming(string s)
|
||
{
|
||
var cleaned = Regex.Replace(s.ToLower(), "[^a-zа-я0-9]", "");
|
||
var words = cleaned.Split(' ');
|
||
var filteredWords = words.Where(word => word.Length > 1).ToArray();
|
||
return string.Concat(filteredWords);
|
||
}
|
||
|
||
private static double Norm(IDictionary<string, int> profile)
|
||
{
|
||
double agg = 0;
|
||
|
||
foreach (var entry in profile)
|
||
{
|
||
agg += 1.0 * entry.Value * entry.Value;
|
||
}
|
||
|
||
return Math.Sqrt(agg);
|
||
}
|
||
|
||
private static double DotProduct(IDictionary<string, int> profile1, IDictionary<string, int> profile2)
|
||
{
|
||
var smallProfile = profile2;
|
||
var largeProfile = profile1;
|
||
|
||
if (profile1.Count < profile2.Count)
|
||
{
|
||
smallProfile = profile1;
|
||
largeProfile = profile2;
|
||
}
|
||
|
||
double agg = 0;
|
||
foreach (var entry in smallProfile)
|
||
{
|
||
if (!largeProfile.TryGetValue(entry.Key, out var i))
|
||
continue;
|
||
|
||
agg += 1.0 * entry.Value * i;
|
||
}
|
||
|
||
return agg;
|
||
}
|
||
}
|