From 5896af9c81036ec086153c78d4456e26b671050a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D1=82=D0=B5=D0=BF=D0=B0=D0=BD=D0=BE=D0=B2=20=D0=94?= =?UTF-8?q?=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9?= Date: Mon, 4 Sep 2023 14:03:49 +0500 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=B8=D0=BB=20?= =?UTF-8?q?=D0=B0=D0=BB=D0=B3=D0=BE=D1=80=D0=B8=D1=82=D0=BC=20=D0=BF=D1=80?= =?UTF-8?q?=D0=BE=D0=B2=D0=B5=D1=80=D0=BA=D0=B8=20=D1=81=D1=85=D0=BE=D0=B4?= =?UTF-8?q?=D1=81=D1=82=D0=B2=D0=B0=20=D1=81=D1=82=D1=80=D0=BE=D0=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Для поиска похожих ключей по значению ячейки используется алгоритм шинглов. --- .../StringSimilarity/CosineSimilarity.cs | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 AsbCloudInfrastructure/Services/WellOperationImport/FileParser/StringSimilarity/CosineSimilarity.cs diff --git a/AsbCloudInfrastructure/Services/WellOperationImport/FileParser/StringSimilarity/CosineSimilarity.cs b/AsbCloudInfrastructure/Services/WellOperationImport/FileParser/StringSimilarity/CosineSimilarity.cs new file mode 100644 index 00000000..ae69f7f9 --- /dev/null +++ b/AsbCloudInfrastructure/Services/WellOperationImport/FileParser/StringSimilarity/CosineSimilarity.cs @@ -0,0 +1,98 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace AsbCloudInfrastructure.Services.WellOperationImport.FileParser.StringSimilarity; + +public class CosineSimilarity +{ + private const int DefaultK = 2; + + protected int K { get; } + + public CosineSimilarity(int k) + { + if (k <= 0) + { + throw new ArgumentOutOfRangeException(nameof(k), "k should be positive!"); + } + + K = k; + } + + public CosineSimilarity() : this(DefaultK) { } + + public double Similarity(IDictionary profile1, IDictionary profile2) + => DotProduct(profile1, profile2) + / (Norm(profile1) * Norm(profile2)); + + public Dictionary GetProfile(string s) + { + var shingles = new Dictionary(); + + if (string.IsNullOrWhiteSpace(s)) + return shingles; + + var cleanString = Stemming(s); + + for (int i = 0; i < (cleanString.Length - K + 1); i++) + { + var shingle = cleanString.Substring(i, K); + + if (shingles.TryGetValue(shingle, out var old)) + { + shingles[shingle] = old + 1; + } + else + { + shingles[shingle] = 1; + } + } + + return shingles; + } + + private static string Stemming(string s) + { + var cleaned = Regex.Replace(s.ToLower(), "[^a-zа-я0-9]", ""); + var words = cleaned.Split(' '); + var filteredWords = words.Where(word => word.Length > 1).ToArray(); + return string.Concat(filteredWords); + } + + private static double Norm(IDictionary profile) + { + double agg = 0; + + foreach (var entry in profile) + { + agg += 1.0 * entry.Value * entry.Value; + } + + return Math.Sqrt(agg); + } + + private static double DotProduct(IDictionary profile1, IDictionary profile2) + { + var smallProfile = profile2; + var largeProfile = profile1; + + if (profile1.Count < profile2.Count) + { + smallProfile = profile1; + largeProfile = profile2; + } + + double agg = 0; + foreach (var entry in smallProfile) + { + if (!largeProfile.TryGetValue(entry.Key, out var i)) + continue; + + agg += 1.0 * entry.Value * i; + } + + return agg; + } +}