DD.WellWorkover.Cloud/AsbCloudInfrastructure/Services/WellOperationImport/FileParser/WellOperationGazpromKhantosExcelParser.cs

227 lines
7.3 KiB
C#
Raw Normal View History

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using AsbCloudApp.Data.WellOperationImport;
using AsbCloudApp.Data.WellOperationImport.Options;
using AsbCloudApp.Exceptions;
using AsbCloudApp.Services.WellOperationImport;
using AsbCloudInfrastructure.Services.WellOperationImport.Constants;
using AsbCloudInfrastructure.Services.WellOperationImport.FileParser.StringSimilarity;
using ClosedXML.Excel;
namespace AsbCloudInfrastructure.Services.WellOperationImport.FileParser;
public class WellOperationGazpromKhantosExcelParser : IWellOperationExcelParser<WellOperationImportGazpromKhantosOptionsDto>
{
private class Operation
{
public int RowNumber { get; set; }
public string? CategoryInfo { get; set; }
public double SectionDiameter { get; set; }
public double Depth { get; set; }
public double Duration { get; set; }
public DateTime Date { get; set; }
}
private readonly CosineSimilarity cosineSimilarity = new();
private readonly Dictionary<string, string> operationDict = InitDict("Operations.txt", '=');
private readonly Dictionary<string, string> sectionDict = InitDict("Sections.txt", '=');
private readonly Dictionary<string, string> operationAttributesDict = InitDict("OperationAttributes.txt", '=');
public SheetDto Parse(Stream stream, WellOperationImportGazpromKhantosOptionsDto options)
{
using var workbook = new XLWorkbook(stream);
return ParseWorkBook(workbook, options);
}
private SheetDto ParseWorkBook(IXLWorkbook workbook, WellOperationImportGazpromKhantosOptionsDto options)
{
if (options.StartRow is < 1 or > 1048576)
2023-09-29 12:06:46 +05:00
throw new ArgumentInvalidException(nameof(options.StartRow), "Некорректное значение начальной строки");
if (options.EndRow is < 1 or > 1048576)
2023-09-29 12:06:46 +05:00
throw new ArgumentInvalidException(nameof(options.EndRow), "Некорректное значение конечной строки");
if (options.EndRow < options.StartRow)
2023-09-29 12:06:46 +05:00
throw new ArgumentInvalidException(nameof(options.EndRow), "Конечный номер строки не может быть больше начального");
var sheet = workbook.GetWorksheet(options.SheetName);
return ParseSheet(sheet, options.StartRow, options.EndRow);
}
private SheetDto ParseSheet(IXLWorksheet sheet, int startRow, int endRow)
{
var operationAttributes = GetOperationAttributes(sheet.RowsUsed());
if (operationAttributes is null)
return new SheetDto { Name = sheet.Name };
var rowsCount = endRow - startRow + 1;
var operations = new List<Operation>();
var cellValuesErrors = new List<string>();
for (int i = 0; i < rowsCount; i++)
{
var xlRow = sheet.Row(startRow + i);
try
{
operations.Add(new Operation
{
RowNumber = xlRow.RowNumber(),
CategoryInfo = xlRow.Cell(operationAttributes[OperationAttributes.CategoryInfo]).GetCellValue<string?>(),
SectionDiameter =xlRow.Cell(operationAttributes[OperationAttributes.SectionDiameter]).GetCellValue<double>(),
Depth = xlRow.Cell(operationAttributes[OperationAttributes.Depth]).GetCellValue<double>(),
Duration = xlRow.Cell(operationAttributes[OperationAttributes.Duration]).GetCellValue<double>(),
Date = xlRow.Cell(operationAttributes[OperationAttributes.Date]).GetCellValue<DateTime>()
});
}
catch (FileFormatException ex)
{
cellValuesErrors.Add(ex.Message);
}
}
if (cellValuesErrors.Any())
throw new FileFormatException(string.Join("\r\n", cellValuesErrors));
return new SheetDto()
{
Name = sheet.Name,
Rows = BuildRows()
};
IEnumerable<(double Diameter, string Name)> BuildSections()
{
var groupedOperations = operations.GroupBy(o => o.SectionDiameter)
.Select(s => new
{
Diameter = s.Key,
CategoryInfo = string.Concat(s.Select(o => o.CategoryInfo))
});
var repeatedSections = new[] { "xвостовик" };
var sections = new List<(double diameter, string section)>();
foreach (var groupedOperation in groupedOperations)
{
var sectionNamesSet = new HashSet<string>(sections.Select(s => s.section));
sections.Add(new ValueTuple<double, string>(groupedOperation.Diameter, sectionDict.FirstOrDefault(item =>
groupedOperation.CategoryInfo.Contains(item.Key) &&
(!sectionNamesSet.Contains(item.Value) || repeatedSections.Contains(item.Value.ToLowerInvariant()))).Value));
}
return sections;
}
IEnumerable<RowDto> BuildRows()
{
if (!operations.Any())
return Enumerable.Empty<RowDto>();
var rows = new List<RowDto>();
for (int i = 0; i < operations.Count; i++)
{
var currentOperation = operations[i];
var nextOperation = i + 1 < operations.Count ? operations[i + 1] : currentOperation;
rows.Add(new RowDto
{
Number = currentOperation.RowNumber,
Section = BuildSections().FirstOrDefault(s => Math.Abs(s.Diameter - currentOperation.SectionDiameter) < 0.1).Name,
Category = GetValueDictionary(operationDict, currentOperation.CategoryInfo, 0.3),
CategoryInfo = currentOperation.CategoryInfo,
DepthStart = currentOperation.Depth,
DepthEnd = nextOperation.Depth,
Duration = currentOperation.Duration,
Date = currentOperation.Date.AddHours(-currentOperation.Duration)
});
}
return rows;
}
}
private IDictionary<string, int>? GetOperationAttributes(IXLRows xlRows)
{
const int countOperationAttributes = 5;
IDictionary<string, int>? operationAttributes = null;
foreach (var xlRow in xlRows)
{
operationAttributes = new Dictionary<string, int>();
var cells = xlRow.CellsUsed().ToArray();
foreach (var cell in cells)
{
var operationAttribute = GetValueDictionary(operationAttributesDict, cell.GetCellValue<string>(), 0.7);
if (operationAttribute is null || operationAttributes.Any(a => a.Key == operationAttribute))
continue;
operationAttributes.Add(operationAttribute, cell.Address.ColumnNumber);
}
if (operationAttributes.Count >= countOperationAttributes)
break;
}
return operationAttributes is not null && operationAttributes.Count == countOperationAttributes ? operationAttributes : null;
}
private string? GetValueDictionary(IDictionary<string, string> dict, string? cellValue, double? minSimilarity)
{
if (string.IsNullOrWhiteSpace(cellValue))
return null;
var similarValues = new List<(double similarity, string value)>();
var profile1 = cosineSimilarity.GetProfile(cellValue);
foreach (var item in dict)
{
var profile2 = cosineSimilarity.GetProfile(item.Key);
var similarity = cosineSimilarity.Similarity(profile1, profile2);
similarValues.Add((similarity, item.Value));
}
var mostSimilarValue = similarValues.MaxBy(v => v.similarity);
return minSimilarity.HasValue && mostSimilarValue.similarity >= minSimilarity ? mostSimilarValue.value : null;
}
private static Dictionary<string, string> InitDict(string fileName, char separator)
{
var resourceName = Assembly.GetExecutingAssembly()
.GetManifestResourceNames()
.FirstOrDefault(n => n.EndsWith(fileName))!;
var stream = Assembly.GetExecutingAssembly()
.GetManifestResourceStream(resourceName)!;
using var reader = new StreamReader(stream);
return reader.ReadToEnd().Split('\r')
.Where(s => !string.IsNullOrWhiteSpace(s))
.Select(line => line.Split(separator))
.ToDictionary(parts => parts[0].Trim(), parts => parts[1].Trim());
}
}