| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
ybny
10年前发布

C#通过编辑距离计算两个字符串的相似度

编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫 Levenshtein Distance。一个字符串可以通过增加一个字符,删除一个字符,替换一个字符得到另外一个字符串,假设,我们把从字符串A转换成字符串B,前面3种操 作所执行的最少次数称为AB相似度
如 abc adc 度为 1
ababababa babababab 度为 2
abcd acdb 度为2

using System;  using System.Text.RegularExpressions;  using System.Threading.Tasks;     namespace Levenshtein  {      /// <summary>      /// 分析完成事件委托      /// </summary>      /// <param name="sim">相似度</param>      public delegate void AnalyzerCompletedHander(double sim);         /// <summary>      /// 文章相似度工具      /// </summary>      public class LevenshteinDistance:IDisposable      {          private string str1;          private string str2;          private int[,] index;          int k;          Task<double> task;             /// <summary>          /// 分析完成事件          /// </summary>          public event AnalyzerCompletedHander AnalyzerCompleted;             /// <summary>          /// 获取或设置文章1          /// </summary>          public string Str1          {              get { return str1; }              set              {                  str1 = Format(value);                  index = new int[str1.Length, str2.Length];              }          }             /// <summary>          /// 获取或设置文章2          /// </summary>          public string Str2          {              get { return str2; }              set              {                  str2 = Format(value);                  index = new int[str1.Length, str2.Length];              }          }             /// <summary>          /// 运算总次数          /// </summary>          public int TotalTimes          {              get { return str1.Length * str2.Length; }          }             /// <summary>          /// 是否完成          /// </summary>          public bool IsCompleted          {              get { return task.IsCompleted; }          }             /// <summary>          /// 实例化          /// </summary>          /// <param name="str1">文章1</param>          /// <param name="str2">文章2</param>          public LevenshteinDistance(string str1, string str2)          {              this.str1 = Format(str1);              this.str2 = Format(str2);              index = new int[str1.Length, str2.Length];          }             public LevenshteinDistance()          {          }             /// <summary>          /// 异步开始任务          /// </summary>          public void Start()          {              task = new Task<double>(Analyzer);              task.Start();              task.ContinueWith(o => Completed(o.Result));          }             /// <summary>          /// 同步开始任务          /// </summary>          /// <returns>相似度</returns>          public double StartAyns()          {              task = new Task<double>(Analyzer);              task.Start();              task.Wait();              return task.Result;          }             private void Completed(double s)          {              if (AnalyzerCompleted != null)              {                  AnalyzerCompleted(s);              }          }             private double Analyzer()          {              if (str1.Length == 0 || str2.Length == 0)                  return 0;              for (int i = 0; i < str1.Length; i++)              {                  for (int j = 0; j < str2.Length; j++)                  {                      k = str1[i] == str2[j] ? 0 : 1;                      if (i == 0&&j==0)                      {                          continue;                      }                      else if (i == 0)                      {                          index[i, j] = k + index[i, j - 1];                          continue;                      }                      else if (j == 0)                      {                          index[i, j] = k + index[i - 1, j];                          continue;                      }                      int temp = Min(index[i, j - 1],                          index[i - 1, j],                          index[i - 1, j - 1]);                      index[i, j] = temp + k;                  }              }              float similarty = 1 - (float)index[str1.Length - 1, str2.Length - 1]                  / (str1.Length > str2.Length ? str1.Length : str2.Length);              return similarty;          }             private string Format(string str)          {              str = Regex.Replace(str, @"[^a-zA-Z0-9\u4e00-\u9fa5\s]", "");              return str;          }             private int Min(int a, int b, int c)          {              int temp = a < b ? a : b;              temp = temp < c ? temp : c;              return temp;          }             public void Dispose()          {              task.Dispose();          }      }  }