Last active
January 27, 2022 19:33
-
-
Save ChuckSavage/e1c61839300abd2d90873b01da9d7575 to your computer and use it in GitHub Desktop.
Natural string comparer for files too
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using System.Numerics; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
namespace SeaRisenLib2.Text; | |
public class NaturalStringComparer : IComparer<string> | |
{ | |
// original: @"(?<=\D)(?=\d)|(?<=\d)(?=\D)" | |
/* | |
* Deciphering the regex pattern - (?<=_)(?=_)(?<=\W)|(?=\W)|(?<=\D)(?=\d)|(?<=\d)(?=\D) | |
* since _ is a part of D (letters), check for it first | |
* (?<=\W) - if the previous character is a non-word *-+=) | |
* (?=\W) - or if the next character is a non-word | |
* (?<=\D)(?=\d) - or if the previous character is a non-digit ABC, or the next character is a digit 123 | |
* (?<=\d)(?=\D) - or if the previous character is a digit 123, or the next character is not a digit ABC | |
*/ | |
private static readonly Regex _re = new Regex(@"(?<=_)|(?=_)|(?<=\W)|(?=\W)|(?<=\D)(?=\d)|(?<=\d)(?=\D)", RegexOptions.Compiled); | |
private static readonly Regex _reAlnum = new Regex(@"[^\w]|[_]", RegexOptions.Compiled); | |
private NaturalStringComparer() { } | |
public static NaturalStringComparer Instance { get; } = new(); | |
public int Compare(string x, string y) | |
{ | |
return Compare(x, y, false); | |
} | |
/// <summary> | |
/// Compare two strings, optionally alphanumeric compare only. Returns a value indicating | |
/// one is less than, greather or equal to the other. | |
/// </summary> | |
/// <param name="x"></param> | |
/// <param name="y"></param> | |
/// <param name="alphanumericOnly"></param> | |
/// <returns></returns> | |
public int Compare(string x, string y, bool alphanumericOnly) | |
{ | |
if (string.IsNullOrWhiteSpace(x)) | |
if (string.IsNullOrWhiteSpace(y)) | |
return 0; // both null | |
else | |
return -1; // x is null but y contains a value | |
else if (string.IsNullOrWhiteSpace(y)) // if y is null, but x contains a value | |
return 1; | |
int r = string.Compare(x, y, StringComparison.CurrentCultureIgnoreCase); | |
if (r == 0) return 0; // they're equal, no need to fine tune the comparison | |
var fX = FileParts(x); | |
var fY = FileParts(y); | |
if (fX.extension == "" || fY.extension == "") | |
return Compare2(x, y, alphanumericOnly); | |
r = string.Compare(fX.name, fY.name, StringComparison.CurrentCultureIgnoreCase); | |
// if equal, compare extensions and be done with it | |
if (r != 0) | |
{ | |
r = Compare2(fX.name, fY.name, alphanumericOnly); | |
if (r != 0) return r; | |
} | |
return string.Compare(fX.extension, fY.extension, StringComparison.CurrentCultureIgnoreCase); | |
// compare by enumerating the parts, instead of (the old vs of) getting the full part list for each string on each compare | |
static int Compare2(string x, string y, bool alphanumericOnly) | |
{ | |
IEnumerable<string> px = Parse(x), py = Parse(y); | |
IEnumerator<string> ex = px.GetEnumerator(), ey = py.GetEnumerator(); | |
bool bx = ex.MoveNext(), by = ey.MoveNext(); | |
int r; | |
// first compare only alphanumeric values | |
try | |
{ | |
char cx, cy; | |
while (bx && by) | |
{ | |
redocx: | |
cx = ex.Current[0]; | |
if (bx && !char.IsLetterOrDigit(cx)) | |
{ | |
bx = ex.MoveNext(); | |
goto redocx; | |
} | |
redocy: | |
cy = ey.Current[0]; | |
if (by && !char.IsLetterOrDigit(cy)) | |
{ | |
by = ey.MoveNext(); | |
goto redocy; | |
} | |
if (!bx || !by) break; | |
if (char.IsLetter(cx) || char.IsLetter(cy)) | |
{ | |
r = string.Compare(ex.Current, ey.Current, StringComparison.CurrentCultureIgnoreCase); | |
if (r != 0) return r; | |
} | |
// both should be numbers? | |
else if (BigInteger.TryParse(ex.Current, out BigInteger ix) && BigInteger.TryParse(ey.Current, out BigInteger iy)) | |
{ | |
r = ix.CompareTo(iy); | |
if (r != 0) return r; | |
r = CompareLength(ex.Current, ey.Current); | |
if (r != 0) return r; | |
} | |
bx = ex.MoveNext(); | |
by = ey.MoveNext(); | |
} | |
} | |
finally | |
{ | |
ex.Dispose(); | |
ey.Dispose(); | |
} | |
if (bx) return 1; | |
if (by) return -1; | |
if (alphanumericOnly) return 0; // both were equal | |
// if bx and by are both false, then the strings without the separators are equal, now | |
// compare the non-alphanumeric values too | |
ex = px.GetEnumerator(); // ex.Reset() doesn't work, so (dispose above and) get the enumerator again | |
ey = py.GetEnumerator(); | |
bx = ex.MoveNext(); | |
by = ey.MoveNext(); | |
try | |
{ | |
while (bx && by) | |
{ | |
if (BigInteger.TryParse(ex.Current, out BigInteger ix) && BigInteger.TryParse(ey.Current, out BigInteger iy)) | |
{ | |
r = ix.CompareTo(iy); | |
if (r != 0) return r; | |
r = CompareLength(ex.Current, ey.Current); | |
} | |
else | |
r = string.Compare(ex.Current, ey.Current, StringComparison.CurrentCultureIgnoreCase); | |
if (r != 0) return r; | |
bx = ex.MoveNext(); | |
by = ey.MoveNext(); | |
} | |
} | |
finally | |
{ | |
ex.Dispose(); | |
ey.Dispose(); | |
} | |
if (bx) return 1; | |
if (by) return -1; | |
return 0; | |
// enumerate the string, returning only what we need to get the comparison | |
IEnumerable<string> Parse(string IN) | |
{ | |
int t = -1; // 0 word, 1 number, 2 whitespace or other non-alphanumeric value | |
int s = 0; // start of part | |
for (int i = 0; i < IN.Length; i++) | |
{ | |
switch (t) | |
{ | |
case 0: | |
if (char.IsLetter(IN[i])) | |
continue; | |
break; | |
case 1: | |
if (char.IsDigit(IN[i])) | |
continue; | |
break; | |
case 2: | |
if (!char.IsLetterOrDigit(IN[i])) // whitespace or other non-alphanumeric value | |
continue; | |
break; | |
default: | |
if (char.IsLetter(IN[i])) t = 0; | |
else if (char.IsDigit(IN[i])) t = 1; | |
else t = 2; | |
continue; | |
} | |
yield return IN.Substring(s, i - s); | |
s = i--; // i needs to point to previous char, after updating s | |
t = -1; | |
} | |
if (s < IN.Length) | |
yield return IN.Substring(s, IN.Length - s); | |
} | |
} | |
} | |
/// <summary> | |
/// Naturally compare two values, by default making it an alphanumeric compare. | |
/// </summary> | |
/// <param name="IN"></param> | |
/// <param name="pattern"></param> | |
/// <param name="alphanumericOnly"></param> | |
/// <returns></returns> | |
public static bool Contains(string IN, string pattern, bool alphanumericOnly = true) | |
{ | |
if (IN.Length < pattern.Length) | |
return false; | |
int length = IN.Length - pattern.Length; | |
for (int i = 0; i <= length; i++) | |
{ | |
if (0 == Instance.Compare(IN.Substring(i, pattern.Length), pattern, alphanumericOnly)) | |
return true; | |
} | |
return false; | |
} | |
/// <summary> | |
/// Naturally compare two values, by default making it an alphanumeric compare. | |
/// </summary> | |
/// <param name="IN"></param> | |
/// <param name="pattern"></param> | |
/// <param name="alphanumericOnly"></param> | |
/// <returns></returns> | |
public static bool EndsWith(string IN, string pattern, bool alphanumericOnly = true) | |
{ | |
if (IN.Length < pattern.Length) | |
return false; | |
return 0 == Instance.Compare(IN.SubstringSafe(IN.Length - pattern.Length, pattern.Length), pattern, alphanumericOnly); | |
} | |
/// <summary> | |
/// Naturally compare two values, by default making it an alphanumeric compare. | |
/// </summary> | |
/// <param name="IN"></param> | |
/// <param name="pattern"></param> | |
/// <param name="alphanumericOnly"></param> | |
/// <returns></returns> | |
public static bool StartsWith(string IN, string pattern, bool alphanumericOnly = true) | |
{ | |
if (IN.Length < pattern.Length) | |
return false; | |
return 0 == Instance.Compare(IN.SubstringSafe(0, pattern.Length), pattern, alphanumericOnly); | |
} | |
// based off of stackoverflow version | |
public int Compare_OriginalSlowerVersion(string x, string y) | |
{ | |
if (string.IsNullOrWhiteSpace(x)) | |
if (string.IsNullOrWhiteSpace(y)) | |
return 0; // both null | |
else | |
return -1; // x is null but y contains a value | |
if (string.IsNullOrWhiteSpace(y)) return 1; // y is null but x contains a value | |
var fX = FileParts(x); | |
var fY = FileParts(y); | |
if (fX.extension == "" || fY.extension == "") | |
return Compare2(x, y); | |
int r = Compare2(fX.name, fY.name); | |
if (r != 0) return r; | |
return Compare2(fX.extension, fY.extension); | |
static int Compare2(string x, string y, bool alphanum = true) | |
{ | |
string xOrig = x, yOrig = y; | |
// compare first on only alphanumeric values, then if no differences | |
// compare the delimiters | |
//if (Debugger.IsAttached && new[] { x, y }.Any(s => s.Contains("2-4RedHead+286029"))) | |
// Debugger.Break(); | |
x = x.ToUpperInvariant(); | |
y = y.ToUpperInvariant(); | |
if (alphanum) | |
{ | |
AlphaNumericOnly(ref x); | |
AlphaNumericOnly(ref y); | |
} | |
var cl = CompareSubstring(x, y); | |
if (cl.success && cl.result != 0) return cl.result; | |
// if substring values are equal, dig deeper | |
var a = SplitIntoParts(x, alphanum); | |
var b = SplitIntoParts(y, alphanum); | |
int i = 0; | |
while (i < a.Length && i < b.Length) | |
{ | |
var (isInt, result) = CompareValue(a[i], b[i]); | |
if (isInt && result == 0) | |
{ | |
// if is int, compare say 001 vs 1 | |
result = CompareLength(a[i], b[i]); | |
} | |
if (result != 0) return result; | |
// if equal in value, dig deeper | |
++i; | |
} | |
i = 0; | |
while (i < a.Length && i < b.Length) | |
{ | |
cl = CompareSubstring(a[i], b[i]); | |
if (cl.success && cl.result != 0) return cl.result; | |
// if parts are equal, check deeper parts | |
++i; | |
} | |
// re-run first compare, returning whatever result we get | |
var r = CompareSubstring(x, y).result; | |
if (!alphanum || r != 0) return r; | |
return Compare2(xOrig, yOrig, false); | |
} | |
static (bool success, int result) CompareSubstring(string x, string y) | |
{ | |
int result = string.Compare(x, 0, y, 0, Math.Min(x.Length, y.Length)); | |
bool isInt = false; | |
if (result != 0) | |
(isInt, result) = CompareValue(x, y); | |
if (isInt) | |
if (result == 0) | |
return (true, CompareLength(x, y)); | |
else | |
return (true, result); | |
return (false, CompareLength(x, y)); | |
} | |
static (bool isInt, int result) CompareValue(string x, string y) | |
{ | |
if (BigInteger.TryParse(x, out BigInteger a) && BigInteger.TryParse(y, out BigInteger b)) | |
return (true, a.CompareTo(b)); | |
return (false, x.CompareTo(y)); | |
} | |
static void AlphaNumericOnly(ref string IN) | |
{ | |
IN = _reAlnum.Replace(IN, ","); | |
IN = Regex.Replace(IN, ",+", ","); | |
IN = IN.Trim(','); | |
} | |
static string[] SplitIntoParts(string IN, bool alphanum) | |
{ | |
return _re.Split(IN) | |
// don't sort commas with other values (or themselves) | |
.Where(s => !alphanum || s != ",") | |
.ToArray(); | |
} | |
} | |
static int CompareLength(string x, string y) | |
{ | |
if (x.Length != y.Length) | |
return x.Length < y.Length ? -1 : 1; | |
return 0; | |
} | |
/// <summary> | |
/// Get the fullname, name and extension of the path as separate parts. | |
/// </summary> | |
/// <param name="path"></param> | |
/// <returns></returns> | |
static (string name, string extension) FileParts(string path) | |
{ | |
string extension = Path.GetExtension(path); | |
if (string.IsNullOrEmpty(extension)) | |
return (path, extension); | |
string name = Path.GetFileNameWithoutExtension(path); | |
string fullname = Path.Combine(Path.GetDirectoryName(path), name); | |
return (fullname, extension); | |
} | |
} | |
========== UNIT TESTS go in separate file within a test project ============== | |
using System; | |
using Microsoft.VisualStudio.TestTools.UnitTesting; | |
using System.Linq; | |
using SeaRisenLib2.Text; | |
namespace TestProject | |
{ | |
[TestClass] | |
public class NaturalStringComparerTest | |
{ | |
[TestMethod] | |
public void TestMethod1() | |
{ | |
string[] list = { "01", "1", "2", "001", "002" }; | |
string[] expected = { "1", "01", "001", "2", "002" }; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod2() | |
{ | |
string[] list = { | |
"01.jpg", | |
"1.png", | |
"2.png", | |
"001.png", | |
"002.jpg", | |
"01.png", | |
"2.jpg" | |
}; | |
string[] expected = { | |
"1.png", | |
"01.jpg", | |
"01.png", | |
"001.png", | |
"2.jpg", | |
"2.png", | |
"002.jpg" | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod3() | |
{ | |
string[] list = { | |
"image 12.jpg", | |
"image-1.jpg", | |
"image_11.jpg", | |
"image+2.jpg", | |
"image,20.jpg", | |
}; | |
string[] expected = { | |
"image-1.jpg", | |
"image+2.jpg", | |
"image_11.jpg", | |
"image 12.jpg", | |
"image,20.jpg", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod4() | |
{ | |
string[] list = { | |
"0_c1c9c_d3dd2e76_orig 2.psd", | |
"0_c1c76_ebedbd57_orig.jpg", | |
"0_c1c77_d2c42e68_orig.jpg", | |
"0_c1c9c_d3dd2e76_orig.jpg", | |
"0_c1c85_1840d3b_orig.jpg", | |
"0_c1c92_a4060a6d_orig.jpg", | |
"0_c1c94_68939054_orig.jpg", | |
"0_c1c97_9a75dd3f_orig.jpg", | |
"0_c1ca1_c559fedc_orig.jpg", | |
"0_c79a4_90493d35_orig.jpg", | |
"0_9ddd7_bc745e8c_XXXL.JPG", | |
"0_9ddd8_7491ab8c_XXL.JPG", | |
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel (1).JPG", | |
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel.JPG", | |
"0_c1c8a_f69af695_orig.jpg", | |
}; | |
string[] expected = { | |
"0_9ddd7_bc745e8c_XXXL.JPG", | |
"0_9ddd8_7491ab8c_XXL.JPG", | |
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel.JPG", | |
"0_9ddd8_7491ab8c_XXL-lines-scale-6_00x-gigapixel (1).JPG", | |
"0_c1c8a_f69af695_orig.jpg", | |
"0_c1c9c_d3dd2e76_orig.jpg", | |
"0_c1c9c_d3dd2e76_orig 2.psd", | |
"0_c1c76_ebedbd57_orig.jpg", | |
"0_c1c77_d2c42e68_orig.jpg", | |
"0_c1c85_1840d3b_orig.jpg", | |
"0_c1c92_a4060a6d_orig.jpg", | |
"0_c1c94_68939054_orig.jpg", | |
"0_c1c97_9a75dd3f_orig.jpg", | |
"0_c1ca1_c559fedc_orig.jpg", | |
"0_c79a4_90493d35_orig.jpg", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod5() | |
{ | |
string[] list = { | |
"1+quite+artsy+288829.webp", | |
"1+quite+artsy+288829_result.jpg", | |
"1+paint+28229.webp", | |
"1-lion-01.webp", | |
"1-lion-11.webp", | |
"1-lion-36-1160x2606.webp", | |
"1-lion-41.webp", | |
"01.jpg", | |
"110208043338528059.jpg", | |
"100122859906.jpg", | |
"1303190918328716010989741.jpg", | |
"1303190918348716010989742.jpg", | |
"12154530903_f5cfcdab52_b.jpg", | |
"2-4RedHead+286029.webp", | |
"2+beach+fun+281529.webp", | |
}; | |
string[] expected = { | |
"1-lion-01.webp", | |
"1-lion-11.webp", | |
"1-lion-36-1160x2606.webp", | |
"1-lion-41.webp", | |
"1+paint+28229.webp", | |
"1+quite+artsy+288829.webp", | |
"1+quite+artsy+288829_result.jpg", | |
"01.jpg", | |
"2-4RedHead+286029.webp", | |
"2+beach+fun+281529.webp", | |
"12154530903_f5cfcdab52_b.jpg", | |
"100122859906.jpg", | |
"110208043338528059.jpg", | |
"1303190918328716010989741.jpg", | |
"1303190918348716010989742.jpg", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod6() | |
{ | |
string[] list = { | |
"1+paint_28228.webp", | |
"2-4RedHead+286029.webp", | |
"2+beach+fun+281529.webp", | |
"1-lion-01.webp", | |
"1-paint+28229.webp", | |
}; | |
string[] expected = { | |
"1-lion-01.webp", | |
"1+paint_28228.webp", | |
"1-paint+28229.webp", | |
"2-4RedHead+286029.webp", | |
"2+beach+fun+281529.webp", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod7() | |
{ | |
string[] list = { | |
"0003-2af55d6.jpg", | |
"3_67.jpg", | |
"03.jpg", | |
"3+ten.jpg", | |
}; | |
string[] expected = { | |
"3_67.jpg", | |
"3+ten.jpg", | |
"03.jpg", | |
"0003-2af55d6.jpg", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod8() | |
{ | |
string[] list = { | |
"7_", | |
"7", | |
"_7", | |
}; | |
string[] expected = { | |
"_7", | |
"7", | |
"7_", | |
}; | |
var actual = list.OrderBy(s => s, NaturalStringComparer.Instance).ToArray(); | |
CollectionAssert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod9() | |
{ | |
string[] list = { | |
@"c:\path\to\file\42.jpg", | |
"42.jpg", | |
}; | |
int expected = 1; | |
int actual = NaturalStringComparer.Instance.Compare(list[0], list[1]); | |
Assert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod10() | |
{ | |
string[] list = { | |
@"c:\path\to\file\42.jpg", | |
@"d:\path\to\file\42.jpg", | |
}; | |
int expected = -1; | |
int actual = NaturalStringComparer.Instance.Compare(list[0], list[1]); | |
Assert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod11() | |
{ | |
string[] list = { | |
@"contains", | |
@"contains", | |
}; | |
bool expected = true; | |
bool actual = NaturalStringComparer.Contains(list[0], list[1]); | |
Assert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod12() | |
{ | |
string[] list = { | |
@"startswith", | |
@"startswith", | |
}; | |
bool expected = true; | |
bool actual = NaturalStringComparer.StartsWith(list[0], list[1]); | |
Assert.AreEqual(expected, actual); | |
} | |
[TestMethod] | |
public void TestMethod13() | |
{ | |
string[] list = { | |
@"endswith", | |
@"endswith", | |
}; | |
bool expected = true; | |
bool actual = NaturalStringComparer.EndsWith(list[0], list[1]); | |
Assert.AreEqual(expected, actual); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment