Skip to content

Back to Snippets

similarityDS()

The Dice-Sørensen coefficient used as a measure of string similarity.

function similarityDS(strA, strB) {
	
	function bigrams(str) {
		return Array.from(
			{ length: str.length - 1 }, 
			(_, i) => str.substring(i, i + 2)
		).reduce((res, it) => {
			res[it] = (res[it] || 0) + 1;
			return res;
		}, {});
	}

	const countsA = bigrams((strA || '').toLowerCase());
	const countsB = bigrams((strB || '').toLowerCase());
	const common = Object.keys(countsA).map(
		it => Math.min(countsA[it], countsB[it] || 0)
	);

	const sum = (a, b) => a + b;
	const sizeA = Object.values(countsA).reduce(sum, 0);
	const sizeB = Object.values(countsB).reduce(sum, 0);
	
	return 2 * common.reduce(sum, 0) / (sizeA + sizeB);
}