const isWordBoundary = (char) => /[A-Z]/.test(char) || /[-_\/.]/.test(char) || /\s/.test(char); const isCaseTransition = (prev, curr) => { const prevIsUpper = prev.toLowerCase() !== prev; const currIsUpper = curr.toLowerCase() !== curr; return ( prevIsUpper && currIsUpper && prev.toLowerCase() !== curr.toLowerCase() ); }; const findBestSubsequenceMatch = (query, target) => { const n = query.length; const m = target.length; if (n === 0 || m === 0) return null; const positions = []; const memo = new Map(); const key = (qIdx, tIdx, gap) => `${qIdx}:${tIdx}:${gap}`; const findBest = (qIdx, tIdx, currentGap) => { if (qIdx === n) { return { done: true, positions: [...positions], gap: currentGap }; } const memoKey = key(qIdx, tIdx, currentGap); if (memo.has(memoKey)) { return memo.get(memoKey); } let bestResult = null; for (let i = tIdx; i < m; i++) { if (target[i] === query[qIdx]) { positions.push(i); const gap = qIdx === 0 ? 0 : i - positions[positions.length - 2] - 1; const newGap = currentGap + gap; if (newGap > m) { positions.pop(); continue; } const result = findBest(qIdx + 1, i + 1, newGap); positions.pop(); if (result && (!bestResult || result.gap < bestResult.gap)) { bestResult = result; if (result.gap === 0) break; } } } memo.set(memoKey, bestResult); return bestResult; }; const result = findBest(0, 0, 0); if (!result) return null; const consecutive = (() => { let c = 1; for (let i = 1; i < result.positions.length; i++) { if (result.positions[i] === result.positions[i - 1] + 1) { c++; } } return c; })(); return { positions: result.positions, consecutive, score: calculateMatchScore(query, target, result.positions, consecutive), }; }; const calculateMatchScore = (query, target, positions, consecutive) => { const n = positions.length; const m = target.length; if (n === 0) return 0; let score = 1.0; const startBonus = (m - positions[0]) / m; score += startBonus * 0.5; let gapPenalty = 0; for (let i = 1; i < n; i++) { const gap = positions[i] - positions[i - 1] - 1; if (gap > 0) { gapPenalty += Math.min(gap / m, 1.0) * 0.3; } } score -= gapPenalty; const consecutiveBonus = consecutive / n; score += consecutiveBonus * 0.3; let boundaryBonus = 0; for (let i = 0; i < n; i++) { const char = target[positions[i]]; if (i === 0 || isWordBoundary(char)) { boundaryBonus += 0.05; } if (i > 0) { const prevChar = target[positions[i - 1]]; if (isCaseTransition(prevChar, char)) { boundaryBonus += 0.03; } } } score = Math.min(1.0, score + boundaryBonus); const lengthPenalty = Math.abs(query.length - n) / Math.max(query.length, m); score -= lengthPenalty * 0.2; return Math.max(0, Math.min(1.0, score)); }; const fuzzyMatch = (query, target) => { const lowerQuery = query.toLowerCase(); const lowerTarget = target.toLowerCase(); if (lowerQuery.length === 0) return null; if (lowerTarget.length === 0) return null; if (lowerTarget === lowerQuery) { return 1.0; } if (lowerTarget.includes(lowerQuery)) { const ratio = lowerQuery.length / lowerTarget.length; return 0.8 + ratio * 0.2; } const match = findBestSubsequenceMatch(lowerQuery, lowerTarget); if (!match) { return null; } return Math.min(1.0, match.score); }; self.onmessage = function (e) { const { messageId, type, data } = e.data; const respond = (type, data) => { self.postMessage({ messageId, type, data }); }; const respondError = (error) => { self.postMessage({ messageId, type: "error", error: error.message || String(error), }); }; try { if (type === "tokenize") { const text = typeof data === "string" ? data : ""; const words = text.toLowerCase().match(/\b[a-zA-Z0-9_-]+\b/g) || []; const tokens = words.filter((word) => word.length > 2); const uniqueTokens = Array.from(new Set(tokens)); respond("tokens", uniqueTokens); } else if (type === "search") { const { query, limit = 10 } = data; if (!query || typeof query !== "string") { respond("results", []); return; } const rawQuery = query.toLowerCase(); const text = typeof query === "string" ? query : ""; const words = text.toLowerCase().match(/\b[a-zA-Z0-9_-]+\b/g) || []; const searchTerms = words.filter((word) => word.length > 2); let documents = []; if (typeof data.documents === "string") { documents = JSON.parse(data.documents); } else if (Array.isArray(data.documents)) { documents = data.documents; } else if (typeof data.transferables === "string") { documents = JSON.parse(data.transferables); } if (!Array.isArray(documents) || documents.length === 0) { respond("results", []); return; } const useFuzzySearch = rawQuery.length >= 3; if (searchTerms.length === 0 && rawQuery.length < 3) { respond("results", []); return; } const pageMatches = new Map(); // Pre-compute lower-case strings for each document const processedDocs = documents.map((doc, docId) => { const title = typeof doc.title === "string" ? doc.title : ""; const content = typeof doc.content === "string" ? doc.content : ""; return { docId, doc, lowerTitle: title.toLowerCase(), lowerContent: content.toLowerCase(), }; }); // First pass: Score pages with fuzzy matching processedDocs.forEach(({ docId, doc, lowerTitle, lowerContent }) => { let match = pageMatches.get(docId); if (!match) { match = { doc, pageScore: 0, matchingAnchors: [] }; pageMatches.set(docId, match); } if (useFuzzySearch) { const fuzzyTitleScore = fuzzyMatch(rawQuery, lowerTitle); if (fuzzyTitleScore !== null) { match.pageScore += fuzzyTitleScore * 100; } const fuzzyContentScore = fuzzyMatch(rawQuery, lowerContent); if (fuzzyContentScore !== null) { match.pageScore += fuzzyContentScore * 30; } } // Token-based exact matching searchTerms.forEach((term) => { if (lowerTitle.includes(term)) { match.pageScore += lowerTitle === term ? 20 : 10; } if (lowerContent.includes(term)) { match.pageScore += 2; } }); }); // Second pass: Find matching anchors pageMatches.forEach((match) => { const doc = match.doc; if ( !doc.anchors || !Array.isArray(doc.anchors) || doc.anchors.length === 0 ) { return; } doc.anchors.forEach((anchor) => { if (!anchor || !anchor.text) return; const anchorText = anchor.text.toLowerCase(); let anchorMatches = false; if (useFuzzySearch) { const fuzzyScore = fuzzyMatch(rawQuery, anchorText); if (fuzzyScore !== null && fuzzyScore >= 0.4) { anchorMatches = true; } } if (!anchorMatches) { searchTerms.forEach((term) => { if (anchorText.includes(term)) { anchorMatches = true; } }); } if (anchorMatches) { match.matchingAnchors.push(anchor); } }); }); const results = Array.from(pageMatches.values()) .filter((m) => m.pageScore > 5) .sort((a, b) => b.pageScore - a.pageScore) .slice(0, limit); respond("results", results); } } catch (error) { respondError(error); } };