import {
  normalizeBody,
  normalizeHeadings,
  normalizeLinks,
  normalizeSpacing,
  normalizeStyles,
  removeImages,
  splitTextBlocks,
} from '../lib'
import { removeComments } from './comments'
import { normalizeLists } from './lists'

// CKEditor: https://github.com/ckeditor/ckeditor5/tree/master/packages/ckeditor5-paste-from-office
// mammoth: https://www.npmjs.com/package/mammoth
// https://github.com/mdmjg/slate-docx-deserializer
// https://stackoverflow.com/questions/2875027/clean-microsoft-word-pasted-text-using-javascript

const matchers = [
  /<meta\s*name="?generator"?\s*content="?microsoft\s*word\s*\d+"?\/?>/i,
  /xmlns:o="urn:schemas-microsoft-com/i,
]

export const isMatch = (html: string) =>
  matchers.some((matcher) => matcher.test(html))

export function normalizeHtml(html: string) {
  // Clean up the spacing
  html = html.replaceAll('&nbsp;', ' ')
  html = html.replace(/\s+/g, ' ')

  // Remove Table of Contents
  const document = new DOMParser().parseFromString(html, 'text/html')
  const tocNode = document.querySelector('[docparttype="Table of Contents"]')
  if (tocNode) {
    tocNode.remove()
  }

  // Remove any ids
  const idNodes = document.querySelectorAll<HTMLElement>('[id]')
  for (const node of idNodes) {
    node.removeAttribute('id')
  }

  // Remove unwanted elements
  removeComments(document)
  removeImages(document)

  // Normalize within the document
  normalizeHeadings(document)
  normalizeLinks(document)
  normalizeLists(document)

  // Tidy-up spacing
  normalizeBody(document)
  splitTextBlocks(document)
  normalizeSpacing(document)
  normalizeStyles(document)

  return document.body.innerHTML
}
