| /**
 * WordFilter.js
 *
 * Released under LGPL License.
 * Copyright (c) 1999-2017 Ephox Corp. All rights reserved
 *
 * License: http://www.tinymce.com/license
 * Contributing: http://www.tinymce.com/contributing
 */
/**
 * This class parses word HTML into proper TinyMCE markup.
 *
 * @class tinymce.pasteplugin.WordFilter
 * @private
 */
define(
  'tinymce.plugins.paste.core.WordFilter',
  [
    'tinymce.core.html.DomParser',
    'tinymce.core.html.Node',
    'tinymce.core.html.Schema',
    'tinymce.core.html.Serializer',
    'tinymce.core.util.Tools',
    'tinymce.plugins.paste.api.Settings',
    'tinymce.plugins.paste.core.Utils'
  ],
  function (DomParser, Node, Schema, Serializer, Tools, Settings, Utils) {
    /**
     * Checks if the specified content is from any of the following sources: MS Word/Office 365/Google docs.
     */
    function isWordContent(content) {
      return (
        (/<font face="Times New Roman"|class="?Mso|style="[^"]*\bmso-|style='[^'']*\bmso-|w:WordDocument/i).test(content) ||
        (/class="OutlineElement/).test(content) ||
        (/id="?docs\-internal\-guid\-/.test(content))
      );
    }
    /**
     * Checks if the specified text starts with "1. " or "a. " etc.
     */
    function isNumericList(text) {
      var found, patterns;
      patterns = [
        /^[IVXLMCD]{1,2}\.[ \u00a0]/,  // Roman upper case
        /^[ivxlmcd]{1,2}\.[ \u00a0]/,  // Roman lower case
        /^[a-z]{1,2}[\.\)][ \u00a0]/,  // Alphabetical a-z
        /^[A-Z]{1,2}[\.\)][ \u00a0]/,  // Alphabetical A-Z
        /^[0-9]+\.[ \u00a0]/,          // Numeric lists
        /^[\u3007\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d]+\.[ \u00a0]/, // Japanese
        /^[\u58f1\u5f10\u53c2\u56db\u4f0d\u516d\u4e03\u516b\u4e5d\u62fe]+\.[ \u00a0]/  // Chinese
      ];
      text = text.replace(/^[\u00a0 ]+/, '');
      Tools.each(patterns, function (pattern) {
        if (pattern.test(text)) {
          found = true;
          return false;
        }
      });
      return found;
    }
    function isBulletList(text) {
      return /^[\s\u00a0]*[\u2022\u00b7\u00a7\u25CF]\s*/.test(text);
    }
    /**
     * Converts fake bullet and numbered lists to real semantic OL/UL.
     *
     * @param {tinymce.html.Node} node Root node to convert children of.
     */
    function convertFakeListsToProperLists(node) {
      var currentListNode, prevListNode, lastLevel = 1;
      function getText(node) {
        var txt = '';
        if (node.type === 3) {
          return node.value;
        }
        if ((node = node.firstChild)) {
          do {
            txt += getText(node);
          } while ((node = node.next));
        }
        return txt;
      }
      function trimListStart(node, regExp) {
        if (node.type === 3) {
          if (regExp.test(node.value)) {
            node.value = node.value.replace(regExp, '');
            return false;
          }
        }
        if ((node = node.firstChild)) {
          do {
            if (!trimListStart(node, regExp)) {
              return false;
            }
          } while ((node = node.next));
        }
        return true;
      }
      function removeIgnoredNodes(node) {
        if (node._listIgnore) {
          node.remove();
          return;
        }
        if ((node = node.firstChild)) {
          do {
            removeIgnoredNodes(node);
          } while ((node = node.next));
        }
      }
      function convertParagraphToLi(paragraphNode, listName, start) {
        var level = paragraphNode._listLevel || lastLevel;
        // Handle list nesting
        if (level !== lastLevel) {
          if (level < lastLevel) {
            // Move to parent list
            if (currentListNode) {
              currentListNode = currentListNode.parent.parent;
            }
          } else {
            // Create new list
            prevListNode = currentListNode;
            currentListNode = null;
          }
        }
        if (!currentListNode || currentListNode.name !== listName) {
          prevListNode = prevListNode || currentListNode;
          currentListNode = new Node(listName, 1);
          if (start > 1) {
            currentListNode.attr('start', '' + start);
          }
          paragraphNode.wrap(currentListNode);
        } else {
          currentListNode.append(paragraphNode);
        }
        paragraphNode.name = 'li';
        // Append list to previous list if it exists
        if (level > lastLevel && prevListNode) {
          prevListNode.lastChild.append(currentListNode);
        }
        lastLevel = level;
        // Remove start of list item "1. " or "· " etc
        removeIgnoredNodes(paragraphNode);
        trimListStart(paragraphNode, /^\u00a0+/);
        trimListStart(paragraphNode, /^\s*([\u2022\u00b7\u00a7\u25CF]|\w+\.)/);
        trimListStart(paragraphNode, /^\u00a0+/);
      }
      // Build a list of all root level elements before we start
      // altering them in the loop below.
      var elements = [], child = node.firstChild;
      while (typeof child !== 'undefined' && child !== null) {
        elements.push(child);
        child = child.walk();
        if (child !== null) {
          while (typeof child !== 'undefined' && child.parent !== node) {
            child = child.walk();
          }
        }
      }
      for (var i = 0; i < elements.length; i++) {
        node = elements[i];
        if (node.name === 'p' && node.firstChild) {
          // Find first text node in paragraph
          var nodeText = getText(node);
          // Detect unordered lists look for bullets
          if (isBulletList(nodeText)) {
            convertParagraphToLi(node, 'ul');
            continue;
          }
          // Detect ordered lists 1., a. or ixv.
          if (isNumericList(nodeText)) {
            // Parse OL start number
            var matches = /([0-9]+)\./.exec(nodeText);
            var start = 1;
            if (matches) {
              start = parseInt(matches[1], 10);
            }
            convertParagraphToLi(node, 'ol', start);
            continue;
          }
          // Convert paragraphs marked as lists but doesn't look like anything
          if (node._listLevel) {
            convertParagraphToLi(node, 'ul', 1);
            continue;
          }
          currentListNode = null;
        } else {
          // If the root level element isn't a p tag which can be
          // processed by convertParagraphToLi, it interrupts the
          // lists, causing a new list to start instead of having
          // elements from the next list inserted above this tag.
          prevListNode = currentListNode;
          currentListNode = null;
        }
      }
    }
    function filterStyles(editor, validStyles, node, styleValue) {
      var outputStyles = {}, matches, styles = editor.dom.parseStyle(styleValue);
      Tools.each(styles, function (value, name) {
        // Convert various MS styles to W3C styles
        switch (name) {
          case 'mso-list':
            // Parse out list indent level for lists
            matches = /\w+ \w+([0-9]+)/i.exec(styleValue);
            if (matches) {
              node._listLevel = parseInt(matches[1], 10);
            }
            // Remove these nodes <span style="mso-list:Ignore">o</span>
            // Since the span gets removed we mark the text node and the span
            if (/Ignore/i.test(value) && node.firstChild) {
              node._listIgnore = true;
              node.firstChild._listIgnore = true;
            }
            break;
          case "horiz-align":
            name = "text-align";
            break;
          case "vert-align":
            name = "vertical-align";
            break;
          case "font-color":
          case "mso-foreground":
            name = "color";
            break;
          case "mso-background":
          case "mso-highlight":
            name = "background";
            break;
          case "font-weight":
          case "font-style":
            if (value !== "normal") {
              outputStyles[name] = value;
            }
            return;
          case "mso-element":
            // Remove track changes code
            if (/^(comment|comment-list)$/i.test(value)) {
              node.remove();
              return;
            }
            break;
        }
        if (name.indexOf('mso-comment') === 0) {
          node.remove();
          return;
        }
        // Never allow mso- prefixed names
        if (name.indexOf('mso-') === 0) {
          return;
        }
        // Output only valid styles
        if (Settings.getRetainStyleProps(editor) === "all" || (validStyles && validStyles[name])) {
          outputStyles[name] = value;
        }
      });
      // Convert bold style to "b" element
      if (/(bold)/i.test(outputStyles["font-weight"])) {
        delete outputStyles["font-weight"];
        node.wrap(new Node("b", 1));
      }
      // Convert italic style to "i" element
      if (/(italic)/i.test(outputStyles["font-style"])) {
        delete outputStyles["font-style"];
        node.wrap(new Node("i", 1));
      }
      // Serialize the styles and see if there is something left to keep
      outputStyles = editor.dom.serializeStyle(outputStyles, node.name);
      if (outputStyles) {
        return outputStyles;
      }
      return null;
    }
    var filterWordContent = function (editor, content) {
      var retainStyleProperties, validStyles;
      retainStyleProperties = Settings.getRetainStyleProps(editor);
      if (retainStyleProperties) {
        validStyles = Tools.makeMap(retainStyleProperties.split(/[, ]/));
      }
      // Remove basic Word junk
      content = Utils.filter(content, [
        // Remove apple new line markers
        /<br class="?Apple-interchange-newline"?>/gi,
        // Remove google docs internal guid markers
        /<b[^>]+id="?docs-internal-[^>]*>/gi,
        // Word comments like conditional comments etc
        /<!--[\s\S]+?-->/gi,
        // Remove comments, scripts (e.g., msoShowComment), XML tag, VML content,
        // MS Office namespaced tags, and a few other tags
        /<(!|script[^>]*>.*?<\/script(?=[>\s])|\/?(\?xml(:\w+)?|img|meta|link|style|\w:\w+)(?=[\s\/>]))[^>]*>/gi,
        // Convert <s> into <strike> for line-though
        [/<(\/?)s>/gi, "<$1strike>"],
        // Replace nsbp entites to char since it's easier to handle
        [/ /gi, "\u00a0"],
        // Convert <span style="mso-spacerun:yes">___</span> to string of alternating
        // breaking/non-breaking spaces of same length
        [/<span\s+style\s*=\s*"\s*mso-spacerun\s*:\s*yes\s*;?\s*"\s*>([\s\u00a0]*)<\/span>/gi,
          function (str, spaces) {
            return (spaces.length > 0) ?
              spaces.replace(/./, " ").slice(Math.floor(spaces.length / 2)).split("").join("\u00a0") : "";
          }
        ]
      ]);
      var validElements = Settings.getWordValidElements(editor);
      // Setup strict schema
      var schema = new Schema({
        valid_elements: validElements,
        valid_children: '-li[p]'
      });
      // Add style/class attribute to all element rules since the user might have removed them from
      // paste_word_valid_elements config option and we need to check them for properties
      Tools.each(schema.elements, function (rule) {
        /*eslint dot-notation:0*/
        if (!rule.attributes["class"]) {
          rule.attributes["class"] = {};
          rule.attributesOrder.push("class");
        }
        if (!rule.attributes.style) {
          rule.attributes.style = {};
          rule.attributesOrder.push("style");
        }
      });
      // Parse HTML into DOM structure
      var domParser = new DomParser({}, schema);
      // Filter styles to remove "mso" specific styles and convert some of them
      domParser.addAttributeFilter('style', function (nodes) {
        var i = nodes.length, node;
        while (i--) {
          node = nodes[i];
          node.attr('style', filterStyles(editor, validStyles, node, node.attr('style')));
          // Remove pointess spans
          if (node.name === 'span' && node.parent && !node.attributes.length) {
            node.unwrap();
          }
        }
      });
      // Check the class attribute for comments or del items and remove those
      domParser.addAttributeFilter('class', function (nodes) {
        var i = nodes.length, node, className;
        while (i--) {
          node = nodes[i];
          className = node.attr('class');
          if (/^(MsoCommentReference|MsoCommentText|msoDel)$/i.test(className)) {
            node.remove();
          }
          node.attr('class', null);
        }
      });
      // Remove all del elements since we don't want the track changes code in the editor
      domParser.addNodeFilter('del', function (nodes) {
        var i = nodes.length;
        while (i--) {
          nodes[i].remove();
        }
      });
      // Keep some of the links and anchors
      domParser.addNodeFilter('a', function (nodes) {
        var i = nodes.length, node, href, name;
        while (i--) {
          node = nodes[i];
          href = node.attr('href');
          name = node.attr('name');
          if (href && href.indexOf('#_msocom_') !== -1) {
            node.remove();
            continue;
          }
          if (href && href.indexOf('file://') === 0) {
            href = href.split('#')[1];
            if (href) {
              href = '#' + href;
            }
          }
          if (!href && !name) {
            node.unwrap();
          } else {
            // Remove all named anchors that aren't specific to TOC, Footnotes or Endnotes
            if (name && !/^_?(?:toc|edn|ftn)/i.test(name)) {
              node.unwrap();
              continue;
            }
            node.attr({
              href: href,
              name: name
            });
          }
        }
      });
      // Parse into DOM structure
      var rootNode = domParser.parse(content);
      // Process DOM
      if (Settings.shouldConvertWordFakeLists(editor)) {
        convertFakeListsToProperLists(rootNode);
      }
      // Serialize DOM back to HTML
      content = new Serializer({
        validate: editor.settings.validate
      }, schema).serialize(rootNode);
      return content;
    };
    var preProcess = function (editor, content) {
      return Settings.shouldUseDefaultFilters(editor) ? filterWordContent(editor, content) : content;
    };
    return {
      preProcess: preProcess,
      isWordContent: isWordContent
    };
  }
);
 |