/*
 * Copyright (C) 2009 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// How we handle the base tag better.
// Current status:
// At now the normal way we use to handling base tag is
// a) For those links which have corresponding local saved files, such as
// savable CSS, JavaScript files, they will be written to relative URLs which
// point to local saved file. Why those links can not be resolved as absolute
// file URLs, because if they are resolved as absolute URLs, after moving the
// file location from one directory to another directory, the file URLs will
// be dead links.
// b) For those links which have not corresponding local saved files, such as
// links in A, AREA tags, they will be resolved as absolute URLs.
// c) We comment all base tags when serialzing DOM for the page.
// FireFox also uses above way to handle base tag.
//
// Problem:
// This way can not handle the following situation:
// the base tag is written by JavaScript.
// For example. The page "www.yahoo.com" use
// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL
// of page when loading page. So when saving page as completed-HTML, we assume
// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved
// completed-HTML page, then the JavaScript will insert a base tag
// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to
// local saved resource files will be resolved as
// "http://www.yahoo.com/yahoo_files/...", which will cause all saved  resource
// files can not be loaded correctly. Also the page will be rendered ugly since
// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
// files can not be fetched.
// Now FireFox, IE and WebKit based Browser all have this problem.
//
// Solution:
// My solution is that we comment old base tag and write new base tag:
// <base href="." ...> after the previous commented base tag. In WebKit, it
// always uses the latest "href" attribute of base tag to set document's base
// URL. Based on this behavior, when we encounter a base tag, we comment it and
// write a new base tag <base href="."> after the previous commented base tag.
// The new added base tag can help engine to locate correct base URL for
// correctly loading local saved resource files. Also I think we need to inherit
// the base target value from document object when appending new base tag.
// If there are multiple base tags in original document, we will comment all old
// base tags and append new base tag after each old base tag because we do not
// know those old base tags are original content or added by JavaScript. If
// they are added by JavaScript, it means when loading saved page, the script(s)
// will still insert base tag(s) to DOM, so the new added base tag(s) can
// override the incorrect base URL and make sure we alway load correct local
// saved resource files.

#include "web/WebFrameSerializerImpl.h"

#include "core/HTMLNames.h"
#include "core/dom/Document.h"
#include "core/dom/DocumentType.h"
#include "core/dom/Element.h"
#include "core/editing/serializers/Serialization.h"
#include "core/frame/FrameSerializer.h"
#include "core/html/HTMLAllCollection.h"
#include "core/html/HTMLElement.h"
#include "core/html/HTMLFormElement.h"
#include "core/html/HTMLFrameElementBase.h"
#include "core/html/HTMLFrameOwnerElement.h"
#include "core/html/HTMLHtmlElement.h"
#include "core/html/HTMLMetaElement.h"
#include "core/loader/DocumentLoader.h"
#include "core/loader/FrameLoader.h"
#include "public/platform/WebCString.h"
#include "public/platform/WebVector.h"
#include "web/WebLocalFrameImpl.h"
#include "wtf/text/TextEncoding.h"

namespace blink {

// Maximum length of data buffer which is used to temporary save generated
// html content data. This is a soft limit which might be passed if a very large
// contegious string is found in the html document.
static const unsigned dataBufferCapacity = 65536;

WebFrameSerializerImpl::SerializeDomParam::SerializeDomParam(
    const KURL& url,
    const WTF::TextEncoding& textEncoding,
    Document* document)
    : url(url),
      textEncoding(textEncoding),
      document(document),
      isHTMLDocument(document->isHTMLDocument()),
      haveSeenDocType(false),
      haveAddedCharsetDeclaration(false),
      skipMetaElement(nullptr),
      haveAddedXMLProcessingDirective(false),
      haveAddedContentsBeforeEnd(false) {}

String WebFrameSerializerImpl::preActionBeforeSerializeOpenTag(
    const Element* element,
    SerializeDomParam* param,
    bool* needSkip) {
  StringBuilder result;

  *needSkip = false;
  if (param->isHTMLDocument) {
    // Skip the open tag of original META tag which declare charset since we
    // have overrided the META which have correct charset declaration after
    // serializing open tag of HEAD element.
    DCHECK(element);
    if (isHTMLMetaElement(element) &&
        toHTMLMetaElement(element)->computeEncoding().isValid()) {
      // Found META tag declared charset, we need to skip it when
      // serializing DOM.
      param->skipMetaElement = element;
      *needSkip = true;
    } else if (isHTMLHtmlElement(*element)) {
      // Check something before processing the open tag of HEAD element.
      // First we add doc type declaration if original document has it.
      if (!param->haveSeenDocType) {
        param->haveSeenDocType = true;
        result.append(createMarkup(param->document->doctype()));
      }

      // Add MOTW declaration before html tag.
      // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
      result.append(
          WebFrameSerializer::generateMarkOfTheWebDeclaration(param->url));
    } else if (isHTMLBaseElement(*element)) {
      // Comment the BASE tag when serializing dom.
      result.append("<!--");
    }
  } else {
    // Write XML declaration.
    if (!param->haveAddedXMLProcessingDirective) {
      param->haveAddedXMLProcessingDirective = true;
      // Get encoding info.
      String xmlEncoding = param->document->xmlEncoding();
      if (xmlEncoding.isEmpty())
        xmlEncoding = param->document->encodingName();
      if (xmlEncoding.isEmpty())
        xmlEncoding = UTF8Encoding().name();
      result.append("<?xml version=\"");
      result.append(param->document->xmlVersion());
      result.append("\" encoding=\"");
      result.append(xmlEncoding);
      if (param->document->xmlStandalone())
        result.append("\" standalone=\"yes");
      result.append("\"?>\n");
    }
    // Add doc type declaration if original document has it.
    if (!param->haveSeenDocType) {
      param->haveSeenDocType = true;
      result.append(createMarkup(param->document->doctype()));
    }
  }
  return result.toString();
}

String WebFrameSerializerImpl::postActionAfterSerializeOpenTag(
    const Element* element,
    SerializeDomParam* param) {
  StringBuilder result;

  param->haveAddedContentsBeforeEnd = false;
  if (!param->isHTMLDocument)
    return result.toString();
  // Check after processing the open tag of HEAD element
  if (!param->haveAddedCharsetDeclaration && isHTMLHeadElement(*element)) {
    param->haveAddedCharsetDeclaration = true;
    // Check meta element. WebKit only pre-parse the first 512 bytes of the
    // document. If the whole <HEAD> is larger and meta is the end of head
    // part, then this kind of html documents aren't decoded correctly
    // because of this issue. So when we serialize the DOM, we need to make
    // sure the meta will in first child of head tag.
    // See http://bugs.webkit.org/show_bug.cgi?id=16621.
    // First we generate new content for writing correct META element.
    result.append(WebFrameSerializer::generateMetaCharsetDeclaration(
        String(param->textEncoding.name())));

    param->haveAddedContentsBeforeEnd = true;
    // Will search each META which has charset declaration, and skip them all
    // in PreActionBeforeSerializeOpenTag.
  }

  return result.toString();
}

String WebFrameSerializerImpl::preActionBeforeSerializeEndTag(
    const Element* element,
    SerializeDomParam* param,
    bool* needSkip) {
  String result;

  *needSkip = false;
  if (!param->isHTMLDocument)
    return result;
  // Skip the end tag of original META tag which declare charset.
  // Need not to check whether it's META tag since we guarantee
  // skipMetaElement is definitely META tag if it's not 0.
  if (param->skipMetaElement == element) {
    *needSkip = true;
  }

  return result;
}

// After we finish serializing end tag of a element, we give the target
// element a chance to do some post work to add some additional data.
String WebFrameSerializerImpl::postActionAfterSerializeEndTag(
    const Element* element,
    SerializeDomParam* param) {
  StringBuilder result;

  if (!param->isHTMLDocument)
    return result.toString();
  // Comment the BASE tag when serializing DOM.
  if (isHTMLBaseElement(*element)) {
    result.append("-->");
    // Append a new base tag declaration.
    result.append(WebFrameSerializer::generateBaseTagDeclaration(
        param->document->baseTarget()));
  }

  return result.toString();
}

void WebFrameSerializerImpl::saveHTMLContentToBuffer(const String& result,
                                                     SerializeDomParam* param) {
  m_dataBuffer.append(result);
  encodeAndFlushBuffer(WebFrameSerializerClient::CurrentFrameIsNotFinished,
                       param, DoNotForceFlush);
}

void WebFrameSerializerImpl::encodeAndFlushBuffer(
    WebFrameSerializerClient::FrameSerializationStatus status,
    SerializeDomParam* param,
    FlushOption flushOption) {
  // Data buffer is not full nor do we want to force flush.
  if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity)
    return;

  String content = m_dataBuffer.toString();
  m_dataBuffer.clear();

  CString encodedContent =
      param->textEncoding.encode(content, WTF::EntitiesForUnencodables);

  // Send result to the client.
  m_client->didSerializeDataForFrame(WebCString(encodedContent), status);
}

// TODO(yosin): We should utilize |MarkupFormatter| here to share code,
// especially escaping attribute values, done by |WebEntities| |m_htmlEntities|
// and |m_xmlEntities|.
void WebFrameSerializerImpl::appendAttribute(StringBuilder& result,
                                             bool isHTMLDocument,
                                             const String& attrName,
                                             const String& attrValue) {
  result.append(' ');
  result.append(attrName);
  result.append("=\"");
  if (isHTMLDocument)
    result.append(m_htmlEntities.convertEntitiesInString(attrValue));
  else
    result.append(m_xmlEntities.convertEntitiesInString(attrValue));
  result.append('\"');
}

void WebFrameSerializerImpl::openTagToString(Element* element,
                                             SerializeDomParam* param) {
  bool needSkip;
  StringBuilder result;
  // Do pre action for open tag.
  result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));
  if (needSkip)
    return;
  // Add open tag
  result.append('<');
  result.append(element->nodeName().lower());

  // Find out if we need to do frame-specific link rewriting.
  WebFrame* frame = nullptr;
  if (element->isFrameOwnerElement()) {
    frame =
        WebFrame::fromFrame(toHTMLFrameOwnerElement(element)->contentFrame());
  }
  WebString rewrittenFrameLink;
  bool shouldRewriteFrameSrc =
      frame && m_delegate->rewriteFrameSource(frame, &rewrittenFrameLink);
  bool didRewriteFrameSrc = false;

  // Go through all attributes and serialize them.
  for (const auto& it : element->attributes()) {
    const QualifiedName& attrName = it.name();
    String attrValue = it.value();

    // Skip srcdoc attribute if we will emit src attribute (for frames).
    if (shouldRewriteFrameSrc && attrName == HTMLNames::srcdocAttr)
      continue;

    // Rewrite the attribute value if requested.
    if (element->hasLegalLinkAttribute(attrName)) {
      // For links start with "javascript:", we do not change it.
      if (!attrValue.startsWith("javascript:", TextCaseInsensitive)) {
        // Get the absolute link.
        KURL completeURL = param->document->completeURL(attrValue);

        // Check whether we have a local file to link to.
        WebString rewrittenURL;
        if (shouldRewriteFrameSrc) {
          attrValue = rewrittenFrameLink;
          didRewriteFrameSrc = true;
        } else if (m_delegate->rewriteLink(completeURL, &rewrittenURL)) {
          attrValue = rewrittenURL;
        } else {
          attrValue = completeURL;
        }
      }
    }

    appendAttribute(result, param->isHTMLDocument, attrName.toString(),
                    attrValue);
  }

  // For frames where link rewriting was requested, ensure that src attribute
  // is written even if the original document didn't have that attribute
  // (mainly needed for iframes with srcdoc, but with no src attribute).
  if (shouldRewriteFrameSrc && !didRewriteFrameSrc &&
      isHTMLIFrameElement(element)) {
    appendAttribute(result, param->isHTMLDocument,
                    HTMLNames::srcAttr.toString(), rewrittenFrameLink);
  }

  // Do post action for open tag.
  String addedContents = postActionAfterSerializeOpenTag(element, param);
  // Complete the open tag for element when it has child/children.
  if (element->hasChildren() || param->haveAddedContentsBeforeEnd)
    result.append('>');
  // Append the added contents generate in  post action of open tag.
  result.append(addedContents);
  // Save the result to data buffer.
  saveHTMLContentToBuffer(result.toString(), param);
}

// Serialize end tag of an specified element.
void WebFrameSerializerImpl::endTagToString(Element* element,
                                            SerializeDomParam* param) {
  bool needSkip;
  StringBuilder result;
  // Do pre action for end tag.
  result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));
  if (needSkip)
    return;
  // Write end tag when element has child/children.
  if (element->hasChildren() || param->haveAddedContentsBeforeEnd) {
    result.append("</");
    result.append(element->nodeName().lower());
    result.append('>');
  } else {
    // Check whether we have to write end tag for empty element.
    if (param->isHTMLDocument) {
      result.append('>');
      // FIXME: This code is horribly wrong.  WebFrameSerializerImpl must die.
      if (!element->isHTMLElement() ||
          !toHTMLElement(element)->ieForbidsInsertHTML()) {
        // We need to write end tag when it is required.
        result.append("</");
        result.append(element->nodeName().lower());
        result.append('>');
      }
    } else {
      // For xml base document.
      result.append(" />");
    }
  }
  // Do post action for end tag.
  result.append(postActionAfterSerializeEndTag(element, param));
  // Save the result to data buffer.
  saveHTMLContentToBuffer(result.toString(), param);
}

void WebFrameSerializerImpl::buildContentForNode(Node* node,
                                                 SerializeDomParam* param) {
  switch (node->getNodeType()) {
    case Node::kElementNode:
      // Process open tag of element.
      openTagToString(toElement(node), param);
      // Walk through the children nodes and process it.
      for (Node* child = node->firstChild(); child;
           child = child->nextSibling())
        buildContentForNode(child, param);
      // Process end tag of element.
      endTagToString(toElement(node), param);
      break;
    case Node::kTextNode:
      saveHTMLContentToBuffer(createMarkup(node), param);
      break;
    case Node::kAttributeNode:
    case Node::kDocumentNode:
    case Node::kDocumentFragmentNode:
      // Should not exist.
      NOTREACHED();
      break;
    // Document type node can be in DOM?
    case Node::kDocumentTypeNode:
      param->haveSeenDocType = true;
    default:
      // For other type node, call default action.
      saveHTMLContentToBuffer(createMarkup(node), param);
      break;
  }
}

WebFrameSerializerImpl::WebFrameSerializerImpl(
    WebLocalFrame* frame,
    WebFrameSerializerClient* client,
    WebFrameSerializer::LinkRewritingDelegate* delegate)
    : m_client(client),
      m_delegate(delegate),
      m_htmlEntities(false),
      m_xmlEntities(true) {
  // Must specify available webframe.
  DCHECK(frame);
  m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);
  // Make sure we have non null client and delegate.
  DCHECK(client);
  DCHECK(delegate);

  DCHECK(m_dataBuffer.isEmpty());
}

bool WebFrameSerializerImpl::serialize() {
  bool didSerialization = false;

  Document* document = m_specifiedWebLocalFrameImpl->frame()->document();
  const KURL& url = document->url();

  if (url.isValid()) {
    didSerialization = true;

    const WTF::TextEncoding& textEncoding =
        document->encoding().isValid() ? document->encoding() : UTF8Encoding();
    if (textEncoding.isNonByteBasedEncoding()) {
      const UChar byteOrderMark = 0xFEFF;
      m_dataBuffer.append(byteOrderMark);
    }

    SerializeDomParam param(url, textEncoding, document);

    Element* documentElement = document->documentElement();
    if (documentElement)
      buildContentForNode(documentElement, &param);

    encodeAndFlushBuffer(WebFrameSerializerClient::CurrentFrameIsFinished,
                         &param, ForceFlush);
  } else {
    // Report empty contents for invalid URLs.
    m_client->didSerializeDataForFrame(
        WebCString(), WebFrameSerializerClient::CurrentFrameIsFinished);
  }

  DCHECK(m_dataBuffer.isEmpty());
  return didSerialization;
}

}  // namespace blink