READABILITY_SCRIPT

Constant READABILITY_SCRIPT 

Source
pub const READABILITY_SCRIPT: &str = "function Readability(e,t){if(t&&t.documentElement)e=t,t=arguments[2];else if(!e||!e.documentElement)throw new Error(\"First argument to Readability constructor should be a document object.\");if(t=t||{},this._doc=e,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._metadata={},this._debug=!!t.debug,this._maxElemsToParse=t.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=t.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=t.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(t.classesToPreserve||[]),this._keepClasses=!!t.keepClasses,this._serializer=t.serializer||function(e){return e.innerHTML},this._disableJSONLD=!!t.disableJSONLD,this._allowedVideoRegex=t.allowedVideoRegex||this.REGEXPS.videos,this._linkDensityModifier=t.linkDensityModifier||0,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let t=function(e){if(e.nodeType==e.TEXT_NODE)return`${e.nodeName} (\"${e.textContent}\")`;var t=Array.from(e.attributes||[],function(e){return`${e.name}=\"${e.value}\"`}).join(\" \");return`<${e.localName} ${t}>`};this.log=function(){if(\"undefined\"!=typeof console){let e=Array.from(arguments,e=>e&&e.nodeType==this.ELEMENT_NODE?t(e):e);e.unshift(\"Reader: (Readability)\"),console.log(...e)}else{var e;\"undefined\"!=typeof dump&&(e=Array.prototype.map.call(arguments,function(e){return e&&e.nodeName?t(e):e}).join(\" \"),dump(\"Reader: (Readability) \"+e+\"\\n\"))}}}else this.log=function(){}}Readability.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:\"section,h2,h3,h4,h5,h6,p,td,pre\".toUpperCase().split(\",\"),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,extraneous:/print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\\/?)font[^>]*>/gi,normalize:/\\s{2,}/g,videos:/\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,shareElements:/(\\b|_)(share|sharedaddy)(\\b|_)/i,nextLink:/(next|weiter|continue|>([^\\|]|$)|\u{bb}([^\\|]|$))/i,prevLink:/(prev|earl|old|new|<|\u{ab})/i,tokenize:/\\W+/g,whitespace:/^\\s*$/,hasContent:/\\S$/,hashUrl:/^#.+/,srcsetUrl:/(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|$))/g,b64DataUrl:/^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,commas:/\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,jsonLdArticleTypes:/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,adWords:/^(ad(vertising|vertisement)?|pub(licit\u{e9})?|werb(ung)?|\u{5e7f}\u{544a}|\u{420}\u{435}\u{43a}\u{43b}\u{430}\u{43c}\u{430}|Anuncio)$/iu,loadingWords:/^((loading|\u{6b63}\u{5728}\u{52a0}\u{8f7d}|\u{417}\u{430}\u{433}\u{440}\u{443}\u{437}\u{43a}\u{430}|chargement|cargando)(\u{2026}|\\.\\.\\.)?)$/iu},UNLIKELY_ROLES:[\"menu\",\"menubar\",\"complementary\",\"navigation\",\"alert\",\"alertdialog\",\"dialog\"],DIV_TO_P_ELEMS:new Set([\"BLOCKQUOTE\",\"DL\",\"DIV\",\"IMG\",\"OL\",\"P\",\"PRE\",\"TABLE\",\"UL\"]),ALTER_TO_DIV_EXCEPTIONS:[\"DIV\",\"ARTICLE\",\"SECTION\",\"P\",\"OL\",\"UL\"],PRESENTATIONAL_ATTRIBUTES:[\"align\",\"background\",\"bgcolor\",\"border\",\"cellpadding\",\"cellspacing\",\"frame\",\"hspace\",\"rules\",\"style\",\"valign\",\"vspace\"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:[\"TABLE\",\"TH\",\"TD\",\"HR\",\"PRE\"],PHRASING_ELEMS:[\"ABBR\",\"AUDIO\",\"B\",\"BDO\",\"BR\",\"BUTTON\",\"CITE\",\"CODE\",\"DATA\",\"DATALIST\",\"DFN\",\"EM\",\"EMBED\",\"I\",\"IMG\",\"INPUT\",\"KBD\",\"LABEL\",\"MARK\",\"MATH\",\"METER\",\"NOSCRIPT\",\"OBJECT\",\"OUTPUT\",\"PROGRESS\",\"Q\",\"RUBY\",\"SAMP\",\"SCRIPT\",\"SELECT\",\"SMALL\",\"SPAN\",\"STRONG\",\"SUB\",\"SUP\",\"TEXTAREA\",\"TIME\",\"VAR\",\"WBR\"],CLASSES_TO_PRESERVE:[\"page\"],HTML_ESCAPE_MAP:{lt:\"<\",gt:\">\",amp:\"&\",quot:\'\"\',apos:\"\'\"},_postProcessContent(e){this._fixRelativeUris(e),this._simplifyNestedElements(e),this._keepClasses||this._cleanClasses(e)},_removeNodes(e,t){if(this._docJSDOMParser&&e._isLiveNodeList)throw new Error(\"Do not pass live node lists to _removeNodes\");for(var i=e.length-1;0<=i;i--){var a=e[i],r=a.parentNode;!r||t&&!t.call(this,a,i,e)||r.removeChild(a)}},_replaceNodeTags(e,t){if(this._docJSDOMParser&&e._isLiveNodeList)throw new Error(\"Do not pass live node lists to _replaceNodeTags\");for(const i of e)this._setNodeTag(i,t)},_forEachNode(e,t){Array.prototype.forEach.call(e,t,this)},_findNode(e,t){return Array.prototype.find.call(e,t,this)},_someNode(e,t){return Array.prototype.some.call(e,t,this)},_everyNode(e,t){return Array.prototype.every.call(e,t,this)},_getAllNodesWithTag(t,e){return t.querySelectorAll?t.querySelectorAll(e.join(\",\")):[].concat.apply([],e.map(function(e){e=t.getElementsByTagName(e);return Array.isArray(e)?e:Array.from(e)}))},_cleanClasses(e){var t=this._classesToPreserve,i=(e.getAttribute(\"class\")||\"\").split(/\\s+/).filter(e=>t.includes(e)).join(\" \");for(i?e.setAttribute(\"class\",i):e.removeAttribute(\"class\"),e=e.firstElementChild;e;e=e.nextElementSibling)this._cleanClasses(e)},_isUrl(e){try{return new URL(e),!0}catch{return!1}},_fixRelativeUris(e){var t=this._doc.baseURI,i=this._doc.documentURI;function r(e){if(t==i&&\"#\"==e.charAt(0))return e;try{return new URL(e,t).href}catch(e){}return e}var a=this._getAllNodesWithTag(e,[\"a\"]),a=(this._forEachNode(a,function(e){var t=e.getAttribute(\"href\");if(t)if(0===t.indexOf(\"javascript:\"))if(1===e.childNodes.length&&e.childNodes[0].nodeType===this.TEXT_NODE){var i=this._doc.createTextNode(e.textContent);e.parentNode.replaceChild(i,e)}else{for(var a=this._doc.createElement(\"span\");e.firstChild;)a.appendChild(e.firstChild);e.parentNode.replaceChild(a,e)}else e.setAttribute(\"href\",r(t))}),this._getAllNodesWithTag(e,[\"img\",\"picture\",\"figure\",\"video\",\"audio\",\"source\"]));this._forEachNode(a,function(e){var t=e.getAttribute(\"src\"),i=e.getAttribute(\"poster\"),a=e.getAttribute(\"srcset\");t&&e.setAttribute(\"src\",r(t)),i&&e.setAttribute(\"poster\",r(i)),a&&(t=a.replace(this.REGEXPS.srcsetUrl,function(e,t,i,a){return r(t)+(i||\"\")+a}),e.setAttribute(\"srcset\",t))})},_simplifyNestedElements(e){for(var t=e;t;){if(t.parentNode&&[\"DIV\",\"SECTION\"].includes(t.tagName)&&(!t.id||!t.id.startsWith(\"readability\"))){if(this._isElementWithoutContent(t)){t=this._removeAndGetNext(t);continue}if(this._hasSingleTagInsideElement(t,\"DIV\")||this._hasSingleTagInsideElement(t,\"SECTION\")){for(var i=t.children[0],a=0;a<t.attributes.length;a++)i.setAttributeNode(t.attributes[a].cloneNode());t.parentNode.replaceChild(i,t),t=i;continue}}t=this._getNextNode(t)}},_getArticleTitle(){var e=this._doc,t=\"\",i=\"\";try{\"string\"!=typeof(t=i=e.title.trim())&&(t=i=this._getInnerText(e.getElementsByTagName(\"title\")[0]))}catch(e){}var a,r,s=!1;function n(e){return e.split(/\\s+/).length}if(/ [\\|\\-\\\\\\/>\u{bb}] /.test(t)){s=/ [\\\\\\/>\u{bb}] /.test(t);let e=Array.from(i.matchAll(/ [\\|\\-\\\\\\/>\u{bb}] /gi));n(t=i.substring(0,e.pop().index))<3&&(t=i.replace(/^[^\\|\\-\\\\\\/>\u{bb}]*[\\|\\-\\\\\\/>\u{bb}]/gi,\"\"))}else t.includes(\": \")?(r=this._getAllNodesWithTag(e,[\"h1\",\"h2\"]),a=t.trim(),this._someNode(r,function(e){return e.textContent.trim()===a})||(n(t=i.substring(i.lastIndexOf(\":\")+1))<3?t=i.substring(i.indexOf(\":\")+1):5<n(i.substr(0,i.indexOf(\":\")))&&(t=i))):(150<t.length||t.length<15)&&1===(r=e.getElementsByTagName(\"h1\")).length&&(t=this._getInnerText(r[0]));e=n(t=t.trim().replace(this.REGEXPS.normalize,\" \"));return t=e<=4&&(!s||e!=n(i.replace(/[\\|\\-\\\\\\/>\u{bb}]+/g,\"\"))-1)?i:t},_prepDocument(){var e=this._doc;this._removeNodes(this._getAllNodesWithTag(e,[\"style\"])),e.body&&this._replaceBrs(e.body),this._replaceNodeTags(this._getAllNodesWithTag(e,[\"font\"]),\"SPAN\")},_nextNode(e){for(var t=e;t&&t.nodeType!=this.ELEMENT_NODE&&this.REGEXPS.whitespace.test(t.textContent);)t=t.nextSibling;return t},_replaceBrs(e){this._forEachNode(this._getAllNodesWithTag(e,[\"br\"]),function(e){for(var t=e.nextSibling,i=!1;(t=this._nextNode(t))&&\"BR\"==t.tagName;){var i=!0,a=t.nextSibling;t.remove(),t=a}if(i){var r=this._doc.createElement(\"p\");for(e.parentNode.replaceChild(r,e),t=r.nextSibling;t;){if(\"BR\"==t.tagName){var s=this._nextNode(t.nextSibling);if(s&&\"BR\"==s.tagName)break}if(!this._isPhrasingContent(t))break;s=t.nextSibling;r.appendChild(t),t=s}for(;r.lastChild&&this._isWhitespace(r.lastChild);)r.lastChild.remove();\"P\"===r.parentNode.tagName&&this._setNodeTag(r.parentNode,\"DIV\")}})},_setNodeTag(e,t){if(this.log(\"_setNodeTag\",e,t),this._docJSDOMParser)return e.localName=t.toLowerCase(),e.tagName=t.toUpperCase(),e;for(var i=e.ownerDocument.createElement(t);e.firstChild;)i.appendChild(e.firstChild);e.parentNode.replaceChild(i,e),e.readability&&(i.readability=e.readability);for(var a=0;a<e.attributes.length;a++)i.setAttributeNode(e.attributes[a].cloneNode());return i},_prepArticle(e){this._cleanStyles(e),this._markDataTables(e),this._fixLazyImages(e),this._cleanConditionally(e,\"form\"),this._cleanConditionally(e,\"fieldset\"),this._clean(e,\"object\"),this._clean(e,\"embed\"),this._clean(e,\"footer\"),this._clean(e,\"link\"),this._clean(e,\"aside\");var i=this.DEFAULT_CHAR_THRESHOLD;this._forEachNode(e.children,function(e){this._cleanMatchedNodes(e,function(e,t){return this.REGEXPS.shareElements.test(t)&&e.textContent.length<i})}),this._clean(e,\"iframe\"),this._clean(e,\"input\"),this._clean(e,\"textarea\"),this._clean(e,\"select\"),this._clean(e,\"button\"),this._cleanHeaders(e),this._cleanConditionally(e,\"table\"),this._cleanConditionally(e,\"ul\"),this._cleanConditionally(e,\"div\"),this._replaceNodeTags(this._getAllNodesWithTag(e,[\"h1\"]),\"h2\"),this._removeNodes(this._getAllNodesWithTag(e,[\"p\"]),function(e){return 0===this._getAllNodesWithTag(e,[\"img\",\"embed\",\"object\",\"iframe\"]).length&&!this._getInnerText(e,!1)}),this._forEachNode(this._getAllNodesWithTag(e,[\"br\"]),function(e){var t=this._nextNode(e.nextSibling);t&&\"P\"==t.tagName&&e.remove()}),this._forEachNode(this._getAllNodesWithTag(e,[\"table\"]),function(e){var t=this._hasSingleTagInsideElement(e,\"TBODY\")?e.firstElementChild:e;this._hasSingleTagInsideElement(t,\"TR\")&&(t=t.firstElementChild,this._hasSingleTagInsideElement(t,\"TD\")&&(t=t.firstElementChild,t=this._setNodeTag(t,this._everyNode(t.childNodes,this._isPhrasingContent)?\"P\":\"DIV\"),e.parentNode.replaceChild(t,e)))})},_initializeNode(e){switch(e.readability={contentScore:0},e.tagName){case\"DIV\":e.readability.contentScore+=5;break;case\"PRE\":case\"TD\":case\"BLOCKQUOTE\":e.readability.contentScore+=3;break;case\"ADDRESS\":case\"OL\":case\"UL\":case\"DL\":case\"DD\":case\"DT\":case\"LI\":case\"FORM\":e.readability.contentScore-=3;break;case\"H1\":case\"H2\":case\"H3\":case\"H4\":case\"H5\":case\"H6\":case\"TH\":e.readability.contentScore-=5}e.readability.contentScore+=this._getClassWeight(e)},_removeAndGetNext(e){var t=this._getNextNode(e,!0);return e.remove(),t},_getNextNode(e,t){if(!t&&e.firstElementChild)return e.firstElementChild;if(e.nextElementSibling)return e.nextElementSibling;for(;(e=e.parentNode)&&!e.nextElementSibling;);return e&&e.nextElementSibling},_textSimilarity(e,t){var i=e.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean),e=t.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);return i.length&&e.length?1-e.filter(e=>!i.includes(e)).join(\" \").length/e.join(\" \").length:0},_isValidByline(e,t){var i=e.getAttribute(\"rel\"),a=e.getAttribute(\"itemprop\"),e=e.textContent.trim().length;return(\"author\"===i||a&&a.includes(\"author\")||this.REGEXPS.byline.test(t))&&!!e&&e<100},_getNodeAncestors(e,t){t=t||0;for(var i=0,a=[];e.parentNode&&(a.push(e.parentNode),!t||++i!==t);)e=e.parentNode;return a},_grabArticle(t){this.log(\"**** grabArticle ****\");var i=this._doc,M=null!==t;if(!(t=t||this._doc.body))return this.log(\"No body found in document. Abort.\"),null;for(var H=t.innerHTML;;){this.log(\"Starting grabArticle loop\");var k=this._flagIsActive(this.FLAG_STRIP_UNLIKELYS),a=[],r=this._doc.documentElement;let e=!0;for(;r;){\"HTML\"===r.tagName&&(this._articleLang=r.getAttribute(\"lang\"));var s=r.className+\" \"+r.id;if(this._isProbablyVisible(r))if(\"true\"==r.getAttribute(\"aria-modal\")&&\"dialog\"==r.getAttribute(\"role\"))r=this._removeAndGetNext(r);else if(this._articleByline||this._metadata.byline||!this._isValidByline(r,s))if(e&&this._headerDuplicatesTitle(r))this.log(\"Removing header: \",r.textContent.trim(),this._articleTitle.trim()),e=!1,r=this._removeAndGetNext(r);else{if(k){if(this.REGEXPS.unlikelyCandidates.test(s)&&!this.REGEXPS.okMaybeItsACandidate.test(s)&&!this._hasAncestorTag(r,\"table\")&&!this._hasAncestorTag(r,\"code\")&&\"BODY\"!==r.tagName&&\"A\"!==r.tagName){this.log(\"Removing unlikely candidate - \"+s),r=this._removeAndGetNext(r);continue}if(this.UNLIKELY_ROLES.includes(r.getAttribute(\"role\"))){this.log(\"Removing content with role \"+r.getAttribute(\"role\")+\" - \"+s),r=this._removeAndGetNext(r);continue}}if(\"DIV\"!==r.tagName&&\"SECTION\"!==r.tagName&&\"HEADER\"!==r.tagName&&\"H1\"!==r.tagName&&\"H2\"!==r.tagName&&\"H3\"!==r.tagName&&\"H4\"!==r.tagName&&\"H5\"!==r.tagName&&\"H6\"!==r.tagName||!this._isElementWithoutContent(r)){if(this.DEFAULT_TAGS_TO_SCORE.includes(r.tagName)&&a.push(r),\"DIV\"===r.tagName){for(var n,l=null,o=r.firstChild;o;){var U=o.nextSibling;if(this._isPhrasingContent(o))null!==l?l.appendChild(o):this._isWhitespace(o)||(l=i.createElement(\"p\"),r.replaceChild(l,o),l.appendChild(o));else if(null!==l){for(;l.lastChild&&this._isWhitespace(l.lastChild);)l.lastChild.remove();l=null}o=U}this._hasSingleTagInsideElement(r,\"P\")&&this._getLinkDensity(r)<.25?(n=r.children[0],r.parentNode.replaceChild(n,r),a.push(r=n)):this._hasChildBlockElement(r)||(r=this._setNodeTag(r,\"P\"),a.push(r))}r=this._getNextNode(r)}else r=this._removeAndGetNext(r)}else{for(var W=this._getNextNode(r,!0),h=this._getNextNode(r),d=null;h&&h!=W;){var c=h.getAttribute(\"itemprop\");if(c&&c.includes(\"name\")){d=h;break}h=this._getNextNode(h)}this._articleByline=(d??r).textContent.trim(),r=this._removeAndGetNext(r)}else this.log(\"Removing hidden node - \"+s),r=this._removeAndGetNext(r)}for(var g=[],_=(this._forEachNode(a,function(e){var t,i;!e.parentNode||void 0===e.parentNode.tagName||(t=this._getInnerText(e)).length<25||0!==(e=this._getNodeAncestors(e,5)).length&&(i=0,++i,i=(i+=t.split(this.REGEXPS.commas).length)+Math.min(Math.floor(t.length/100),3),this._forEachNode(e,function(e,t){e.tagName&&e.parentNode&&void 0!==e.parentNode.tagName&&(void 0===e.readability&&(this._initializeNode(e),g.push(e)),e.readability.contentScore+=i/(0===t?1:1===t?2:3*t))}))}),[]),m=0,F=g.length;m<F;m+=1){var u=g[m],p=u.readability.contentScore*(1-this._getLinkDensity(u));u.readability.contentScore=p,this.log(\"Candidate:\",u,\"with score \"+p);for(var N=0;N<this._nbTopCandidates;N++){var E=_[N];if(!E||p>E.readability.contentScore){_.splice(N,0,u),_.length>this._nbTopCandidates&&_.pop();break}}}var f=_[0]||null,b=!1;if(null===f||\"BODY\"===f.tagName){for(f=i.createElement(\"DIV\"),b=!0;t.firstChild;)this.log(\"Moving child out:\",t.firstChild),f.appendChild(t.firstChild);t.appendChild(f),this._initializeNode(f)}else if(f){for(var T=[],A=1;A<_.length;A++).75<=_[A].readability.contentScore/f.readability.contentScore&&T.push(this._getNodeAncestors(_[A]));if(3<=T.length)for(S=f.parentNode;\"BODY\"!==S.tagName;){for(var y=0,v=0;v<T.length&&y<3;v++)y+=Number(T[v].includes(S));if(3<=y){f=S;break}S=S.parentNode}f.readability||this._initializeNode(f);for(var S=f.parentNode,C=f.readability.contentScore,X=C/3;\"BODY\"!==S.tagName;)if(S.readability){var L=S.readability.contentScore;if(L<X)break;if(C<L){f=S;break}C=S.readability.contentScore,S=S.parentNode}else S=S.parentNode;for(S=f.parentNode;\"BODY\"!=S.tagName&&1==S.children.length;)S=(f=S).parentNode;f.readability||this._initializeNode(f)}for(var D=i.createElement(\"DIV\"),$=(M&&(D.id=\"readability-content\"),Math.max(10,.2*f.readability.contentScore)),x=(S=f.parentNode).children,I=0,j=x.length;I<j;I++){var R,V,P,O=x[I],w=!1;this.log(\"Looking at sibling node:\",O,O.readability?\"with score \"+O.readability.contentScore:\"\"),this.log(\"Sibling has score\",O.readability?O.readability.contentScore:\"Unknown\"),O===f?w=!0:(R=0,O.className===f.className&&\"\"!==f.className&&(R+=.2*f.readability.contentScore),O.readability&&O.readability.contentScore+R>=$?w=!0:\"P\"===O.nodeName&&(R=this._getLinkDensity(O),(80<(P=(V=this._getInnerText(O)).length)&&R<.25||P<80&&0<P&&0===R&&-1!==V.search(/\\.( |$)/))&&(w=!0))),w&&(this.log(\"Appending node:\",O),this.ALTER_TO_DIV_EXCEPTIONS.includes(O.nodeName)||(this.log(\"Altering sibling:\",O,\"to div.\"),O=this._setNodeTag(O,\"DIV\")),D.appendChild(O),x=S.children,--I,--j)}if(this._debug&&this.log(\"Article content pre-prep: \"+D.innerHTML),this._prepArticle(D),this._debug&&this.log(\"Article content post-prep: \"+D.innerHTML),b)f.id=\"readability-page-1\",f.className=\"page\";else{var B=i.createElement(\"DIV\");for(B.id=\"readability-page-1\",B.className=\"page\";D.firstChild;)B.appendChild(D.firstChild);D.appendChild(B)}this._debug&&this.log(\"Article content after paging: \"+D.innerHTML);var b=!0,G=this._getInnerText(D,!0).length;if(G<this._charThreshold)if(b=!1,t.innerHTML=H,this._attempts.push({articleContent:D,textLength:G}),this._flagIsActive(this.FLAG_STRIP_UNLIKELYS))this._removeFlag(this.FLAG_STRIP_UNLIKELYS);else if(this._flagIsActive(this.FLAG_WEIGHT_CLASSES))this._removeFlag(this.FLAG_WEIGHT_CLASSES);else if(this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);else{if(this._attempts.sort(function(e,t){return t.textLength-e.textLength}),!this._attempts[0].textLength)return null;D=this._attempts[0].articleContent,b=!0}if(b)return G=[S,f].concat(this._getNodeAncestors(S)),this._someNode(G,function(e){if(!e.tagName)return!1;e=e.getAttribute(\"dir\");return!!e&&(this._articleDir=e,!0)}),D}},_unescapeHtmlEntities(e){if(!e)return e;var i=this.HTML_ESCAPE_MAP;return e.replace(/&(quot|amp|apos|lt|gt);/g,function(e,t){return i[t]}).replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi,function(e,t,i){i=parseInt(t||i,t?16:10);return(0==i||1114111<i||55296<=i&&i<=57343)&&(i=65533),String.fromCodePoint(i)})},_getJSONLD(e){var l,e=this._getAllNodesWithTag(e,[\"script\"]);return this._forEachNode(e,function(e){if(!l&&\"application/ld+json\"===e.getAttribute(\"type\"))try{var t=e.textContent.replace(/^\\s*<!\\[CDATA\\[|\\]\\]>\\s*$/g,\"\"),i=JSON.parse(t);if(Array.isArray(i)&&!(i=i.find(e=>e[\"@type\"]&&e[\"@type\"].match(this.REGEXPS.jsonLdArticleTypes))))return;var a,r,s,n=/^https?\\:\\/\\/schema\\.org\\/?$/;if(!(\"string\"==typeof i[\"@context\"]&&i[\"@context\"].match(n)||\"object\"==typeof i[\"@context\"]&&\"string\"==typeof i[\"@context\"][\"@vocab\"]&&i[\"@context\"][\"@vocab\"].match(n)))return;if(!(i=!i[\"@type\"]&&Array.isArray(i[\"@graph\"])?i[\"@graph\"].find(e=>(e[\"@type\"]||\"\").match(this.REGEXPS.jsonLdArticleTypes)):i)||!i[\"@type\"]||!i[\"@type\"].match(this.REGEXPS.jsonLdArticleTypes))return;l={},\"string\"==typeof i.name&&\"string\"==typeof i.headline&&i.name!==i.headline?(a=this._getArticleTitle(),r=.75<this._textSimilarity(i.name,a),s=.75<this._textSimilarity(i.headline,a),l.title=s&&!r?i.headline:i.name):\"string\"==typeof i.name?l.title=i.name.trim():\"string\"==typeof i.headline&&(l.title=i.headline.trim()),i.author&&(\"string\"==typeof i.author.name?l.byline=i.author.name.trim():Array.isArray(i.author)&&i.author[0]&&\"string\"==typeof i.author[0].name&&(l.byline=i.author.filter(function(e){return e&&\"string\"==typeof e.name}).map(function(e){return e.name.trim()}).join(\", \"))),\"string\"==typeof i.description&&(l.excerpt=i.description.trim()),i.publisher&&\"string\"==typeof i.publisher.name&&(l.siteName=i.publisher.name.trim()),\"string\"==typeof i.datePublished&&(l.datePublished=i.datePublished.trim())}catch(e){this.log(e.message)}}),l||{}},_getArticleMetadata(e){var t={},s={},i=this._doc.getElementsByTagName(\"meta\"),n=/\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi,l=/^\\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\\s*[-\\.:]\\s*)?(author|creator|pub-date|description|title|site_name)\\s*$/i,i=(this._forEachNode(i,function(e){var t,i,a=e.getAttribute(\"name\"),r=e.getAttribute(\"property\"),e=e.getAttribute(\"content\");e&&(i=t=null,r&&(t=r.match(n))&&(i=t[0].toLowerCase().replace(/\\s/g,\"\"),s[i]=e.trim()),!t&&a&&l.test(a)&&(i=a,e&&(i=i.toLowerCase().replace(/\\s/g,\"\").replace(/\\./g,\":\"),s[i]=e.trim())))}),t.title=e.title||s[\"dc:title\"]||s[\"dcterm:title\"]||s[\"og:title\"]||s[\"weibo:article:title\"]||s[\"weibo:webpage:title\"]||s.title||s[\"twitter:title\"]||s[\"parsely-title\"],t.title||(t.title=this._getArticleTitle()),\"string\"!=typeof s[\"article:author\"]||this._isUrl(s[\"article:author\"])?void 0:s[\"article:author\"]);return t.byline=e.byline||s[\"dc:creator\"]||s[\"dcterm:creator\"]||s.author||s[\"parsely-author\"]||i,t.excerpt=e.excerpt||s[\"dc:description\"]||s[\"dcterm:description\"]||s[\"og:description\"]||s[\"weibo:article:description\"]||s[\"weibo:webpage:description\"]||s.description||s[\"twitter:description\"],t.siteName=e.siteName||s[\"og:site_name\"],t.publishedTime=e.datePublished||s[\"article:published_time\"]||s[\"parsely-pub-date\"]||null,t.title=this._unescapeHtmlEntities(t.title),t.byline=this._unescapeHtmlEntities(t.byline),t.excerpt=this._unescapeHtmlEntities(t.excerpt),t.siteName=this._unescapeHtmlEntities(t.siteName),t.publishedTime=this._unescapeHtmlEntities(t.publishedTime),t},_isSingleImage(e){for(;e;){if(\"IMG\"===e.tagName)return!0;if(1!==e.children.length||\"\"!==e.textContent.trim())return!1;e=e.children[0]}return!1},_unwrapNoscriptImages(o){var e=Array.from(o.getElementsByTagName(\"img\")),e=(this._forEachNode(e,function(e){for(var t=0;t<e.attributes.length;t++){var i=e.attributes[t];switch(i.name){case\"src\":case\"srcset\":case\"data-src\":case\"data-srcset\":return}if(/\\.(jpg|jpeg|png|webp)/i.test(i.value))return}e.remove()}),Array.from(o.getElementsByTagName(\"noscript\")));this._forEachNode(e,function(e){if(this._isSingleImage(e)){var t=o.createElement(\"div\"),i=(t.innerHTML=e.innerHTML,e.previousElementSibling);if(i&&this._isSingleImage(i)){for(var a=i,r=(\"IMG\"!==a.tagName&&(a=i.getElementsByTagName(\"img\")[0]),t.getElementsByTagName(\"img\")[0]),s=0;s<a.attributes.length;s++){var n,l=a.attributes[s];\"\"===l.value||\"src\"!==l.name&&\"srcset\"!==l.name&&!/\\.(jpg|jpeg|png|webp)/i.test(l.value)||r.getAttribute(l.name)===l.value||(n=l.name,r.hasAttribute(n)&&(n=\"data-old-\"+n),r.setAttribute(n,l.value))}e.parentNode.replaceChild(t.firstElementChild,i)}}})},_removeScripts(e){this._removeNodes(this._getAllNodesWithTag(e,[\"script\",\"noscript\"]))},_hasSingleTagInsideElement(e,t){return 1==e.children.length&&e.children[0].tagName===t&&!this._someNode(e.childNodes,function(e){return e.nodeType===this.TEXT_NODE&&this.REGEXPS.hasContent.test(e.textContent)})},_isElementWithoutContent(e){return!(e.nodeType!==this.ELEMENT_NODE||e.textContent.trim().length||e.children.length&&e.children.length!=e.getElementsByTagName(\"br\").length+e.getElementsByTagName(\"hr\").length)},_hasChildBlockElement(e){return this._someNode(e.childNodes,function(e){return this.DIV_TO_P_ELEMS.has(e.tagName)||this._hasChildBlockElement(e)})},_isPhrasingContent(e){return e.nodeType===this.TEXT_NODE||this.PHRASING_ELEMS.includes(e.tagName)||(\"A\"===e.tagName||\"DEL\"===e.tagName||\"INS\"===e.tagName)&&this._everyNode(e.childNodes,this._isPhrasingContent)},_isWhitespace(e){return e.nodeType===this.TEXT_NODE&&0===e.textContent.trim().length||e.nodeType===this.ELEMENT_NODE&&\"BR\"===e.tagName},_getInnerText(e,t){t=void 0===t||t;e=e.textContent.trim();return t?e.replace(this.REGEXPS.normalize,\" \"):e},_getCharCount(e,t){return t=t||\",\",this._getInnerText(e).split(t).length-1},_cleanStyles(e){if(e&&\"svg\"!==e.tagName.toLowerCase()){for(var t=0;t<this.PRESENTATIONAL_ATTRIBUTES.length;t++)e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[t]);this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.includes(e.tagName)&&(e.removeAttribute(\"width\"),e.removeAttribute(\"height\"));for(var i=e.firstElementChild;null!==i;)this._cleanStyles(i),i=i.nextElementSibling}},_getLinkDensity(e){var t=this._getInnerText(e).length;if(0===t)return 0;var i=0;return this._forEachNode(e.getElementsByTagName(\"a\"),function(e){var t=e.getAttribute(\"href\"),t=t&&this.REGEXPS.hashUrl.test(t)?.3:1;i+=this._getInnerText(e).length*t}),i/t},_getClassWeight(e){if(!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))return 0;var t=0;return\"string\"==typeof e.className&&\"\"!==e.className&&(this.REGEXPS.negative.test(e.className)&&(t-=25),this.REGEXPS.positive.test(e.className)&&(t+=25)),\"string\"==typeof e.id&&\"\"!==e.id&&(this.REGEXPS.negative.test(e.id)&&(t-=25),this.REGEXPS.positive.test(e.id)&&(t+=25)),t},_clean(e,t){var i=[\"object\",\"embed\",\"iframe\"].includes(t);this._removeNodes(this._getAllNodesWithTag(e,[t]),function(e){if(i){for(var t=0;t<e.attributes.length;t++)if(this._allowedVideoRegex.test(e.attributes[t].value))return!1;if(\"object\"===e.tagName&&this._allowedVideoRegex.test(e.innerHTML))return!1}return!0})},_hasAncestorTag(e,t,i,a){i=i||3,t=t.toUpperCase();for(var r=0;e.parentNode;){if(0<i&&i<r)return!1;if(e.parentNode.tagName===t&&(!a||a(e.parentNode)))return!0;e=e.parentNode,r++}return!1},_getRowAndColumnCount(e){for(var t=0,i=0,a=e.getElementsByTagName(\"tr\"),r=0;r<a.length;r++){var s=a[r].getAttribute(\"rowspan\")||0;t+=(s=s&&parseInt(s,10))||1;for(var n=0,l=a[r].getElementsByTagName(\"td\"),o=0;o<l.length;o++){var h=l[o].getAttribute(\"colspan\")||0;n+=(h=h&&parseInt(h,10))||1}i=Math.max(i,n)}return{rows:t,columns:i}},_markDataTables(e){for(var t=e.getElementsByTagName(\"table\"),i=0;i<t.length;i++){var a,r=t[i];\"presentation\"==r.getAttribute(\"role\")?r._readabilityDataTable=!1:\"0\"==r.getAttribute(\"datatable\")?r._readabilityDataTable=!1:r.getAttribute(\"summary\")||(a=r.getElementsByTagName(\"caption\")[0])&&a.childNodes.length?r._readabilityDataTable=!0:[\"col\",\"colgroup\",\"tfoot\",\"thead\",\"th\"].some(function(e){return!!r.getElementsByTagName(e)[0]})?(this.log(\"Data table because found data-y descendant\"),r._readabilityDataTable=!0):r.getElementsByTagName(\"table\")[0]||1==(a=this._getRowAndColumnCount(r)).columns||1==a.rows?r._readabilityDataTable=!1:10<=a.rows||4<a.columns?r._readabilityDataTable=!0:r._readabilityDataTable=10<a.rows*a.columns}},_fixLazyImages(e){this._forEachNode(this._getAllNodesWithTag(e,[\"img\",\"picture\",\"figure\"]),function(e){if(e.src&&this.REGEXPS.b64DataUrl.test(e.src)){var t=this.REGEXPS.b64DataUrl.exec(e.src);if(\"image/svg+xml\"===t[1])return;for(var i=!1,a=0;a<e.attributes.length;a++){var r=e.attributes[a];if(\"src\"!==r.name&&/\\.(jpg|jpeg|png|webp)/i.test(r.value)){i=!0;break}}i&&(t=t[0].length,e.src.length-t<133&&e.removeAttribute(\"src\"))}if(!(e.src||e.srcset&&\"null\"!=e.srcset)||e.className.toLowerCase().includes(\"lazy\"))for(var s,n,l=0;l<e.attributes.length;l++)\"src\"!==(r=e.attributes[l]).name&&\"srcset\"!==r.name&&\"alt\"!==r.name&&(s=null,/\\.(jpg|jpeg|png|webp)\\s+\\d/.test(r.value)?s=\"srcset\":/^\\s*\\S+\\.(jpg|jpeg|png|webp)\\S*\\s*$/.test(r.value)&&(s=\"src\"),s&&(\"IMG\"===e.tagName||\"PICTURE\"===e.tagName?e.setAttribute(s,r.value):\"FIGURE\"!==e.tagName||this._getAllNodesWithTag(e,[\"img\",\"picture\"]).length||((n=this._doc.createElement(\"img\")).setAttribute(s,r.value),e.appendChild(n))))})},_getTextDensity(e,t){var i=this._getInnerText(e,!0).length;if(0===i)return 0;var a=0,e=this._getAllNodesWithTag(e,t);return this._forEachNode(e,e=>a+=this._getInnerText(e,!0).length),a/i},_cleanConditionally(e,T){this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)&&this._removeNodes(this._getAllNodesWithTag(e,[T]),function(e){function t(e){return e._readabilityDataTable}var i,a=\"ul\"===T||\"ol\"===T;if(a||(i=0,m=this._getAllNodesWithTag(e,[\"ul\",\"ol\"]),this._forEachNode(m,e=>i+=this._getInnerText(e).length),a=.9<i/this._getInnerText(e).length),\"table\"===T&&t(e))return!1;if(this._hasAncestorTag(e,\"table\",-1,t))return!1;if(this._hasAncestorTag(e,\"code\"))return!1;if([...e.getElementsByTagName(\"table\")].some(e=>e._readabilityDataTable))return!1;var r=this._getClassWeight(e);this.log(\"Cleaning Conditionally\",e);if(r+0<0)return!0;if(this._getCharCount(e,\",\")<10){for(var s=e.getElementsByTagName(\"p\").length,n=e.getElementsByTagName(\"img\").length,l=e.getElementsByTagName(\"li\").length-100,o=e.getElementsByTagName(\"input\").length,h=this._getTextDensity(e,[\"h1\",\"h2\",\"h3\",\"h4\",\"h5\",\"h6\"]),d=0,c=this._getAllNodesWithTag(e,[\"object\",\"embed\",\"iframe\"]),g=0;g<c.length;g++){for(var _=0;_<c[g].attributes.length;_++)if(this._allowedVideoRegex.test(c[g].attributes[_].value))return!1;if(\"object\"===c[g].tagName&&this._allowedVideoRegex.test(c[g].innerHTML))return!1;d++}var m=this._getInnerText(e);if(this.REGEXPS.adWords.test(m)||this.REGEXPS.loadingWords.test(m))return!0;var u=m.length,p=this._getLinkDensity(e),m=[\"SPAN\",\"LI\",\"TD\"].concat(Array.from(this.DIV_TO_P_ELEMS)),N=this._getTextDensity(e,m),E=this._hasAncestorTag(e,\"figure\"),f=(()=>{const e=[];return!E&&1<n&&s/n<.5&&e.push(`Bad p to img ratio (img=${n}, p=${s})`),!a&&s<l&&e.push(`Too many li\'s outside of a list. (li=${l} > p=${s})`),o>Math.floor(s/3)&&e.push(`Too many inputs per p. (input=${o}, p=${s})`),!a&&!E&&h<.9&&u<25&&(0===n||2<n)&&0<p&&e.push(`Suspiciously short. (headingDensity=${h}, img=${n}, linkDensity=${p})`),!a&&r<25&&p>.2+this._linkDensityModifier&&e.push(`Low weight and a little linky. (linkDensity=${p})`),25<=r&&p>.5+this._linkDensityModifier&&e.push(`High weight and mostly links. (linkDensity=${p})`),(1===d&&u<75||1<d)&&e.push(`Suspicious embed. (embedCount=${d}, contentLength=${u})`),0===n&&0===N&&e.push(`No useful content. (img=${n}, textDensity=${N})`),!!e.length&&(this.log(\"Checks failed\",e),!0)})();if(a&&f){for(var b=0;b<e.children.length;b++)if(1<e.children[b].children.length)return f;m=e.getElementsByTagName(\"li\").length;if(n==m)return!1}return f}return!1})},_cleanMatchedNodes(e,t){for(var i=this._getNextNode(e,!0),a=this._getNextNode(e);a&&a!=i;)a=t.call(this,a,a.className+\" \"+a.id)?this._removeAndGetNext(a):this._getNextNode(a)},_cleanHeaders(e){e=this._getAllNodesWithTag(e,[\"h1\",\"h2\"]);this._removeNodes(e,function(e){var t=this._getClassWeight(e)<0;return t&&this.log(\"Removing header with low class weight:\",e),t})},_headerDuplicatesTitle(e){if(\"H1\"!=e.tagName&&\"H2\"!=e.tagName)return!1;e=this._getInnerText(e,!1);return this.log(\"Evaluating similarity of header:\",e,this._articleTitle),.75<this._textSimilarity(this._articleTitle,e)},_flagIsActive(e){return 0<(this._flags&e)},_removeFlag(e){this._flags=this._flags&~e},_isProbablyVisible(e){return(!e.style||\"none\"!=e.style.display)&&(!e.style||\"hidden\"!=e.style.visibility)&&!e.hasAttribute(\"hidden\")&&(!e.hasAttribute(\"aria-hidden\")||\"true\"!=e.getAttribute(\"aria-hidden\")||e.className&&e.className.includes&&e.className.includes(\"fallback-image\"))},parse(){if(0<this._maxElemsToParse){var e=this._doc.getElementsByTagName(\"*\").length;if(e>this._maxElemsToParse)throw new Error(\"Aborting parsing document; \"+e+\" elements found\")}this._unwrapNoscriptImages(this._doc);var e=this._disableJSONLD?{}:this._getJSONLD(this._doc),e=(this._removeScripts(this._doc),this._prepDocument(),this._getArticleMetadata(e)),t=(this._metadata=e,this._articleTitle=e.title,this._grabArticle());if(!t)return null;this.log(\"Grabbed: \"+t.innerHTML),this._postProcessContent(t),e.excerpt||(i=t.getElementsByTagName(\"p\")).length&&(e.excerpt=i[0].textContent.trim());var i=t.textContent;return{title:this._articleTitle,byline:e.byline||this._articleByline,dir:this._articleDir,lang:this._articleLang,content:this._serializer(t),textContent:i,length:i.length,excerpt:e.excerpt,siteName:e.siteName||this._articleSiteName,publishedTime:e.publishedTime}}},\"object\"==typeof module&&(module.exports=Readability);";
Expand description

Mozilla Readability.js library (minified)

This is the complete Readability algorithm from Mozilla, used to extract the main content from web pages while filtering out navigation, ads, sidebars, etc.

Source: https://github.com/mozilla/readability License: Apache-2.0