0a1,516
+ [@
+ %3c?php if (!defined('PmWiki')) exit();
+ /* AutomaticLinks2 / autolink2
+
+ Large parts by Christian Heller / http://www.plomlompom.de
+ But a lot of this code and the whole foundation is from AutomaticLinks / autolink.php
+ by Karl Loncarek:
+ http://www.pmwiki.org/wiki/Cookbook/AutomaticLinks
+
+ This file is part of PmWiki; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version. See pmwiki.php for full details.
+
+
+
+ SHORT EXPLANATION:
+
+ Automatically links phrases in wiki page texts to pages that have a sufficiently similar name, giving a certain
+ tolerance to different word endings, German umlauts and a certain case insensitivity.
+
+ Also provides for a (:autolink2backlinks:) directive that outputs a linked list of pages that seem to automatically
+ backlink to the current page in autolink2 fashion.
+
+
+
+ LONG EXPLANATION:
+
+ autolink2.php reads the names of all the files in wiki.d/ that belong to groups given to the Autolink2Activate function
+ and creates an entry for each of them in a file .autolink2-$group and an array $AutolinkListFull of the format
+
+ $id of a page =>
+ ($pagename in the format $group.something,
+ $regexp to match the non-group part of this pagename,
+ $references (an array of $ids) to other pages in the page text of $pagename)
+
+ On a given wiki page of $current_id in the $AutolinkLIstFull array, this data is used to match (via $regexps
+ for the $ids given in $references) and markup otherwise non-linked phrases in the page text to create links
+ to other wiki pages.
+ (see functions Autolink2Active, AutolinkSet)
+
+ The $references have previously been collected from reading .autolink2-group and during the updating of
+ $AutolinkListFull (which, of course, also writes back to .autolink2-group).
+ (see functions AutolinkListFullRead, AutolinkListFullUpdate and CollectPossibleReferences)
+
+ The $regexp is created with a certain tolerance towards other ways to write the phrase matching the pagename,
+ allowing for case insensitivity, different ways to write umlauts and, to a certain degree, different word endings.
+ It's double-checked by 1) matching the phrase in the page text in the Markup function, which opens the function
+ AutolinkSet for link creation which 2) tries to translate the phrase found via $regexp back into a pagename to link;
+ this may lead to cases where the $regexp of one $id matches a phrase that is found out by AutolinkSet to more
+ closely match another $id; a desired effect.
+ (see functions GenerateAutolinkRegexp, AutolinkSet)
+
+ There's also a Markup to output a list of all the pages that have $current_id in their list of references,
+ callable via the (:autolink2backlinks:) directive.
+ (see functions Autolink2Activate, GenerateBacklinks)
+
+
+
+ TO DO:
+ - certain number words (i.e. word divisions that follow a word that ends in a number don't get matched yet)
+ - French accents etc.
+ - words like "ATripToParis" with one-letter-words
+ - uppercase-umlaut letter rules
+ - limits on the word-lengths that the tolerances for different word endings are effective for
+ */
+
+
+
+ #set predefined values
+ $AutolinkListFull = array();
+ $AutolinkListShort = array();
+ $AutolinkTime = 0;
+ $AutolinkFmt = "%3ca class=\"autolink\" href='\$LinkUrl'>\$LinkText%3c/a>";
+
+ function AutolinkListFullRead($group) {
+ # Read the group-specific .autolink file and translate it into $AutolinkListFull.
+
+ global $AutolinkListFull,$AutolinkTime,$WorkDir;
+ $filename_autolink = "$WorkDir/.autolink2-$group";
+ if (file_exists($filename_autolink)) {
+ $AutolinkTime = filemtime($filename_autolink);
+ if ($fp = file_get_contents($filename_autolink, "r")) {
+ $entries = explode('!!!', $fp); # delimiter between entries is string "!!!"
+ foreach ($entries as $entry) {
+ $data = explode('%250a', $entry); # delimiter between data inside of entries is "%250a"
+ $id = $data[0];
+ $name = $data[1];
+ $regexp = $data[2];
+ $references = explode(',', $data[3]); # delimiter between references is ","
+ $AutolinkListFull[$id] = array($name, $regexp, $references);
+ }
+
+ }
+ }
+ }
+
+
+
+ function AutolinkListFullUpdate($group) {
+ # Check if files have been added, removed or changed since the last update.
+ # Update $AutolinkListFull and the group-specific .autolink file accordingly.
+
+ global $AutolinkTime,$AutolinkListFull,$WorkDir,$SearchPatterns;
+
+ # Fill $matches with all pagenames in wiki.d/ to consider for autolink2.
+ $filename_autolink = "$WorkDir/.autolink2-$group";
+ if ($handle = opendir($WorkDir)) { # get filenames that are matching
+ while (false !== ($file = readdir($handle))) {
+ if ($file != "." && $file!= ".." && substr($file,0,strlen($group)+1) == "$group." && 0 == preg_match('/,del-[0-9]*$/', $file)) { # only accept group-specific files and ignore deleted pages
+ $count=0;
+ if (isset($SearchPatterns['default'])) {
+ foreach($SearchPatterns['default'] as $v) { #check for in search excluded files
+ $count += preg_match($v,$file);
+ }
+ }
+ if ($count==0) { #save only not excluded files
+ $matches[]=$file;
+ }
+ }
+ }
+ closedir($handle);
+ }
+
+ # Check which pages are new and which have been modified.
+ # Create a filename => id array called $modify for modified pages
+ # and a filename array called $new_pages for new ones.
+ $list_filenames_ids = array();
+ foreach ($AutolinkListFull as $id => $pageData) {
+ $list_filenames_ids[$pageData[0]] = $id;
+ }
+ $modify = array();
+ $new_pages = array();
+ foreach ($matches as $filename) {
+ $list_filenames = array_keys($list_filenames_ids);
+ if (in_array($filename, $list_filenames)) {
+ if (filemtime($WorkDir.'/'.$filename) >= $AutolinkTime) {
+ $modify[$filename] = $list_filenames_ids[$filename];
+ }
+ }
+ else {
+ $new_pages[] = $filename;
+ }
+ }
+
+ # If there are new pages ...
+ if (count($new_pages) > 0) {
+
+ # ... first create new IDs for these, higher than all existing ones.
+ # Build a filename => id array named $create with these.
+ $list_IDs = array_keys($AutolinkListFull);
+ $highest_id = 0;
+ if ($list_IDs != array()) {
+ $highest_id = max(max($list_IDs), 0);
+ }
+ foreach ($new_pages as $filename) {
+ $highest_id = $highest_id + 1;
+ $create[$filename] = $highest_id;
+ }
+
+ # Create entries for the new pages in $AutolinkListFull.
+ # Their only content at first will be their name and the regular expression to find matches to them.
+ # We will count their possible autolink references to other pages at a later point, together with
+ # the merely modified ones, when we have all new pages together in $AutolinkListFull.
+ foreach ($create as $filename => $id) {
+ $regexp = GenerateAutolinkRegexp($filename);
+ $AutolinkListFull[$id] = array($filename, $regexp, array());
+ }
+ $modify = array_merge($modify, $create);
+
+ # Now that $AutolinkListFull has all new pages added, re-sort it by filename-length, from large to small.
+ $filenames_with_id = array();
+ foreach($AutolinkListFull as $id => $pageData) {
+ $filename = $pageData[0];
+ $filenames_with_id[$filename] = $id;
+ }
+ $filenames = array_keys($filenames_with_id);
+ uasort($filenames,'sizersort'); # sort array on size, largest entries first
+ $AutolinkListFullNew = array();
+ foreach($filenames as $filename) {
+ $id = $filenames_with_id[$filename];
+ $AutolinkListFullNew[$id] = $AutolinkListFull[$id];
+ }
+ $AutolinkListFull = $AutolinkListFullNew;
+
+ # There are no references yet to the new pages in the old ones,
+ # and later we will only do a total recount in those that are new or have been modified.
+ # Therefore it becomes necessary to go through the page texts of all unmodified old pages
+ # to find possible references to the new ones.
+ $IDsToBeIgnored = array_values($modify);
+ foreach ($create as $filename_new => $id_new) {
+ $regexp = $AutolinkListFull[$id_new][1];
+ foreach ($AutolinkListFull as $id => $pageData) {
+ if (!(in_array($id, $IDsToBeIgnored))) {
+ $filename = $pageData[0];
+ $page = ReadPage($filename);
+ $pagetext = $page['text'];
+ if (1 == preg_match("/$regexp/i", $pagetext)) {
+ $AutolinkListFull[$id][2][] = $id_new;
+ }
+ }
+ }
+ }
+ }
+
+ # For all pages mentioned in $modify (i.e. the modified as well as the new ones):
+ # Analyze their text to find possible references to other pages via regular expressions.
+ foreach ($modify as $filename => $id) {
+ $AutolinkListFull[$id][2] = CollectPossibleReferences($filename);
+ }
+
+ # check whether files do exist, if not: remove from filename/title list
+ foreach($AutolinkListFull as $id => $pageData) {
+ $filename = $pageData[0];
+ if (!file_exists("$WorkDir/$filename")) {
+ unset($AutolinkListFull[$id]);
+ }
+ }
+
+ # Now write the $AutolinkListFull content into the group-specific .autolink file!
+ if ($modify != array()) {
+ $string_part = '';
+ $array_to_string = array();
+ foreach($AutolinkListFull as $id => $pageData) {
+ $string_part = $id.'%250a'.$pageData[0].'%250a'.$pageData[1].'%250a'.implode(',', $pageData[2]);
+ $array_to_string[] = $string_part;
+ }
+ $string = implode('!!!', $array_to_string);
+ file_put_contents($filename_autolink, $string);
+ }
+ }
+
+
+
+ function GenerateAutolinkRegexp($filename) {
+ # Generate a regular expression compatible with the needs of autolink2 out of a pagename to be autolinked.
+ #
+ # The rules for the regexp generation are the following:
+ # 1) strip the group part of the filename
+ # 2) find occurences of CamelCase and therefore multiple words contained in the filename;
+ # allow for any number of occurences of [.,- ] in between multiple words thus found;
+ # also give a certain tolerance for more or less or different characters in the suffixes
+ # of words preceding uppercase CamelCase words
+ # a) do this first for words where there are two lowercase characters preceding uppercase characters
+ # b) do this second for the words left where there is only one lowercase character preceding an uppercase character
+ # This rule combination makes sense for German umlauts (single characters) which have been transformed into two latin characters,
+ # for which the path is opened in the next step:
+ # 3) allow two-characters-combinations typical for transformation from one German umlaut character into two latin characters
+ # to be read as either these two latin characters or as one German umlaut character
+
+ $filename = preg_replace('/([A-Za-z0-9]*)\.([A-Za-z0-9]*)/', '$2', $filename); # we strip the group part of the filename
+ $regexp = preg_replace('/[a-z][a-z][-]?([A-Z0-9])/', '[a-zäöüß]?[a-zäöüß]?[a-zäöüß]?[.,\-: ]*$1', $filename);
+ $regexp = preg_replace('/[a-z][-]?([A-Z0-9])/', '[a-zäöüß]?[a-zäöüß]?[.,\-: ]*$1', $regexp);
+ $searcharray = array( "ae", "Ae", "oe", "Oe", "ue", "Ue", "ss");
+ $replacearray = array( "((ae)|ä)", "((Ae)|Ä)", "((oe)|ö)", "((Oe)|ö)", "((ue)|ü)", "((Ue)|ü)", "((ss)|ß)");
+ $regexp = str_replace($searcharray,$replacearray,$regexp);
+ return $regexp;
+ }
+
+
+
+ function sizersort($a,$b) {
+ # sorting helper function to sort regarding textsize in mind, from longest to smallest
+
+ if (strlen($a)==strlen($b)) {
+ return 0;
+ }
+ return (strlen($a) > strlen($b)) ? -1 : 1;
+ }
+
+
+
+ function CollectPossibleReferences($filename) {
+ # Search $AutolinkListFull for pagenames that seem to be referenced in the page text.
+
+ global $AutolinkListFull;
+ $page = ReadPage($filename);
+ $pagetext = $page['text'];
+ $array_references = array();
+ foreach ($AutolinkListFull as $id => $pageData) {
+ $regexp = $pageData[1];
+ if (1 == preg_match("/$regexp/i", $pagetext)) {
+ $array_references[] = $id;
+ }
+ }
+ return $array_references;
+ }
+
+
+
+
+ function AutolinkSet($pattern) {
+ # AutolinkSet is called by the Markup function on a text pattern to decide
+ # whether it fits $AutolinkListShort well enough to turn it into a link;
+ # and if so, which link to turn it into.
+ #
+ # A sequence of comparison rules of growing tolerance is traversed for each
+ # pagename in $AutolinkListShort in combination with $pattern. If a comparison
+ # is successful, but the rule was rather lax, the pagenames in
+ # $AutolinkListShort that have not been compared yet to $pattern are also
+ # tried out according to stricter rules; thus not only the first, but also
+ # the highest similarity is matched.
+ global $AutolinkListShort,$pagename,$AutolinkFmt;
+ $local_page = $pagename; # There's so many $pagenameSomethingSomething variables after this, so we rename this one to avoid confusion.
+
+ # Some formatting on $pattern to eliminate umlauts.
+ # (We assume that in pagenames umlauts have already been transformed to non-umlauts in a way comparable to this ruleset.)
+ $pattern_temp = preg_replace('/ä/', 'ae', $pattern);
+ $pattern_temp = preg_replace('/Ä/', 'Ae', $pattern_temp);
+ $pattern_temp = preg_replace('/ö/', 'oe', $pattern_temp);
+ $pattern_temp = preg_replace('/Ö/', 'Oe', $pattern_temp);
+ $pattern_temp = preg_replace('/ü/', 'ue', $pattern_temp);
+ $pattern_temp = preg_replace('/Ü/', 'Ue', $pattern_temp);
+ $pattern_temp = preg_replace('/ß/', 'ss', $pattern_temp);
+
+ # We create different versions of $pattern to accomodate different comparison rules.
+ #
+ # $pattern_HyphenPossible cuts out occurences of '.', ',' ':' and ' ', but keeps '-' (unlike $pattern_NoHyphen).
+ # $pattern_divided identifies single words and replaces everything (or the nothing) between them with '!!!' to unambiguously divide them.
+ # (Rule for this: single words are separated where uppercase characters or numbers follow lowercase letters and/or by occurences of '.', ',', '-', ':', ' '.)
+ # $pattern_parts translates $pattern_divided into an array of the single words identified in $pattern.
+ $pattern_HyphenPossible = preg_replace('/[.,: ]*/', '', $pattern_temp);
+ $pattern_HyphenPossible = strtolower($pattern_HyphenPossible);
+ $pattern_NoHyphen = preg_replace('/[-]*/', '', $pattern_HyphenPossible);
+ $pattern_divided = preg_replace('/([a-z])[.,\-: ]*([A-Z0-9])/', '$1!!!$2', $pattern);
+ $pattern_divided = preg_replace('/([a-z])[.,\-: ]+([a-z0-9])/', '$1!!!$2', $pattern_divided);
+ $pattern_divided = strtolower($pattern_divided);
+ $pattern_parts = explode('!!!', $pattern_divided);
+ $number_pattern_parts = count($pattern_parts);
+
+ # $FuzzyMatch is to be filled if a fuzzy match between a pattern and a linkable pagename is found.
+ # It will be used in case no better match is found.
+ # $FuzzyMatch[1] contains the linkable pagename.
+ # $FuzzyMatch[0] contains an integer indicating a prioritization; the higher the integer, the better the match.
+ $FuzzyMatch = array(0, '');
+
+ # Traverse $AutolinkListShort for pagenames (as $pagenameAbsolute, later to be shortened to $pagename) to compare $pattern to.
+ #
+ # If $pagenameAbsolute equals the local page, ignore it.
+ #
+ # Else, strip the group name part from $pagenameAbsolute to generate $pagename.
+ # (Rule for this: Cut out the pagename part (alphabetical characters, numbers and hyphens) that ends in a dot.)
+ # $pagename_lowercase finally will make it comparable to the lowercase versions of $pattern.
+ foreach ($AutolinkListShort as $id => $pageData) {
+ $pagenameAbsolute = $pageData[0];
+ if ($pagenameAbsolute != $local_page) {
+ $pagenameRelative = preg_replace('/([A-Za-z0-9\-]*)\.([A-Za-z0-9]*)/', '$2', $pagenameAbsolute);
+ $pagename_lowercase = strtolower($pagenameRelative);
+
+ # COMPARISON TOLERANCE LEVEL 0: success in the case of an absolute match.
+ #
+ # If the lowercase version of the remaining string fits $pattern_HyphenPossible, return a link straight away.
+ if ($pagename_lowercase == $pattern_HyphenPossible ) {
+ return(MakeLink($local_page,$pagenameAbsolute,$pattern,NULL,$AutolinkFmt));
+ }
+
+ # If the tolerance comparison 0 failed, continue comparisons on the current $pagename if no previous fuzzy (non-absolute) match of priority 2 has been found.
+ if ($FuzzyMatch[0] %3c 2) {
+
+ # COMPARISON TOLERANCE LEVEL 1: success in the case of an absolute match, but ignoring hyphens.
+ #
+ $pagename_NoHyphen = preg_replace('/-/', '', $pagename_lowercase);
+ if ($pagename_NoHyphen == $pattern_NoHyphen) {
+ $FuzzyMatch[0] = 2;
+ $FuzzyMatch[1] = $pagenameAbsolute;
+ }
+
+ # Since tolerance comparison 1 failed, further continue comparisons on the current $pagename if
+ # 1) no previous fuzzy (non-absolute) match has been found.
+ # 2) $pattern consists of more than one word (check $number_pattern_parts).
+ # 3) $pagename consists of at least as many words.
+ #
+ # For condition 3) we try to split $pagename into single words like we did previously with $pattern.
+ # (Rule for this: single words are separated where uppercase characters or numbers follow lowercase letters and/or by occurences of '-'.
+ # Notice that this rule is a little simpler than the one for $pattern. We assume less tolerance for the naming of pages.)
+ elseif (($FuzzyMatch[0] == 0) and ($number_pattern_parts > 1)) {
+ $pagename_divided = preg_replace('/([a-z])[-]?([A-Z0-9])/', '$1!!!$2', $pagenameRelative);
+ $pagename_divided = strtolower($pagename_divided);
+ $pagename_parts = explode('!!!', $pagename_divided);
+ $number_pagename_parts = count($pagename_parts);
+ if ($number_pagename_parts == $number_pattern_parts) {
+
+ # COMPARISON TOLERANCE LEVEL 2: check if the words would fit if they had different suffixes.
+ #
+ # We traverse the arrays $pagename_parts and $pattern_parts to compare parts.
+ # While doing this, we give a certain indifference to word endings:
+ # Compare not only $pattern_part and $pagename_part, but also versions of each truncated at the end.
+ # If the comparison is successful, remember $pagenameAbsolute as a $FuzzyMatch:
+ # This is to be used in case no better match is found in the remaining iterations of $AutolinkListShort.
+ $FuzzyMatch = array(1, $pagenameAbsolute);
+ $counter = 0;
+ while (($Parts_PagenameAndPattern_Similar !== 0) and ($counter %3c $number_pattern_parts)) {
+ $pagename_part = $pagename_parts[$counter];
+ $pagename_part_short = substr($pagename_part, 0, -1);
+ $pattern_part = $pattern_parts[$counter];
+ $pattern_part_short = substr($pattern_part, 0, -1);
+ if (($pagename_part !== $pattern_part) and ($pagename_part_short !== $pattern_part_short) and ($pagename_part_short !== $pattern_part) and ($pagename_part !== $pattern_part_short)) {
+ $FuzzyMatch = array(0, '');
+ }
+ $counter = $counter + 1;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ # If nothing but a fuzzy match has been found for the pattern, and this is not $local_page, return it as the link.
+ if (($FuzzyMatch[0] > 0) and ($FuzzyMatch[1] !== $local_page)) {
+ return(MakeLink($local_page,$FuzzyMatch[1],$pattern,NULL,$AutolinkFmt));
+ }
+
+ # If all else fails, at least return $pattern without a link.
+ return $pattern;
+ }
+
+
+
+
+
+ function GenerateBacklinks($current_id) {
+ # Generate a list in pmWiki markup format with links for all the pages
+ # that reference a given page (identified by $current_id)
+
+ global $AutolinkListFull;
+
+ # Look up $current_id in the reference lists of all the page entries in $AutolinkListFull.
+ # If found, add these to the $backlinks list.
+ $backlinks = array();
+ foreach ($AutolinkListFull as $id => $pageData) {
+ if (in_array($current_id, $pageData[2])) {
+ $backlinks[] = $pageData[0];
+ }
+ }
+
+ # Sort the backlinks list alphabetically and turn it into a string $string_backlinks
+ # that formats them as a list of links into the pmWiki markup language. Return it.
+ sort($backlinks);
+ $string_backlinks = '';
+ foreach ($backlinks as $pagenameAbsolute) {
+ $string_backlinks = $string_backlinks.'* [['.$pagenameAbsolute.']] (:nl:)';
+ }
+ return $string_backlinks;
+ }
+
+
+
+ function Autolink2Activate($groups, $ExcludePagesFromAutolink2links) {
+ global $AutolinkListFull,$AutolinkListShort,$pagename;
+
+ # Read and update all group-specific .autolink files as well as the $AutolinkListFull array.
+ $group = explode (" ",$groups);
+ foreach ($group as $v) {
+ AutolinkListFullRead($v);
+ AutolinkListFullUpdate($v);
+ }
+
+ # Stop here if the current page ($pagename) is included in $ExcludePagesFromAutolink2links.
+ foreach ($ExcludePagesFromAutolink2links as $pattern) {
+ if (1 == preg_match($pattern, $pagename)) {
+ return;
+ }
+ }
+
+ # Try to find the current page in the AutolinkList. Stop here if it cannot be found.
+ $current_id = '';
+ foreach($AutolinkListFull as $id => $pageData) {
+ if ($pageData[0] == $pagename) {
+ $current_id = $id;
+ }
+ }
+ if ($current_id != '') {
+
+ # $current_references is a list of the IDs of all the pages that seem to be referenced in this page's text.
+ # Shorten $AutolinkListFull to $AutolinkListShort, which only contains entries for these very pages.
+ #
+ # Notice that, even though the list is shortened, the kept entries' relative order to each other
+ # (hopefully top-down from the longest pagenames to the shortest ones due to AutolinkListFullUpdate function)
+ # is conserved.
+ $current_references = $AutolinkListFull[$current_id][2];
+ foreach ($AutolinkListFull as $id => $pageData) {
+ if (in_array($id, $current_references)) {
+ $AutolinkListShort[$id] = $AutolinkListFull[$id];
+ }
+ }
+
+ # Create a Markup function for every assumed Autolink reference, to be identified via regular expressions.
+ # The best place for this sequence in the order of pmWiki Markups seems to be after 'style'.
+ $LastMarkupName = 'style';
+ foreach ($current_references as $referenced_id) {
+ $regexp = $AutolinkListFull[$referenced_id][1];
+ $NewMarkupName = 'autolink2links'.$referenced_id;
+
+ # Try to match only Autolink regular expressions outside of HTML tags (i.e. which are not followed
+ # in any distance by a '>' if there's no '%3c' or line-end in-between).
+ # AutolinkSet has the final responsibility for deciding whether the Markup is to do anything,
+ # i.e. create a link and pointing to what.
+ Markup($NewMarkupName, ">$LastMarkupName", "/($regexp)(?=[^>]*($|%3c(?!\/a)))/ie", "Keep(AutolinkSet('$1'), 'L')");
+ $LastMarkupName = $NewMarkupName;
+ }
+
+ # Markup to output a backlink list via the GenerateBacklinks function.
+ $backlinks = GenerateBacklinks($current_id);
+ Markup('autolink2backlinks', '%3cnl0', '/\(:autolink2backlinks:\)/', $backlinks);
+ }
+ }
+
+
+
+ # $groups gives the names of the groups this plugin is supposed to work for.
+ # $ExcludePagesFromAutolink2linksMarkup gives regexp patterns for pagenames on which the autolink2links Markup is not to be used.
+ # (Notice that this does not mean that they won't be taken into account for autolink2backlinks generation.)
+ $groups = 'Mind';
+ $ExcludePagesFromAutolink2links = array('/Mind.RecentChanges/', '/Mind.TwitterHistory-/');
+ Autolink2Activate($groups, $ExcludePagesFromAutolink2links);
+ @]