UPDATED (and intentionally reinserted into the feed):
I've made a bunch of changes to this code, and updated it.
It's quite a bit slower, but I really don't care (-:
It uses my new pet project, the tokalizer.
You'll probably want to grab the newly-compiled diff-php as this is the one I'll be "maintaining" (ie, when someone complains, or when it breaks for me).
(end update)
I've told a few people that I'd blog about this "soon" and that was a while ago, so I figured I'd better get on the ball.
I tweeted this almost two weeks ago:
Derick responded saying that
diff -p does this for C. I tried it with PHP, and it gave me
the outermost block where the change occurred (ie, the class, not the
function). The interesting thing, though, is that it changed the @@ line:
@@ -32,7 +32,7 @@ class Foo2 {
Almost what I was looking for, not not quite. I really wanted a php-aware diff that could tell me context.
So, what's a developer with almost no spare time on his hands (but an idea of how to actually accomplish this pet project) to do? Write it himself, of course! (-:
So, I did. Here's an example of the output:
--- tmp/left.php +++ tmp/right.php @@ -1,7 +1,7 @@ (root) <?php class Foo { function bar() { - // baz! + // bax! } } @@ -32,7 +32,7 @@ (root):Foo2(class) // k // l function bar2() { - // baz2! + // bax2! } } @@ -63,7 +63,7 @@ (root):Foo3(class):bar3(function) // k // l $test = "foo {$test}"; - // baz2! + // bax2! } function bar4() { @@ -93,7 +93,7 @@ (root):Foo3(class):bar4(function):bar5(function) // k // l $test = "foo {$test}"; - //baz5 + //bax5 // a // b // c
Here's the code for my php-aware diff. I use it as my default svn diff command now (see comments). Hope you find it useful, I sure do.
#!/usr/bin/php <?php /// PHP-Aware diff /// Copyright 2008, Sean Coates /// Usage of the works is permitted provided that this instrument is retained /// with the works, so that any entity that uses the works is notified of this /// instrument. /// DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY. /// (Fair License - http://www.opensource.org/licenses/fair.php ) /// Short license: do whatever you like with this. //// save this file as diff-php //// and make sure /path/to/diff-php is chmod +x //// TO USE from cli: //// /path/to/diff-php leftfile rightfile # (compares files, as diff does) //// //// TO USE from svn: //// in ~/.subversion/config, add: diff-cmd = /path/to/diff-php //// You might need to adjust DIFF_PATH, below // the tokenizer scares me a bit (-: class DiffPHP { const DEBUG_SYNTAX = false; // set to true to get syntax error data (== broken diffs) const DIFF_PATH = '/usr/bin/diff'; const DIFF_OPTS = '-u'; /** * The "left" file, as passed by svn (or cli) */ protected $left; /** * The "right" file, as passed by svn (or cli) */ protected $right; /** * A "nice" version of the left file. * * Instead of foo/bar/.svn/base/whatever.php, it would just be whatever.php */ protected $niceLeft; /** * A "nice" version of the right file. * * Instead of foo/bar/.svn/base/whatever.php, it would just be whatever.php */ protected $niceRight; /** * Captured file contents (prevents reading the file twice + diff) */ protected $fileContents; /** * The output from the diff executable */ protected $diff; /** * Each chunk of the diff goes in here (begins with a @@ identifier line) */ protected $chunks; /** * Array of tokens from the Left file */ protected $tokens; /** * Mapping of source lines to source class/functions */ protected $lineMap; /** * Current context (used to construct line map) */ protected $context; /** * Brace depth (used to determine if we're still in the current context) */ protected $braceDepth; /** * Bool flag to indicate that syntax is somehow broken */ protected $isBroken; /** * Object-wide index to keep track of the current token number */ protected $tokenIndex; /** * Currently parsing token value */ protected $currentValue; /** * Constructor. The magic happens here. Once instantiated, the entire * process runs */ public function __construct() { $this->parseArgs(); $this->fileContents = file_get_contents($this->left); $this->doDiff(); // subject (probably) IS a PHP file: if (!isset($_ENV['NODIFFPHP']) && stripos($this->fileContents, '<?') !== false) { $this->splitDiff(); $this->determineHierarchy(); $this->reconstructDiff(); } else { // not a PHP file; return regular diff: echo $this->diff; } } /** * Parses the passed arguments. * * Determines if it's svn (7 args) or cli (2 args), and stores the parsed * arguments. */ protected function parseArgs() { // if this is being called from svn, we'll get 4 arguments // (8th is argv 0 == this script) if (8 == $_SERVER['argc']) { $this->niceLeft = $_SERVER['argv'][3]; $this->niceRight = $_SERVER['argv'][5]; $this->left = $_SERVER['argv'][6]; $this->right = $_SERVER['argv'][7]; } else if (3 == $_SERVER['argc']) { // 2 arguments means a regular diff $this->niceLeft = $_SERVER['argv'][1]; $this->niceRight = $_SERVER['argv'][2]; $this->left = $this->niceLeft; $this->right = $this->niceRight; } else { die("See " . __FILE__ . " for details on how to use this script\n"); } } /** * Calls the external diff program to get the base diff */ protected function doDiff() { if (is_readable($this->left) && is_readable($this->right)) { $diffCmd = self::DIFF_PATH . ' ' . self::DIFF_OPTS . " {$this->left} {$this->right}"; $this->diff = `$diffCmd`; } else { die("{$this->left} or {$this->right} is not readable\n"); } } /** * Takes an identifier line (looks like: @@ -30,23 +30,79 @@) and returns * the begin line number */ protected function parseLineNum($identifier) { list(,$from) = explode(" ", $identifier); list($from) = explode(',', $from); return (int) substr($from, 1); } /** * Sanitizes CRLF or CR into just LF */ protected function sanitizeLineEndings($data) { // first, sanitize line endings: $data = str_replace("\r\n", "\n", $data); $data = str_replace("\r", "\n", $data); return $data; } /** * Actually splits the diff into chunks and stores chunks + line numbers */ protected function splitDiff() { // now split: $this->diff = explode("\n", $this->sanitizeLineEndings($this->diff)); // array to return: $this->chunks = array(); // line counter $line = 0; // outer loop: file(s) $maxLine = count($this->diff); // skip first 2 lines as left, right files $line += 2; // descend into data chunks while ($line < $maxLine) { // next line is the chunk identifier $dataChunk = array(); $dataChunk['identifier'] = $this->diff[$line++]; $dataChunk['line'] = $this->parseLineNum($dataChunk['identifier']); $dataChunk['data'] = array(); while ($line < $maxLine && !(substr($this->diff[$line], 0, 2) == '@@' && substr($this->diff[$line], -2) == '@@')) { $dataChunk['data'][] = $this->diff[$line++]; } $this->chunks[] = $dataChunk; } } /** * Reconstructs the diff (with adjusted identifier lines, and outputs the * result) */ protected function reconstructDiff() { $out = "--- {$this->niceLeft}\n+++ {$this->niceRight}\n"; foreach ($this->chunks as $chunk) { $out .= $chunk['identifier'] . "\n"; $out .= implode("\n", $chunk['data']) ."\n"; } echo $out; } /** * Descends into a deeper context * * @param string $type friendly name, either class or function */ protected function enterContext($type) { // next comes whitespace: if (is_array($this->tokens[++$this->tokenIndex])) { list($token, $this->currentValue) = $this->tokens[$this->tokenIndex]; } else { $token = null; $this->currentValue = $this->tokens[$this->tokenIndex]; } if ($token != T_WHITESPACE) { // syntax is broken, let's get out of here if (self::DEBUG_SYNTAX) { die("Syntax broken in whitespace assertion, " . $this->context[count($this->context) - 1] . "\n"); } $this->isBroken = true; break; } $this->checkLineBreak(); // next comes the name: if (is_array($this->tokens[++$this->tokenIndex])) { list($token, $this->currentValue) = $this->tokens[$this->tokenIndex]; } else { $token = null; $this->currentValue = $this->tokens[$this->tokenIndex]; } $this->context[] = $this->currentValue . "({$type})"; // chew through the next few tokens until we get a "{" while ($this->currentValue != '{' && $this->tokenIndex < count($this->tokens)) { if (is_array($this->tokens[++$this->tokenIndex])) { list($token, $this->currentValue) = $this->tokens[$this->tokenIndex]; } else { $token = null; $this->currentValue = $this->tokens[$this->tokenIndex]; } $this->checkLineBreak(); switch ($token) { // these are all valid before the brace: case null: case T_WHITESPACE: case T_VARIABLE: case T_EXTENDS: case T_IMPLEMENTS: case T_STRING: case T_ARRAY: case T_CONSTANT_ENCAPSED_STRING: case T_LNUMBER: case '=': break; // if another token is found, then there's a syntax error // (this was added to prevent really deep looping) default: if (self::DEBUG_SYNTAX) { die("Syntax broken in token assertion, " . $this->context[count($this->context) - 1] . "," . token_name($token) . "\n"); } $this->isBroken = true; return; } } // found the starting brace $this->braceDepth[count($this->context) - 1] = 1; } /** * Tokenizes the code and creates a line map */ protected function tokenizeHierarchy() { $this->context = array('(root)'); $this->lineMap = array(''); $this->tokens = token_get_all($this->sanitizeLineEndings($this->fileContents)); $this->isBroken = false; for ($this->tokenIndex=0; $this->tokenIndex<count($this->tokens); $this->tokenIndex++) { if ($this->isBroken) { // syntax is somehow broken; return progress, but don't go further return; } if (is_array($this->tokens[$this->tokenIndex])) { list($token, $this->currentValue) = $this->tokens[$this->tokenIndex]; } else { $token = null; $this->currentValue = $this->tokens[$this->tokenIndex]; //change here } switch ($token) { // check for class case T_CLASS: // found "class" $this->enterContext('class'); break; case T_FUNCTION: // found "function" $this->enterContext('function'); break; default: $idx = count($this->context) - 1; switch ($this->currentValue) { case '{': case T_CURLY_OPEN: case T_DOLLAR_OPEN_CURLY_BRACES: ++$this->braceDepth[$idx]; break; case '}': --$this->braceDepth[$idx]; if ($this->braceDepth[$idx] == 0) { // we're out of this context array_pop($this->context); } else if ($this->braceDepth[$idx] < 0) { // bad stuff! if (self::DEBUG_SYNTAX) { die("Syntax broken in brace close assertion, " . $this->context[count($this->context) - 1] . "\n"); } $this->isBroken = true; } break; default: $this->checkLineBreak(); } } } } /** * Determines if the currently processing token contains line breaks, and * if so, adjusts the lineMap accordingly */ protected function checkLineBreak() { // check for new line: if (strpos($this->currentValue, "\n") !== false) { for ($j=1; $j<=substr_count($this->currentValue, "\n"); $j++) { $this->lineMap[] = implode(':', $this->context); } } } /** * Matches the chunk map to the line map */ protected function determineHierarchy() { $this->tokenizeHierarchy(); for ($chunknum=0; $chunknum < count($this->chunks); $chunknum++) { $this->chunks[$chunknum]['identifier'] .= ' ' . $this->lineMap[$this->chunks[$chunknum]['line']]; } } } new DiffPhp; // komode: le=unix language=php codepage=utf8 tab=4 notabs indent=4
The most up-to-date version of this file can also be found in my personal svn repostory: https://svn.caedmon.net/svn/public/diff-php/diff-php.
Please let me know if you run into any bugs.. I'm sure there are a few, but it works pretty well for me.
