# # Indentation finder, by Philippe Fremy # Copyright 2002-2008 Philippe Fremy # # This program is distributed under the BSD license. You should have received # a copy of the file LICENSE.txt along with this software. # import sys import re help = \ """Usage : %s [ --vim-output ] [ --verbose ] file1 file2 ... fileN Display indentation used in the list of files. Possible answers are (with X being the number of spaces used for indentation): space X tab 8 mixed tab X space Y mixed means that indentation style is tab at the beginning of the line (tab being 8 positions) and then spaces to do the indentation, unless you reach 8 spaces which are replaced by a tab. This is the vim source file indentation for example. In my opinion, this is the worst possible style. --vim-output: output suitable to use inside vim: set sts=0 | set tabstop=4 | set noexpandtab | set shiftwidth=4 """ VERSION='1.4' ### Used when indentation is tab, to set tabstop in vim DEFAULT_TAB_WIDTH = 8 ### default values for files where indentation is not meaningful (empty files) # possible values: # DEFAULT_RESULT = ('space', 4 ) # DEFAULT_RESULT = ('space', 2 ) # DEFAULT_RESULT = ('space', 8 ) # DEFAULT_RESULT = ('tab', DEFAULT_TAB_WIDTH ) DEFAULT_RESULT = ('space', 8 ) VERBOSE_QUIET = 0 VERBOSE_INFO = 1 VERBOSE_DEBUG = 2 VERBOSE_DEEP_DEBUG = 3 DEFAULT_VERBOSITY = VERBOSE_QUIET ### class LineType: NoIndent = 'NoIndent' SpaceOnly = 'SpaceOnly' TabOnly = 'TabOnly' Mixed = 'Mixed' BeginSpace = 'BeginSpace' def info( s ): log( VERBOSE_INFO, s ) def dbg( s ): log( VERBOSE_DEBUG, s ) def deepdbg( s ): log( VERBOSE_DEEP_DEBUG, s ) def log( level, s ): if level <= IndentFinder.VERBOSITY: print s class IndentFinder: """ IndentFinder reports the indentation used in a source file. Its approach is not tied to any particular language. It was tested successfully with python, C, C++ and Java code. How does it work ? It scans each line of the entry file for a space character (white space or tab) repeated until a non space character is found. Such a line is considered to be a properly indented line of code. Blank lines and comments line (starting with # or /* or * ) are ignored. Lines coming after a line ending in '\\' have higher chance of being not properly indented, and are thus ignored too. Only the increment in indentation are fed in. Dedentation or maintaining the same indentation is not taken into account when analysing a file. Increment in indentation from zero indentation to some indentation is also ignored because it's wrong in many cases (header file with many structures for example, do not always obey the indentation of the rest of the code). Each line is analysed as: - SpaceOnly: indentation of more than 8 space - TabOnly: indentation of tab only - Mixed: indentation of tab, then less than 8 spaces - BeginSpace: indentation of less than 8 space, that could be either a mixed indentation or a pure space indentation. - non-significant Then two consecutive significant lines are then considered. The only valid combinations are: - (NoIndent, BeginSpace) => space or mixed - (NoIndent, Tab) => tab - (BeginSpace, BeginSpace) => space or mixed - (BeginSpace, SpaceOnly) => space - (SpaceOnly, SpaceOnly) => space - (TabOnly, TabOnly) => tab - (TabOnly, Mixed) => mixed - (Mixed, TabOnly) => mixed The increment in number of spaces is then recorded. At the end, the number of lines with space indentation, mixed space and tab indentation are compared and a decision is made. If no decision can be made, DEFAULT_RESULT is returned. If IndentFinder ever reports wrong indentation, send me immediately a mail, if possible with the offending file. """ def __init__(self, default_result=DEFAULT_RESULT): self.clear() self.default_result = default_result VERBOSITY = DEFAULT_VERBOSITY def parse_file_list( self, file_list ): for fname in file_list: self.parse_file( fname ) def parse_file( self, fname ): f = open( fname ) l = f.readline() while( l ): self.analyse_line( l ) l = f.readline() f.close() def clear( self ): self.lines = {} for i in range(2,9): self.lines['space%d' % i] = 0 for i in range(2,9): self.lines['mixed%d' % i] = 0 self.lines['tab'] = 0 self.nb_processed_lines = 0 self.nb_indent_hint = 0 self.indent_re = re.compile( "^([ \t]+)([^ \t]+)" ) self.mixed_re = re.compile( "^(\t+)( +)$" ) self.skip_next_line = False self.previous_line_info = None def analyse_line( self, line ): if line[-1:] == '\n': line = line[:-1] deepdbg( 'analyse_line: "%s"' % line.replace(' ', '.' ).replace('\t','\\t') ) self.nb_processed_lines += 1 skip_current_line = self.skip_next_line self.skip_next_line = False if line[-1:] == '\\': deepdbg( 'analyse_line: Ignoring next line!' ) # skip lines after lines ending in \ self.skip_next_line = True if skip_current_line: deepdbg( 'analyse_line: Ignoring current line!' ) return ret = self.analyse_line_indentation( line ) if ret: self.nb_indent_hint += 1 deepdbg( 'analyse_line: Result of line analysis: %s' % str(ret) ) return ret def analyse_line_type( self, line ): '''Analyse the type of line and return (LineType, ). The function will reject improperly formatted lines (mixture of tab and space for example) and comment lines. ''' mixed_mode = False tab_part = '' space_part = '' if len(line) > 0 and line[0] != ' ' and line[0] != '\t': return (LineType.NoIndent, '') mo = self.indent_re.match( line ) if not mo: deepdbg( 'analyse_line_type: line is not indented' ) return None indent_part = mo.group(1) text_part = mo.group(2) deepdbg( 'analyse_line_type: indent_part="%s" text_part="%s"' % (indent_part.replace(' ', '.').replace('\t','\\t').replace('\n', '\\n' ), text_part ) ) if text_part[0] == '*': # continuation of a C/C++ comment, unlikely to be indented correctly return None if text_part[0:2] == '/*' or text_part[0] == '#': # python, C/C++ comment, might not be indented correctly return None if '\t' in indent_part and ' ' in indent_part: # mixed mode mo = self.mixed_re.match( indent_part ) if not mo: # line is not composed of '\t\t\t ', ignore it return None mixed_mode = True tab_part = mo.group(1) space_part = mo.group(2) if mixed_mode: if len(space_part) >= 8: # this is not mixed mode, this is garbage ! return None return (LineType.Mixed, tab_part, space_part ) if '\t' in indent_part: return (LineType.TabOnly, indent_part) if ' ' in indent_part: if len(indent_part) < 8: # this could be mixed mode too return (LineType.BeginSpace, indent_part) else: # this is really a line indented with spaces return (LineType.SpaceOnly, indent_part ) assert False, 'We should never get there !' def analyse_line_indentation( self, line ): previous_line_info = self.previous_line_info current_line_info = self.analyse_line_type( line ) self.previous_line_info = current_line_info if current_line_info == None or previous_line_info == None: deepdbg('analyse_line_indentation: Not enough line info to analyse line: %s, %s' % (str(previous_line_info), str(current_line_info))) return t = (previous_line_info[0], current_line_info[0]) deepdbg( 'analyse_line_indentation: Indent analysis: %s %s' % t ) if (t == (LineType.TabOnly, LineType.TabOnly) or t == (LineType.NoIndent, LineType.TabOnly) ): if len(current_line_info[1]) - len(previous_line_info[1]) == 1 : self.lines['tab'] += 1 return 'tab' elif (t == (LineType.SpaceOnly, LineType.SpaceOnly) or t == (LineType.BeginSpace, LineType.SpaceOnly) or t == (LineType.NoIndent, LineType.SpaceOnly) ): nb_space = len(current_line_info[1]) - len(previous_line_info[1]) if 1 < nb_space <= 8: key = 'space%d' % nb_space self.lines[key] += 1 return key elif (t == (LineType.BeginSpace, LineType.BeginSpace) or t == (LineType.NoIndent, LineType.BeginSpace) ): nb_space = len(current_line_info[1]) - len(previous_line_info[1]) if 1 < nb_space <= 8: key1 = 'space%d' % nb_space key2 = 'mixed%d' % nb_space self.lines[ key1 ] += 1 self.lines[ key2 ] += 1 return key1 elif t == (LineType.BeginSpace, LineType.TabOnly): # we assume that mixed indentation used 8 characters tabs if len(current_line_info[1]) == 1: # more than one tab on the line --> not mixed mode ! nb_space = len(current_line_info[1])*8 - len(previous_line_info[1]) if 1 < nb_space <= 8: key = 'mixed%d' % nb_space self.lines[ key ] += 1 return key elif t == (LineType.TabOnly, LineType.Mixed): tab_part, space_part = tuple(current_line_info[1:3]) if len(previous_line_info[1]) == len(tab_part): nb_space = len(space_part) if 1 < nb_space <= 8: key = 'mixed%d' % nb_space self.lines[ key ] += 1 return key elif t == (LineType.Mixed, LineType.TabOnly): tab_part, space_part = previous_line_info[1:3] if len(tab_part)+1 == len(current_line_info[1]): nb_space = 8-len(space_part) if 1 < nb_space <= 8: key = 'mixed%d' % nb_space self.lines[ key ] += 1 return key else: pass return None def results( self ): dbg( "Nb of scanned lines : %d" % self.nb_processed_lines ) dbg( "Nb of indent hint : %d" % self.nb_indent_hint ) dbg( "Collected data:" ) for key in self.lines: if self.lines[key] > 0: dbg( '%s: %d' % (key, self.lines[key] ) ) max_line_space = max( [ self.lines['space%d'%i] for i in range(2,9) ] ) max_line_mixed = max( [ self.lines['mixed%d'%i] for i in range(2,9) ] ) max_line_tab = self.lines['tab'] dbg( 'max_line_space: %d' % max_line_space ) dbg( 'max_line_mixed: %d' % max_line_mixed ) dbg( 'max_line_tab: %d' % max_line_tab ) ### Result analysis # # 1. Space indented file # - lines indented with less than 8 space will fill mixed and space array # - lines indented with 8 space or more will fill only the space array # - almost no lines indented with tab # # => more lines with space than lines with mixed # => more a lot more lines with space than tab # # 2. Tab indented file # - most lines will be tab only # - very few lines as mixed # - very few lines as space only # # => a lot more lines with tab than lines with mixed # => a lot more lines with tab than lines with space # # 3. Mixed tab/space indented file # - some lines are tab-only (lines with exactly 8 step indentation) # - some lines are space only (less than 8 space) # - all other lines are mixed # # If mixed is tab + 2 space indentation: # - a lot more lines with mixed than with tab # If mixed is tab + 4 space indentation # - as many lines with mixed than with tab # # If no lines exceed 8 space, there will be only lines with space # and tab but no lines with mixed. Impossible to detect mixed indentation # in this case, the file looks like it's actually indented as space only # and will be detected so. # # => same or more lines with mixed than lines with tab only # => same or more lines with mixed than lines with space only # result = None # Detect space indented file if max_line_space >= max_line_mixed and max_line_space > max_line_tab: nb = 0 indent_value = None for i in range(8,1,-1): if self.lines['space%d'%i] > int( nb * 1.1 ) : # give a 10% threshold indent_value = i nb = self.lines[ 'space%d' % indent_value ] if indent_value == None: # no lines result = self.default_result else: result = ('space', indent_value ) # Detect tab files elif max_line_tab > max_line_mixed and max_line_tab > max_line_space: result = ('tab', DEFAULT_TAB_WIDTH ) # Detect mixed files elif max_line_mixed >= max_line_tab and max_line_mixed > max_line_space: nb = 0 indent_value = None for i in range(8,1,-1): if self.lines['mixed%d'%i] > int( nb * 1.1 ) : # give a 10% threshold indent_value = i nb = self.lines[ 'mixed%d' % indent_value ] if indent_value == None: # no lines result = self.default_result else: result = ('mixed', (8,indent_value) ) else: # not enough information to make a decision result = self.default_result info( "Result: %s" % str( result ) ) return result def __str__ (self): itype, ival = self.results() if itype != 'mixed': return '%s %d' % (itype, ival) else: itab, ispace = ival return '%s tab %d space %d' % (itype, itab, ispace) def vim_output( self ): result = self.results() indent_type, n = result if indent_type == "space": # spaces: # => set sts to the number of spaces # => set tabstop to the number of spaces # => expand tabs to spaces # => set shiftwidth to the number of spaces return "set sts=%d | set tabstop=%d | set expandtab | set shiftwidth=%d \" (%s %d)" % (n,n,n,indent_type,n) elif indent_type == "tab": # tab: # => set sts to 0 # => set tabstop to preferred value # => set expandtab to false # => set shiftwidth to tabstop return "set sts=0 | set tabstop=%d | set noexpandtab | set shiftwidth=%d \" (%s)" % (DEFAULT_TAB_WIDTH, DEFAULT_TAB_WIDTH, indent_type ) if indent_type == 'mixed': tab_indent, space_indent = n # tab: # => set sts to 0 # => set tabstop to tab_indent # => set expandtab to false # => set shiftwidth to space_indent return "set sts=4 | set tabstop=%d | set noexpandtab | set shiftwidth=%d \" (%s %d)" % (tab_indent, space_indent, indent_type, space_indent ) def main(): VIM_OUTPUT = 0 file_list = [] for opt in sys.argv[1:]: if opt == "--vim-output": VIM_OUTPUT = 1 elif opt == "--verbose" or opt == '-v': IndentFinder.VERBOSITY += 1 elif opt == "--version": print 'IndentFinder v%s' % VERSION return elif opt[0] == "-": print help % sys.argv[0] return else: file_list.append( opt ) fi = IndentFinder() if len(file_list) > 1: # multiple files for fname in file_list: fi.clear() fi.parse_file( fname ) if VIM_OUTPUT: print "%s : %s" % (fname, fi.vim_output()) else: print "%s : %s" % (fname, str(fi)) return else: # only one file, don't print filename fi.parse_file_list( file_list ) if VIM_OUTPUT: sys.stdout.write( fi.vim_output() ) else: print str(fi) if __name__ == "__main__": main()