SoFunction
Updated on 2024-11-19

Python3 ID3 Decision Tree Implementation Code for Determining Whether an Application for a Loan is Successful or Not

1. Define spanning tree

# -*- coding: utf-8 -*-
# Functions for spanning trees

from numpy import * 
import numpy as np
import pandas as pd
from math import log 
import operator 

# Calculate the Information Gain function of the dataset (information entropy is called Shannon entropy in machine learning practice)
def calcInfoEnt(dataSet):#Label in this question i.e. good OR bad melon #dataSet each column is an attribute (end of column is Label)
 numEntries = len(dataSet) # Each line is a sample
 labelCounts = {} # Create a dictionary labelCounts for all possible categories.
 for featVec in dataSet: # Loop by row: i.e. rowVev takes every row in the dataset
  currentLabel = featVec[-1] #Therefore, fatVec[-1] takes the last value of each line, i.e. Label.
  if currentLabel not in (): # If the current Label in the dictionary does not already have a
   labelCounts[currentLabel] = 0 # then create the word by assigning it a value of 0 first
  labelCounts[currentLabel] += 1 # count, count the number of labels per category (this line is not limited by if)
 InfoEnt = 0.0
 for key in labelCounts: # Iterate over each type of Label
  prob = float(labelCounts[key])/numEntries #Entropy accumulation for each type of Label
  InfoEnt -= prob * log(prob,2) The information entropy gain formula used for #ID3
 return InfoEnt

### For a discrete feature: take all samples where the value of the feature is value.
def splitDiscreteDataSet(dataSet, axis, value): #dataSet is the set of current nodes (to be divided), axis indicates the attribute on which the division is based, value is the value used for the division.
 retDataSet = []  #Assign a list to the return Data Set to store the
 for featVec in dataSet:
  if featVec[axis] == value:
   reducedFeatVec = featVec[:axis]   # Features prior to this feature remain in the sample dataSet
   (featVec[axis+1:]) # features after this feature remain in the sample dataSet
   (reducedFeatVec)  # Add this sample to the list
 return retDataSet

### For continuous features: return all samples where the feature has a value greater than value (split the set into two parts using value as a threshold).
def splitContinuousDataSet(dataSet, axis, value): 
 retDataSetG = []  # will store samples with values greater than value
 retDataSetL = []  # will store samples with values less than value
 for featVec in dataSet: 
  if featVec[axis] > value: 
   reducedFeatVecG = featVec[:axis]
   (featVec[axis+1:]) 
   (reducedFeatVecG)
  else:
   reducedFeatVecL = featVec[:axis]
   (featVec[axis+1:]) 
   (reducedFeatVecL)
 return retDataSetG,retDataSetL  # Returns two sets, in the form of a tuple with two elements.

### Choose the best current segmentation feature based on InfoGain (and for continuous variables also choose what value to segment by)
def chooseBestFeatureToSplit(dataSet,labels): 
 numFeatures = len(dataSet[0])-1
 baseEntropy = calcInfoEnt(dataSet) 
 bestInfoGain = 0.0; bestFeature = -1
 bestSplitDict = {}
 for i in range(numFeatures):
  #Iterate over all features: the following line takes the i-th of each row, which is the value of the i-th feature of all samples in the current set.
  featList = [example[i] for example in dataSet]
  #Determine if a feature is discrete
  if not (type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int'): 
# For discrete features: find the entropy increase if the feature is divided into
   uniqueVals = set(featList)  # Create a set from a list (get the value of the unique element of the list)
   newEntropy = 0.0
   for value in uniqueVals:  # Iterate through each value of the discrete feature
    subDataSet = splitDiscreteDataSet(dataSet, i, value)# Calculate the information entropy for each value taken
    prob = len(subDataSet)/float(len(dataSet))
    newEntropy += prob * calcInfoEnt(subDataSet)# Entropy accumulation for each value taken
   infoGain = baseEntropy - newEntropy # Get the entropy increase divided by this feature
# For a continuous feature: find the entropy increase if the feature is used to divide (difference: for n data, you need to add n-1 candidate divisions, and choose the best division point)
  else: # Generate n-1 candidate division points
   sortfeatList=sorted(featList) 
   splitList=[] 
   for j in range(len(sortfeatList)-1): # Generate n-1 candidate division points
    ((sortfeatList[j] + sortfeatList[j+1])/2.0) 
   bestSplitEntropy = 10000     #Set a very large entropy value (to be used later)
   #Iterate over n-1 candidate points: find the entropy increase of the jth candidate point, and select the best point.
   for j in range(len(splitList)):
    value = splitList[j] 
    newEntropy = 0.0 
    DataSet = splitContinuousDataSet(dataSet, i, value)
    subDataSetG = DataSet[0]
    subDataSetL = DataSet[1] 
    probG = len(subDataSetG) / float(len(dataSet)) 
    newEntropy += probG * calcInfoEnt(subDataSetG) 
    probL = len(subDataSetL) / float(len(dataSet)) 
    newEntropy += probL * calcInfoEnt(subDataSetL)
    if newEntropy < bestSplitEntropy: 
     bestSplitEntropy = newEntropy
     bestSplit = j
   bestSplitDict[labels[i]] = splitList[bestSplit]# Dictionary to record the current optimal division point for consecutive attributes
   infoGain = baseEntropy - bestSplitEntropy  # Calculate the entropy increase divided by this node
# Select the attribute among all attributes (both continuous and discrete) that gives the largest entropy increase
  if infoGain > bestInfoGain: 
   bestInfoGain = infoGain
   bestFeature = i
 #If the best segmentation feature of the current node is a continuous feature, it should be binarized according to "whether it is less than or equal to its best segmentation point".
 # i.e. change the feature to "whether it is less than or equal to bestSplitValue", e.g. change "Density" to "Density<=0.3815".
 #Note: The following paragraph directly manipulates the original dataSet data, and the previous float values become 0 and 1 accordingly.
 #[Why do this?] At the end of the function createTree() you will see the explanation
 if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':  
  bestSplitValue = bestSplitDict[labels[bestFeature]] 
  labels[bestFeature] = labels[bestFeature] + '<=' + str(bestSplitValue)
  for i in range(shape(dataSet)[0]): 
   if dataSet[i][bestFeature] <= bestSplitValue: 
    dataSet[i][bestFeature] = 1 
   else: 
    dataSet[i][bestFeature] = 0
 return bestFeature  

# If the features have been divided and the samples under the node have not been standardized, then voting is required: count the number of labels in each category, and take the max.
def majorityCnt(classList): 
 classCount = {}  # will create a dictionary with keys of type Label
 for vote in classList: 
  if vote not in (): 
   classCount[vote] = 0  # First occurrence of Label added to dictionary
  classCount[vote] += 1  # Counting
 return max(classCount)

2. Recursive generation of decision trees

# Main program: recursive generation of decision trees
 # dataSet: the current dataset used to build the tree, which starts out as data_full and gets smaller and smaller as it is divided. This is due to the fact that we have reached a fork in the tree. The first division of the data before the 17 melons is at the root node, and then the first bestFeat is chosen to be the texture. The texture values are clear, fuzzy, and slightly fuzzy; the melons are divided into clear (9), slightly fuzzy (5), and fuzzy (3), and then the category of the division should be reduced by 1 for the next division.
 # labels: the categories in the current dataset (this is because some labels are not in the current dataset, e.g. if at some point the watermelons are all light white and no dark green)
 # data_full: all of the data
 # label_full:all categories

numLine = numColumn = 2 #This is because I'm going to use global numLine after this ...... as to why I must use global.

# I don't fully understand it either. If I just define the local variable it always gives me an error and I have to use the global variable in the if there. I'm looking for a solution.

def createTree(dataSet,labels,data_full,labels_full): 
 classList = [example[-1] for example in dataSet] 
 # Recursive stop condition 1: all samples of the current node belong to the same class; (Note: the count() method counts the number of times an element appears in the list)
 if (classList[0]) == len(classList): 
  return classList[0]
 
# Recursive stop condition 2: the set of samples on the current node is the empty set (i.e., there are no more samples at a given value of the feature):
 global numLine,numColumn
 (numLine,numColumn) = shape(dataSet)
 if float(numLine) == 0: 
  return 'empty'
 
# Recursive stop condition 3: all features available for segmentation are used, then majorityCnt() is called to vote on Label;
 if float(numColumn) == 1: 
  return majorityCnt(classList) 
 
#Continue to divide when you don't stop:
 bestFeat = chooseBestFeatureToSplit(dataSet,labels)# Call the function to find out which is the current best segmentation feature
 bestFeatLabel = labels[bestFeat]  # Current best segmentation characteristics
 myTree = {bestFeatLabel:{}} 
 featValues = [example[bestFeat] for example in dataSet] 
 uniqueVals = set(featValues) 
 if type(dataSet[0][bestFeat]).__name__=='str': 
  currentlabel = labels_full.index(labels[bestFeat]) 
  featValuesFull = [example[currentlabel] for example in data_full] 
  uniqueValsFull = set(featValuesFull) 
 del(labels[bestFeat]) #After segmentation, the current feature has been used, so it is deleted from the "feature set to be segmented".

 # [Recursive call] Divide a subtree for each value of the feature (beatFeat) currently used for division.
 for value in uniqueVals: # Iterate over the [existing] values of the feature
  subLabels = labels[:] 
  if type(dataSet[0][bestFeat]).__name__=='str': 
   (value)  # Delete after division (from uniqueValsFull!)
  myTree[bestFeatLabel][value] = createTree(splitDiscreteDataSet(dataSet,bestFeat,value),subLabels,data_full,labels_full)# use splitDiscreteDataSet()
 # is due to the fact that, all continuous features are processed into discrete values after splitting by our defined chooseBestFeatureToSplit().
 if type(dataSet[0][bestFeat]).__name__=='str': # If the feature is discrete [see later note for more details]
  for value in uniqueValsFull:# then there may be some values that are no longer in the [existing] values.
 #That's why the above removes "uniqueValsFull" from
 #Because those values of that feature that were not taken in the existing dataset, were retained in it
   myTree[bestFeatLabel][value] = majorityCnt(classList) 

 return myTree 

3. Calling the spanning tree

Statements called by #SpanningTree
df = pd.read_excel(r'E:\BaiduNetdiskDownload\spspss\data\experimental data\bank loans.xlsx') 
data = [:,1:].tolist() 
data_full = data[:] 
labels = [1:-1].tolist() 
labels_full = labels[:] 
myTree = createTree(data,labels,data_full,labels_full) 

View Data

data

在这里插入图片描述

labels

在这里插入图片描述

4. Mapping decision trees

Functions for #plotting decision trees
import  as plt 
decisionNode = dict(boxstyle = "sawtooth",fc = "0.8") # Define branch point styles
leafNode = dict(boxstyle = "round4",fc = "0.8") # Define the style of leaf nodes
arrow_args = dict(arrowstyle = "<-") #Define the arrow logo style

# Calculate the number of leaf nodes in the tree
def getNumLeafs(myTree):
 numLeafs = 0 
 firstStr = list(())[0]
 secondDict = myTree[firstStr]
 for key in (): 
  if type(secondDict[key]).__name__=='dict': 
   numLeafs += getNumLeafs(secondDict[key]) 
  else: numLeafs += 1
 return numLeafs

# Calculate the maximum depth of the tree
def getTreeDepth(myTree): 
 maxDepth = 0 
 firstStr = list(())[0] 
 secondDict = myTree[firstStr] 
 for key in (): 
  if type(secondDict[key]).__name__=='dict': 
   thisDepth = 1 + getTreeDepth(secondDict[key]) 
  else: thisDepth = 1 
  if thisDepth > maxDepth: 
   maxDepth = thisDepth
 return maxDepth 

# Draw the nodes
def plotNode(nodeTxt,centerPt,parentPt,nodeType): 
 createPlot.(nodeTxt,xy = parentPt,xycoords = 'axes fraction',xytext = centerPt,textcoords = 'axes fraction',va = "center", ha = "center",bbox = nodeType,arrowprops = arrow_args) 

# Mark the text on the arrow
def plotMidText(cntrPt,parentPt,txtString): 
 lens = len(txtString) 
 xMid = (parentPt[0] + cntrPt[0]) / 2.0 - lens*0.002 
 yMid = (parentPt[1] + cntrPt[1]) / 2.0 
 createPlot.(xMid,yMid,txtString) 

def plotTree(myTree,parentPt,nodeTxt): 
 numLeafs = getNumLeafs(myTree) 
 depth = getTreeDepth(myTree) 
 firstStr = list(())[0] 
 cntrPt = (plotTree.x0ff + (1.0 + float(numLeafs))/2.0/,plotTree.y0ff) 
 plotMidText(cntrPt,parentPt,nodeTxt) 
 plotNode(firstStr,cntrPt,parentPt,decisionNode) 
 secondDict = myTree[firstStr] 
 plotTree.y0ff = plotTree.y0ff - 1.0/ 
 for key in (): 
  if type(secondDict[key]).__name__=='dict': 
   plotTree(secondDict[key],cntrPt,str(key)) 
  else: 
   plotTree.x0ff = plotTree.x0ff + 1.0/ 
   plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode) 
   plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key)) 
 plotTree.y0ff = plotTree.y0ff + 1.0/ 

def createPlot(inTree): 
 fig = (1,facecolor = 'white') 
 () 
 axprops = dict(xticks = [],yticks = []) 
 createPlot.ax1 = (111,frameon = False,**axprops) 
  = float(getNumLeafs(inTree)) 
  = float(getTreeDepth(inTree)) 
 plotTree.x0ff = -0.5/ 
 plotTree.y0ff = 1.0 
 plotTree(inTree,(0.5,1.0),'') 
 ()

5. Calling functions

The # command plots a decision tree.
createPlot(myTree)

myTree

summarize

To this article on Python3 ID3 decision tree to determine the success of the loan application code is introduced to this article, more related python ID3 decision tree to determine the contents of the search for my previous articles or continue to browse the following related articles I hope you will support me in the future more!