Difference between revisions of "Python Data Mining"

From rbachwiki
Jump to navigation Jump to search
(Created page with "==Analyze csv files using pandas== import pandas as pd df=pd.read_csv('filename.csv) df.shape # prints the number of columns and rows df.info() # prints out the names and...")
 
Line 1: Line 1:
==Analyze csv files using pandas==
==Analyze csv files using pandas==
import pandas as pd
<pre>
df=pd.read_csv('filename.csv)
 
df.shape # prints the number of columns and rows
import pandas as pd
df.info() # prints out the names and datatypes of all the columns
 
 
df = pd.read_csv('survey_results_public.csv')
schema_df = pd.read_csv('survey_results_schema.csv')
 
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)
 
df[['OpenSourcer', 'Employment']].tail(2)
 
df.shape # prints out the number of rows and columns
 
schema_df.tail(2)
 
df.columns # gives you the name of the columns
 
df.iloc[[0,1]] # returns the first  and 2nd records with all columns
# inner [[0,1]]brackets are rows and the [[],1]
 
 
df.iloc[0:3,[5,6,7]] # returns the first  and 2nd record with specific columns
# inner [[0,1]]brackets are rows and the [[],1]
#iloc uses index location
 
#loc uses labels
df.loc[0:10, 'Hobbyist':'Country']
 
#loc uses labels
df.loc[0:10, ['Hobbyist','Student','Country']]
 
# get a count of yes and no responses
df['Hobbyist'].value_counts()
</pre>

Revision as of 21:48, 10 January 2020

Analyze csv files using pandas


import pandas as pd


df = pd.read_csv('survey_results_public.csv')
schema_df = pd.read_csv('survey_results_schema.csv')

pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

df[['OpenSourcer', 'Employment']].tail(2)

df.shape # prints out the number of rows and columns

schema_df.tail(2)

df.columns # gives you the name of the columns

df.iloc[[0,1]] # returns the first  and 2nd records with all columns
# inner [[0,1]]brackets are rows and the [[],1]


df.iloc[0:3,[5,6,7]] # returns the first  and 2nd record with specific columns
# inner [[0,1]]brackets are rows and the [[],1]
#iloc uses index location

#loc uses labels
df.loc[0:10, 'Hobbyist':'Country']

#loc uses labels
df.loc[0:10, ['Hobbyist','Student','Country']]

# get a count of yes and no responses
df['Hobbyist'].value_counts()