-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cpp
131 lines (113 loc) · 4.49 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#include <string>
#include <set>
#include <iostream>
#include <fstream>
#include <cctype>
#include <regex>
#include <algorithm>
#include <sstream>
#include <map>
#include <iomanip>
#include <filesystem>
std::string toLowerCase(const std::string &str) // convert string to lowercase
{
std::string lowercased = str;
std::transform(lowercased.begin(), lowercased.end(), lowercased.begin(),
[](unsigned char c) { return std::tolower(c); });
return lowercased;
}
bool containsNumber(const std::string &str) // check if string contains a number
{
return std::any_of(str.begin(), str.end(), ::isdigit);
}
int main()
{
std::string file_name;
std::ifstream in_file;
// List all .txt files in the current directory
std::cout << "Available .txt files in the current directory: \n";
for(const auto & entry : std::filesystem::directory_iterator(std::filesystem::current_path()))
{
if(entry.path().extension() == ".txt")
std::cout << entry.path().filename() << "\n";
}
// Ask for file name and verify if it exists
do {
std::cout << "\nEnter file name: ";
std::cin >> file_name;
in_file.open(file_name);
if(!in_file)
{
std::cerr << "Error opening file, try again" << std::endl;
}
} while (!in_file);
std::string line;
std::map<std::string, std::pair<int, std::vector<int>>> word_counter; // map of words and their count and line numbers, using map to sort the words alphabetically
std::set<std::string> urls;
std::regex url_pattern(R"(((http|https)://|www\.)[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]*)"); // regex for urls
int line_number = 1;
int word_count = 0;
while(getline(in_file, line)) // read file line by line
{
std::smatch match; // regex match
while (std::regex_search (line, match, url_pattern)) // find all urls in the line
{
urls.insert(match.str());
line = match.prefix().str() + " " + match.suffix().str();
}
std::istringstream iss(line);
std::string word;
while (iss >> word) // read line word by word
{
word.erase(word.begin(), std::find_if(word.begin(), word.end(), [](unsigned char c){ return !std::ispunct(c); }));
word.erase(std::find_if(word.rbegin(), word.rend(), [](unsigned char c){ return !std::ispunct(c); }).base(), word.end());
word = toLowerCase(word);
if(word.empty() || containsNumber(word)) // skip empty words and words with numbers
{
continue;
}
word_counter[word].first++; // increment word count
word_counter[word].second.push_back(line_number); // add line number to the map
}
line_number++;
}
in_file.close();
std::ofstream out_file;
out_file.open("rezultatai.txt");
out_file << "LINKS (" << urls.size() << ")\n";
for(const auto& url: urls) // print urls to file, remove the last character if it's a dot
{
std::string fixed_url = url;
if (!fixed_url.empty() && fixed_url.back() == '.')
{
fixed_url.pop_back(); // remove the last character
}
out_file << fixed_url << "\n";
}
out_file << "\n";
out_file << std::left << "WORD COUNTER\n";
out_file << std::setw(20) << "Word" << " | " << std::setw(5) << "Count" << " | " << "Found in rows\n";
out_file << "--------------------------------------------------------------\n";
for(const auto& pair: word_counter)
{
if(pair.second.first > 1)
{
word_count++;
out_file << std::setw(20) << pair.first << " | " << std::setw(5) << pair.second.first << " | ";
int lineCounter = 0;
for(auto lineNum : pair.second.second)
{
if(lineCounter != 0 && lineCounter % 10 == 0) // 10 lines numbers per line
out_file << "\n" << std::setw(28) << " "; // 28: 20 (word column width) + 3 (inter-column space) + 5 (count column width)
out_file << lineNum << " ";
lineCounter++;
}
out_file << "\n--------------------------------------------------------------\n";
}
}
out_file.close();
std::cout << "Number of links: " << urls.size() << std::endl;
std::cout << "Number of unique words (count >= 2): " << word_count << std::endl;
std::cout << "Check rezultatai.txt for results\n" << std::endl;
return 0;
}